├── .github └── workflows │ ├── ci.yml │ └── notice.yml ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── demo_test.t ├── files ├── Arabidopsis_thaliana.fna.gz ├── EnsemblPlants47.png ├── runtime_ram.png └── test_transcripts.fna ├── install_R_deps.R ├── lib ├── R │ └── README.md ├── cpanfileEnsembl ├── cpanfileREST └── requirements.txt ├── pangenes ├── CHANGES.txt ├── HPC.conf.sample ├── HPC.conf.sample.slurm ├── README.md ├── _chunk_chr.pl ├── _cluster_analysis.pl ├── _collinear_genes.pl ├── _cut_sequences.pl ├── _dotplot.pl ├── asciinema.txt ├── bin │ └── README.md ├── check_evidence.pl ├── check_quality.pl ├── cpanfile ├── get_pangenes.pl ├── lib │ ├── HPCluster.pm │ └── pangeneTools.pm ├── match_cluster.pl ├── pics │ ├── collinear_pangenes_minimap2.png │ ├── fixing_genemodels.png │ ├── flow-check-evidence.dia │ ├── flow-check-evidence.png │ ├── flow-get-pangenes.dia │ ├── flow-get-pangenes.png │ ├── long_model.png │ ├── pairs2clusters.png │ ├── pangene_set_nomenclature.png │ ├── pangenesPAG2023.pdf │ ├── wgaoverlap.dia │ └── wgaoverlap.png ├── plots │ ├── core_gene.tab_core_both.png │ ├── dotplot.png │ ├── haplotypes.trimmed.png │ ├── pan_gene.tab_pan.png │ ├── pangene_context.png │ └── pangene_matrix__shell.png └── rename_pangenes.pl ├── phylogenomics ├── Oryza.log ├── PlantCompUtils.pm ├── README.md ├── TODO.txt ├── downloads │ └── README.txt ├── ens_sequences.pl ├── ens_single-copy_core_genes.pl ├── ens_syntelogs.pl └── phylo_test.t ├── recipes ├── exampleAPI.pl ├── exampleBiomart.R ├── exampleCRAM.pl ├── exampleFTP.sh ├── exampleMySQL.sh ├── exampleREST.R ├── exampleREST.pl ├── exampleREST.py └── exampleVEP.sh └── repeats ├── AnnotRedRepeats.py ├── README.md ├── Red2Ensembl.py ├── bench ├── README.md ├── list.Red ├── list.cores ├── list.cores.sp ├── list.cores.sp.toplevel ├── list.cores.wheat ├── list.toplevel ├── log.Rgenes.50 ├── log.exons ├── log.gc ├── log.genes ├── log.nrplants.bed ├── log.redat.bed ├── log.repeat.N50 ├── log.repeat.length ├── log.repeat.overlap ├── log.updown500 ├── log.updown500.16mer ├── log.updown500.21mer ├── log.updown500.31mer ├── log.wheat.Red.bed ├── log.wheat.redat.bed ├── pfam │ └── enrich.R └── repeatmodeller │ ├── HOWTO.txt │ ├── list.cores.sp │ ├── log.Rgenes │ ├── log.Rgenes.50 │ ├── log.exons │ ├── log.genes │ ├── log.repeat.N50 │ ├── log.repeat.length │ ├── log.repeat.overlap │ └── log.updown500 └── get_repeats_ensembl.sh /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # See the NOTICE file distributed with this work for additional information 2 | # regarding copyright ownership. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: "CI" 17 | 18 | on: 19 | push: 20 | branches: 21 | - master 22 | pull_request: 23 | 24 | defaults: 25 | run: 26 | working-directory: ./ 27 | 28 | jobs: 29 | tests: 30 | runs-on: ubuntu-latest 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - uses: shogo82148/actions-setup-perl@v1 36 | with: 37 | perl-version: "5.28" 38 | 39 | - name: Install dependencies 40 | run: | 41 | sudo apt-get update 42 | sudo apt-get install -y wget python3 python3-pip python3-setuptools mysql-client libmysqlclient-dev libdb-dev g++-10 bedtools r-base 43 | make install_REST 44 | make install_ensembl 45 | make install_repeats 46 | make install_pangenes 47 | PERL5LIB=$PWD/lib:$PERL5LIB 48 | export PERL5LIB 49 | shell: bash 50 | 51 | - name: Run tests 52 | run: | 53 | make test_travis 54 | make test_repeats_travis 55 | make test_pangenes 56 | -------------------------------------------------------------------------------- /.github/workflows/notice.yml: -------------------------------------------------------------------------------- 1 | # See the NOTICE file distributed with this work for additional information 2 | # regarding copyright ownership. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: "Update NOTICE copyright year at the start of every year" 17 | 18 | on: 19 | schedule: 20 | - cron: '15 15 1 1 *' 21 | 22 | jobs: 23 | notice_update: 24 | name: Update NOTICE copyright year 25 | runs-on: ubuntu-latest 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | - name: Update NOTICE file 31 | run: | 32 | sed -i "s/$(date +%Y --date='1 year ago')/$(date +%Y)/" NOTICE 33 | 34 | - uses: EndBug/add-and-commit@v9 35 | with: 36 | add: 'NOTICE' 37 | message: 'Update NOTICE copyright year' 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | minimap2release = 2.24 3 | gffreadrelease = 0.12.7 4 | gmaprelease = 2021-12-17 5 | clustalorelease = 1.2.4 6 | alistatrelease = 1.14 7 | 8 | test: 9 | perl demo_test.t 10 | 11 | test_travis: 12 | perl demo_test.t travis 13 | 14 | clean: 15 | rm -f *rachypodium* && rm -f Compara*gz 16 | rm -f new_genomes.txt && rm -f uniprot_report_EnsemblPlants.txt 17 | rm -f arabidopsis_thaliana*.tar.gz 18 | rm -f plants_species-tree*.nh 19 | rm -f oryza_sativa* 20 | 21 | install: 22 | -sudo apt install -y wget mysql-client libmysqlclient-dev libdb-dev bedtools pip cpanminus 23 | 24 | install_REST: 25 | cpanm --local-lib lib --installdeps --notest --cpanfile lib/cpanfileREST . 26 | pip3 install --user requests 27 | 28 | install_biomart_r: 29 | Rscript install_R_deps.R 30 | 31 | install_ensembl: 32 | cpanm --local-lib lib --installdeps --notest --cpanfile lib/cpanfileEnsembl . 33 | cd lib && git clone https://github.com/Ensembl/ensembl.git 34 | cd lib && git clone https://github.com/Ensembl/ensembl-variation.git 35 | cd lib && git clone https://github.com/Ensembl/ensembl-funcgen.git 36 | cd lib && git clone https://github.com/Ensembl/ensembl-compara.git 37 | cd lib && git clone https://github.com/Ensembl/ensembl-metadata.git 38 | cd lib && git clone -b release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git 39 | 40 | install_minimap2: 41 | if [ ! -d "lib/minimap2" ]; then \ 42 | cd lib && wget https://github.com/lh3/minimap2/releases/download/v${minimap2release}/minimap2-${minimap2release}.tar.bz2 && \ 43 | tar xfj minimap2-${minimap2release}.tar.bz2 && cd minimap2-${minimap2release} && make && cd .. && \ 44 | rm -f minimap2-${minimap2release}.tar.bz2 && ln -fs minimap2-${minimap2release} minimap2; \ 45 | fi 46 | 47 | install_Red: 48 | cd lib && git clone https://github.com/EnsemblGenomes/Red.git && cd Red/src_2.0 && make bin && make 49 | #in case you need to use an alternative g++ compiler 50 | #cd lib && git clone https://github.com/EnsemblGenomes/Red.git && cd Red/src_2.0 && make bin && make CXX=g++-10 51 | 52 | install_repeats: install_minimap2 install_Red 53 | pip3 install --user -r lib/requirements.txt 54 | cd files && wget -c https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2 && bunzip2 nrTEplantsJune2020.fna.bz2 55 | 56 | install_redat: 57 | cd files && wget -c ftp://ftpmips.helmholtz-muenchen.de/plants/REdat/mipsREdat_9.3p_ALL.fasta.gz && gunzip mipsREdat_9.3p_ALL.fasta.gz 58 | 59 | test_repeats_travis: 60 | cd repeats && ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna 61 | 62 | test_repeats: 63 | cd repeats && ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna && \ 64 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna test_Atha_chr4 --bed_file test.nrTEplants.bed 65 | 66 | uninstall_repeats: 67 | cd files && rm -rf nrTEplantsJune2020.fna* 68 | cd lib && rm -rf Red minimap2-${minimap2release} minimap2 69 | 70 | clean_repeats: 71 | cd repeats && rm -rf test_Atha_chr4 Atha.sm.fna test.nrTEplants.bed 72 | 73 | # gmap takes several minutes to compile 74 | install_gmap: 75 | cd pangenes/bin && wget http://research-pub.gene.com/gmap/src/gmap-gsnap-${gmaprelease}.tar.gz && tar xfz gmap-gsnap-${gmaprelease}.tar.gz && \ 76 | cd gmap-${gmaprelease} && ./configure --prefix=${PWD}/pangenes/bin/gmap-${gmaprelease}/exe && \ 77 | make && make install && cd .. && rm -rf gmap-gsnap-${gmaprelease}.tar.gz && ln -fs gmap-${gmaprelease} gmap 78 | 79 | install_gffread: 80 | cd pangenes/bin && wget https://github.com/gpertea/gffread/releases/download/v${gffreadrelease}/gffread-${gffreadrelease}.tar.gz && \ 81 | tar xfz gffread-${gffreadrelease}.tar.gz && cd gffread-${gffreadrelease} && make && cd .. && \ 82 | rm -f gffread-${gffreadrelease}.tar.gz && ln -fs gffread-${gffreadrelease} gffread 83 | 84 | install_pangenes: install_minimap2 install_gffread install_gmap 85 | # core perl modules, DB_File not installed in Travis 86 | cpanm -v --installdeps --notest --cpanfile pangenes/cpanfile . 87 | cd files && wget -c https://github.com/Ensembl/plant-scripts/releases/download/v0.4/test_rice.tgz && tar xfz test_rice.tgz && rm -f test_rice.tgz 88 | 89 | # see https://github.com/ekg/wfmash for other options 90 | install_wfmash: 91 | -sudo apt install cmake libjemalloc-dev zlib1g-dev libgsl-dev libhts-dev 92 | cd pangenes/bin && git clone https://github.com/ekg/wfmash && cd wfmash && cmake -H. -Bbuild && cmake --build build -- -j 3 93 | 94 | install_gsalign: 95 | cd pangenes/bin && git clone https://github.com/hsinnan75/GSAlign.git && cd GSAlign && make 96 | 97 | install_pangenes_quality: 98 | cd pangenes/bin && wget http://www.clustal.org/omega/clustalo-${clustalorelease}-Ubuntu-x86_64 && \ 99 | chmod +x clustalo-${clustalorelease}-Ubuntu-x86_64 && \ 100 | ln -fs clustalo-${clustalorelease}-Ubuntu-x86_64 clustalo && \ 101 | wget https://github.com/thomaskf/AliStat/archive/refs/tags/v${alistatrelease}.tar.gz && \ 102 | tar xfz v${alistatrelease}.tar.gz && cd AliStat-${alistatrelease} && make && cd .. && \ 103 | rm -f v${alistatrelease}.tar.gz && ln -s AliStat-${alistatrelease} AliStat 104 | 105 | uninstall_pangenes: 106 | cd pangenes/bin && rm -rf gffread-${gffreadrelease} gmap-${gmaprelease} gffread wfmash GSAlign gmap \ 107 | clustalo-${clustalorelease}-Ubuntu-x86_64 clustalo AliStat-${alistatrelease} AliStat 108 | cd lib && rm -rf minimap2-${minimap2release} minimap2 109 | cd files && rm -rf test_rice 110 | 111 | test_pangenes: 112 | cd pangenes && perl get_pangenes.pl -d ../files/test_rice && \ 113 | perl get_pangenes.pl -d ../files/test_rice -t 0 -s '^\d+$$' &&\ 114 | perl get_pangenes.pl -d ../files/test_rice -H && \ 115 | perl check_evidence.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -i gene:ONIVA01G50800.cdna.fna -f -v && \ 116 | perl match_cluster.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ \ 117 | -s ../files/test_transcripts.fna -o test_transcripts.gmap.tsv && \ 118 | perl rename_pangenes.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -o clade.consortium.1 119 | 120 | clean_pangenes: 121 | cd pangenes && rm -rf test_rice_pangenes && rm test_transcripts.gmap.tsv && rm -rf clade.consortium.1 122 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Ensembl 2 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | Copyright [2016-2025] EMBL-European Bioinformatics Institute 4 | Copyright [2021-2025] Estacion Experimental Aula Dei-CSIC 5 | 6 | This product includes software developed at: 7 | - EMBL-European Bioinformatics Institute 8 | - Wellcome Trust Sanger Institute 9 | - Estacion Experimental Aula Dei-CSIC 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Scripting analyses of genomes in Ensembl Plants 3 | 4 | This repo contains code examples for interrogating 5 | [Ensembl Plants](https://plants.ensembl.org/index.html) 6 | from your own scripts and for masking & annotating 7 | [repeats](#repeat-masking-and-annotation) and 8 | [calling pangenes](#pangenes) in plant genomes. 9 | 10 | [![Build Status](https://github.com/Ensembl/plant-scripts/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/Ensembl/plant-scripts/actions/workflows/ci.yml) 11 | 12 | - [List of recipes](#list-of-recipes) 13 | - [Dependencies of recipes](#dependencies) 14 | - [FTP](#ftp) 15 | - [MySQL](#mysql) 16 | - [Perl](#perl) 17 | - [Python](#python) 18 | - [R](#r) 19 | - [Repeat masking and annotation](#repeat-masking-and-annotation) 20 | - [Pangene analysis](#pangenes) 21 | - [Phylogenomics](#phylogenomics) 22 | - [Species tree](#species-tree) 23 | - [Citation](#citation) 24 | 25 | 26 | ## List of recipes 27 | 28 | The code for the recipes in this section can be found in folder [recipes](./recipes/). 29 | They are grouped by type (API, BioMart, CRAM, FTP, MySQL, REST & VEP) and their dependencies 30 | are explained below. To create your own recipes please read the appropriate documentation: 31 | 32 | | type | URLs | 33 | |---|---| 34 | | API | http://plants.ensembl.org/info/data/api.html | 35 | | BioMart | http://plants.ensembl.org/info/data/biomart/index.html | 36 | | FTP | http://plants.ensembl.org/info/data/ftp | 37 | | MySQL | http://plants.ensembl.org/info/data/mysql.html | 38 | | REST | http://plants.ensembl.org/info/data/rest.html | 39 | | VEP | http://plants.ensembl.org/info/docs/tools/vep/index.html | 40 | 41 | These are the script recipes, obtained with grep -P "^## \w\d+" recipes/example* : 42 | 43 | ``` 44 | exampleAPI.pl:## A1) Load the Registry object with details of genomes available 45 | exampleAPI.pl:## A2) Check which analyses are available for a species 46 | exampleAPI.pl:## A3) Get soft masked sequences from Arabidopsis thaliana 47 | exampleAPI.pl:## A4) Get BED file with repeats in chr4 48 | exampleAPI.pl:## A5) Find the DEAR3 gene 49 | exampleAPI.pl:## A6) Get the transcript used in Compara analyses 50 | exampleAPI.pl:## A7) Find all orthologues of a gene 51 | exampleAPI.pl:## A8) Get markers mapped on chr1D of bread wheat 52 | exampleAPI.pl:## A9) Find all syntelogues among rices 53 | exampleAPI.pl:## A10) Print all translations for otherfeatures genes 54 | 55 | exampleBiomart.R:## B1) Check plant marts and select dataset 56 | exampleBiomart.R:## B2) Check available filters and attributes 57 | exampleBiomart.R:## B3) Download GO terms associated to genes 58 | exampleBiomart.R:## B4) Get Pfam domains annotated in genes 59 | exampleBiomart.R:## B5) Get SNP consequences from a selected variation source 60 | 61 | exampleCRAM.pl:## C1) Find RNA-seq CRAM files for a genome assembly 62 | 63 | exampleFTP.sh:## F1) Download peptide sequences in FASTA format 64 | exampleFTP.sh:## F2) Download CDS nucleotide sequences in FASTA format 65 | exampleFTP.sh:## F3) Download transcripts (cDNA) in FASTA format 66 | exampleFTP.sh:## F4) Download soft-masked genomic sequences 67 | exampleFTP.sh:## F5) Upstream/downstream sequences 68 | exampleFTP.sh:## F6) Get mappings to UniProt proteins 69 | exampleFTP.sh:## F7) Get indexed, bgzipped VCF file with variants mapped 70 | exampleFTP.sh:## F8) Get precomputed VEP cache files 71 | exampleFTP.sh:## F9) Download all homologies in a single TSV file, several GBs 72 | exampleFTP.sh:## F10) Download UniProt report of Ensembl Plants, 73 | exampleFTP.sh:## F11) Retrieve list of new species in current release 74 | exampleFTP.sh:## F12) Get current plant species tree (cladogram) 75 | 76 | exampleMySQL.sh:## S1) Check currently supported Ensembl Genomes (EG) core schemas, 77 | exampleMySQL.sh:## S2) Count protein-coding genes of a particular species 78 | exampleMySQL.sh:## S3) Get stable_ids of transcripts used in Compara analyses 79 | exampleMySQL.sh:## S4) Get variants significantly associated to phenotypes 80 | exampleMySQL.sh:## S5) Get Triticum aestivum homeologous genes across A,B & D subgenomes 81 | exampleMySQL.sh:## S6) Count the number of whole-genome alignments of all genomes 82 | exampleMySQL.sh:## S7) Extract all the mutations and consequences for a selected wheat line 83 | exampleMySQL.sh:## S8) Get FASTA of repeated sequences from selected species 84 | exampleMySQL.sh:## S9) Get GFF of repeated sequences from selected species 85 | 86 | exampleREST:## R1) Create a HTTP client and a helper functions 87 | exampleREST:## R2) Get metadata for all plant species 88 | exampleREST:## R3) Find features overlapping genomic region 89 | exampleREST:## R4) Fetch phenotypes overlapping genomic region 90 | exampleREST:## R5) Find homologues of selected gene 91 | exampleREST:## R6) Get annotation of orthologous genes/proteins 92 | exampleREST:## R7) Fetch variant consequences for multiple variant ids 93 | exampleREST:## R8) Check consequences of SNP within CDS sequence 94 | exampleREST:## R9) Retrieve variation sources of a species 95 | exampleREST:## R10) Get soft-masked upstream sequence of gene in otherfeatures track 96 | exampleREST:## R11) Get all species under a given taxonomy clade 97 | exampleREST:## R12) transfer coordinates across genome alignments between species 98 | 99 | exampleVEP.sh:## V1) Download, install and update VEP 100 | exampleVEP.sh:## V2) Unpack downloaded cache file & check SIFT support 101 | exampleVEP.sh:## V3) Predict effect of variants 102 | exampleVEP.sh:## V4) Predict effect of variants for species not in Ensembl 103 | ``` 104 | 105 | ### Dependencies 106 | 107 | Some of the recipes and scripts depend on additional software packages, see below to learn how to install them. 108 | Note that only *make install* requires **sudo**, you might need help from your sysadmin for that task. 109 | 110 | #### FTP 111 | 112 | The examples for bulk downloads from the FTP site require the software [wget](https://www.gnu.org/software/wget/), 113 | which is usually installed on most Linux distributions. For macOS it is available on [Homebrew](https://brew.sh). 114 | For Windows it ships with [MobaXterm](https://mobaxterm.mobatek.net). On Debian/Ubuntu systems you can also install 115 | it with (requires sudo): 116 | 117 | make install 118 | 119 | #### MySQL 120 | 121 | The examples for SQL queries to Ensembl Genomes database servers require the [MySQL](https://www.mysql.com) client. 122 | Depending on your Linux flavour this package can be named *mysql-client* or simply *mysql*. On Debian/Ubuntu systems 123 | you can also install it with (requires sudo): 124 | 125 | make install 126 | 127 | #### Perl 128 | 129 | As listed in [cpanfile](./files/cpanfile), several modules are required for the REST examples: 130 | [JSON](https://metacpan.org/pod/JSON), [JSON::XS](https://metacpan.org/pod/JSON::XS) and 131 | [HTTP::Tiny](https://metacpan.org/pod/HTTP::Tiny). 132 | Provided [cpanm](https://metacpan.org/pod/App::cpanminus) is available in your system (for instance after make install), 133 | these modules can be installed with: 134 | 135 | #make install 136 | make install_REST 137 | 138 | Similarly, the dependencies for the ensembl VEP ([DBI](https://metacpan.org/pod/DBI), [DBD::mysql](https://metacpan.org/pod/DBD::mysql) 139 | and [Archive::Zip](https://metacpan.org/pod/Archive::Zip)), together with those used by recipes using the Ensembl Perl API, 140 | can be installed with: 141 | 142 | #make install 143 | make install_ensembl 144 | 145 | Ensembl API installation instructions can be found [here](http://plants.ensembl.org/info/docs/api/api_installation.html), 146 | or if you use git [here](http://plants.ensembl.org/info/docs/api/api_git.html). There is also a debugging 147 | [guide](http://plants.ensembl.org/info/docs/api/debug_installation_guide.html), which lists some extra dependencies that might not have, 148 | such as modules [DBI](https://metacpan.org/pod/DBI) and [DBD::mysql](https://metacpan.org/pod/DBD::mysql). 149 | Note that your local Ensembl API should match the version of the current Ensembl release. 150 | 151 | #### Python 152 | 153 | The REST recipes written in python require library [requests](https://pypi.org/project/requests). 154 | Provided pip3 is available in your system (for instance after make install), it can be installed with: 155 | 156 | #make install 157 | make install_REST 158 | 159 | #### R 160 | 161 | For the BioMart recipes you will need BioConductor package 162 | [biomaRt](http://www.bioconductor.org/packages/release/bioc/html/biomaRt.html) 163 | (read more [here](http://plants.ensembl.org/info/data/biomart/biomart_r_package.html)). 164 | For the REST recipes two core packages are required: [httr](https://cran.r-project.org/web/packages/httr) and 165 | [jsonlite](https://cran.r-project.org/web/packages/jsonlite). All these can be installed with: 166 | 167 | Rscript install_R_deps.R 168 | 169 | ## Repeat masking and annotation 170 | 171 | See examples and documentation in folder [repeats](./repeats/). 172 | 173 | If you want to annotate repeats you must first run: 174 | 175 | #make install # install required bedtools 176 | make install_repeats # requires gcc & g++ compilers 177 | 178 | ## Pangenes 179 | 180 | See examples and documentation in folder [pangenes](./pangenes/). 181 | We recommend checking out the 182 | [Runmodes and HPC configuration](https://github.com/Ensembl/plant-scripts/tree/master/pangenes#runmodes-and-hpc-configuration) docs. 183 | 184 | Install it the [bioconda](https://anaconda.org/bioconda/get_pangenes) way: 185 | 186 | conda activate bioconda 187 | conda create -n get_pangenes -c conda-forge -c bioconda get_pangenes 188 | conda activate get_pangenes 189 | # or simply 190 | conda install bioconda::get_pangenes 191 | 192 | Install it the compilation way: 193 | 194 | #make install # install required bedtools 195 | make install_pangenes # requires gcc & g++ compilers 196 | 197 | # optionally you might also want to try: 198 | make install_gsalign 199 | make install_pangenes_quality 200 | 201 | 202 | ## Phylogenomics 203 | 204 | See examples and documentation in folder [phylogenomics](./phylogenomics/). 205 | 206 | If you want to run any of those scripts you must first run: 207 | 208 | #make install 209 | make install_REST 210 | 211 | ## Species tree 212 | 213 | ![Plant species tree](./files/EnsemblPlants47.png) 214 | 215 | *Fig. 1. Species tree of Ensembl Plants release 47 obtained with recipe F12. Figure generated with [iTOL](https://itol.embl.de)* 216 | 217 | ## Citation 218 | 219 | Contreras-Moreira B, Naamati G, Rosello M, Allen JE, Hunt SE, Muffato M, Gall A, Flicek P (2022) 220 | Scripting Analyses of Genomes in Ensembl Plants. In: Edwards D. (eds) Plant Bioinformatics. 221 | Methods in Molecular Biology, vol 2443. Humana, New York, NY. [10.1007/978-1-0716-2067-0_2](https://link.springer.com/protocol/10.1007%2F978-1-0716-2067-0_2) 222 | 223 | 224 | 225 | ### pangenes 226 | 227 | For the pangene protocols the primary citation is: 228 | 229 | Contreras-Moreira B, Saraf S, Naamati G, Casas AM, Amberkar SS, Flicek P, Jones AR & Dyer S (2023) 230 | GET_PANGENES: calling pangenes from plant genome alignments confirms presence-absence variation. 231 | Genome Biol 24, 223. https://doi.org/10.1186/s13059-023-03071-z 232 | 233 | Check all the references you need to cite in each script by running: 234 | 235 | perl get_pangenes.pl -v 236 | perl check_evidence.pl -c 237 | perl check_quality.pl -c 238 | perl match_cluster.pl -c 239 | 240 | 241 | ### repeats 242 | 243 | For the scripts and data in the [repeats](./repeats/) folder please cite: 244 | 245 | Contreras-Moreira B, Filippi CV, Naamati G, García Girón C, Allen JE, Flicek P (2021) 246 | Efficient masking of plant genomes by combining kmer counting and curated repeats Genomics. 247 | Plant Genome https://doi.org/10.1002/tpg2.20143 248 | (preprint https://www.biorxiv.org/content/10.1101/2021.03.22.436504v1) 249 | 250 | Girgis HZ (2015) Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. 251 | BMC Bioinformatics 16:227. https://doi.org/10.1186/s12859-015-0654-5 252 | 253 | Li H (2018) Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34(18):3094–3100. 254 | https://doi.org/10.1093/bioinformatics/bty191 255 | 256 | -------------------------------------------------------------------------------- /demo_test.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Test::More; 4 | 5 | my $number_of_tests = 1; 6 | 7 | ok( eval{ `recipes/exampleMySQL.sh test` } =~ /_core_/ , 'exampleMySQL.sh' ); 8 | 9 | if(!$ARGV[0] || $ARGV[0] ne 'travis'){ 10 | 11 | # FTP/REST/API tests might timeout from Travis 12 | 13 | $number_of_tests += 5; 14 | 15 | ok( eval{ `recipes/exampleFTP.sh --spider test 2>&1` } =~ /Brachypodium_distachyon/ , 16 | 'exampleFTP.sh' ); 17 | 18 | ok( eval{ `python recipes/exampleREST.py test` } =~ /hordeum_vulgare/ , 'exampleREST.py' ); 19 | 20 | ok( eval{ `perl recipes/exampleREST.pl test` } =~ /hordeum_vulgare/ , 'exampleREST.pl' ); 21 | 22 | ok( eval{ `Rscript recipes/exampleREST.R test` } =~ /hordeum_vulgare/ , 'exampleREST.R' ); 23 | 24 | ok( eval{ `perl recipes/exampleCRAM.pl test` } =~ /subgroup/ , 'exampleCRAM.pl' ); 25 | } 26 | 27 | # requires perl API to be installed ie 'make install_ensembl' 28 | if($ARGV[0] && $ARGV[0] eq 'API'){ 29 | ok( eval{ `perl recipes/exampleAPI.pl test` } =~ /xref/ , 'exampleAPI.pl' ); 30 | $number_of_tests++; 31 | } 32 | 33 | # requires BiomaRt R library ie 'make install_biomart_r' 34 | if($ARGV[0] && $ARGV[0] eq 'biomart'){ 35 | ok( eval{ `Rscript recipes/exampleBiomart.R test` } =~ /IWGSC/ , 'exampleBiomaRt.R' ); 36 | $number_of_tests++; 37 | } 38 | 39 | done_testing( $number_of_tests ); 40 | -------------------------------------------------------------------------------- /files/Arabidopsis_thaliana.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/Arabidopsis_thaliana.fna.gz -------------------------------------------------------------------------------- /files/EnsemblPlants47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/EnsemblPlants47.png -------------------------------------------------------------------------------- /files/runtime_ram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/runtime_ram.png -------------------------------------------------------------------------------- /files/test_transcripts.fna: -------------------------------------------------------------------------------- 1 | >TR1 2 | AAGAATTACAAACCTCAACAACTCTCGACTGTTACAGGCATCGCTGCCATAATATGAGTA 3 | GAGTACATGGGCATCACCTAAATTCTGTGAGCGCCATCCATCAAACTATGTCTGACTGAC 4 | TTGACTGTCTACTACAAGATATACTCATTCTCAATTACATGACGCTTTGACATAATGAGT 5 | GCAAGAAAAATCTACTGATACATCATCACCAAACACACCAACCTAGATTTCCAAGAATCG 6 | TCAATTCAAAACAAGAAACGCTAGAAACAACCTAATCAGATGCAATTATTATTCTTCCTC 7 | TTGTTATGTCGTGAAGAAAGACCATAGCGTTTTCCCAATCGATTTGTTTCGCCTCTCTGA 8 | ATCATCATCATCATCATCCTCGTCGTCATCTTCAACGCCCCCACGCTCACTTCCTGATGG 9 | TGTCATGGGCATCGCATCTTCACCTTGTTCACCATCCACAACAATATCACCATACTCCTC 10 | ACCGGCAGTAGCATGGGCATCAACAGTTTCAGCCTGGGAGAACTCTTGCAACTGGGATGC 11 | ACCATCAGCTGATTCAGAGGGGCCCGTCCTCTTGCTATCAGCAGCTTTAGAGGTTCCTTC 12 | CGTGTCATCAGCGGTAGTTTCTACCGTCTGCTTTTTTCCTCTCCTCACAGCTTTCTTCTT 13 | CTCCCGTGGAGGGGCTGCTGCTGTGGCATTTGCAACTGTAGTGCGTCTCAAATTGTAGCG 14 | TTTTTCACCTGTGGTTTGCATCAACGGGCCAGATGGCTGCCTCCTCTTACGAGGCTGCCC 15 | GCCAACTGAAACACTTTCAGATTGTGCTTCACTAACTTCTTCATCTTCCTCACTGATCTC 16 | TGGTCCAGCAAAGCGCCTCTTCCGTGTACCACCAACAGTAACTGAATCTTCCTGATCATC 17 | ATTCTTTTCATCAAAATTCCCACCAAGTACTACTTTAGCATCTTCAAGCACTGCACGAAC 18 | CGACTTTGTCCTTCTTGGTTTACCCTTCCGTTTCGGAGGCTTTGATGTTTCAGGTTGCAA 19 | TTCAACTTCAGGGGCAGCCGTTGAGTCCTTTCCATTTTGGTCCACAGAGGCAATTGTTGT 20 | GTCAACTGCCATGTCATTAGCACCAGCAAATGGCTGAGTTCCATGAACATCTATACAATT 21 | ATCTGCAACTCCAACAGAAGACTCGATTTGGATATCATCAGCTATATCAAGCCTTTCGAA 22 | TTCTACATTCCCCCTTGTTCCACTGTCAGAAGGGAGACCCTCAGCATCAAAAGAATTATT 23 | TGCTGCCTGATAGACAGGAGTTGGCTCATAATCTGCTTCACTCGGTGATGCTTCCTCTAA 24 | TCTCGCACCAAAAGGAATGTTAATTTCTGCTTCCTGGTCCGATGACTGCTCGGCCTTTGT 25 | CCTAGGAGAGAACTTGAAAAGCCTTGAACACTTCTTCAACAGTGAGAAACGACCAGCTGA 26 | ATTAACAAGTGTTCCTGTGTCTTGAGCCAAATTATCAGTGTTCGGGGAACGACCATCTCC 27 | CTCAACAGCCAAACTTGGGTGGTCAATATCAGCATCATCCTTGAGAGATAGATCACCAAA 28 | GCCCTCAAAAACAGAAACCCCACAGCTTTTGCACACCTTGTACTTTTCAAACATATCAAT 29 | GAGGTTATTCCTGTCCCTGTTGTATGCTTCTCGGCGATCCTTCAGACTCTTGCTGAGTGC 30 | ATGAAGACTATCAATGTCTCTTCTAATGTCTGCCTTGTCAGTTTCCAATTTTTGCTTCTC 31 | CTCCTCTAGAACTTCTTTTTCCCTCAGCAGCTGTTCTTTCTCCAATGTTATCTTTTGAAT 32 | CTTGGACTCGTTCAAATCAATAGCATGCCTTAGTTCATTTTCTACAAAATCCCTTCTCTT 33 | ATTTAACTCATCCTCCTTCTGTTCCAGTTCCTTCTGCTTAATTGCTAGTTTATTCGCCAT 34 | CTCCATCTCCAGTTCATGTCTATGGAGCTGCAAATTACGCTGAAGATCAGCACGTTCTCT 35 | CTTCAGAAATTCATCAATCTCCTCACGCTGGTGTTTTATATCATCCAATAAAGCTTTTTC 36 | CTTGAGTGCAAGACTATCCTGCTGCTCCTTGTATTTTGCATCCATTTCATCCTGTGTATC 37 | CTTGAATCTCTTCTCTTCATTGTGACGCCACTTTCCTAGATTTGTTTTATCATTCTTGAG 38 | CACCTTTGCCTCCTCTTCTAAATGAGCTCTTTTTTCATCAAGCTGCTCCCACTCTTCTTC 39 | GAATTTCTGCCGCTGCTTCCTCAAATCTTCAGTTTCCTCCGAGAGGGAATTGCTTCGCAT 40 | TCTATACTCATCAATCTCCTTCTTCAGTTGTGCTGTCAGCATGCTGTGCTCTTGTCTCTC 41 | TTCCTCTGTAAGTTTCAAATTGTTTTGTTCTTCTAAAATTTTCTTCTTTTCAGCTTCAAT 42 | TGTGGCTTTCAGCCTTTCAATATCTGATCTGTACATCTCAGCCTGTTTTCGTTCATTGTC 43 | CACTTGCAGCTTTTCTTCGGATAAGCTATCCTTCTCAGTCTTCAGTGACTCCTCCCACTT 44 | CTTCAGAGCTTTCGACTTAGTATCATGGTCAATCTGCAAATCTTCCAGCCTTTTCATGCT 45 | TTCATTTAATGCTTGCTCCCTTTTAGATATTATGTTCTCACGAGAACTGAGATCTTTTTC 46 | CTTCTTTACCAAATCAGCTTCCCTATGTTTCAGCTTTTCCTCAAAGGATTTCCTCTCACT 47 | CTCCAGCTCTAATTCAAAATCTCGTCTTTTTGCTTCAAGCTTCACCTCGTGATCCTTGAT 48 | AAGCTTCTGAAGCCCCATTTTTTCTCTTGTACTAGCTTTTTCTTCCCTCTCAGATAACTT 49 | TTTCTCCCTTTTTTCCAAAGTCTTGTGCTTTGAGTCAGCATCCTTTTCTTGTGAGCGTAA 50 | TTCATTCAGCCTTTTGGCAATATCCTCCTCCTTTGTTTTCAAAATGATTTTAGTTGACTC 51 | CACTGTCTTTTTTGCCTCCTCCAATTCTTCCTGTTTTATCTTACAAAGTTTGTCATTCTC 52 | ATTTGCTCTCTCCTCTCTGTCATTTACAGATCTCTGCAAATCAACGAGTCTGTTCTGACT 53 | TTCTTTAAGCTTCTTCTCCCAGTCCTGCAGAGATTCCTCCTGCTCCTTGAGTTGCTTCTC 54 | TCGGGCCTTCCTCTCGGTCTCAAAATATAGCTTCTCCTTCTCCAACCTACGTTGCCGAGC 55 | TTCTGCCTCCTCTAAATCTCTGTCAGCTTGTGACTTCTTACGGTTTGCCTCCGCAAGCTT 56 | CGCATCTGCGGCATGAAGCTTCCCCTCGATCTCCAAAGATTTTTCCTCCAAATTAGCCTC 57 | AAGAGATTGAGCATCAGCTACCTTTTTCTGAGACGTGAACTTAACCTCAGCTATCTCTGA 58 | TCGAATTTCACGTAGTGCCTTTTCAAGATCAGCTACACATTGTTTCTCAACACCCAGTGA 59 | TTTCCTCATGTTTTCTTCTCGCCTTTCATATTCTGAAATAGCATTCAAATGTGCAGCTTG 60 | TTCCCGCTTTAAGATCTCCTCTTTTTGTGTCAGCACTTGGGTGACTTCGTCAAATTTAGC 61 | TGCCCACTCTTTTTTCTCAATCAAGAGCAGACCCATATTGTACTGATATTCATGTAACTC 62 | CTGCTCGAGCTCGGCAGCTCTCTGTGAAGTCCCTTTGGCCTTGCCAAGGGGCGCAGATGC 63 | GGGCGGAGCGCCACCACCGGCCCGCTGGTTTCCTGGAGACGGCGTGGACCAGCCGTTCCA 64 | TCCTTTCCCCTGCGGAGTAAACATCACGGCTTCTTGCTCCCAATTCCTTCACGGCGCATC 65 | AAAGAGACGAGGCGCCGCCCCCTACCGGGAGGTGCCCGGCGGCCACGG 66 | >TR2 67 | GTAACTTGGAGACAACGTGCAAAACTTTTTAGCTTGCAGTTTTTCAGCCAAAAGACTTTT 68 | CACAAAATGGACCCATGATTATTAATTTTTCAACTTACATTGCACAGTTTTTGTAATTAA 69 | TGTATCTCGCATGGCTCTTGTTTGTTCTCCTTCTTCCCTGGACAAAATTCAGCTAAGCTT 70 | TTGTGTAACAATTGCTGGTGCAGCCTTGCTTCATACATCACAG 71 | >TR3 72 | TCCTTGAATCCTGGCTGTCTGCGGCGTCTCAGCTACTCGCCCGCCTGAACAAACGGATCG 73 | AAGCCAAGGACTGGGAGGCGGCGGCGAGCGACTGCTGGATCCTGGAGCGGATCTGGAAGC 74 | TGCTTGCCGACATCGAGGACCTGCACCTGCTCATGGACCCGGACGATTTCCTGCGGCTCA 75 | AGAGCCAGCTCGCGATACGGTCGGCGCCGGACGGCACCGACGCGTCCTTCTGCTTCCGGA 76 | CCAGAGCGCTGCTGCACGCCGCCAACGCCACGAGGGACATCAAGAAGCTGGTGCCGTGGG 77 | TGATCGGCGTCGAGGCGGACCCCAATGGCGGGCCGAGGGTGCAGGAGGCGGCCATGAGGC 78 | TGTACCACGGCCGGAGGCGCGGCGAGGGCGAGGACGCTGGCAAGATCGAGCTGCTGCAGG 79 | CTTTCCAGGCCGTGGAGGCGGCCGTGCGGAGGTTCTTCTTCGCGTACCGGCAGGTCGTGG 80 | CGGCGGTGTGTGGCACGGCGGAGGCGTCGGGCAACCGGGCGCTGTTCGTGCCGGCGGAGG 81 | GGATGGACCCGCTCTCGCAGATGTTCCTCGAGCCGCCCTACTTCCCCAGCCTCGACGCCG 82 | CCAAGACGTTCTTGGCCGACTACTGGGTTCAGCACATGGCCGCCGCCTCTGTTCCGTCAG 83 | GGCGGAGCTGAAGGTTTCGAACGGCCAAAAACCGCGGCGATCGGTAATTTTGCAGGCTAG 84 | AAGTTACCTATGATCCCCAGCCTGCAATCCTATAGTGATTCATCTCAGTAGCGATACATG 85 | AGTACAGTAGATACTCCTAGATGCGTGTGTTGTGACTGTGATGCCATCTGTTCTAGTGTT 86 | CTAGTATCACAGAGGAAGTATTTAACCGTGAGACATTCAATTAAATCAAG 87 | >TR4 88 | GCAGTGACACCTCCAAATCTAACATTTCGCGGTTGCATTACCATCTCTTGCCTCTTGGGCTCTGCCAAGA 89 | ATAGCCAAAGCATATGTAGCCTTCCTGCCTCTCGTGTTCACTCGTTCGGTCCTCTTCCTCCGTTCTCCTC 90 | TTCCCCTTGCCCTCCTCCAGATCGACCATCACTTGCATGCATGCGCAGGCACGATCGAACGCAGTAGATG 91 | CATTGGCTGCCAGCTCGATCCGCACCGACGATACTCCGGCGAGGCAAAGCGCGGCGTAAGGAGGAGGAGG 92 | AAAAGTGGCCGCGACCCGCGGGATGGGCCGTCGACGGAGCGCGCGTTCGAGGGG 93 | CAGCCCGTCCCGCCGTGGACGGAGCAGGTGACGCTGCGCGCCGTGGTGGCGAGCGTGGCGCTGGGCGTGG 94 | CCCTGAGCAGCGTGATGATGAACCTGGTGTTCACGTCGGGGATCATCCCGTCGCTCAACATCTCCGCCGG 95 | CCTCCTCGGCTTCTTCCTCCTCAAGGCCTGGACGCGCCTGCTCGACCAGCTCGGCTCGCCGGGCCGCCCC 96 | TTCACCCGCCAGGAGAACGCCGTCGTCCAGACCTGCGTCGTCGCCTGCGCCAGCATGACATACAGCGGTG 97 | GGTTTGGATCGTATCTGCTGGCCATGGATCGGAAGACGGCGGAGAAGACGAGCACCGGGGACGACTCCAG 98 | CGCGAGCGTCAGTGAGCCGGAGTTCGGTCGGATGATGGCCTTCTTCTTCCTCGTTAGCTTCGTCGGTCTC 99 | CTCGCCATTGTCCCCATGAGGAAGACAATGATCATCCGCCACCGGCTGACGTTCCCAAGCGGCTCGGCGA 100 | CGGCTCACCTCATCAACAGCTTCCATACCCCTCACGGCGCTAGACAAGCGAAGAGGCAAGTCTCGCTCGT 101 | TCTCCGGTCGTCGTTGGCGAGCTTGTTCTGGTCCATCTTCCAGTGGTTCTACACCGGAGGTCCAAACTGC 102 | GGCTTCACTTCCTTCCCAACGTTTGGGCTCAGCGCCTTCAATCGCGGTTTCTACATCAGTTTGAACGGAA 103 | CTTATGTGGGAATGCTCTTCGGGTCCATCATCTC 104 | CTGGGGGATCATGCGGCCGTACATCCGGAGCAAAAGAGGAATCTGGTACGACGCCGATCTCCAGGAGACG 105 | AACTTGAAGAGCTTCAGTGGATACAAGGTGTTTTGCGCCATAGCAATGATCCTCGGCGACGGCATCTTCC 106 | AGCTCGTCGCGATCTCGCTGAGGACGATACACACGGTGCGCCACCACCAGGTAGCGGCGGAGACGCTCAG 107 | GTCCTTCTCCGACGTCGACGCCATGCCGCGGCCGGTGATGAGCTTCGACGACCGCCGCAGGACGCAGGTG 108 | TTCCTCAGGGAGCACATCCCGAGCACCTTCGCCATCAGCGGGTACGTCGTCCTGGCCACCGTCTCCACCG 109 | TCGTCATCCCGCTCATGTACGGCCAGGTGAGGTACTACCACGTCGCCGCCGCGTACGCGTTCGCGCCCGT 110 | CCTGGCCTTCTGCAACGCCTACGGCACGGGCGTCGCGGAGACCAACTTCTCGGCGCAGTACAACAAGCTG 111 | GTGATCCTCATGTTCGCGTCGTGGATCGGGATCAAGAACGGCGGGATCGTCGGGAGCCTCGTCATCTGCG 112 | GCATCGTGTCGTCCATCGTCTCCACCGCCTCCGACTTCATGTCGGACTTCAAGACGAGCTACCTGACGCT 113 | CACCTCGCCGCGGGCCACGCTGGTGAGCCAGGTGATCGGCACGGCGATGGGGTGCGTCGTCAACCCGGCC 114 | -------------------------------------------------------------------------------- /install_R_deps.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Installs missing R dependencies 4 | 5 | local_lib = "./lib/R/" 6 | .libPaths( c( .libPaths(), local_lib) ) 7 | 8 | if(!requireNamespace("BiocManager", quietly=T)) 9 | install.packages("BiocManager", dependencies=T, lib=local_lib) 10 | 11 | install.packages("dplyr", "stringi", "knitr", "httr", "jsonlite", dependencies=T, lib=local_lib) 12 | 13 | BiocManager::install("biomaRt", lib=local_lib, dependencies=T) 14 | 15 | sessionInfo() 16 | -------------------------------------------------------------------------------- /lib/R/README.md: -------------------------------------------------------------------------------- 1 | This folder will be used as local lib by install_R_deps.R 2 | -------------------------------------------------------------------------------- /lib/cpanfileEnsembl: -------------------------------------------------------------------------------- 1 | requires 'DBI'; 2 | requires 'DBD::mysql'; 3 | requires 'Archive::Zip'; 4 | -------------------------------------------------------------------------------- /lib/cpanfileREST: -------------------------------------------------------------------------------- 1 | requires 'JSON'; 2 | requires 'JSON::XS'; 3 | requires 'HTTP::Tiny'; 4 | -------------------------------------------------------------------------------- /lib/requirements.txt: -------------------------------------------------------------------------------- 1 | sqlalchemy==1.3.23 2 | sqlalchemy_utils 3 | PyMySQL 4 | -------------------------------------------------------------------------------- /pangenes/CHANGES.txt: -------------------------------------------------------------------------------- 1 | 21122021: _collinear_genes.pl tested with -s 2 | 27122021: get_pangenes.pl integrates _cut_sequences.pl & _collinear_genes.pl 3 | 04012022: checks regex matches chr names 4 | 04012022: tested with -s, made test_rice 5 | 10012022: wfmash only needed with -w 6 | 25012022: gene BED files checked as they might be interrupted 7 | 28012022: added -H, trying out diff MINMASKLEN values in barley 8 | 03022022: while parsing GFF files, chr names and extracted sequences are checked 9 | 11022022: genomic segments are added as segment_collinear features in _collinear_genes.pl 10 | 15022022: genomic segments are used to produce .gdna.fna clusters in get_pangenes.pl 11 | 02032022: added & tested _collinear_genes.pl -sg 12 | 04032022: added & tested get_pangenes.pl -s 13 | 04032022: get_pangenes.pl -s prints ANI matrix from GSAlign estimates 14 | 15032022: collinearity TSV files sorted 15 | 15032022: pangenome matrices are chr-sorted with -s 16 | 16032022: _cluster_analysis.pl unclusters non-neighbors more than $MAXDISTNEIGHBORS away 17 | 16032022: BED-like pangenome matrices produced with -s 18 | 17032022: updated documentation in README 19 | 24032022: added get_pangenes.pl -N (and _cluster_analysis.pl -m) 20 | 04042022: tested wfmash v0.8.1-25-g1344b9e on rice chr1 testset (-p 80 -s 1000) 21 | 20042022: added sub split_genome_sequences_per_chr_bedtools to _collinear_genes (uses faster bedtools) 22 | 20042022: adopted minimap2-2.24 and updated Makefile 23 | 22042022: save compressed copy of $merged_tsv_file as evidence of clusters 24 | 22042022: check_evidence.pl prints basic stats for a cluster 25 | 05052022: check_evidence.pl -f prints GFF fixes for long gene models 26 | 06052022: check_evidence.pl -f prints sorted- non-overlapping GFF fixes for long gene models 27 | 11052022: check_evidence.pl -f tested for split and missing gene models, premature stop codons tracked 28 | 11052022: GFF patches, maybe created with check_evidence.pl, can now be used with get_pangenes.pl -p, existing WGAs reused 29 | 20052022: prints WGA stats summary 30 | 27052022: check_evidence.pl -s -r oryza_sativa_RAPDB,oryza_sativa_MSU appends to file mode isoform sequence, preferring refs 31 | 02052022: check_evidence.pl locates internal stop codons in CDS sequences, those cannot be used to fix gene models 32 | 07062022: check_evidence.pl skip long segments with candidate long/split genes, uses $MAXSEGMENTSIZE 33 | 09062022: check_evidence.pl sub liftover_gmap calls validates lifted CDS sequences with sub no_premature_stops 34 | 21062022: check_evidence.pl split models only fixed if mapped gene overlaps >= MINFIXOVERLAP=0.75 35 | 21062022: check_evidence.pl long models only fixed if mapped gene pair overlaps >= MINFIXOVERLAP=0.75 36 | 29062022: added _dotplot.pl to produce dotplot figures with R package pafr 37 | 14072022: added _chunk_chr.pl to break long chromosomes in chunks separated by geneless stretches (testing only) 38 | 30082022: tested get_pangenes.pl with rice, barley and wheat 39 | 12092022: check_evidence.pl -f -v prints out GMAP alignments, -p allows for partial CDS lift-over 40 | 13092022: check_evidence.pl -f does not use long/short isoforms for lift-over 41 | 09022023: fixed segments with flipped species while producing PAF in _dotplot.pl 42 | 20022023: get_pangenes.pl stops if 0 genes parsed from GFF 43 | 07032023: _collinear_genes.pl prints out all unmapped genes and the underlying cause 44 | 09032023: _collinear_genes.pl now maps genes in WGAs in both strands, added optional -n 45 | 17032023: _cluster_analysis.pl now merges disjoint clusters (diff species, 75% supporting edges) caused often by split gene models 46 | 30032023: check_evidence.pl -P prints python code to plot genomic context of pangene cluster, requires pyGenomeViz 47 | 03042023: discard partially mapped genes at the ends of chrs (query2ref) in _collinear_genes.pl 48 | 26042023: updated documentation and added Example 2: pangene and Presence-Absence Variation (PAV) analysis 49 | 04052023: added check_quality.pl and updated documentation 50 | 30082023: completed target test_pangenes in Makefile 51 | 30082023: Makefile target install_pangenes now system-installs pangenes/cpanfile; these are core modules, only DB_file seems to be lacking in Travis 52 | 25092023: added match_cluster.pl and updated documentation; uses .cdna.fna clusters by default, -C seems slower 53 | 24102023: added sub select_GFF_valid_genes to leave out non-coding genes; see $GFFACCEPTEDFEATS & $GFFVALIDGENEFEAT 54 | 24102023: get_pangenes.pl now prints number of valid & non-valid genes in each input GFF file (details saved in .gff.log files) 55 | 25102023: identical gene ids are now supported accross annotations, thanks for reporting the bug Pimmy! 56 | 10112023: improved calculation of overlap coordinates from WGA segments in different strands 57 | 14112023: genes lacking cDNA are skipped in _cluster_analysis.pl 58 | 15112023: removed bug from 25102023 that impaired removal of non-local genes, this duplicated a few clusters 59 | 23112023: match_cluster.pl now takes a complete pangene results folder ie Oryza_nivara_v1chr1_alltaxa_5neigh_algMmap_ 60 | 24112023: added rename_pangenes.pl to assing pangene IDs to previously computed clusters 61 | 10012024: fixed bug in handling - strand coords in sub query2ref_coords 62 | 11012024: sub _parseCIGARfeature handles correctly 1bp CS-type SNPs when computing overlap with optional query coord 63 | 28022024: removed bug from 25102023 that misordered non-reference clusters in matrices 64 | 28022024: added -n to avoid intervining non-reference pangenes in _cluster_analysis.pl 65 | 28022024: added -f to get_pangenes.pl, which calls _cluster_analysis -v to make blocks of ref genes 66 | 24052024: check_quality.pl -h prints header 67 | 01082024: added -S to rename_pangenes.pl 68 | 25092024: added section 'Example 6: estimation of haplotype diversity' 69 | 03102024: get_pangenes.pl expects min 95% sequence identity for WGA-based gene alignments, as in GET_HOMOLOGUES-EST, to help avoid diverged tandem copies 70 | 04102024: get_pangenes.pl now set MAXDISTNEIGHBORS=2, neighbor genes in a cluster cannot be more than 2 genes away 71 | 09102024: rename_pangenes.pl -r creates all expected outfiles, tested with rice data 72 | 15012025: prepare for conda package 73 | 06032025: get_pangenes.pl: sort & concat alignment results using tempfile with filenames to sort to avoid "Argument list too long" 74 | 24032025: BED matrix produced by _cluster_analysis.pl is 0-based 75 | 25032025: match_cluster.pl was added -i to control sequence identity of matches 76 | 25032025: match_cluster.pl was added -F to produce a FASTA file with sequence index that can be exported as gene-based pangenome for mapping, 77 | 25032025: with estimated from reference genome 78 | 25032025: updated Makefiles and documentation 79 | 08042025: match_cluster.pl TSV output updated, tested with barley 80 | 08042025: add pangenome coords example to documentation 81 | 14052025: added POCS to troubleshooting to explain small cores 82 | 19052025: check_quality.pl does not assume gff files are available 83 | 27052025: _cluster_analysis.pl -t now affects pangene set growth simulation 84 | -------------------------------------------------------------------------------- /pangenes/HPC.conf.sample: -------------------------------------------------------------------------------- 1 | # cluster/farm configuration file, edit as needed (use spaces or tabs) 2 | # PATH might be empty or set to a path/ ending with '/' 3 | PATH /path/to/lsf/bin/ 4 | TYPE lsf 5 | SUBEXE bsub 6 | CHKEXE bjobs 7 | DELEXE bkill 8 | ERROR EXIT 9 | QARGS -q production -M 20G 10 | -------------------------------------------------------------------------------- /pangenes/HPC.conf.sample.slurm: -------------------------------------------------------------------------------- 1 | # cluster/farm configuration file, edit as needed (use spaces or tabs) 2 | # PATH might be empty or set to a path/ ending with '/' 3 | TYPE slurm 4 | SUBEXE sbatch 5 | CHKEXE squeue 6 | DELEXE scancel 7 | ERROR F 8 | # 70GB was enough for chr-split wheat analysis with minimap2 9 | QARGS -p production --time=24:00:00 --mem 70G 10 | -------------------------------------------------------------------------------- /pangenes/_chunk_chr.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long qw(:config no_ignore_case); 5 | 6 | # Takes a GFF & FASTA pair of files and produces a new pair of files with 7 | # the original chromosomes/contigs split in chunks of contiguous genes. 8 | # A new chunk is created when the next gene on the current chr is further 9 | # than MAXGENEDIST bp. A chunk is a genomic block containing at least one gene, 10 | # usually also ending with a gene. 11 | # 12 | # Not used anymore, legacy only. 13 | 14 | # Copyright [2022-24] 15 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC 16 | 17 | # perl _chunk_chr.pl -sp oryza_sativa -fa Oryza_sativa.IRGSP-1.0.dna.toplevel.fa \ 18 | # -gf Oryza_sativa.IRGSP-1.0.51.gff3 19 | 20 | my $BEDTOOLSEXE = 'bedtools'; 21 | 22 | my $MAXGENEDIST = 500_000; 23 | 24 | my %main_gff_feats = ( 25 | 'gene' => 1, 26 | 'ncRNA_gene' => 1 27 | ); 28 | 29 | my %skip_gff_feats = ( 30 | 'chromosome' => 1, 31 | 'scaffold' => 1 32 | ); 33 | 34 | my ( $help, $sp1, $fasta1, $bedtools_path, $cmd, $bed) = (0, 0); 35 | my ( $maxdist, $gff1, $outpath ) = ($MAXGENEDIST, '', ''); 36 | 37 | GetOptions( 38 | "help|?" => \$help, 39 | "sp|species=s" => \$sp1, 40 | "fa|fasta=s" => \$fasta1, 41 | "gf|gff=s" => \$gff1, 42 | "d|maxdist=i" => \$maxdist, 43 | "o|outpath=s" => \$outpath, 44 | "B|bedtools=s" => \$bedtools_path 45 | ) || help_message(); 46 | 47 | sub help_message { 48 | print "\nusage: $0 [options]\n\n" 49 | . "-sp binomial/trinomial species name (required, example: -sp oryza_sativa, used to name outfiles)\n" 50 | . "-fa genome FASTA filename (required, example: -fa oryza_sativa.fna)\n" 51 | . "-gf GFF filename (required, example: -gf oryza_sativa.RAPDB.gff)\n" 52 | . "-d max distance (bp) tp next gene (optional, example: -d $MAXGENEDIST)\n" 53 | . "-o path to output folder (optional, default current folder)\n" 54 | . "-B path to bedtools binary (optional, default: -B bedtools)\n\n" 55 | } 56 | 57 | if($help || (!$sp1 || !$fasta1 || !$gff1)){ 58 | help_message(); 59 | exit(0); 60 | } 61 | 62 | if(!-s $fasta1 || !-s $gff1){ 63 | print "# ERROR: please make sure all input files exist and have content\n"; 64 | exit(-1); 65 | } 66 | 67 | if($maxdist < 1){ 68 | print "# ERROR: distance must be positive\n"; 69 | exit(-1); 70 | } 71 | 72 | # check binaries 73 | if(!$bedtools_path) { 74 | $bedtools_path = $BEDTOOLSEXE 75 | } 76 | if(`$bedtools_path` !~ 'sage') { 77 | print "# ERROR: cannot find binary file $bedtools_path , exit\n"; 78 | exit(-1) 79 | } 80 | 81 | print "\n# $0 -sp $sp1 -fa $fasta1 -gf $gff1 -d $maxdist -o $outpath -B $bedtools_path\n\n"; 82 | 83 | 84 | 85 | # set output filenames 86 | my $chunkfnafile = "$sp1.chunk$maxdist.fna"; 87 | my $chunkgfffile = "$sp1.chunk$maxdist.gff"; 88 | my $chunkbedfile = "$sp1.chunk$maxdist.bed"; 89 | if(-e $outpath) { 90 | $chunkfnafile = "$outpath/$chunkfnafile"; 91 | $chunkgfffile = "$outpath/$chunkgfffile"; 92 | $chunkbedfile = "$outpath/$chunkbedfile"; 93 | } 94 | 95 | my ($ref_chrs, $ref_chunk_genes, $ref_chunks) = 96 | chunk_GFF($gff1, $maxdist, \%main_gff_feats, \%skip_gff_feats); 97 | 98 | if(scalar(keys(%$ref_chunks)) == 0) { 99 | die "# ERROR: cannot chunk GFF file ($gff1)\n"; 100 | } 101 | 102 | open(GFFCHUNK,">",$chunkgfffile) || 103 | die "# ERROR: cannot open chunk GFF file ($chunkgfffile)\n"; 104 | 105 | open(FNACHUNK,">",$chunkfnafile) || 106 | die "# ERROR: cannot open chunk FASTA file ($chunkfnafile)\n"; 107 | 108 | open(BEDCHUNK,">",$chunkbedfile) || 109 | die "# ERROR: cannot open chunk BED file ($chunkbedfile)\n"; 110 | 111 | my $total_chunks = 0; 112 | foreach my $chr (@$ref_chrs) { 113 | foreach my $chunk (sort {$a<=>$b} keys(%{$ref_chunks->{$chr}})) { 114 | 115 | # print transformed gene models to chunked GFF file 116 | print GFFCHUNK $ref_chunk_genes->{$chr}{$chunk}; 117 | 118 | # print sequence to chunked FASTA file 119 | $bed = sprintf("%s\t%d\t%d", 120 | $chr, 121 | $ref_chunks->{$chr}{$chunk}{'start'}-1, #0-based 122 | $ref_chunks->{$chr}{$chunk}{'end'}); 123 | 124 | $cmd = "echo '$bed' | $bedtools_path getfasta -fi $fasta1 -bed stdin"; 125 | open(BEDTOOLS,"$cmd |") || 126 | die "# ERROR: cannot run bedtools ($cmd)\n"; 127 | while() { 128 | if(/^>/) { 129 | print FNACHUNK ">$chr\.chunk$chunk\n"; 130 | } else { 131 | print FNACHUNK; 132 | } 133 | } 134 | close(BEDTOOLS); 135 | 136 | # log 137 | print BEDCHUNK "$bed\t$chr\.chunk$chunk\n"; 138 | 139 | $total_chunks++ 140 | } 141 | } 142 | 143 | close(BEDCHUNK); 144 | close(FNACHUNK); 145 | close(GFFCHUNK); 146 | 147 | print "# chunked GFF file: $chunkgfffile\n"; 148 | print "# chunked FASTA file: $chunkfnafile\n"; 149 | print "# chunked BED file: $chunkbedfile\n"; 150 | 151 | 152 | printf("\n# total chr/contigs=%d total chunks=%d\n", 153 | scalar(@$ref_chrs), 154 | $total_chunks); 155 | 156 | 157 | ############################### 158 | 159 | # Parses GFF file and finds chunks of contiguous genes. 160 | # Returns: 161 | # i) ref to hash mapping chunk ID to translated gene models in chunks 162 | # ii) ref to hash mapping chunk ID to chunk 1-based coordinates in original FASTA 163 | # iii) ref to list with chr names in same order as input 164 | sub chunk_GFF { 165 | 166 | my ($gff_file, $maxdist, $ref_main_gff, $ref_skip_gff) = @_; 167 | 168 | my ($chr, $start, $end, $chunk_start, $chunk_end, $gff_line); 169 | my ($num_chunk, $dist, $offset, $prev_end) = (1, 0, 0, 0); 170 | my (%chunk_genes, %chunk, @chrs); 171 | 172 | open(GFF,"<",$gff_file) || 173 | die "# ERROR(chunk_GFF): cannot read $gff_file\n"; 174 | while(){ 175 | 176 | next if(/^#/ || /^$/); 177 | 178 | my @gff = split(/\t/,$_); 179 | ($chr, $start, $end) = @gff[0,3,4]; 180 | 181 | next if($ref_skip_gff->{ $gff[2] }); 182 | 183 | if($ref_main_gff->{ $gff[2] }) { 184 | 185 | # new chunk with new chr 186 | if($num_chunk > 1 && !grep(/^$chr$/,@chrs)) { 187 | push(@chrs, $chr); 188 | $prev_end = 0; 189 | $num_chunk++; 190 | 191 | } elsif($prev_end > 0) { # new chunk if previous gene too far 192 | 193 | $dist = $start-$prev_end; 194 | if($dist > $maxdist) { 195 | $num_chunk++; 196 | } 197 | } 198 | 199 | # set chunk start and offset, 200 | # used to cut sequence & to transform gene coords 201 | if(!defined($chunk{$chr}{$num_chunk}{'start'})) { 202 | $chunk{$chr}{$num_chunk}{'start'} = $start; 203 | $chunk{$chr}{$num_chunk}{'offset'} = $start-1; 204 | } 205 | 206 | # update chunk last coord with every new gene 207 | $chunk{$chr}{$num_chunk}{'end'} = $end; 208 | 209 | # save end coord for next iteration 210 | $prev_end = $end; 211 | } 212 | 213 | # transform coords relative to current chunk 214 | $gff[0] = "$chr.chunk$num_chunk"; 215 | $gff[3] -= $chunk{$chr}{$num_chunk}{'offset'}; 216 | $gff[4] -= $chunk{$chr}{$num_chunk}{'offset'}; 217 | $gff_line = join("\t",@gff); 218 | 219 | $chunk_genes{$chr}{$num_chunk} .= $gff_line; 220 | } 221 | close(GFF); 222 | 223 | return (\@chrs, \%chunk_genes, \%chunk); 224 | } 225 | 226 | -------------------------------------------------------------------------------- /pangenes/_cut_sequences.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long qw(:config no_ignore_case); 5 | use File::Copy qw(cp); 6 | 7 | # Takes a GFF & FASTA pair of files and produces FASTA files with 8 | # cDNA, CDS nucl & pep sequences. Optionally it can take also a 9 | # GFF patch that modifies the input GFF. 10 | # Note: also creates a FASTA index file (.fai) 11 | # 12 | # Uses external software: gffread [https://f1000research.com/articles/9-304/v2] 13 | 14 | # Copyright [2021-23] 15 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC 16 | 17 | # perl _cut_sequences.pl -sp oryza_sativa -fa Oryza_sativa.IRGSP-1.0.dna.toplevel.fa \ 18 | # -gf Oryza_sativa.IRGSP-1.0.51.gff3 19 | 20 | my $GFFREADEXE = 'gffread'; # v0.12.7 21 | 22 | my ( $help, $nored, $gffreadpath, $sp1, $fasta1) = (0, 0); 23 | my ( $minlen, $gff1, $patchgff1, $tname, $outpath ) = (0, '', ''); 24 | my $patched_gff_filename = ''; 25 | 26 | GetOptions( 27 | "help|?" => \$help, 28 | "sp|species=s" => \$sp1, 29 | "fa|fasta=s" => \$fasta1, 30 | "gf|gff=s" => \$gff1, 31 | "pt|patch=s" => \$patchgff1, 32 | "l|minlen=i" => \$minlen, 33 | "nr|n" => \$nored, 34 | "p|path=s" => \$gffreadpath, 35 | "o|outpath=s" => \$outpath 36 | ) || help_message(); 37 | 38 | sub help_message { 39 | print "\nusage: $0 [options]\n\n" 40 | . "-sp binomial/trinomial species name (required, example: -sp oryza_sativa, used to name outfiles)\n" 41 | . "-fa genome FASTA filename (required, example: -fa oryza_sativa.fna)\n" 42 | . "-gf GFF filename (required, example: -gf oryza_sativa.RAPDB.gff)\n" 43 | . "-pt patch GFF filename (optional, example: -pt oryza_sativa.RAPDB.patch.gff,\n" 44 | . " creates oryza_sativa.RAPDB.patched.gff)\n" 45 | . "-l min length (bp) of features (optional, example: -l 100)\n" 46 | . "-nr remove redundancy in seq names (optional, ie 'gene:ONIVA01G00100')\n" 47 | . "-p path to gffread binary (optional, default: $GFFREADEXE)\n" 48 | . "-o path to output folder (optional, default current folder)\n\n" 49 | } 50 | 51 | if($help || (!$sp1 || !$fasta1 || !$gff1)){ 52 | help_message(); 53 | exit(0); 54 | } 55 | 56 | if(!-s $fasta1 || !-s $gff1){ 57 | print "# ERROR: please make sure all input files exist and have content\n"; 58 | exit(-1); 59 | } 60 | 61 | if($patchgff1 && !-e $patchgff1){ 62 | print "# ERROR: please make sure patch GFF file exists\n"; 63 | exit(-2); 64 | } 65 | 66 | if(!$gffreadpath){ 67 | $gffreadpath = $GFFREADEXE; 68 | } 69 | 70 | if($minlen < 1){ 71 | $minlen = 0 72 | } 73 | 74 | print "\n# $0 -sp $sp1 -fa $fasta1 -gf $gff1 -pt $patchgff1 " . 75 | "-l $minlen -nr $nored -path $gffreadpath\n\n"; 76 | 77 | # set output filenames 78 | my $cdnafile = "$sp1.cdna.fna"; 79 | my $cdsfile = "$sp1.cds.fna"; 80 | my $pepfile = "$sp1.cds.faa"; 81 | if($patchgff1){ 82 | $cdnafile = "$sp1.patch.cdna.fna"; 83 | $cdsfile = "$sp1.patch.cds.fna"; 84 | $pepfile = "$sp1.patch.cds.faa"; 85 | $patched_gff_filename = $gff1; 86 | $patched_gff_filename =~ s/\.gff$/.patched.gff/; 87 | } 88 | if($outpath) { 89 | $cdnafile = "$outpath/$cdnafile"; 90 | $cdsfile = "$outpath/$cdsfile"; 91 | $pepfile = "$outpath/$pepfile"; 92 | } 93 | 94 | # only patch if required 95 | if(-s $patchgff1) { 96 | my $num_patches = patch_gff($gff1, $patchgff1, $patched_gff_filename); 97 | $gff1 = $patched_gff_filename; 98 | 99 | } else { 100 | # otherwise just make symb link 101 | symlink($gff1, $patched_gff_filename); 102 | } 103 | 104 | my ($ref_names, $ref_coords) = parse_genes($gff1); 105 | 106 | my $num_cdna = parse_gffread($gffreadpath,$fasta1,$gff1,$cdnafile, 107 | 'cdna',$minlen,$nored,$ref_names,$ref_coords); 108 | my $num_cds = parse_gffread($gffreadpath,$fasta1,$gff1,$cdsfile, 109 | 'cds',$minlen,$nored,$ref_names,$ref_coords); 110 | my $num_pep = parse_gffread($gffreadpath,$fasta1,$gff1,$pepfile, 111 | 'pep',$minlen,$nored,$ref_names,$ref_coords); 112 | 113 | if(scalar(keys(%$ref_names)) == 0) { 114 | die "# ERROR: cannot parse Parent IDs of mRNA/transcripts, please check GFF format ($gff1)\n"; 115 | } 116 | 117 | if($num_cdna) { 118 | print "# $cdnafile n=$num_cdna\n"; 119 | } else { 120 | die "# ERROR: cannot extract cDNA sequences, please check GFF format and/or chr names ($gff1)\n"; 121 | } 122 | 123 | if($num_cds) { 124 | print "# $cdsfile n=$num_cds\n" 125 | } else { 126 | die "# ERROR: cannot extract CDS sequences, please check GFF format and/or chr names ($gff1)\n"; 127 | } 128 | 129 | print "# $pepfile n=$num_pep\n"; 130 | 131 | 132 | ############################### 133 | 134 | # Runs gffread, parses its stdout and saves output in FASTA file. 135 | # Returns number of sequences printed out. 136 | sub parse_gffread { 137 | 138 | my ($gffreadexe,$fasta_file,$gff_file,$outfile, 139 | $seqtype,$minlen,$remove_red,$ref_tr2gene,$ref_tr2coords) = @_; 140 | 141 | my ($params,$mrnaid,$geneid,$coords); 142 | 143 | if($seqtype eq 'cds'){ 144 | $params = '-x - '; 145 | } elsif($seqtype eq 'pep'){ 146 | $params = '-y - '; 147 | } else { 148 | $params = '-w - '; # cDNA, default 149 | } 150 | 151 | if($minlen > 0) { 152 | $params .= " -l $minlen "; 153 | } 154 | 155 | my $num_seqs = 0; 156 | open(OUT,">",$outfile) || 157 | die "# ERROR(parse_gffread): cannot create $outfile\n"; 158 | 159 | open(GFFREAD,"$gffreadexe $params -g $fasta_file $gff_file |") || 160 | die "# ERROR(parse_gffread): cannot run $gffreadexe\n"; 161 | 162 | while(){ 163 | if(/^>(\S+)/){ 164 | $mrnaid = $1; 165 | $geneid = $ref_tr2gene->{$mrnaid} || ''; 166 | $coords = $ref_tr2coords->{$mrnaid} || ''; 167 | 168 | # remove redundant bits 169 | if($remove_red){ 170 | $mrnaid =~ s/transcript://; 171 | $geneid =~ s/gene://; 172 | } 173 | 174 | print OUT ">$mrnaid $geneid $coords [$sp1]\n"; 175 | $num_seqs++; 176 | } else { 177 | print OUT; 178 | } 179 | } 180 | close(GFFREAD); 181 | 182 | close(OUT); 183 | 184 | # do not remove index (used later) 185 | # unlink($fasta_file.'.fai'); 186 | 187 | return $num_seqs 188 | } 189 | 190 | # Reads in GFF file to parse gene names as parent IDs of transcripts. 191 | # Returns: 192 | # i) ref to hash mapping transcript ID -> gene ID 193 | # ii) ref to hash mapping transcript ID -> gene coords 194 | sub parse_genes { 195 | 196 | my ($gff_file) = @_; 197 | 198 | my ($mrnaid,$geneid,$coord,%names,%coords); 199 | 200 | open(GFF,"<",$gff_file) || die "# ERROR(parse_genenames): cannot read $gff_file\n"; 201 | while(){ 202 | my @F = split(/\t/,$_); 203 | 204 | next if(scalar(@F)<9 || ($F[2] ne "mRNA" && $F[2] ne "transcript")); 205 | 206 | # take only genes where ID can be parsed 207 | if($F[8] =~ /ID=([^;]+).*?Parent=([^;]+)/){ 208 | 209 | $mrnaid = $1; 210 | $geneid = $2; 211 | chomp $geneid; 212 | 213 | $coord = "$F[0]:$F[3]-$F[4]($F[6])"; 214 | $names{$mrnaid} = $geneid; 215 | $coords{$mrnaid} = $coord; 216 | } 217 | } 218 | close(GFF); 219 | 220 | return (\%names,\%coords); 221 | } 222 | 223 | # Takes two GFF files (original and patch) and produces a new GFF file 224 | # that includes patched gene models. Patched models match the original 225 | # ones by means of 'old_locus_tag' tags in gene features. 226 | # Patched GFF files might be concatenated; this means latter models 227 | # can replaced models declared earlier in the same file. Two params: 228 | # i) GFF filename 229 | # ii) GFF filename with selected gene model patches 230 | # iii) output patched GFF filename 231 | # Returns integer with number of patched gene models 232 | sub patch_gff { 233 | 234 | my ($gff_file, $patchfile, $patched_gff_filename) = @_; 235 | 236 | my ($patch,$gene_id,$old_gene_id,$comment); 237 | my ($chr,$start,$line); 238 | my (%depr_gene_id, %new2old, %patched_model, %coords); 239 | 240 | # read in GFF patches, possible concatenated 241 | open(PATCH,'<',$patchfile) || 242 | die "# ERROR(patch_gff): cannot read $patchfile\n"; 243 | while($line = ) { 244 | 245 | if($line =~ /^$/){ 246 | next 247 | } elsif($line =~ /^#/) { 248 | $comment = $line; 249 | } else { 250 | chomp($line); 251 | my @gffdata = split(/\t/,$line); 252 | 253 | #chr01 gmap gene 411 776 . - . ID=Os121164;..old_locus_tag=missing; 254 | #chr01 gmap gene 411 776 . - . ID=gene:Os127564;..old_locus_tag=Os121164; 255 | #chr01 gmap gene 411 800 . - . ID=Os22222;..old_locus_tag=Os127564; 256 | 257 | if($gffdata[2] && $gffdata[2] eq 'gene') { 258 | 259 | $chr = $gffdata[0]; 260 | $start = $gffdata[3]; 261 | 262 | if($gffdata[8] =~ m/ID=([^;]+)/) { 263 | $gene_id = $1; 264 | } 265 | 266 | while($gffdata[8] =~ m/old_locus_tag=([^;]+)/g) { 267 | $old_gene_id = $1; 268 | 269 | # if gene replaces a previous patched model, 270 | # correct old_locus_tag to point to original 271 | if($new2old{ $old_gene_id }) { 272 | 273 | # add original old locus tag 274 | if($line !~ /$new2old{$old_gene_id}/) { 275 | $line .= "old_locus_tag=$new2old{$old_gene_id};"; 276 | } 277 | 278 | $depr_gene_id{ $old_gene_id } = $gene_id; 279 | $old_gene_id = $new2old{ $old_gene_id }; 280 | } 281 | 282 | $new2old{ $gene_id } = $old_gene_id; 283 | $depr_gene_id{ $old_gene_id } = $gene_id; 284 | } 285 | 286 | $patched_model{ $gene_id } = "$line\n"; 287 | $coords{ $chr }{ $start } = $gene_id; 288 | 289 | } else { 290 | $patched_model{ $gene_id } .= "$line\n" 291 | } 292 | } 293 | } 294 | close(PATCH); 295 | 296 | # find non-redundant patches, sort and concat them 297 | my $total_patched = 0; 298 | foreach $chr (sort {$a cmp $b} keys(%coords)) { 299 | foreach $start (sort {$a <=> $b} keys(%{ $coords{$chr} })) { 300 | 301 | $gene_id = $coords{ $chr }{ $start }; 302 | 303 | if($depr_gene_id{ $gene_id }) { 304 | print "# skip old patched $gene_id\n"; 305 | next; 306 | } 307 | 308 | #if($gene_id eq 'gene:Os06g0705350.path1') { print "mira\n" } 309 | $patch .= $patched_model{ $gene_id }; 310 | $total_patched++; 311 | } 312 | } 313 | 314 | printf("# total patched: %d deprecated: %d\n\n", 315 | $total_patched,scalar(keys(%depr_gene_id))); 316 | 317 | # read in original GFF, apply patches and save output 318 | open(PATCHED,">",$patched_gff_filename) || 319 | die "# ERROR(patch_gff): cannot create $patched_gff_filename\n"; 320 | 321 | my $geneOK = 1; 322 | open(GFF,'<',$gff_file) || 323 | die "# ERROR(patch_gff): cannot read $gff_file\n"; 324 | while() { 325 | 326 | my @gffdata = split(/\t/,$_); 327 | if($gffdata[2] && $gffdata[2] eq 'gene') { 328 | if($gffdata[8] =~ m/ID=([^;]+)/) { 329 | $gene_id = $1; 330 | if($depr_gene_id{ $gene_id }){ 331 | $geneOK = 0; 332 | } else { $geneOK = 1 } 333 | } else { $geneOK = 1 } 334 | 335 | print PATCHED if($geneOK); 336 | 337 | } else { 338 | print PATCHED if($geneOK) 339 | } 340 | } 341 | close(GFF); 342 | 343 | # apply patches 344 | print PATCHED $patch; 345 | close(PATCHED); 346 | 347 | return $total_patched 348 | } 349 | -------------------------------------------------------------------------------- /pangenes/_dotplot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use File::Basename qw( dirname ); 5 | use FindBin '$Bin'; 6 | use lib "$Bin/lib"; 7 | use pangeneTools qw( read_FAI_regex2hash ); 8 | 9 | $|=1; 10 | 11 | # Parses pairwise collinear TSV files made with _collinear_genes.pl to 12 | # produce PAF files that can be used to produce a dotplot of matched gene 13 | # models with R package pafr [https://cran.r-project.org/package=pafr] 14 | # Note: contigs < $MINCONTIGSIZE are ignored for clarity 15 | 16 | # Copyright [2022-23] 17 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC 18 | 19 | # perl _dotplot.pl _Oryza_nivara_v1.Oryza_sativa.IRGSP-1.0.algMmap.overlap0.5.tsv 20 | 21 | my $MINCONTIGSIZE = 100_000; 22 | my $DUMMYQUAL = 60; 23 | my $VERBOSE = 0; 24 | 25 | my $TSVfile = $ARGV[0] || die "# usage: $0 \n"; 26 | 27 | my $resultsDIR = dirname($TSVfile); 28 | 29 | my ($sp1filename, $sp2filename) = ('', ''); 30 | my ($faifile, $sp1, $sp2, $species, $chr, $len) = ('','',''); 31 | my (%file,%size); 32 | 33 | ## locate FASTA index files (.fai), usually created in _cut_sequences.pl 34 | opendir(DIR,$resultsDIR) || die "# ERROR: cannot list $resultsDIR\n"; 35 | my @faifiles = grep {/\.fai$/} readdir(DIR); 36 | closedir(DIR); 37 | 38 | foreach $faifile (@faifiles) { 39 | 40 | $species = $faifile; 41 | $species =~ s/^_//; 42 | $species =~ s/\.fna.fai$//; 43 | 44 | if($TSVfile =~ m/_$species\./) { 45 | $sp1 = $species; 46 | $sp1filename = $sp1; 47 | $file{ $sp1 } = "$resultsDIR/$faifile"; 48 | } elsif($TSVfile =~ m/\.$species\./) { 49 | $sp2 = $species; 50 | $sp2filename = $sp2; 51 | $file{ $sp2 } = "$resultsDIR/$faifile"; 52 | } 53 | } 54 | 55 | ## parse contig sizes from FASTA index files 56 | if($sp1 eq '' || $sp2 eq '') { 57 | die "# ERROR: cannot find FASTA indexes (.fai) for $TSVfile ($sp1, $sp2)\n"; 58 | } else { 59 | for $species ($sp1, $sp2) { 60 | my $ref_bed = read_FAI_regex2hash( $file{$species} ); 61 | foreach $chr (keys(%$ref_bed)) { 62 | $len = (split(/\t/,$ref_bed->{$chr}))[2]; 63 | $size{$species}{$chr} = $len; 64 | print "# $species $chr $len\n" if($VERBOSE); 65 | } 66 | } 67 | } 68 | 69 | ## parse TSV file and convert it to PAF format so that pafr can take it, 70 | ## see https://dwinter.github.io/pafr/articles/Introduction_to_pafr.html 71 | 72 | my $outPAFfile = $TSVfile; 73 | $outPAFfile =~ s/\.tsv$/.genes.paf/; 74 | 75 | open(PAF,">",$outPAFfile) || die "# ERROR: cannot create $outPAFfile\n"; 76 | 77 | my (@data,$chr1,$start1,$end1,$chr2,$start2,$end2); 78 | my ($strand1,$strand2,$relstrand); 79 | open(TSV,"<",$TSVfile) || die "# ERROR: cannot read $TSVfile\n"; 80 | while() { 81 | 82 | @data = split(/\t/,$_); 83 | 84 | # OsMH63_01G000010 OsMH63_01G000010 oryza_sativa_mh63 .. ortholog_collinear .. oryza_sativa .. 1:7538-15379(+);1:2902-10817(+) 85 | # Note that after 86 | # https://github.com/Ensembl/plant-scripts/blob/f9c9e4e71fbb8e46f84b0609a6dfc1dd5930bacf/pangenes/_collinear_genes.pl#L1774 87 | # in segments species order might change 88 | # Os01g0100466 Os01g0100466 oryza_sativa .. segment_collinear .. oryza_sativa_mh63:1:17370-18541(-) .. 1:12807-13978(-);1:17370-18541(-) 89 | 90 | if($data[14] =~ m/(\S+)?:(\d+)-(\d+)\(([+-])\);(\S+)?:(\d+)-(\d+)\(([+-])\)/) { 91 | 92 | $sp1 = $data[2]; 93 | $sp2 = $data[7]; 94 | ($chr1,$start1,$end1,$strand1,$chr2,$start2,$end2,$strand2) = ($1,$2,$3,$4,$5,$6,$7,$8); 95 | 96 | # skip unknown contigs ie unplaced_chrUn, might occur with -s 97 | next if(!$size{$sp1}{$chr1} || !$size{$sp2}{$chr2}); 98 | 99 | next if($size{$sp1}{$chr1} < $MINCONTIGSIZE || 100 | $size{$sp2}{$chr2} < $MINCONTIGSIZE); 101 | 102 | if($strand1 eq $strand2) { 103 | $relstrand = '+'; 104 | } else { 105 | $relstrand = '-'; 106 | } 107 | 108 | if($sp1 eq $sp1filename) { 109 | printf( PAF "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n", 110 | $chr1,$size{$sp1}{$chr1},$start1-1,$end1, 111 | $relstrand, 112 | $chr2,$size{$sp2}{$chr2},$start2-1,$end2, 113 | $data[3], # overlap instead of matching bases in the mapping 114 | $data[3], # overlap instead of bases, including gaps, in the mapping 115 | $DUMMYQUAL); 116 | } else { 117 | printf( PAF "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n", 118 | $chr2,$size{$sp2}{$chr2},$start2-1,$end2, 119 | $relstrand, 120 | $chr1,$size{$sp1}{$chr1},$start1-1,$end1, 121 | $data[3], # overlap instead of matching bases in the mapping 122 | $data[3], # overlap instead of bases, including gaps, in the mapping 123 | $DUMMYQUAL); 124 | } 125 | } 126 | } 127 | close(TSV); 128 | 129 | close(PAF); 130 | 131 | print "\n# \$MINCONTIGSIZE = $MINCONTIGSIZE\n"; 132 | print "\n# PAF file: $outPAFfile\n\n"; 133 | 134 | print "# Make a dotplot of aligned models coords with the following R script:\n"; 135 | 136 | print< LSF , HPC.conf.sample.slurm -> slurm 20 | ls HPC* 21 | 22 | # suppose you want to run GET_PANGENES in a slurm setting 23 | cp HPC.conf.sample.slurm HPC.conf 24 | cat HPC.conf 25 | # you should manually edit file HPC.conf to match your settings 26 | # for instance, the provided sample file assumes a queue named 'production' and max 70GB RAM per job, 27 | # enough in our benchmarks up to wheat and maize using minimap2, you might want to change that 28 | 29 | sleep 3 30 | 31 | ## examples 32 | 33 | # local analysis of test_rice data, make it HPC/parallel with: perl get_pangenes.pl -d ../files/test_rice -m cluster 34 | perl get_pangenes.pl -d ../files/test_rice 35 | 36 | # results folder is: /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_ 37 | ls /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_ 38 | 39 | sleep 3 40 | 41 | # now restrict whole-genome alignments (WGA) to homologous chromosomes, 42 | # for this to work chr names in input FASTA files must be consistent so that regular expression will match them all, let's check: 43 | zgrep "^>" ../files/test_rice/*fa.gz 44 | 45 | # in the test_rice example the main chromosomes are named with integer numbers, this should work: 46 | perl get_pangenes.pl -d ../files/test_rice -s '^\d+' 47 | 48 | # now the results are in /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_split_ 49 | # you can see the folder name now is added 'split_' to indicate that input genomes were split by chr matching the regular expression 50 | 51 | sleep 3 52 | 53 | # now we will mask geneless regions >= 1Mb, this is required by minimap2 with large genomes 54 | perl get_pangenes.pl -d ../files/test_rice -H 55 | # in this case, output is saved to Oryza_nivara_v1chr1_highrep_alltaxa_2neigh_algMmap_ 56 | # note the 'highrep_' tag 57 | 58 | sleep 3 59 | 60 | # let's check how much disk the output folder takes, most are temporary files 61 | # that might be re-used in future jobs, but can be removed if needed 62 | du -hs test_rice_pangenes/ 63 | du -hs test_rice_pangenes/tmp 64 | 65 | sleep 3 66 | 67 | # we will now extract the WGA evidence supporting an example pangene cluster, 68 | # see also options -f -v 69 | perl check_evidence.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -i gene:ONIVA01G50800.cdna.fna 70 | 71 | sleep 3 72 | 73 | # match arbitrary sequences to computed pangene clusters 74 | perl match_cluster.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -s ../files/test_transcripts.fna -o test_transcripts.gmap.tsv 75 | cat test_transcripts.gmap.tsvtest_transcripts.gmap.tsv 76 | 77 | sleep 3 78 | 79 | # clean up 80 | rm -rf test_rice_pangenes 81 | 82 | exit 83 | -------------------------------------------------------------------------------- /pangenes/bin/README.md: -------------------------------------------------------------------------------- 1 | This is where external binaries needed to analyze a pan-gene set are to be installed. 2 | They can be installed with: 3 | 4 | cd ../.. 5 | make install_pangenes 6 | 7 | -------------------------------------------------------------------------------- /pangenes/check_quality.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # This script takes a cDNA/CDS cluster produced by get_pangenes.pl and 4 | # produces a quality control report 5 | 6 | # Copyright [2023-25] 7 | # EMBL-European Bioinformatics Institute & Estacion Experimental Aula Dei-CSIC 8 | 9 | $|=1; 10 | 11 | use strict; 12 | use warnings; 13 | use Getopt::Std; 14 | use File::Temp qw/ tempfile /; 15 | use FindBin '$Bin'; 16 | use lib "$Bin/lib"; 17 | use pangeneTools qw( check_installed_features feature_is_installed 18 | parse_sequence_FASTA_file calc_stdev calc_mode ); 19 | 20 | my @FEATURES2CHECK = ( 21 | 'EXE_CLUSTALO', 'EXE_ALISTAT', 'EXE_GREP' 22 | ); 23 | 24 | my ($INP_dir, $INP_clusterfile, $INP_first_isof, $INP_noheader, $INP_outdir) = ('','',0,0,''); 25 | my ($isCDS, $ispep, $seq, $n_isof, $occup, $SE_len, $SE_exons) = ( 0, 0 ); 26 | my ($updir, $n_exons, $gff_file, $SE_dist, $max_dist, $c); 27 | my ($cluster_list_file,$cluster_folder, $gene_id, $isof_id); 28 | my ($msa_filename, $dist_filename, $fhmsa, $fhdist, $cmd); 29 | my ($sites, $Ca, $Cr_max, $Cr_min, $Cc_max, $Cc_min, $Cij_max, $Cij_min); 30 | my (%opts, %isof_len, %isof_seq, %isof_header, %isof_order); 31 | my (%taxa, @len, @exons, @dist, @mode_len, @mode_exons, @mode_dist); 32 | 33 | getopts('hnIco:d:i:', \%opts); 34 | 35 | if(($opts{'h'})||(scalar(keys(%opts))==0)) 36 | { 37 | print "\nusage: $0 [options]\n\n"; 38 | print "-h this message\n"; 39 | print "-c print credits and checks installation\n"; 40 | print "-d directory produced by get_pangenes.pl (example: -d /path/data_pangenes/..._algMmap_,\n"; 41 | print " genomic and gff files usually one folder up)\n"; 42 | print "-i cdna/cds .fna/.faa file as in .cluster_list (example: -i gene:ONIVA01G52180.cdna.fna)\n"; 43 | print "-I take 1st isoform only (optional, by default takes all)\n"; 44 | print "-o folder to write output files (optional, MSA files removed by default)\n"; 45 | print "-n do not print header in text report (optional, by default the following header is added:\n"; 46 | print " file 1stisof occup seqs mode_len SE_len mode_exons\n"; 47 | print " SE_exons mode_dist max_dist SE_dist sites Ca Cr_max\n"; 48 | print " Cr_min Cc_max Cc_min Cij_max Cij_min)\n"; 49 | exit(0); 50 | } 51 | 52 | if(defined($opts{'c'})) { 53 | print "\nPrimary citation:\n https://doi.org/10.1186/s13059-023-03071-z\n"; 54 | print "\nThis software uses external algorithms, please cite them accordingly:\n"; 55 | print " clustal-omega https://doi.org/10.1002%2Fpro.3290\n"; 56 | print " AliStat https://doi.org/10.1093/nargab/lqaa024\n"; 57 | 58 | # check all binaries needed by this program and print diagnostic info 59 | print check_installed_features(@FEATURES2CHECK); 60 | exit(0); 61 | } 62 | 63 | if(defined($opts{'d'})) { 64 | $INP_dir = $opts{'d'}; 65 | $updir = $INP_dir . '/..'; 66 | } 67 | else{ die "# EXIT : need a -d directory\n" } 68 | 69 | if(defined($opts{'i'})){ 70 | $INP_clusterfile = $opts{'i'}; 71 | if($INP_clusterfile !~ /\.cdna\.fna$/ && 72 | $INP_clusterfile !~ /\.cds\.fna$/ && 73 | $INP_clusterfile !~ /\.cds\.faa$/) { 74 | die "# EXIT : need a .fna/.faa cluster filename with parameter -i\n" 75 | 76 | } else { 77 | if($INP_clusterfile =~ /\.cds\.f/) { 78 | $isCDS = 1 79 | } 80 | 81 | if($INP_clusterfile =~ /\.cds\.faa/) { 82 | $ispep = 1 83 | } 84 | } 85 | } 86 | else{ die "# EXIT : need parameter -i\n" } 87 | 88 | if(defined($opts{'I'})){ 89 | $INP_first_isof = 1 90 | } 91 | 92 | if(defined($opts{'o'})){ 93 | $INP_outdir = $opts{'o'}; 94 | if(!-e $INP_outdir) { 95 | mkdir($INP_outdir); 96 | } 97 | } 98 | 99 | if(defined($opts{'n'})){ 100 | $INP_noheader = 1 101 | } 102 | 103 | # 1) locate .cluster_list file to check clusterfile is there 104 | opendir(INPDIR,$INP_dir) || 105 | die "# ERROR: cannot list $INP_dir , please check -d argument is a valid folder\n"; 106 | my @files = grep {/\.cluster_list/} readdir(INPDIR); 107 | closedir(INPDIR); 108 | 109 | if(@files) { 110 | $cluster_list_file = $files[0]; 111 | $cluster_folder = (split(/\.cluster_list/,$cluster_list_file))[0] 112 | } else { 113 | die "# ERROR: cannot find .cluster_list file in $INP_dir\n"; 114 | } 115 | 116 | my $clusternameOK = 0; 117 | open(LIST,"<","$INP_dir/$cluster_list_file") || 118 | die "# ERROR: cannot read $INP_dir/$cluster_list_file, ". 119 | "please check -d argument is a valid folder\n"; 120 | 121 | while() { 122 | if(/$INP_clusterfile/) { 123 | $clusternameOK = 1; 124 | } 125 | } 126 | close(LIST); 127 | 128 | if($clusternameOK == 0) { 129 | die "# ERROR: cannot find $INP_clusterfile in $INP_dir/$cluster_list_file, please correct\n"; 130 | } 131 | 132 | # 2) parse FASTA file, extract gene names and sequence lengths 133 | my ( $ref_geneid, $ref_fasta, $ref_isof_coords, $ref_taxon ) = 134 | parse_sequence_FASTA_file( "$INP_dir/$cluster_folder/$INP_clusterfile" , 1); 135 | 136 | foreach $gene_id (@$ref_geneid) { 137 | 138 | $n_isof = 0; 139 | foreach $seq (split(/\n/,$ref_fasta->{$gene_id})) { 140 | 141 | if($seq =~ /^>(\S+)/) { 142 | $n_isof++; 143 | $isof_id = $1; 144 | $isof_header{$gene_id}{$isof_id} = $seq; 145 | $isof_order{$gene_id}{$isof_id} = $n_isof; 146 | next; 147 | } 148 | $isof_len{$gene_id}{$isof_id} += length($seq); 149 | $isof_seq{$gene_id}{$isof_id} .= $seq; 150 | } 151 | } 152 | 153 | # 3) print selected isoform sequence(s) to temp file and work out basic stats 154 | my ($fh, $filename) = tempfile( 'tempfasXXXXX', UNLINK => 1); 155 | 156 | foreach $gene_id (@$ref_geneid) { 157 | foreach $isof_id (keys(%{$isof_len{$gene_id}})) { 158 | 159 | next if($INP_first_isof == 1 && $isof_order{$gene_id}{$isof_id} != 1); 160 | 161 | $taxa{ $ref_taxon->{$gene_id} }++; 162 | push(@len, $isof_len{$gene_id}{$isof_id}); 163 | 164 | # find GFF file & get number of exons 165 | $n_exons = 0; 166 | $gff_file = $updir . "/_$ref_taxon->{$gene_id}.gff"; 167 | 168 | # not always there, perhaps only results folder present 169 | if(-z $gff_file) { 170 | open(GREP, "$ENV{'EXE_GREP'} '$isof_id;' $gff_file |"); 171 | while() { 172 | #1 NAM exon 2575663 2575953 ... 173 | my @data = split(/\t/,$_); 174 | 175 | if($isCDS == 0 && $data[2] eq 'exon') { 176 | $n_exons++ 177 | 178 | } elsif($isCDS == 1 && $data[2] eq 'CDS') { 179 | $n_exons++ 180 | } 181 | } 182 | close(GREP); 183 | } 184 | 185 | push(@exons, $n_exons); 186 | 187 | # actually print to temp file 188 | print $fh "$isof_header{$gene_id}{$isof_id}\n$isof_seq{$gene_id}{$isof_id}\n"; 189 | } 190 | } 191 | 192 | $occup = scalar(keys(%taxa)); 193 | $n_isof = scalar(@len); # recompute in case inly 1st isoform taken 194 | 195 | $SE_len = sprintf("%1.1f", calc_stdev( \@len ) / sqrt($n_isof)); 196 | @mode_len = calc_mode( \@len ); 197 | $SE_exons = sprintf("%1.1f", calc_stdev( \@exons ) / sqrt($n_isof)); 198 | @mode_exons = calc_mode( \@exons ); 199 | 200 | 201 | # 4) compute multiple sequence alignment (MSA), distance matrix & MSA report 202 | 203 | if($INP_outdir ne '') { 204 | $msa_filename = "$INP_outdir/$INP_clusterfile"; 205 | $msa_filename =~ s/\.(f[na]a)$/.aln.$1/; 206 | $dist_filename = "$INP_outdir/$INP_clusterfile"; 207 | $dist_filename =~ s/\.(f[na]a)$/.dist.$1/; 208 | } else { 209 | ($fhmsa, $msa_filename) = tempfile( 'tempmsaXXXXX', UNLINK => 1); 210 | ($fhdist, $dist_filename) = tempfile( 'tempdistXXXXX', UNLINK => 1); 211 | } 212 | 213 | $cmd = "$ENV{'EXE_CLUSTALO'} --force --full -i $filename -o $msa_filename --distmat-out=$dist_filename 2>&1"; 214 | system($cmd); 215 | if ( $? != 0 ) { 216 | die "# ERROR: failed running clustal-omega ($cmd)\n"; 217 | } elsif ( !-s $msa_filename ) { 218 | die "# ERROR: failed generating $msa_filename file ($cmd)\n"; 219 | } 220 | 221 | # parse MSA distances 222 | $max_dist = -1; 223 | open(DIST,"<",$dist_filename) || 224 | die "# ERROR: cannot read $dist_filename\n"; 225 | while() { 226 | chomp; 227 | my @data = split(/\s+/,$_); 228 | next if($#data < 1); 229 | foreach $c (1 .. $#data) { 230 | push(@dist, $data[$c]); 231 | if($data[$c] > $max_dist){ $max_dist = $data[$c] } 232 | } 233 | } 234 | close(DIST); 235 | 236 | $SE_dist = sprintf("%1.1f", calc_stdev( \@dist ) / $n_isof); 237 | @mode_dist = calc_mode( \@dist ); 238 | 239 | # MSA report 240 | $cmd = "$ENV{'EXE_ALISTAT'} $msa_filename 1 -b"; 241 | if($ispep) { 242 | $cmd = "$ENV{'EXE_ALISTAT'} $msa_filename 6 -b"; 243 | } 244 | 245 | open(ALISTAT,"$cmd |") || 246 | die "# ERROR: cannot run $cmd\n"; 247 | while() { 248 | #sequences, #sites, Ca, Cr_max, Cr_min, Cc_max, Cc_min, Cij_max, Cij_min 249 | if(/^$msa_filename/) { 250 | chomp; 251 | my @data = split(/,\s+/,$_); 252 | ($sites, $Ca, $Cr_max, $Cr_min, $Cc_max, $Cc_min, $Cij_max, $Cij_min) = @data[2 .. $#data]; 253 | } 254 | } 255 | close(ALISTAT); 256 | 257 | 258 | # 5) finally print summary in one line 259 | if($INP_noheader == 0) { 260 | print "file\t1stisof\toccup\tseqs\tmode_len\tSE_len\tmode_exons\tSE_exons\t" . 261 | "mode_dist\tmax_dist\tSE_dist\tsites\tCa\tCr_max\tCr_min\tCc_max\tCc_min\tCij_max\tCij_min\n"; 262 | } 263 | 264 | printf( 265 | "%s\t%d\t%d\t%d\t%d\t%1.1f\t" . 266 | "%d\t%1.1f\t%1.6f\t%1.6f\t%1.6f\t%d\t" . 267 | "%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\n", 268 | 269 | $INP_clusterfile, 270 | $INP_first_isof, 271 | $occup, 272 | $n_isof, 273 | $mode_len[0], 274 | $SE_len, 275 | 276 | $mode_exons[0], 277 | $SE_exons, 278 | $mode_dist[0], 279 | $max_dist, 280 | $SE_dist, 281 | $sites, 282 | 283 | $Ca, 284 | $Cr_max, 285 | $Cr_min, 286 | $Cc_max, 287 | $Cc_min, 288 | $Cij_max, 289 | $Cij_min); 290 | -------------------------------------------------------------------------------- /pangenes/cpanfile: -------------------------------------------------------------------------------- 1 | requires 'FindBin'; 2 | requires 'Compress::Zlib'; 3 | requires 'Cwd'; 4 | requires 'Fcntl'; 5 | requires 'File::Basename'; 6 | requires 'File::Copy'; 7 | requires 'File::Temp'; 8 | requires 'Getopt::Long'; 9 | requires 'Getopt::Std'; 10 | requires 'DB_File'; 11 | -------------------------------------------------------------------------------- /pangenes/lib/HPCluster.pm: -------------------------------------------------------------------------------- 1 | package HPCluster; 2 | 3 | # Package to manage cluster jobs from get_pangenes.pl 4 | 5 | # Currently supports SGE, SLURM and LSF clusters, 6 | # but it should not be too dificult to add support for other systems (help welcome) 7 | 8 | # taken from https://github.com/eead-csic-compbio 9 | 10 | use strict; 11 | require Exporter; 12 | 13 | our @ISA = qw( Exporter ); 14 | our @EXPORT = qw( 15 | read_cluster_config print_cluster_config cluster_is_available 16 | submit_cluster_job check_cluster_jobs 17 | ); 18 | 19 | # key cluster management binaries 20 | my @CLBINS = ('SUBEXE','CHKEXE','DELEXE'); 21 | 22 | # Default SGE cluster configuration options 23 | # can be overriden by custom config file 24 | my %CLUSTER_CONF; 25 | $CLUSTER_CONF{'PATH'} = ''; # should end with / 26 | $CLUSTER_CONF{'TYPE'} = 'sge'; 27 | $CLUSTER_CONF{'SUBEXE'} = 'qsub'; 28 | $CLUSTER_CONF{'CHKEXE'} = 'qstat'; 29 | $CLUSTER_CONF{'DELEXE'} = 'qdel'; 30 | $CLUSTER_CONF{'ERROR'} = 'Eqw'; # state of failed jobs 31 | $CLUSTER_CONF{'QARGS'} = ''; # queue name, resources, etc 32 | $CLUSTER_CONF{'STIME'} = 1; # interval in seconds between sub commands 33 | $CLUSTER_CONF{'CTIME'} = 30; # interval in seconds between stat commands 34 | 35 | # check sample.HPC.conf for suggested LSF & slurm parameters 36 | 37 | # Checks whether cluster config file exists and parses it. 38 | # input: 39 | # 1 (string) full path to optional config file 40 | sub read_cluster_config { 41 | my ($config_file) = @_; 42 | if(open(CONF,"<",$config_file)) { 43 | while() { 44 | next if(/^#/); 45 | chomp; 46 | if(/^(\S+)\s+([^\n]+)/){ $CLUSTER_CONF{$1} = $2 } 47 | } 48 | close(CONF); 49 | } else { 50 | print "# INFO: no cluster config file\n\n"; 51 | } 52 | } 53 | 54 | # Prints to stdout current cluster configuration 55 | sub print_cluster_config { 56 | foreach my $conf (sort keys(%CLUSTER_CONF)) { 57 | print "# $conf\t$CLUSTER_CONF{$conf}\n"; 58 | } 59 | print "\n"; 60 | } 61 | 62 | # Checks whether cluster management binaries can be used, returns 0 otherwise 63 | # Uses system 'which' 64 | sub cluster_is_available { 65 | my ($path,$output); 66 | for my $bin (@CLBINS) { 67 | # concat path and binary and do system call 68 | $path = `which $CLUSTER_CONF{"PATH"}$CLUSTER_CONF{$bin}`; 69 | chomp($path); 70 | if($path eq '' || $path =~ /no $CLUSTER_CONF{$bin} in/) { 71 | print "# ERROR: cannot find cluster binary $bin\n\n"; 72 | return 0; 73 | } else { 74 | 75 | $output = `$CLUSTER_CONF{"PATH"}$CLUSTER_CONF{$bin} -help 2>&1`; 76 | if(!$output || 77 | ($output !~ /usage:/i && $output !~ /use/i && $output !~ /invalid option/)) { 78 | print "# ERROR: wrong cluster binary $bin\n"; 79 | print "$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{$bin} -help\n"; 80 | return 0; 81 | } 82 | } 83 | } 84 | 85 | return 1; 86 | } 87 | 88 | # submits a cluster job, stores the assigned process id and waits STIME 89 | # input: 90 | # 1 (string) job name 91 | # 2 (string) command to be run 92 | # 3 (string) name of output file 93 | # 4 (string) name of work directory 94 | # 5 reference to cluster job hash 95 | sub submit_cluster_job { 96 | my ($jobname,$command,$outfile,$dir,$ref_cluster_PIDs) = @_; 97 | 98 | my ($qPID,$qsubcommand) = ('',''); 99 | 100 | if($CLUSTER_CONF{'TYPE'} eq 'lsf') { 101 | $qsubcommand = " -J n$jobname -o $outfile "; 102 | } elsif($CLUSTER_CONF{'TYPE'} eq 'slurm') { 103 | $qsubcommand = " --job-name=n$jobname -o $outfile "; 104 | } 105 | 106 | # other cluster management types could be added here with elsif 107 | else { # default SGE 108 | $qsubcommand = " -N n$jobname -j y -o $outfile -S /bin/bash"; 109 | } 110 | 111 | $qPID = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'SUBEXE'} $CLUSTER_CONF{'QARGS'} $qsubcommand < is submitted/ || 121 | $qPID =~ /^Submitted batch job (\d+)/ ){ $qPID = $1 } 122 | 123 | # save job details associated to process id 124 | $ref_cluster_PIDs->{$qPID}{'command'} = $qsubcommand; 125 | $ref_cluster_PIDs->{$qPID}{'executable'} = $command; 126 | $ref_cluster_PIDs->{$qPID}{'status'} = 'sent'; 127 | 128 | # sleep to avoid overloading 129 | sleep($CLUSTER_CONF{'STIME'}); 130 | } 131 | 132 | # Checks status of cluster jobs and prints messages to stdout 133 | # input: 134 | # 1 (string) name of work directory 135 | # 2 reference to cluster job hash 136 | sub check_cluster_jobs { 137 | my ($dir,$ref_PIDs) = @_; 138 | 139 | my ($waiting,$qPID,$newqPID,$qout) = (1); 140 | 141 | while($waiting) { 142 | $waiting=0; 143 | foreach $qPID (sort {$a<=>$b} (keys(%$ref_PIDs))) { 144 | 145 | next if($ref_PIDs->{$qPID}{'status'} eq 'deleted'); 146 | 147 | # get status of this job 148 | $qout = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'CHKEXE'} | grep $qPID`; 149 | if($qout){ 150 | if($qout =~ /\s+$CLUSTER_CONF{'ERROR'}\s+/) { 151 | # resubmit failed jobs 152 | $newqPID = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'SUBEXE'} $CLUSTER_CONF{'QARGS'} $ref_PIDs->{$qPID}{'command'} <{$qPID}{'executable'} 155 | EOF`; 156 | 157 | if($newqPID =~ /^(\d+)\./ || $newqPID =~ /^Your job (\d+)/){ $newqPID = $1 } 158 | $ref_PIDs->{$newqPID}{'command'} = $ref_PIDs->{$qPID}{'command'}; 159 | $ref_PIDs->{$newqPID}{'executable'} = $ref_PIDs->{$qPID}{'executable'}; 160 | $ref_PIDs->{$newqPID}{'status'} = 'sent'; 161 | sleep($CLUSTER_CONF{'STIME'}); 162 | 163 | # remove failed job 164 | system("$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'DELEXE'} $qPID"); 165 | $ref_PIDs->{$qPID}{'status'} = 'deleted'; 166 | print "# check_cluster_jobs: deleted job $qPID , resubmitted as $newqPID\n"; 167 | $waiting++; 168 | } 169 | else{ $waiting++; last; } 170 | } 171 | } 172 | if($waiting) 173 | { 174 | print "# check_cluster_jobs: waiting ...\n"; 175 | sleep($CLUSTER_CONF{'CTIME'}); 176 | } 177 | } 178 | } 179 | 180 | 1; 181 | -------------------------------------------------------------------------------- /pangenes/pics/collinear_pangenes_minimap2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/collinear_pangenes_minimap2.png -------------------------------------------------------------------------------- /pangenes/pics/fixing_genemodels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/fixing_genemodels.png -------------------------------------------------------------------------------- /pangenes/pics/flow-check-evidence.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-check-evidence.dia -------------------------------------------------------------------------------- /pangenes/pics/flow-check-evidence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-check-evidence.png -------------------------------------------------------------------------------- /pangenes/pics/flow-get-pangenes.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-get-pangenes.dia -------------------------------------------------------------------------------- /pangenes/pics/flow-get-pangenes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-get-pangenes.png -------------------------------------------------------------------------------- /pangenes/pics/long_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/long_model.png -------------------------------------------------------------------------------- /pangenes/pics/pairs2clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pairs2clusters.png -------------------------------------------------------------------------------- /pangenes/pics/pangene_set_nomenclature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pangene_set_nomenclature.png -------------------------------------------------------------------------------- /pangenes/pics/pangenesPAG2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pangenesPAG2023.pdf -------------------------------------------------------------------------------- /pangenes/pics/wgaoverlap.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/wgaoverlap.dia -------------------------------------------------------------------------------- /pangenes/pics/wgaoverlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/wgaoverlap.png -------------------------------------------------------------------------------- /pangenes/plots/core_gene.tab_core_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/core_gene.tab_core_both.png -------------------------------------------------------------------------------- /pangenes/plots/dotplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/dotplot.png -------------------------------------------------------------------------------- /pangenes/plots/haplotypes.trimmed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/haplotypes.trimmed.png -------------------------------------------------------------------------------- /pangenes/plots/pan_gene.tab_pan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pan_gene.tab_pan.png -------------------------------------------------------------------------------- /pangenes/plots/pangene_context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pangene_context.png -------------------------------------------------------------------------------- /pangenes/plots/pangene_matrix__shell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pangene_matrix__shell.png -------------------------------------------------------------------------------- /phylogenomics/Oryza.log: -------------------------------------------------------------------------------- 1 | 2 | # WARNING : folder 'Oryza' exists, files might be overwritten 3 | 4 | # pangene_analysis.pl -d Plants -c Oryza -r oryza_sativa -o -f Oryza -t protein -G 0 -W 0 -L 0 -S 0 5 | 6 | # supported species in NCBI taxon Oryza : 11 7 | 8 | # total selected species : 11 9 | 10 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_sativa.tsv.gz 11 | 12 | # re-using Oryza_sativa.IRGSP-1.0.pep.all.fa.gz 13 | 14 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_indica.tsv.gz 15 | 16 | # re-using Oryza_indica.ASM465v1.pep.all.fa.gz 17 | 18 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_brachyantha.tsv.gz 19 | 20 | # re-using Oryza_brachyantha.Oryza_brachyantha.v1.4b.pep.all.fa.gz 21 | 22 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_meridionalis.tsv.gz 23 | 24 | # re-using Oryza_meridionalis.Oryza_meridionalis_v1.3.pep.all.fa.gz 25 | 26 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_glaberrima.tsv.gz 27 | 28 | # re-using Oryza_glaberrima.Oryza_glaberrima_V1.pep.all.fa.gz 29 | 30 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_punctata.tsv.gz 31 | 32 | # re-using Oryza_punctata.Oryza_punctata_v1.2.pep.all.fa.gz 33 | 34 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_longistaminata.tsv.gz 35 | 36 | # re-using Oryza_longistaminata.O_longistaminata_v1.0.pep.all.fa.gz 37 | 38 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_barthii.tsv.gz 39 | 40 | # re-using Oryza_barthii.O.barthii_v1.pep.all.fa.gz 41 | 42 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_nivara.tsv.gz 43 | 44 | # re-using Oryza_nivara.Oryza_nivara_v1.0.pep.all.fa.gz 45 | 46 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_rufipogon.tsv.gz 47 | 48 | # re-using Oryza_rufipogon.OR_W1943.pep.all.fa.gz 49 | 50 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_glumipatula.tsv.gz 51 | 52 | # re-using Oryza_glumipatula.Oryza_glumaepatula_v1.5.pep.all.fa.gz 53 | 54 | # oryza_sativa : sequences = 35775 clusters = 35419 (singletons = 8371) 55 | # oryza_indica : sequences = 40745 clusters = 39590 (singletons = 7645) 56 | # oryza_brachyantha : sequences = 32037 clusters = 31856 (singletons = 10421) 57 | # oryza_meridionalis : sequences = 29308 clusters = 28659 (singletons = 4605) 58 | # oryza_glaberrima : sequences = 33164 clusters = 31710 (singletons = 4051) 59 | # oryza_punctata : sequences = 31762 clusters = 30673 (singletons = 6246) 60 | # oryza_longistaminata : sequences = 31686 clusters = 30034 (singletons = 7460) 61 | # oryza_barthii : sequences = 34575 clusters = 33903 (singletons = 3263) 62 | # oryza_nivara : sequences = 36313 clusters = 35918 (singletons = 3387) 63 | # oryza_rufipogon : sequences = 37071 clusters = 36716 (singletons = 3837) 64 | # oryza_glumipatula : sequences = 35735 clusters = 35231 (singletons = 4249) 65 | 66 | # total sequences = 378171 67 | 68 | # number_of_clusters = 110289 (core = 8052) 69 | 70 | # cluster_list = Oryza/oryzasativa_Oryza_algEnsemblCompara.cluster_list 71 | # cluster_directory = Oryza/oryzasativa_Oryza_algEnsemblCompara 72 | 73 | # percent_conserved_proteins_file = Oryza/POCP.matrix.tab 74 | 75 | # pangenome_file = Oryza/pangenome_matrix.tab tranposed = Oryza/pangenome_matrix.tr.tab 76 | # pangenome_genes = Oryza/pangenome_matrix_genes.tab transposed = Oryza/pangenome_matrix_genes.tr.tab 77 | # pangenome_FASTA_file = Oryza/pangenome_matrix.fasta 78 | 79 | # genome composition report (samples=10,seed=12345) 80 | ## sample 0 (oryza_sativa | 0,1,2,3,4,5,6,7,8,9,10,) 81 | ## sample 1 (oryza_meridionalis | 3,2,4,8,10,0,6,9,1,7,5,) 82 | ## sample 2 (oryza_meridionalis | 3,0,1,9,4,2,6,8,7,5,10,) 83 | ## sample 3 (oryza_barthii | 7,10,2,8,6,5,3,9,1,0,4,) 84 | ## sample 4 (oryza_indica | 1,4,2,6,7,3,0,10,8,9,5,) 85 | ## sample 5 (oryza_sativa | 0,6,3,5,10,4,8,7,2,1,9,) 86 | ## sample 6 (oryza_barthii | 7,8,4,1,9,5,0,10,2,6,3,) 87 | ## sample 7 (oryza_glaberrima | 4,10,3,9,8,1,7,6,2,0,5,) 88 | ## sample 8 (oryza_indica | 1,0,2,3,6,8,10,9,5,4,7,) 89 | ## sample 9 (oryza_glaberrima | 4,6,1,5,0,2,10,9,8,7,3,) 90 | 91 | # pan-gene (number of clusters) = Oryza/pan_gene.tab 92 | # core-gene (number of clusters) = Oryza/core_gene.tab 93 | 94 | # runtime: 146 wallclock secs (79.95 usr 4.76 sys + 29.57 cusr 1.52 csys = 115.80 CPU) 95 | -------------------------------------------------------------------------------- /phylogenomics/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Plant phylogenomics scripts 3 | 4 | These scripts interrogate Ensembl Plants through [REST endpoints](https://rest.ensembl.org) 5 | and the FTP site to export data that might be useful for phylogenomic and pan-gene set studies. 6 | 7 | These scripts were tested at the 8 | [CABANA workshop: Analysis of crop genomics data ](http://training.ensembl.org/events/2021/2021-03-01-CABANA). 9 | 10 | ## Documentation and examples 11 | 12 | Run any of the scripts with argument -h to get instructions and examples. 13 | 14 | ## Dependencies 15 | 16 | The following dependencies can be installed in the parent folder with: 17 | 18 | make install_REST 19 | 20 | The scripts require the following non-core Perl modules: 21 | * [HTTP::Tiny](https://metacpan.org/release/HTTP-Tiny) 22 | * [JSON](https://metacpan.org/release/JSON) 23 | * [DBI](https://metacpan.org/pod/DBI) 24 | * [DBD::mysql](https://metacpan.org/pod/DBD::mysql) 25 | 26 | which can be installed with: 27 | ``` 28 | # install cpanminus installer, check more options at https://metacpan.org/pod/App::cpanminus 29 | sudo cpan -i App::cpanminus 30 | 31 | # actually install modules 32 | sudo apt-get install -y mysql-client libmysqlclient-dev 33 | cpanm JSON JSON::XS HTTP::Tiny DBI DBD::mysql 34 | ``` 35 | 36 | In addition the scripts import module [PlantCompUtils.pm](./PlantCompUtils.pm), 37 | which is included in this folder. 38 | 39 | 40 | ### ens_single-copy_core_genes.pl 41 | 42 | This script can be used to obtain single-copy core genes present within a clade. 43 | Example calls include: 44 | 45 | ``` 46 | perl ens_single-copy_core_genes.pl -c Brassicaceae -f Brassicaceae 47 | perl ens_single-copy_core_genes.pl -c Brassicaceae -f Brassicaceae -t cdna -o beta_vulgaris 48 | perl ens_single-copy_core_genes.pl -f poaceae -c 4479 -r oryza_sativa -WGA 75 49 | perl ens_single-copy_core_genes.pl -f all -c 33090 -m all -r physcomitrium_patens 50 | ``` 51 | 52 | Note option -f produces FASTA files of aligned peptide sequences, one per cluster. 53 | Such a task takes usually takes over an hour over the Ensembl REST API. 54 | 55 | 56 | ### ens_syntelogs.pl 57 | 58 | This script is related to [ens_single-copy_core_genes.pl](ens_single-copy_core_genes.pl) but explicitely considers only orthogroups with Gene Order Conservation (GOC) score >= 75 by default. The output matrix contains also the genomic coordinates of genes of the reference genome: 59 | 60 | ``` 61 | perl ens_syntelogs.pl -c Brassicaceae -f Brassicaceae 62 | 63 | ``` 64 | 65 | A sample output matrix is available in [Brassicaceae.syntelogs.GOC75.tsv](./bench/Brassicaceae.syntelogs.GOC75.tsv). 66 | A benchmark is described in . 67 | 68 | Note option -f produces FASTA files of aligned peptide sequences, one per cluster. 69 | Such a task takes usually takes over an hour over the Ensembl REST API. 70 | 71 | WARNING: not all species are included in the Compara gene-tree analysis. You can exclude them with -i. 72 | 73 | ### ens_sequences.pl 74 | 75 | Produces a FASTA file with the canonical cds/pep sequences of species in a clade in Ensembl Plants: 76 | ``` 77 | perl ens_syntelogs.pl -c Brassicaceae -f Brassicaceae.fna 78 | 79 | ``` 80 | 81 | 82 | ### ens_pangene_analysis.pl 83 | 84 | This was a prototype which was eventually replaced by the scripts at 85 | [pangenes](https://github.com/Ensembl/plant-scripts/tree/master/pangenes). 86 | 87 | -------------------------------------------------------------------------------- /phylogenomics/TODO.txt: -------------------------------------------------------------------------------- 1 | 2 | 1) workout stable id from cluster content, so it is reasonable stable 3 | 4 | 1.1) IDEA 1 5 | 1.1.1) produce a list of ordered core markers/genes along chrs -> how to be sure they will be core in the future? 6 | 1.1.2) for each cluster, find the closest marker both sides and define interval 7 | 1.1.3) number clusters per interval 8 | 9 | 1.2) IDEA 2 10 | 1.2.1) compute N most frequent 21-mers in cluster 11 | 1.2.2) work out cluster id from kmer composition 12 | 13 | 1.3) IDEA 3 14 | 1.3.1) Use previous clusters to compare to new and reuse stable ids as much as possible 15 | -------------------------------------------------------------------------------- /phylogenomics/downloads/README.txt: -------------------------------------------------------------------------------- 1 | 2 | This folder is where Compara TSV and fasta compressed files are stored. 3 | -------------------------------------------------------------------------------- /phylogenomics/ens_sequences.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | 5 | use Getopt::Long qw(:config no_ignore_case); 6 | use Benchmark; 7 | use HTTP::Tiny; 8 | use JSON qw(decode_json); 9 | use FindBin '$Bin'; 10 | use lib $Bin; 11 | use PlantCompUtils qw( 12 | list_ensembl_mysql_dbs get_canonical_transcript_ids 13 | download_FASTA_file parse_isoform_FASTA_file perform_rest_action 14 | $REQUEST_COUNT $FASTADIR @DIVISIONS 15 | ); 16 | 17 | # Downloads cds/pep sequences of species in a clade from Ensembl Plants. 18 | # Uses canonical transcripts, used in the gene tree analysis, 19 | # which usually are the longest translation with no stop codons 20 | # 21 | # Copyright [2019-2021] EMBL-European Bioinformatics Institute 22 | 23 | # Ensembl Genomes 24 | my $RESTURL = 'http://rest.ensembl.org'; 25 | my $INFOPOINT = $RESTURL . '/info/genomes/division/'; 26 | my $TAXOPOINT = $RESTURL . '/info/genomes/taxonomy/'; 27 | 28 | my $downloadir = $Bin . '/downloads'; 29 | my $division = 'Plants'; 30 | my $seqtype = 'protein'; 31 | my $taxonid = ''; # NCBI Taxonomy id Brassicaceae=3700 32 | # Asterids=71274, Poaceae=4479 33 | 34 | my ( $fastadir, $outfile, $out_genome ) = ( '', '', '' ); 35 | 36 | my ( $help, $sp, $line, $id, $show_supported, $request, $response ); 37 | my ( $filename, $dnafile, $pepfile, $seqfolder, $ext ); 38 | my ( @ignore_species, %ignore, %division_supported ); 39 | 40 | GetOptions( 41 | "help|?" => \$help, 42 | "supported|l" => \$show_supported, 43 | "division|d=s" => \$division, 44 | "clade|c=s" => \$taxonid, 45 | "outgroup|o=s" => \$out_genome, 46 | "ignore|i=s" => \@ignore_species, 47 | "type|t=s" => \$seqtype, 48 | "outfile|f=s" => \$outfile 49 | ) || help_message(); 50 | 51 | sub help_message { 52 | print "\nusage: $0 [options]\n\n" 53 | . "-c NCBI Taxonomy clade of interest (required, example: -c Brassicaceae or -c 3700)\n" 54 | . "-f output FASTA file (required, example: -f myfile.fasta)\n" 55 | . "-l list supported species_names (optional, example: -l)\n" 56 | . "-d Ensembl division (optional, default: -d $division)\n" 57 | . "-o outgroup species_name (optional, example: -o brachypodium_distachyon)\n" 58 | . "-i ignore species_name(s) (optional, example: -i selaginella_moellendorffii -i ...)\n" 59 | . "-t sequence type [protein|cdna] (optional, default: -t protein)\n\n"; 60 | 61 | print "Example calls:\n\n" 62 | . " perl $0 -c Liliopsida -o arabidopsis_thaliana -f Liliopsoda.Atha.EG44.faa\n"; 63 | exit(0); 64 | } 65 | 66 | if ($help) { help_message() } 67 | 68 | if ($division) { 69 | if ( !grep( /^$division$/, @PlantCompUtils::DIVISIONS ) ) { 70 | die "# ERROR: accepted values for division are: " 71 | . join( ',', @PlantCompUtils::DIVISIONS ) . "\n"; 72 | } 73 | else { 74 | my $lcdiv = lc($division); 75 | $fastadir = $PlantCompUtils::FASTADIR; 76 | $fastadir =~ s/xxx/$lcdiv/; 77 | } 78 | } 79 | 80 | if ($show_supported) { 81 | print "# $0 -d $division -l \n\n"; 82 | } 83 | else { 84 | 85 | if ( $taxonid eq '' ) { 86 | print "# ERROR: need a valid NCBI Taxonomy clade, ". 87 | "such as -c Brassicaceae or -c 3700\n\n"; 88 | print "# Check https://www.ncbi.nlm.nih.gov/taxonomy\n"; 89 | exit; 90 | } 91 | else { 92 | $taxonid =~ s/\s+/%20/g; 93 | } 94 | 95 | if (@ignore_species) { 96 | foreach my $sp (@ignore_species) { 97 | $ignore{$sp} = 1; 98 | } 99 | printf( "\n# ignored species : %d\n\n", scalar( keys(%ignore) ) ); 100 | } 101 | 102 | if ( $seqtype ne 'protein' && $seqtype ne 'cdna' ) { 103 | die "# ERROR: accepted values for seqtype are: protein|cdna\n"; 104 | } 105 | else { 106 | if ( $seqtype eq 'protein' ) { 107 | $ext = '.faa'; 108 | $seqfolder = 'pep'; 109 | } 110 | else { 111 | $ext = '.fna'; 112 | $seqfolder = 'cdna'; 113 | } 114 | } 115 | 116 | if ( !$outfile ) { 117 | print "# ERROR: need a valid output file, such as -f Brassicaceae.fasta\n\n"; 118 | exit; 119 | } 120 | 121 | print "# $0 -d $division -c $taxonid -o $out_genome -f $outfile -t $seqtype\n\n"; 122 | } 123 | 124 | my $start_time = new Benchmark(); 125 | 126 | # new object and params for REST requests 127 | my $http = HTTP::Tiny->new(); 128 | my $global_headers = { 'Content-Type' => 'application/json' }; 129 | $PlantCompUtils::REQUEST_COUNT = 0; 130 | 131 | ## 0) check supported species in division 132 | 133 | $request = $INFOPOINT . "Ensembl$division?"; 134 | 135 | $response = perform_rest_action( $http, $request, $global_headers ); 136 | my $infodump = decode_json($response); 137 | 138 | foreach $sp ( @{$infodump} ) { 139 | if ( $sp->{'has_peptide_compara'} ) { 140 | $division_supported{ $sp->{'name'} } = 1; 141 | } 142 | } 143 | 144 | # list supported species and exit 145 | if ($show_supported) { 146 | 147 | foreach $sp ( sort( keys(%division_supported) ) ) { 148 | print "$sp\n"; 149 | } 150 | exit; 151 | } 152 | 153 | # check outgroup is supported 154 | if ( $out_genome && !$division_supported{$out_genome} ) { 155 | die "# ERROR: genome $out_genome is not supported\n"; 156 | } 157 | 158 | ## 1) check species in clade 159 | 160 | my ( $n_of_species, $n_of_sequences ) = ( 0, 0 ); 161 | my ( @supported_species, %supported ); 162 | 163 | $request = $TAXOPOINT . "$taxonid?"; 164 | 165 | $response = perform_rest_action( $http, $request, $global_headers ); 166 | $infodump = decode_json($response); 167 | 168 | foreach $sp ( @{$infodump} ) { 169 | if ( $sp->{'name'} && $division_supported{ $sp->{'name'} } ) { 170 | 171 | next if ( $ignore{ $sp->{'name'} } ); 172 | 173 | # add sorted clade species 174 | $supported{ $sp->{'name'} } = 1; 175 | push( @supported_species, $sp->{'name'} ); 176 | } 177 | } 178 | 179 | printf( "# supported species in NCBI taxon %s : %d\n\n", 180 | $taxonid, scalar(@supported_species) ); 181 | 182 | # add outgroup if required 183 | if ($out_genome) { 184 | push( @supported_species, $out_genome ); 185 | $supported{$out_genome} = 1; 186 | print "# outgenome: $out_genome\n"; 187 | } 188 | 189 | $n_of_species = scalar(@supported_species); 190 | print "# total selected species : $n_of_species\n\n"; 191 | 192 | ## 2) connect to public Ensembl server and 193 | ## find latest database schema for each species 194 | 195 | my $ref_dbs = list_ensembl_mysql_dbs(); 196 | 197 | my %species2db; 198 | foreach $sp (@supported_species) { 199 | foreach my $db (@$ref_dbs) { 200 | if($db =~ /$sp\_core_\d+/) { 201 | $species2db{$sp} = $db; 202 | } 203 | } 204 | } 205 | 206 | ## 3) get sequences for selected (plant) species 207 | 208 | open( OUTFILE, ">", $outfile ) || die "# ERROR: cannot create $outfile\n"; 209 | 210 | # iteratively get and parse FASTA files 211 | foreach $sp (@supported_species) { 212 | 213 | # get list of canonical transcripts for this species 214 | my $ref_canon_isofs = get_canonical_transcript_ids($species2db{$sp}); 215 | 216 | printf("# %s canonical isoforms=%d\n", $sp, 217 | scalar(keys(%$ref_canon_isofs))); 218 | 219 | # now get FASTA file and parse it, selected/longest isoforms are read 220 | my $stored_sequence_file = 221 | download_FASTA_file( $fastadir, "$sp/$seqfolder", $downloadir ); 222 | 223 | my ( $ref_sequence, $ref_header ) = 224 | parse_isoform_FASTA_file($stored_sequence_file, $ref_canon_isofs); 225 | 226 | foreach $id ( keys(%$ref_sequence) ) { 227 | print OUTFILE ">$id $ref_header->{$id} [$sp]\n$ref_sequence->{$id}\n"; 228 | $n_of_sequences++; 229 | } 230 | } 231 | 232 | close(OUTFILE); 233 | 234 | print "# created $outfile with $n_of_sequences sequences\n"; 235 | -------------------------------------------------------------------------------- /phylogenomics/phylo_test.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Test::More tests => 1; 4 | 5 | ok( eval{ `perl ens_single-copy_core_genes.pl -c Oryza -r oryza_sativa` } =~ /# total single-copy core clusters/ , 'ens_single-copy_core_genes.pl' ); 6 | -------------------------------------------------------------------------------- /recipes/exampleAPI.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Examples of queries to Ensembl Plants using the native Perl API 4 | # Check the tutorials for more examples: 5 | # https://m.ensembl.org/info/docs/api/core/core_tutorial.html 6 | # https://www.ensembl.org/info/docs/api/compara/compara_tutorial.html 7 | # 8 | # Copyright [2017-21] EMBL-European Bioinformatics Institute 9 | 10 | 11 | # Install the Ensembl Perl API and updated env as explained in 12 | # http://www.ensembl.org/info/docs/api/api_installation.html 13 | # http://www.ensembl.org/info/docs/api/api_git.html 14 | # https://m.ensembl.org/info/docs/api/debug_installation_guide.html 15 | # Note you might need some dependencies, such as libmysqlclient-dev 16 | # and Perl modules DBI & DBD::mysql 17 | 18 | 19 | ## A1) Load the Registry object with details of genomes available 20 | 21 | use warnings; 22 | use strict; 23 | use List::Util qw(all); 24 | use FindBin; 25 | use lib "$FindBin::Bin/../lib/bioperl-live/"; 26 | use lib "$FindBin::Bin/../lib/ensembl/modules/"; 27 | use lib "$FindBin::Bin/../lib/ensembl-compara/modules/"; 28 | use lib "$FindBin::Bin/../lib/ensembl-variation/modules/"; 29 | use lib "$FindBin::Bin/../lib/ensembl-funcgen/modules/"; 30 | use lib "$FindBin::Bin/../lib/ensembl-metadata/modules/"; 31 | 32 | use Bio::EnsEMBL::Registry; 33 | 34 | Bio::EnsEMBL::Registry->load_registry_from_db( 35 | -USER => 'anonymous', 36 | -HOST => 'mysql-eg-publicsql.ebi.ac.uk', 37 | -PORT => '4157', 38 | #-VERBOSE => 1 # uncomment to see dbs loaded 39 | ); 40 | 41 | ## A2) Check which analyses are available for a species 42 | 43 | # Note: logic_names are printed for each analysis 44 | 45 | my $division = 'plants'; 46 | my $species = 'arabidopsis_thaliana'; 47 | 48 | my $analysis_adaptor = Bio::EnsEMBL::Registry-> 49 | get_adaptor( $species, "core", "analysis" ); 50 | 51 | foreach my $analysis (sort 52 | {$a->logic_name() cmp $b->logic_name()} 53 | @{ $analysis_adaptor->fetch_all() }){ 54 | print $analysis->logic_name(), "\n"; 55 | } 56 | 57 | # stop here if only test 58 | if($ARGV[0] eq "test"){ 59 | exit(0); 60 | } 61 | 62 | 63 | ## A3) Get soft masked sequences from Arabidopsis thaliana 64 | 65 | my $slice_adaptor = Bio::EnsEMBL::Registry-> 66 | get_adaptor($species, 'core', 'Slice'); 67 | 68 | my ($total,$masked, $softseq) = (0,0); 69 | foreach my $slice (@{ $slice_adaptor->fetch_all('toplevel') }){ 70 | 71 | # for brevity consider only the plastome 72 | next if($slice->seq_region_name() ne 'Pt'); 73 | 74 | # note Ensembl 1-based inclusive coordinates 75 | printf(">%s %s %d-%d\n", 76 | $slice->seq_region_name(), 77 | $slice->coord_system_name(), 78 | $slice->start(), 79 | $slice->end()); 80 | 81 | # By default repeatmask* analyses, see recipe A2 to list others 82 | # Repeat analyses include 'repeatmask_redat', 83 | # 'repeatmask_nrplants' or 'repeatdetector_curated' 84 | # $slice->get_repeatmasked_seq( ['repeatmask_redat'], 1 ) 85 | # only print a 50b segment for brevity 86 | print substr($slice->get_repeatmasked_seq( undef, 1 )->seq(),80,50), "\n"; 87 | } 88 | 89 | ## A4) Get BED file with repeats in chr4 90 | 91 | my $chrname = 'chr4'; 92 | 93 | my $slice = $slice_adaptor-> 94 | fetch_by_region( 'toplevel', $chrname ); 95 | 96 | my @repeats = @{ $slice->get_all_RepeatFeatures() }; 97 | my $total_repeats = 0; 98 | 99 | foreach my $repeat (@repeats) { 100 | 101 | # for brevity 102 | last if($total_repeats++ > 10); 103 | 104 | printf("%s\t%d\t%d\t%s\t%s\t%s\n", 105 | $chrname, 106 | $repeat->start()-1, 107 | $repeat->end(), 108 | $repeat->analysis()->logic_name(), 109 | $repeat->repeat_consensus()->repeat_class(), 110 | $repeat->repeat_consensus()->repeat_type() ); 111 | } 112 | 113 | ## A5) Find the DEAR3 gene 114 | 115 | # gene of interest and species 116 | my $gene_name = 'DEAR3'; 117 | 118 | # get a gene adaptor to work with genes from 119 | # the species 120 | my $gene_adaptor = Bio::EnsEMBL::Registry-> 121 | get_adaptor($species, 'core', 'gene'); 122 | 123 | # find the gene with the specified name using 124 | # the adaptor 125 | my ($gene_obj) = @{$gene_adaptor-> 126 | fetch_all_by_external_name($gene_name)}; 127 | 128 | ## A6) Get the transcript used in Compara analyses 129 | 130 | # The canonical transcript is used in the gene tree analysis, 131 | # which usually is the longest translation with no stop codons 132 | 133 | printf(">DEAR3 %s\n%s\n", 134 | $gene_obj->canonical_transcript()->stable_id(), 135 | $gene_obj->canonical_transcript()->spliced_seq() ); 136 | 137 | printf(">DEAR3 %s CDS\n%s\n", 138 | $gene_obj->canonical_transcript()->stable_id(), 139 | $gene_obj->canonical_transcript()->translateable_seq() ); 140 | 141 | printf(">DEAR3 %s\n%s\n\n", 142 | $gene_obj->canonical_transcript()->translation->stable_id(), 143 | $gene_obj->canonical_transcript()->translate->seq() ); 144 | 145 | ## A7) Find all orthologues of a gene 146 | 147 | # get an adaptor to work with genes from compara 148 | my $gene_member_adaptor = Bio::EnsEMBL::Registry-> 149 | get_adaptor($division, 'compara', 'GeneMember'); 150 | 151 | # find the corresponding gene in compara 152 | my $gene_member = $gene_member_adaptor-> 153 | fetch_by_stable_id($gene_obj->stable_id()); 154 | 155 | # get an adaptor to work with homologues in compara 156 | my $homology_adaptor = Bio::EnsEMBL::Registry-> 157 | get_adaptor($division, 'compara', 'Homology'); 158 | 159 | # find all homologues of the gene 160 | my @homologies = @{$homology_adaptor-> 161 | fetch_all_by_Member($gene_member)}; 162 | 163 | # filter out homologues based on type 164 | @homologies = grep { 165 | $_->description =~ m/ortholog/ 166 | } @homologies; 167 | 168 | foreach my $homology (@homologies) { 169 | 170 | # get the protein from the target 171 | my $target = $homology->get_all_Members->[1]; 172 | 173 | printf("%s\t%s\t%s\t%s\n", 174 | $gene_obj->stable_id(), 175 | $species, 176 | $target->stable_id(), 177 | $target->genome_db->name() ); 178 | } 179 | 180 | ## A8) Get markers mapped on chr1D of bread wheat 181 | 182 | # Note: only a few plants have markers 183 | # As of release EG47/100: 184 | # triticum_aestivum, oryza_indica, brassica_rapa 185 | # 186 | # Coordinates are returned in BED format 187 | 188 | $species = 'triticum_aestivum'; 189 | $chrname = '1D'; 190 | 191 | $slice_adaptor = Bio::EnsEMBL::Registry-> 192 | get_adaptor( $species, 'Core', 'Slice' ); 193 | 194 | $slice = $slice_adaptor-> 195 | fetch_by_region( 'chromosome', $chrname ); 196 | 197 | my $total_markers = 0; 198 | foreach my $mf (@{ $slice->get_all_MarkerFeatures() }) { 199 | 200 | last if($total_markers++ > 10); #for brevity 201 | 202 | my $marker = $mf->marker(); 203 | 204 | printf("%s\t%d\t%d\t%s\t%s\t%s\t%d\n", 205 | $mf->seq_region_name(), 206 | $mf->start()-1, 207 | $mf->end(), 208 | $mf->display_id(), 209 | $marker->left_primer(), 210 | $marker->right_primer(), 211 | $marker->max_primer_dist() ); 212 | } 213 | 214 | 215 | ## A9) Find all syntelogues among rices 216 | 217 | # Note: GOC=Gene Order Conservation score 218 | # Read more at 219 | # https://www.ensembl.org/info/genome/compara/Ortholog_qc_manual.html 220 | 221 | # get an adaptor to work with comparative sets from compara 222 | my $mlss_adaptor = Bio::EnsEMBL::Registry-> 223 | get_adaptor($division, 'compara', 'MethodLinkSpeciesSet'); 224 | 225 | # find the mlss that describes orthologies between these 226 | # two rice species 227 | my $mlss = $mlss_adaptor->fetch_by_method_link_type_registry_aliases( 228 | 'ENSEMBL_ORTHOLOGUES', ['oryza_sativa', 'oryza_indica']); 229 | 230 | # find all homologues between these two rice species 231 | @homologies = @{$homology_adaptor-> 232 | fetch_all_by_MethodLinkSpeciesSet($mlss)}; 233 | 234 | # filter out homologues based on local gene order conservation 235 | @homologies = grep { 236 | $_->goc_score && $_->goc_score >= 75 237 | } @homologies; 238 | 239 | my $count = 1; 240 | foreach my $homology (@homologies) { 241 | 242 | # get one orthologue 243 | my $prot = $homology->get_all_Members->[1]; 244 | 245 | # find all orthologues in rice 246 | my @rice_homologies = @{$homology_adaptor-> 247 | fetch_all_by_Member($prot, 248 | -METHOD_LINK_TYPE => 'ENSEMBL_ORTHOLOGUES', 249 | -TARGET_TAXON => 'Oryza')}; 250 | 251 | if (all {$_->goc_score && $_->goc_score >= 75} 252 | @rice_homologies) { 253 | foreach my $rh (@rice_homologies) { 254 | printf("%s\t%s\t%s\t%s\t%d\n", 255 | $rh->get_all_Members->[0]->genome_db->name, 256 | $rh->get_all_Members->[0]->stable_id, 257 | $rh->get_all_Members->[1]->genome_db->name, 258 | $rh->get_all_Members->[1]->stable_id, 259 | $rh->goc_score); 260 | } 261 | print "\n"; 262 | 263 | # Only print the first 10 groups 264 | last if $count++ == 10; 265 | } 266 | } 267 | 268 | ## A10) Print all translations for otherfeatures genes 269 | 270 | # Note: otherfeatures dbs are Ensembl databases that 271 | # usually contain additional annotation tracks 272 | 273 | $count = 0; 274 | $species = 'triticum_aestivum'; 275 | 276 | $gene_adaptor = Bio::EnsEMBL::Registry-> 277 | get_adaptor($species, "otherfeatures", "gene"); 278 | my $genes = $gene_adaptor->fetch_all_by_biotype('protein_coding'); 279 | 280 | for my $gene (@$genes){ 281 | my $transcripts = $gene->get_all_Transcripts; 282 | for my $t (@$transcripts){ 283 | if ($t->biotype ne 'protein_coding'){ 284 | next; 285 | } 286 | $count++; 287 | print ">",$gene->stable_id,"\n"; 288 | 289 | my $translation = $t->translation; 290 | my $sequence = $translation->seq; 291 | print $translation->seq, "\n"; 292 | } 293 | 294 | # Print only first 10, comment for real use 295 | last if ($count == 10); 296 | } 297 | 298 | -------------------------------------------------------------------------------- /recipes/exampleBiomart.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Examples of R queries to Ensembl Plants Biomart 4 | # Your usage of the data returned by the Biomart service is 5 | # subject to same conditions as laid out on the Ensembl website. 6 | # 7 | # See documentation at 8 | # https://www.ensembl.org/info/data/biomart/how_to_use_biomart.html 9 | # https://www.ensembl.org/info/data/biomart/biomart_r_package.html 10 | # 11 | # Copyright [2020-21] EMBL-European Bioinformatics Institute 12 | 13 | # To install R package biomaRt run: Rscript install_R_deps.R 14 | 15 | local_lib = "./lib/R/" 16 | .libPaths( c( .libPaths(), local_lib) ) 17 | 18 | library("biomaRt") 19 | 20 | args = commandArgs(trailingOnly=TRUE) 21 | 22 | ## B1) Check plant marts and select dataset 23 | 24 | listMarts( host="plants.ensembl.org" ) 25 | 26 | EPgenes = useEnsembl( biomart="plants_mart", 27 | host="plants.ensembl.org") 28 | 29 | dsets = listDatasets(EPgenes) 30 | 31 | dsets[grep("Triticum aestivum", dsets$description),] 32 | # dataset description version 33 | # 69 taestivum_eg_gene Triticum aestivum genes (IWGSC) IWGSC 34 | 35 | # take a note of the dataset name 'taestivum_eg_gene' 36 | 37 | 38 | ## B2) Check available filters and attributes 39 | 40 | EPgenes = useMart( 41 | biomart="plants_mart", 42 | host="plants.ensembl.org", 43 | dataset="taestivum_eg_gene") 44 | 45 | head( listFilters(EPgenes) ) 46 | 47 | head( listAttributes(EPgenes) ) 48 | 49 | # stop here if just a test 50 | if(length(args)==1 && args[1]=="test"){ 51 | q("no",1) 52 | } 53 | 54 | 55 | ## B3) Download GO terms associated to genes 56 | 57 | # Note genes might appear in several rows 58 | 59 | go = getBM( 60 | attributes=c("ensembl_gene_id", "go_id"), 61 | mart=EPgenes) 62 | 63 | head(go) 64 | 65 | ## B4) Get Pfam domains annotated in genes 66 | 67 | EPgenes = useMart( 68 | biomart="plants_mart", 69 | host="plants.ensembl.org", 70 | dataset="hannuus_eg_gene") 71 | 72 | pfam = getBM( 73 | attributes=c("ensembl_gene_id", "pfam"), 74 | mart=EPgenes) 75 | 76 | head(pfam) 77 | 78 | ## B5) Get SNP consequences from a selected variation source 79 | 80 | # Note this requires connecting to a different mart (snp) 81 | # Note this query takes a few minutes to run 82 | 83 | EPvar = useMart( biomart="plants_variations", 84 | host="plants.ensembl.org", 85 | dataset="taestivum_eg_snp") 86 | 87 | snp_source = c("EMS-induced mutation") 88 | 89 | chrs = listFilterValues(mart=EPvar, 90 | filter="chr_name") 91 | 92 | attribs = c( 93 | "refsnp_id", 94 | "refsnp_source", 95 | "ensembl_gene_stable_id", 96 | "consequence_type_tv", 97 | "sift_prediction", 98 | "sift_score") 99 | 100 | filts = c( 101 | "variation_source", 102 | "chr_name", 103 | "sift_prediction") 104 | 105 | preds = c( 106 | "tolerated", # comment if unwanted 107 | "deleterious") 108 | 109 | snps <- NULL 110 | for(chr in chrs){ 111 | print(chr) # show progress 112 | 113 | for(pred in preds){ 114 | print(pred) # show progress 115 | 116 | tmp_s <- getBM( 117 | attributes=attribs, 118 | filters=filts, 119 | values=list( 120 | variation_source=snp_source, 121 | chr_name=chr, 122 | sift_prediction=c(pred)), 123 | mart=EPvar) 124 | 125 | # append SNP batches to object snps 126 | if(is.null(snps)){ 127 | snps<-tmp_s 128 | }else{ 129 | snps<-rbind(snps,tmp_s) 130 | } 131 | } 132 | } 133 | 134 | head(snps) 135 | -------------------------------------------------------------------------------- /recipes/exampleCRAM.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Example of Perl client to browse RNA-seq CRAM files from FTP server 4 | 5 | # Sequence reads from RNA-seq studies at the European Nucleotide Archive 6 | # are regularly mapped to genome assemblies in Ensembl Plants. For each study 7 | # CRAM files are created with the https://www.ebi.ac.uk/fg/rnaseq/api pipeline 8 | # and published on FTP site ftp://ftp.ensemblgenomes.org/pub/misc_data/Track_Hubs 9 | # 10 | # Each study contains a separate folder for each assembly used for mapping. 11 | # For instance, study SRP133995 was mapped to tomato assembly SL3.0 and the 12 | # tracksDb.txt file therein indicates the full path to the relevant CRAM file 13 | # next to its metadata. 14 | 15 | # Copyright [2020-24] EMBL-European Bioinformatics Institute 16 | 17 | use strict; 18 | use warnings; 19 | use Net::FTP; 20 | 21 | ## C1) Find RNA-seq CRAM files for a genome assembly 22 | 23 | # Note: assembly name is 'assembly_default' in recipe R2 24 | # Note: can take a few minutes 25 | 26 | my $FTPURL = 'ftp.ensemblgenomes.org'; 27 | my $FTPDIR = '/pub/misc_data/Track_Hubs'; 28 | 29 | my $assembly_name = ''; 30 | if($ARGV[0]){ $assembly_name = $ARGV[0] } 31 | else{ die "# usage: $0 \n" } 32 | 33 | my ($study,$file,$descr,$cramfile,$subgroup); 34 | 35 | if( my $ftp = Net::FTP->new( $FTPURL, Passive=>1, Debug=>0, Timeout=>60) ){ 36 | 37 | $ftp->login( "anonymous", '-anonymous@' )|| 38 | die "# ERROR: cannot login " . $ftp->message(); 39 | $ftp->cwd($FTPDIR) || 40 | die "# ERROR: cannot change working directory to $FTPDIR " . 41 | $ftp->message(); 42 | 43 | # print header 44 | print "cramfile\tstudy\tassembly\tsubgroup\tdescription\n"; 45 | 46 | # stop if test only 47 | if($assembly_name eq 'test'){ 48 | exit(0); 49 | } 50 | 51 | # list all ENA studies 52 | foreach $study ( $ftp->ls() ) { 53 | 54 | $ftp->cwd($study); 55 | 56 | my @contents = $ftp->ls(); 57 | 58 | # skip other assemblies 59 | if(!grep(/^$assembly_name$/, @contents)){ 60 | $ftp->cdup(); 61 | next; 62 | } 63 | 64 | # get description from hub.txt 65 | $descr = 'NA'; 66 | $ftp->get("hub.txt"); 67 | if(open(HUB,"<","hub.txt")){ 68 | while(){ 69 | if(/^longLabel ([^;]*)/){ 70 | $descr = $1; 71 | } 72 | } 73 | close(HUB); 74 | unlink('hub.txt'); 75 | } 76 | else { warn "# WARN: cannot get $study/hub.txt\n" } 77 | 78 | # look for CRAM files 79 | foreach $file (@contents) { 80 | if($file eq $assembly_name) { 81 | 82 | $ftp->cwd($file); 83 | 84 | # get and parse trackDb.txt 85 | $ftp->get("trackDb.txt"); 86 | if(open(TRACKDB,"<","trackDb.txt")){ 87 | 88 | while(){ 89 | if(/track/){ $subgroup = 'NA' } 90 | elsif(/subGroups (.*)$/){ $subgroup = $1 } 91 | elsif(/bigDataUrl (\S+\.cram)/){ 92 | $cramfile = $1; 93 | 94 | # print this CRAM file 95 | print "$cramfile\t$study\t$file\t$subgroup\t$descr\n"; 96 | } 97 | } 98 | close(HUB); 99 | unlink('trackDb.txt'); 100 | } 101 | else { warn "# WARN: cannot get $study/$file/trackDb.txt\n" } 102 | 103 | $ftp->cdup(); 104 | } 105 | } 106 | 107 | # up to study level 108 | $ftp->cdup(); 109 | } 110 | 111 | $ftp->close() 112 | } 113 | -------------------------------------------------------------------------------- /recipes/exampleFTP.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright [2020-21] EMBL-European Bioinformatics Institute 4 | 5 | # The recipes below use wget to download files 6 | # Please change these env variables to use other tools ie curl 7 | EXE="wget" 8 | ARGSDEF=" -c " 9 | ARGSTOFILE=" -O " 10 | ARGSTDOUT=" --quiet $ARGSTOFILE - " 11 | 12 | # set servers & division 13 | SERVER="ftp://ftp.ensemblgenomes.org/pub" 14 | DIV=plants 15 | BIOMARTSERVICE="http://plants.ensembl.org/biomart/martservice" 16 | 17 | # get Ensembl Plants current release number 18 | SUMFILE="${SERVER}/${DIV}/current/summary.txt" 19 | RELEASE=$($EXE $ARGSTDOUT $SUMFILE | \ 20 | perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }') 21 | 22 | # work out Ensembl Genomes release 23 | EGRELEASE=$((RELEASE - 53)); 24 | 25 | # alternatively set a different Ensembl Genomes (EG) release 26 | # EGRELEASE= 27 | 28 | # optional arguments, if any 29 | OPTARG=$1 30 | 31 | echo "EGRELEASE=${EGRELEASE} OPTARG=${OPTARG}" 32 | echo 33 | 34 | # set example species 35 | SPECIES=Brachypodium_distachyon 36 | 37 | ## F1) Download peptide sequences in FASTA format 38 | 39 | FASTAPEP="${SPECIES}*pep.all.fa.gz" 40 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/pep/${FASTAPEP}" 41 | echo "# downloading $URL" 42 | $EXE $OPTARG $ARGSDEF $URL 43 | 44 | # stop here if just a test 45 | if [[ $# -ge 2 ]] && [[ $2 = "test" ]]; then 46 | exit 0 47 | fi 48 | 49 | 50 | 51 | ## F2) Download CDS nucleotide sequences in FASTA format 52 | 53 | FASTACDS="${SPECIES}*cds.all.fa.gz" 54 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/cds/${FASTACDS}" 55 | echo "# downloading $URL" 56 | $EXE $OPTARG $ARGSDEF $URL 57 | 58 | ## F3) Download transcripts (cDNA) in FASTA format 59 | 60 | FASTACDNA="${SPECIES}*cdna.all.fa.gz" 61 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/cdna/${FASTACDNA}" 62 | echo "# downloading $URL" 63 | $EXE $OPTARG $ARGSDEF $URL 64 | 65 | ## F4) Download soft-masked genomic sequences 66 | 67 | FASTASM="${SPECIES}*.dna_sm.toplevel.fa.gz" 68 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/dna/${FASTASM}" 69 | echo "# downloading $URL" 70 | $EXE $OPTARG $ARGSDEF $URL 71 | 72 | ## F5) Upstream/downstream sequences 73 | 74 | # Note: this is actually a precompiled BioMart query. 75 | # You can construct your queries at http://plants.ensembl.org/biomart/martview 76 | # and export them as XML 77 | 78 | MARTSPECIES=bdistachyon_eg_gene 79 | BIOMARTQUERY=$(cat <<-XMLQUERY 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | XMLQUERY 91 | ) 92 | 93 | FASTAUP="${SPECIES}.upstream_flank100.chr5.fa" 94 | URL="${BIOMARTSERVICE}?query=$BIOMARTQUERY" 95 | echo "# downloading $FASTAUP" 96 | if [[ $OPTARG == "--spider" ]]; then 97 | echo "# skip this recipe in test" 98 | echo 99 | else 100 | $EXE $OPTARG $ARGSDEF "$URL" $ARGSTOFILE $FASTAUP 101 | fi 102 | 103 | ## F6) Get mappings to UniProt proteins 104 | 105 | UNIPTSV="${SPECIES}*.uniprot.tsv.gz" 106 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/${SPECIES,,}/$UNIPTSV" 107 | echo "# downloading $URL" 108 | $EXE $OPTARG $ARGSDEF $URL 109 | 110 | ## F7) Get indexed, bgzipped VCF file with variants mapped 111 | 112 | # Note: this file contains all variants known to Ensembl Plants, 113 | # individual genotypes are not necessarily conserved 114 | 115 | VCF="${SPECIES,,}.vcf.gz*" 116 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/variation/vcf/${SPECIES,,}/${VCF}" 117 | echo "# downloading $URL" 118 | $EXE $OPTARG $ARGSDEF $URL 119 | 120 | # wheat is an exception, as you can tell from the VCF file which EMS lines 121 | # share a certain mutation, as in this excerpt: 122 | #CHROM POS ID REF ALT QUAL FILTER INFO 123 | #1A 238016 Cadenza0202.chr1A.238016 G A . . EMS-induced mutation;TSA=SNV 124 | #1A 238016 Cadenza0230.chr1A.238016 G A . . EMS-induced mutation;TSA=SNV 125 | #1A 238016 Cadenza1874.chr1A.238016 G A . . EMS-induced mutation;TSA=SNV 126 | #1A 406098 Cadenza0148.chr1A.406098 T C . . EMS-induced mutation;TSA=SNV 127 | #1A 406098 Cadenza0877.chr1A.406098 T C . . EMS-induced mutation;TSA=SNV 128 | #1A 406098 Cadenza1340.chr1A.406098 T C . . EMS-induced mutation;TSA=SNV 129 | 130 | ## F8) Get precomputed VEP cache files 131 | 132 | SPECIES=arabidopsis_thaliana 133 | VEPCACHE="${SPECIES,,}*.tar.gz*" 134 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/variation/vep/${VEPCACHE}" 135 | echo "# downloading $URL" 136 | $EXE $OPTARG $ARGSDEF $URL 137 | 138 | # Note: you can get indexed cached files instead from 139 | # URL=${SERVER}/${DIV}/release-${EGRELEASE}/variation/indexed_vep_cache/${VEPCACHE} 140 | 141 | ## F9) Download all homologies in a single TSV file, several GBs 142 | 143 | TSVFILE="Compara.${RELEASE}.protein_default.homologies.tsv.gz" 144 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/ensembl-compara/homologies/${TSVFILE}" 145 | echo "# downloading $URL" 146 | $EXE $OPTARG $ARGSDEF $URL 147 | 148 | # Note: you can extract homologies from this file by parsing it 149 | # in the command line. Example: 150 | # zcat $TSVFILE | grep triticum_aestivum | grep oryza_sativa | grep ortholog 151 | 152 | # Note: homologies of each species can be retrieved from a more specific file 153 | # SPECIES=Triticum_aestivum 154 | #URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/ensembl-compara/homologies/${SPECIES,,}${TSVFILE}" 155 | #wget -c "$URL" 156 | #zcat "$TSVFILE" | grep oryza_sativa | grep ortholog 157 | 158 | # Note: Alternatively a smaller file in OrthoXML format can be obtained 159 | # OXMLFILE="Compara.${RELEASE}.protein_default.allhomologies.orthoxml.xml.gz" 160 | # URL="${SERVER}/${DIV}/release-${EGRELEASE}/xml/ensembl-compara/homologies/${OXMLFILE}" 161 | 162 | ## F10) download UniProt report of Ensembl Plants, 163 | # summarized how many protein sequences from each species 164 | # have been annotated in SwissProt & TrEMBL 165 | 166 | UNIPFILE="uniprot_report_EnsemblPlants.txt" 167 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/$UNIPFILE" 168 | echo "# downloading $URL" 169 | $EXE $OPTARG $ARGSDEF $URL 170 | 171 | ## F11) Retrieve list of new species in current release 172 | 173 | NEWLIST="new_genomes.txt" 174 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/$NEWLIST" 175 | echo "# downloading $URL" 176 | $EXE $OPTARG $ARGSDEF $URL 177 | 178 | ## F12) Get current plant species tree (cladogram) 179 | 180 | TREEFILE="plants_protein-trees_default.nh" 181 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/compara/species_trees/$TREEFILE" 182 | echo "# downloading $URL" 183 | $EXE $OPTARG $ARGSDEF $URL 184 | 185 | -------------------------------------------------------------------------------- /recipes/exampleMySQL.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Your usage of the data returned by the public MySQL server is 4 | # subject to same conditions as laid out on the Ensembl website. 5 | # 6 | # Copyright [2020-21] EMBL-European Bioinformatics Institute 7 | 8 | # documentation about Ensembl schemas can be found at 9 | # http://www.ensembl.org/info/docs/api/index.html 10 | 11 | # set server details 12 | SERVER=mysql-eg-publicsql.ebi.ac.uk 13 | USER=anonymous 14 | PORT=4157 15 | 16 | # get Ensembl Plants current release number from FTP server 17 | # Note: wget is used, this can be modified to use alternatives ie curl 18 | FTPSERVER="ftp://ftp.ensemblgenomes.org/pub" 19 | DIV=plants 20 | SUMFILE="${FTPSERVER}/${DIV}/current/summary.txt" 21 | RELEASE=$(wget --quiet -O - $SUMFILE | \ 22 | perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }') 23 | 24 | # work out Ensembl Genomes release 25 | EGRELEASE=$((RELEASE - 53)); 26 | 27 | # alternatively set other EG release number 28 | # EGRELEASE= 29 | 30 | echo "EGRELEASE=${EGRELEASE}" 31 | echo 32 | 33 | # stop here if just a test 34 | if [[ $# -ge 0 ]] && [[ $1 = "test" ]]; then 35 | mysql --host $SERVER --user $USER --port $PORT -e "show databases" 36 | exit 0 37 | fi 38 | 39 | ## S1) Check currently supported Ensembl Genomes (EG)/non-vertebrates core schemas, 40 | 41 | # Note: includes non-plants as well 42 | 43 | mysql --host $SERVER --user $USER --port $PORT \ 44 | -e "show databases" | grep "core_${EGRELEASE}_${RELEASE}" 45 | 46 | # The following API script can also be used: 47 | # https://github.com/Ensembl/ensembl-metadata/blob/master/misc_scripts/get_list_databases_for_division.pl 48 | 49 | ## S2) Count protein-coding genes of a particular species 50 | 51 | SPECIES=arabidopsis_thaliana 52 | SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \ 53 | -e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}") 54 | 55 | mysql --host $SERVER --user $USER --port $PORT \ 56 | $SPECIESCORE -e "SELECT COUNT(*) FROM gene WHERE biotype='protein_coding'" 57 | 58 | ## S3) Get stable_ids of transcripts used in Compara analyses 59 | 60 | # Canonical transcripts are used in the gene tree analysis, 61 | # which usually are the longest translations with no stop codons. 62 | # This file can be combined to that obtained in recipe F3 to 63 | # obtain the sequences 64 | 65 | mysql --host $SERVER --user $USER --port $PORT \ 66 | "ensembl_compara_plants_${EGRELEASE}_${RELEASE}" \ 67 | -e "SELECT sm.stable_id \ 68 | FROM seq_member sm, gene_member gm, genome_db gdb \ 69 | WHERE sm.seq_member_id = gm.canonical_member_id \ 70 | AND sm.genome_db_id = gdb.genome_db_id \ 71 | AND gdb.name = '$SPECIES' \ 72 | LIMIT 10" 73 | 74 | ## S4) Get variants significantly associated to phenotypes 75 | 76 | # Variation schema documented at 77 | # http://www.ensembl.org/info/docs/api/variation/variation_schema.html 78 | 79 | SPECIESVAR=$(mysql --host $SERVER --user $USER --port $PORT \ 80 | -e "show databases" | grep "${SPECIES}_variation_${EGRELEASE}_${RELEASE}") 81 | 82 | mysql --host $SERVER --user $USER --port $PORT \ 83 | $SPECIESVAR< ${MINLEN}" | \ 204 | sort -u -k1,1 -k2,2n > $BEDFILE 205 | 206 | # similar to recipe F4 207 | URL="${FTPSERVER}/${DIV}/current/fasta/${SPECIES}/dna/${FASTANAME}" 208 | wget -c $URL -O- | gunzip > $FASTA 209 | bedtools getfasta -name -fi $FASTA -bed $BEDFILE > $REPFASTA 210 | 211 | ## S9) Get GFF of repeated sequences from selected species 212 | 213 | # This recipe first interrogates the MySQL server and produces a GFF file 214 | # with repeat sequences. Uses MINLEN to skip short repeats. 215 | # Note: requires wget, sort and perl 216 | 217 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \ 218 | "SELECT sr.name,rc.repeat_class,'Repeat',r.seq_region_start, \ 219 | r.seq_region_end,r.score,r.seq_region_strand,0,rc.repeat_name \ 220 | FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \ 221 | WHERE r.seq_region_id=sr.seq_region_id \ 222 | AND r.repeat_consensus_id=rc.repeat_consensus_id \ 223 | AND (r.seq_region_end-r.seq_region_start+1) > 90" | sort -k1,1 -k4,4n | \ 224 | perl -lane 'if($F[5] eq "NULL"){ $F[5]="."}; if($F[6]==1){ $F[6]="+" } else {$F[6]="-" }; $F[8]="Note=\"$F[8]\";"; print join("\t",@F)' 225 | 226 | : 227 | -------------------------------------------------------------------------------- /recipes/exampleVEP.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright [2020-21] EMBL-European Bioinformatics Institute 4 | 5 | # documentation about Ensembl VEP can be found at 6 | # http://www.ensembl.org/info/docs/tools/vep/index.html 7 | 8 | # set Ensembl Plants release number, check it 9 | # at the bottom of http://plants.ensembl.org 10 | # EG stands for Ensembl Genomes 11 | 12 | # example call: VEPATH=/path/to/ensembl-vep EGRELEASE=50 ./exampleVEP.sh 13 | 14 | if [ -z "${EGRELEASE}" ]; then 15 | EGRELEASE=49 16 | fi 17 | 18 | # edit if needed to point to ensembl-vep 19 | if [ -z "${VEPATH}" ]; then 20 | VEPATH=./ 21 | fi 22 | 23 | # check VEP 24 | if [ ! -f "${VEPATH}/ensembl-vep/vep" ]; then 25 | echo "# ERROR: Cannot find ${VEPATH}/ensembl-vep/vep not found, please set VEPATH accordingly" 26 | exit 1 27 | fi 28 | 29 | 30 | # work out Ensembl release, do not change 31 | RELEASE=$((EGRELEASE + 53)); 32 | 33 | echo "EGRELEASE=${EGRELEASE}" 34 | echo 35 | 36 | ## V1) Download, install and update VEP 37 | 38 | # Fresh install 39 | #git clone https://github.com/Ensembl/ensembl-vep.git 40 | #cd ensembl-vep 41 | #perl INSTALL.pl 42 | 43 | # To update from a previous version: 44 | #cd ensembl-vep 45 | #git pull 46 | #git checkout release/$RELEASE 47 | #perl INSTALL.pl 48 | 49 | ## V2) Unpack downloaded cache file & check SIFT support 50 | 51 | # Note: cache downloaded in recipe F8 52 | # Note: look for "sift b" 53 | 54 | SPECIES=arabidopsis_thaliana 55 | VEPCACHE="${SPECIES}*.tar.gz*" 56 | 57 | if [ ! -f ${VEPCACHE} ]; then 58 | echo "# ERROR: Cache file ${VEPCACHE} not found, get it with recipe F8" 59 | exit 1 60 | else 61 | tar xfz $VEPCACHE 62 | pattern="${SPECIES}/${EGRELEASE}_*/info.txt" 63 | files=( $pattern ) 64 | INFOFILE="${files[0]}" 65 | if [ -f "${INFOFILE}" ]; then 66 | grep sift "${INFOFILE}" 67 | echo "${INFOFILE}" 68 | else 69 | echo "# ERROR: Cannot find file ${INFOFILE}, please correct/set variable EGRELEASE" 70 | exit 1 71 | fi 72 | fi 73 | 74 | ## V3) Predict effect of variants 75 | 76 | # See more options and examples at 77 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html 78 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_example.html 79 | 80 | VCFILE="${VEPATH}/ensembl-vep/examples/arabidopsis_thaliana.TAIR10.vcf" 81 | OUTFILE='arabidopsis_thaliana.vep.output' 82 | 83 | VEPOPTIONS=( 84 | --genomes # Ensembl Genomes, for Plants 85 | --species $SPECIES 86 | --cache # use local cache file, opposed to --database 87 | --dir_cache ./ # location of unpacked cache $SPECIES folder 88 | --cache_version $EGRELEASE 89 | --check_existing # co-located known variants 90 | --distance 5000 # max dist between variant and transcript 91 | --biotype # show biotype of neighbor transcript 92 | --input_file $VCFILE 93 | --output_file $OUTFILE 94 | ) 95 | 96 | # --sift b # only some species have SIFT precomputed 97 | 98 | ${VEPATH}/ensembl-vep/vep "${VEPOPTIONS[@]}" 99 | 100 | ## V4) Predict effect of variants for species not in Ensembl 101 | 102 | # GFF file must be sorted and indexed with BGZIP and TABIX, see 103 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#gff 104 | 105 | FASTAGZFILE= # GZIP-compressed file of genome FASTA file 106 | GFFILE= # gene models matching sequences in FASTAGZFILE 107 | GZGFFILE=$GFFILE.sorted.gz 108 | 109 | if [[ -f $GFFILE && -f $FASTAGZFILE ]]; then 110 | # sort and index 111 | grep -v "#" $GFFILE | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > $GZGFFILE 112 | tabix -p gff $GZGFFILE 113 | 114 | # actually call vep 115 | ${VEPATH}/ensembl-vep/vep -i $VCFILE -gff $GZGFFILE -fasta $FASTAGZFILE 116 | fi 117 | -------------------------------------------------------------------------------- /repeats/README.md: -------------------------------------------------------------------------------- 1 | 2 | These scripts can be used to: 3 | + i) mask repeated sequences in plant genomes with the 4 | [Repeat detector](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0654-5) (Red) 5 | + ii) annotate repeats with https://github.com/lh3/minimap2 and 6 | * the curated library of repeats 7 | [nrTEplants](https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2), 8 | described in full detail [here](https://github.com/Ensembl/plant_tools/tree/master/bench/repeat_libs) 9 | * annotated repeats from species in Ensembl Plants, obtained with *get_repeats_ensembl.sh* 10 | 11 | Optionally, repeats and annotated repeats can be loaded into an [Ensembl core db](https://www.ensembl.org/info/docs/api/core/core_schema.html) 12 | as new analyses with default logic names are 'repeatdetector' and 'repeatdetector_annotated'. 13 | 14 | ## Dependencies 15 | 16 | The following dependencies can be installed in the parent folder with: 17 | 18 | make install install_repeats 19 | 20 | There are two required binaries, for which version 10 of the GNU C++ compiler (actually any g++ >= 8 should work, please edit the [Makefile](../Makefile) accordingly): 21 | 22 | * A clone of Red from https://github.com/EnsemblGenomes/Red (the original repo is [here](https://github.com/BioinformaticsToolsmith/Red)) 23 | * A clone of minimap2 from https://github.com/lh3/minimap2 24 | 25 | Plus: 26 | 27 | * A copy of the [nrTEplants library](https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2) 28 | 29 | And three Python3 modules: 30 | 31 | * [sqlalchemy](https://pypi.org/project/SQLAlchemy) 32 | * [sqlalchemy_utils](https://pypi.org/project/SQLAlchemy-Utils) 33 | * [pymysql](https://pypi.org/project/PyMySQL) 34 | 35 | 36 | 37 | Note that script [get_repeats_ensembl.sh](./get_repeats_ensembl.sh) has some more dependencies listed at its header. 38 | 39 | ## Argument lists 40 | 41 | If you run 42 | 43 | ./Red2Ensembl.py -h 44 | 45 | you'll get the list of supported arguments and what they're for: 46 | 47 | ``` 48 | ./Red2Ensembl.py -h 49 | usage: Red2Ensembl.py [-h] [--exe EXE] [--cor COR] [--msk_file MSK_FILE] 50 | [--bed_file BED_FILE] [--host HOST] [--user USER] 51 | [--pw PW] [--port PORT] [--db DB] 52 | [--logic_name LOGIC_NAME] [--description DESCRIPTION] 53 | [--displaylabel DISPLAYLABEL] 54 | fasta_file outdir 55 | 56 | Script to run RepeatDetector (a fork of Red v2) to mask repeats, 57 | and optionally feed results into an Ensembl core database. 58 | 59 | positional arguments: 60 | fasta_file path to FASTA file with top-level genomic sequences 61 | outdir path to directory to store Red temp results 62 | 63 | optional arguments: 64 | -h, --help show this help message and exit 65 | --exe EXE path to Red executable, default: ./../lib/Red/bin/Red 66 | --cor COR number of cores for Red, default: 1 67 | --msk_file MSK_FILE name of output FASTA file with soft-masked sequences 68 | --bed_file BED_FILE name of output BED file with repeated ranges, uses 69 | original sequence names 70 | --host HOST name of the database host, required to store repeats 71 | in Ensembl core 72 | --user USER host user, required to store repeats in Ensembl core 73 | --pw PW host password, required to store repeats in Ensembl 74 | core 75 | --port PORT host port, required to store repeats in Ensembl core 76 | --db DB name of the core database, required to store repeats 77 | in Ensembl core 78 | --logic_name LOGIC_NAME 79 | logic name of Ensembl analysis, default: 80 | repeatdetector 81 | --description DESCRIPTION 82 | quoted string with Ensembl analysis description, 83 | default: Repeats detected using Red (REPeatDetector) 86 | --displaylabel DISPLAYLABEL 87 | string with Ensembl analysis display label, default: 88 | Repeats:Red 89 | 90 | Citation: 91 | Contreras-Moreira et al (2021) https://doi.org/10.1002/tpg2.20143 92 | Girgis HZ (2015) BMC Bioinformatics 16:227. doi: 10.1186/s12859-015-0654-5 93 | ``` 94 | 95 | Similarly, if you run 96 | 97 | ./AnnotRedRepeats.py -h 98 | 99 | you'll get: 100 | 101 | ``` 102 | usage: AnnotRedRepeats.py [-h] [--exe EXE] [--cor COR] [--minlen MINLEN] 103 | [--bed_file BED_FILE] [--host HOST] [--user USER] 104 | [--pw PW] [--port PORT] [--db DB] 105 | [--logic_name LOGIC_NAME] 106 | [--description DESCRIPTION] 107 | [--displaylabel DISPLAYLABEL] 108 | repeat_fasta_file outdir 109 | 110 | Script to annotate Red repeats and optionally 111 | feed the new consensus_repeats into an Ensembl core database. 112 | 113 | positional arguments: 114 | repeat_fasta_file path to FASTA file with repeat sequences in RepBase 115 | format 116 | outdir path to directory with stored Red results 117 | 118 | optional arguments: 119 | -h, --help show this help message and exit 120 | --exe EXE path to minimap2 executable, default: 121 | ./../lib/minimap2/minimap2 122 | --cor COR number of cores for minimap2, default: 1 123 | --minlen MINLEN min length of repeats to be annotated, default: 90bp 124 | --bed_file BED_FILE name of output BED file with annotated repeats 125 | --host HOST name of the database host 126 | --user USER host user 127 | --pw PW host password 128 | --port PORT host port 129 | --db DB name of the core database 130 | --logic_name LOGIC_NAME 131 | logic name of Ensembl analysis, default: 132 | repeatdetector_annotated 133 | --description DESCRIPTION 134 | quoted string with Ensembl analysis description, 135 | default: Repeats detected using Red (REPeatDetector) and annotated by 138 | alignment to a repeat library. 139 | --displaylabel DISPLAYLABEL 140 | string with Ensembl analysis display label, default: 141 | 'Repeats:Red (annotated)' 142 | 143 | Citation: 144 | Contreras-Moreira et al (2021) https://doi.org/10.1002/tpg2.20143 145 | Girgis HZ (2015) BMC Bioinformatics 16:227. doi: 10.1186/s12859-015-0654-5 146 | Li H (2018) Bioinformatics 34(18):3094–3100. doi: 10.1093/bioinformatics/bty191 147 | ``` 148 | 149 | ## Examples 150 | 151 | Note that the input FASTA file can be GZIP/BZIP2 compressed. 152 | The script *Red2Ensembl.py* will attempt to estimate the GB RAM needed for the input genome. 153 | 154 | ### i) Masking 155 | 156 | ``` 157 | ## test run, saves results in folder 'test_Atha_chr4' 158 | ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna --bed_file Atha.bed 159 | 160 | # parsing FASTA file 161 | # genome length = 18585056 bp 162 | ... 163 | 164 | 165 | ## real example, with several chromosomes, taking 4 CPU cores 166 | ./Red2Ensembl.py Brachypodium_distachyon_v3.0.dna.toplevel.fa Brachypodium_distachyon --cor 4 167 | 168 | ## local run & loading repeats in core Ensembl db (will re-use previous Red results) 169 | ./Red2Ensembl.py Brachypodium_distachyon_v3.0.dna.toplevel.fa Brachypodium_distachyon \ 170 | --host pl1 --user xyz --pw XYZ --port 123 --db brachypodium_distachyon_core_49_102 171 | ``` 172 | 173 | ### ii) Annotating masked repeated sequences 174 | 175 | The repeats called by Red can be optionally annotated by similarity to sequences in an external FASTA file, 176 | such as the library **nrTEplants**. The script does not load the resulting annotations in a core db just yet: 177 | ``` 178 | ## test run, re-uses folder 'test_Atha_chr4' 179 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna test_Atha_chr4 --bed_file test.nrTEplants.bed 180 | 181 | ## consider only repeats with length >= 200 bp 182 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna Brachypodium_distachyon --cor 4 \ 183 | --minlen 200 184 | 185 | ## add annotated repeats to Ensembl core db and use a different minimap2 binary 186 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna Brachypodium_distachyon --exe /path/to/minimap2 --cor 4 \ 187 | --host pl1 --user xyz --pw XYZ \ 188 | --port 123 --db brachypodium_distachyon_core_49_102 189 | ``` 190 | 191 | Note that any FASTA file can be used to annotate the repeats. For instance, repeats annotated 192 | in current species in Ensembl can be retrieved and used as well: 193 | ``` 194 | ./get_repeats_ensembl.sh arabidopsis_thaliana 195 | 196 | # This will produce file: arabidopsis_thaliana.repeats.nondeg.fasta 197 | 198 | # Note this file can be highly redundant; redundancy can be eliminated with linclust, 199 | # see https://github.com/soedinglab/MMseqs2 200 | 201 | ./AnnotRedRepeats.py arabidopsis_thaliana.repeats.nondeg.fasta test_Atha_chr4 --bed_file test.ensembl.bed 202 | ``` 203 | 204 | ## Annotation summary 205 | 206 | If a library such as nrTEplants or any other RepBase-formatted file is used, 207 | an annotation report like this is produced. These are valid examples of FASTA headers: 208 | 209 | >TEdenovo-B-R2315-Map11:repetDB.Mar2020#TIR @Brassica_rapa [S:] 210 | >AT1TE94285:TAIR10_TE#DNA/MuDR @Arabidopsis_thaliana [S:] 211 | 212 | The repeat classification is then parsed to produce a report like this: 213 | 214 | ``` 215 | # Genome length: 18585056 Repeated content: 6837303 36.8% Annotated: 2748796 14.8% 216 | 217 | class bp 218 | DIRS 1212 219 | DNA 32110 220 | DNA/En-Spm 50044 221 | DNA/HAT 33911 222 | DNA/Harbinger 13879 223 | DNA/Mariner 3935 224 | DNA/MuDR 283157 225 | DNA/Pogo 21954 226 | DNA/Tc1 3467 227 | Helitron 20670 228 | LARD 83725 229 | LINE 2384 230 | LINE/L1 9898 231 | LINE? 1235 232 | LTR 113511 233 | LTR/Copia 88739 234 | LTR/Gypsy 900679 235 | MITE 2502 236 | Other 42920 237 | Other/Simple 1596 238 | RC/Helitron 766803 239 | RathE1_cons 1188 240 | RathE2_cons 245 241 | RathE3_cons 196 242 | SINE 9192 243 | Satellite 132 244 | TIR 79579 245 | TIR/Mutator 364 246 | TRIM 53086 247 | Unclassified 126483 248 | ``` 249 | 250 | ## Runtime and RAM requirements 251 | 252 | These data were measured on a CentOS7.9 computer using 4 cores of a Xeon E5-2620 v4 (2.10GHz) CPU. 253 | 254 | ![](../files/runtime_ram.png) 255 | 256 | 257 | ## Error messages 258 | 259 | + ERROR: cannot run Red -9: This means the Red process was killed by the Operating system, usually for taking too much RAM. You will need more RAM to run this job. 260 | 261 | -------------------------------------------------------------------------------- /repeats/bench/list.Red: -------------------------------------------------------------------------------- 1 | Aegilops_tauschii 2 | Arabidopsis_halleri 3 | Arabidopsis_thaliana 4 | Arabis_alpina 5 | Brachypodium_distachyon 6 | Brassica_rapa 7 | Camelina_sativa 8 | Citrullus_lanatus 9 | Cucumis_melo 10 | Helianthus_annuus 11 | Malus_domestica_golden 12 | Olea_europaea_sylvestris 13 | Oryza_sativa 14 | Prunus_dulcis 15 | Rosa_chinensis 16 | Setaria_viridis 17 | Trifolium_pratense 18 | Triticum_turgidum 19 | Vitis_vinifera 20 | Zea_mays 21 | -------------------------------------------------------------------------------- /repeats/bench/list.cores: -------------------------------------------------------------------------------- 1 | aegilops_tauschii_core_48_101_3 2 | arabidopsis_halleri_core_48_101_1 3 | arabidopsis_thaliana_core_48_101_11 4 | arabis_alpina_core_48_101_1 5 | brachypodium_distachyon_core_48_101_4 6 | brassica_rapa_core_48_101_1 7 | camelina_sativa_core_48_101_1 8 | citrullus_lanatus_core_48_101_1 9 | cucumis_melo_core_48_101_1 10 | helianthus_annuus_core_48_101_1 11 | malus_domestica_golden_core_48_101_1 12 | olea_europaea_sylvestris_core_48_101_1 13 | oryza_sativa_core_48_101_7 14 | prunus_dulcis_core_48_101_1 15 | rosa_chinensis_core_48_101_1 16 | setaria_viridis_core_48_101_1 17 | trifolium_pratense_core_48_101_1 18 | triticum_turgidum_core_48_101_1 19 | vitis_vinifera_core_48_101_3 20 | zea_mays_core_48_101_7 21 | -------------------------------------------------------------------------------- /repeats/bench/list.cores.sp: -------------------------------------------------------------------------------- 1 | aegilops_tauschii_core_48_101_3 aegilops_tauschii 2 | arabidopsis_halleri_core_48_101_1 arabidopsis_halleri 3 | arabidopsis_thaliana_core_48_101_11 arabidopsis_thaliana 4 | arabis_alpina_core_48_101_1 arabis_alpina 5 | brachypodium_distachyon_core_48_101_4 brachypodium_distachyon 6 | brassica_rapa_core_48_101_1 brassica_rapa 7 | camelina_sativa_core_48_101_1 camelina_sativa 8 | citrullus_lanatus_core_48_101_1 citrullus_lanatus 9 | cucumis_melo_core_48_101_1 cucumis_melo 10 | helianthus_annuus_core_48_101_1 helianthus_annuus 11 | malus_domestica_golden_core_48_101_1 malus_domestica_golden 12 | olea_europaea_sylvestris_core_48_101_1 olea_europaea_sylvestris 13 | oryza_sativa_core_48_101_7 oryza_sativa 14 | prunus_dulcis_core_48_101_1 prunus_dulcis 15 | rosa_chinensis_core_48_101_1 rosa_chinensis 16 | setaria_viridis_core_48_101_1 setaria_viridis 17 | trifolium_pratense_core_48_101_1 trifolium_pratense 18 | triticum_turgidum_core_48_101_1 triticum_turgidum 19 | vitis_vinifera_core_48_101_3 vitis_vinifera 20 | zea_mays_core_48_101_7 zea_mays 21 | -------------------------------------------------------------------------------- /repeats/bench/list.cores.sp.toplevel: -------------------------------------------------------------------------------- 1 | aegilops_tauschii_core_48_101_3 aegilops_tauschii Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa 2 | arabidopsis_halleri_core_48_101_1 arabidopsis_halleri Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa 3 | arabidopsis_thaliana_core_48_101_11 arabidopsis_thaliana Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 4 | arabis_alpina_core_48_101_1 arabis_alpina Arabis_alpina.A_alpina_V4.dna.toplevel.fa 5 | brachypodium_distachyon_core_48_101_4 brachypodium_distachyon Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa 6 | brassica_rapa_core_48_101_1 brassica_rapa Brassica_rapa.Brapa_1.0.dna.toplevel.fa 7 | camelina_sativa_core_48_101_1 camelina_sativa Camelina_sativa.Cs.dna.toplevel.fa 8 | citrullus_lanatus_core_48_101_1 citrullus_lanatus Citrullus_lanatus.Cla97_v1.dna.toplevel.fa 9 | cucumis_melo_core_48_101_1 cucumis_melo Cucumis_melo.Melonv4.dna.toplevel.fa 10 | helianthus_annuus_core_48_101_1 helianthus_annuus Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa 11 | malus_domestica_golden_core_48_101_1 malus_domestica_golden Malus_domestica_golden.ASM211411v1.dna.toplevel.fa 12 | olea_europaea_sylvestris_core_48_101_1 olea_europaea_sylvestris Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa 13 | oryza_sativa_core_48_101_7 oryza_sativa Oryza_sativa.IRGSP-1.0.dna.toplevel.fa 14 | prunus_dulcis_core_48_101_1 prunus_dulcis Prunus_dulcis.ALMONDv2.dna.toplevel.fa 15 | rosa_chinensis_core_48_101_1 rosa_chinensis Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa 16 | setaria_viridis_core_48_101_1 setaria_viridis Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa 17 | trifolium_pratense_core_48_101_1 trifolium_pratense Trifolium_pratense.Trpr.dna.toplevel.fa 18 | triticum_turgidum_core_48_101_1 triticum_turgidum Triticum_turgidum.Svevo.v1.dna.toplevel.fa 19 | vitis_vinifera_core_48_101_3 vitis_vinifera Vitis_vinifera.12X.dna.toplevel.fa 20 | zea_mays_core_48_101_7 zea_mays Zea_mays.B73_RefGen_v4.dna.toplevel.fa 21 | -------------------------------------------------------------------------------- /repeats/bench/list.cores.wheat: -------------------------------------------------------------------------------- 1 | triticum_aestivum_arinalrfor_core_51_104_1 2 | triticum_aestivum_jagger_core_51_104_1 3 | triticum_aestivum_julius_core_51_104_1 4 | triticum_aestivum_lancer_core_51_104_1 5 | triticum_aestivum_landmark_core_51_104_1 6 | triticum_aestivum_mace_core_51_104_1 7 | triticum_aestivum_mattis_core_51_104_1 8 | triticum_aestivum_norin61_core_51_104_1 9 | triticum_aestivum_stanley_core_51_104_1 10 | -------------------------------------------------------------------------------- /repeats/bench/list.toplevel: -------------------------------------------------------------------------------- 1 | Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa 2 | Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa 3 | Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 4 | Arabis_alpina.A_alpina_V4.dna.toplevel.fa 5 | Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa 6 | Brassica_rapa.Brapa_1.0.dna.toplevel.fa 7 | Camelina_sativa.Cs.dna.toplevel.fa 8 | Citrullus_lanatus.Cla97_v1.dna.toplevel.fa 9 | Cucumis_melo.Melonv4.dna.toplevel.fa 10 | Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa 11 | Malus_domestica_golden.ASM211411v1.dna.toplevel.fa 12 | Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa 13 | Oryza_sativa.IRGSP-1.0.dna.toplevel.fa 14 | Prunus_dulcis.ALMONDv2.dna.toplevel.fa 15 | Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa 16 | Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa 17 | Trifolium_pratense.Trpr.dna.toplevel.fa 18 | Triticum_turgidum.Svevo.v1.dna.toplevel.fa 19 | Vitis_vinifera.12X.dna.toplevel.fa 20 | Zea_mays.B73_RefGen_v4.dna.toplevel.fa 21 | -------------------------------------------------------------------------------- /repeats/bench/log.Rgenes.50: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 916 3144217 130459 178814 225364 2 | arabidopsis_halleri 209 635687 55807 425348 119975 3 | arabidopsis_thaliana 69 201660 22619 151854 54427 4 | arabis_alpina 364 1201405 91329 787666 192520 5 | brachypodium_distachyon 344 1249602 154904 208189 177288 6 | brassica_rapa 219 729119 55369 485467 158167 7 | camelina_sativa 573 1808571 108683 1032089 352086 8 | citrullus_lanatus 43 164993 15906 72027 9153 9 | cucumis_melo 89 294239 38958 189021 44912 10 | helianthus_annuus 604 2055877 66811 497800 798084 11 | malus_domestica_golden 637 2535932 343622 1861255 944792 12 | olea_europaea_sylvestris 402 1095646 80911 151255 121251 13 | oryza_sativa 45 169051 9415 34747 10603 14 | prunus_dulcis 387 1354547 223204 1055234 442933 15 | rosa_chinensis 963 3347977 398581 2255192 1287944 16 | setaria_viridis 453 1682607 109789 161966 393371 17 | trifolium_pratense 553 1781566 518826 810044 570870 18 | triticum_turgidum 2459 8351371 349698 465873 570011 19 | vitis_vinifera 739 2723777 377354 1645288 1065500 20 | zea_mays 158 487237 19550 39041 94402 21 | -------------------------------------------------------------------------------- /repeats/bench/log.exons: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 167903 68899123 1093757 1476448 3150918 2 | arabidopsis_halleri 168961 48889148 4713750 10589208 8184041 3 | arabidopsis_thaliana 145966 46554976 2639673 7998681 8355123 4 | arabis_alpina 108362 30098225 1690816 6446837 2726170 5 | brachypodium_distachyon 152258 68995392 3749292 6022129 5544077 6 | brassica_rapa 206588 48263240 2740496 9859653 3779043 7 | camelina_sativa 481073 132504580 6268348 20505918 36061050 8 | citrullus_lanatus 118249 26303029 1094402 4528151 292055 9 | cucumis_melo 137797 41060653 1150542 6423622 2905048 10 | helianthus_annuus 231826 82980403 2162522 3054869 9975943 11 | malus_domestica_golden 221431 62527585 2256187 8083094 3939718 12 | olea_europaea_sylvestris 235142 65931783 2550352 6425574 12189818 13 | oryza_sativa 156630 65380182 2349371 5368395 4300234 14 | prunus_dulcis 134744 47752477 2716214 10176450 4580463 15 | rosa_chinensis 174074 74446167 3081688 12161789 12510233 16 | setaria_viridis 161524 74246042 2306301 4559099 6563336 17 | trifolium_pratense 181312 55716319 4583322 10160516 5755230 18 | triticum_turgidum 308306 82770985 554686 1220411 2480209 19 | vitis_vinifera 147613 39848131 1841978 5904363 4073035 20 | zea_mays 206593 63390920 488030 1046187 8058936 21 | -------------------------------------------------------------------------------- /repeats/bench/log.gc: -------------------------------------------------------------------------------- 1 | ../Red_minimap2//Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa 46.3 2 | ../Red_minimap2//Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa 36.0 3 | ../Red_minimap2//Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 36.1 4 | ../Red_minimap2//Arabis_alpina.A_alpina_V4.dna.toplevel.fa 36.8 5 | ../Red_minimap2//Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa 46.4 6 | ../Red_minimap2//Brassica_rapa.Brapa_1.0.dna.toplevel.fa 35.3 7 | ../Red_minimap2//Camelina_sativa.Cs.dna.toplevel.fa 36.6 8 | ../Red_minimap2//Citrullus_lanatus.Cla97_v1.dna.toplevel.fa 33.6 9 | ../Red_minimap2//Cucumis_melo.Melonv4.dna.toplevel.fa 33.5 10 | ../Red_minimap2//Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa 38.5 11 | ../Red_minimap2//Malus_domestica_golden.ASM211411v1.dna.toplevel.fa 38.0 12 | ../Red_minimap2//Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa 35.4 13 | ../Red_minimap2//Oryza_sativa.IRGSP-1.0.dna.toplevel.fa 43.6 14 | ../Red_minimap2//Prunus_dulcis.ALMONDv2.dna.toplevel.fa 37.6 15 | ../Red_minimap2//Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa 38.8 16 | ../Red_minimap2//Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa 46.2 17 | ../Red_minimap2//Trifolium_pratense.Trpr.dna.toplevel.fa 32.4 18 | ../Red_minimap2//Triticum_turgidum.Svevo.v1.dna.toplevel.fa 46.0 19 | ../Red_minimap2//Vitis_vinifera.12X.dna.toplevel.fa 34.5 20 | ../Red_minimap2//Zea_mays.B73_RefGen_v4.dna.toplevel.fa 46.9 21 | -------------------------------------------------------------------------------- /repeats/bench/log.genes: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 39614 348459650 107216460 115488145 136485083 2 | arabidopsis_halleri 32158 77672904 7028692 14970718 14329155 3 | arabidopsis_thaliana 27628 67504196 3255170 9734724 13753315 4 | arabis_alpina 21609 45801712 2194862 8499977 4069886 5 | brachypodium_distachyon 34310 122507803 7423797 9928120 10466133 6 | brassica_rapa 41018 83240510 4332091 15559458 11155807 7 | camelina_sativa 89275 215022170 8523587 26276159 49343237 8 | citrullus_lanatus 22541 81241809 2904724 10076936 6915666 9 | cucumis_melo 28299 99772108 3949118 17800648 11892758 10 | helianthus_annuus 52191 199869937 7343679 16325186 58538874 11 | malus_domestica_golden 40624 145824479 8244741 29968698 25457319 12 | olea_europaea_sylvestris 50678 153512663 6720213 11820126 29235344 13 | oryza_sativa 35775 130112020 8042778 13051885 11449782 14 | prunus_dulcis 27963 94240163 4142093 17684800 11380531 15 | rosa_chinensis 45464 117342283 3905528 14950018 18102851 16 | setaria_viridis 38334 123295550 4481990 7094063 10854583 17 | trifolium_pratense 39917 133489771 12955199 17236097 24083603 18 | triticum_turgidum 66545 475684837 148018114 156621545 185695090 19 | vitis_vinifera 29927 153564951 10261857 40330281 30458129 20 | zea_mays 39583 168056455 14098099 16263247 41720838 21 | -------------------------------------------------------------------------------- /repeats/bench/log.nrplants.bed: -------------------------------------------------------------------------------- 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species aegilops_tauschii -logic_name repeatmask_nrplants 2 | aegilops_tauschii genome_length 4224915394 3 | aegilops_tauschii repeatmask_nrplants 2954386572 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_halleri -logic_name repeatmask_nrplants 5 | arabidopsis_halleri genome_length 196243198 6 | arabidopsis_halleri repeatmask_nrplants 54248175 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_thaliana -logic_name repeatmask_nrplants 8 | arabidopsis_thaliana genome_length 119667750 9 | arabidopsis_thaliana repeatmask_nrplants 34493047 10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabis_alpina -logic_name repeatmask_nrplants 11 | arabis_alpina genome_length 12 | arabis_alpina repeatmask_nrplants 138763577 13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brachypodium_distachyon -logic_name repeatmask_nrplants 14 | brachypodium_distachyon genome_length 271163419 15 | brachypodium_distachyon repeatmask_nrplants 76177501 16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brassica_rapa -logic_name repeatmask_nrplants 17 | brassica_rapa genome_length 283822783 18 | brassica_rapa repeatmask_nrplants 85105371 19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species camelina_sativa -logic_name repeatmask_nrplants 20 | camelina_sativa genome_length 641356059 21 | camelina_sativa repeatmask_nrplants 158537047 22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species citrullus_lanatus -logic_name repeatmask_nrplants 23 | citrullus_lanatus genome_length 365450462 24 | citrullus_lanatus repeatmask_nrplants 64792392 25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species cucumis_melo -logic_name repeatmask_nrplants 26 | cucumis_melo genome_length 357857370 27 | cucumis_melo repeatmask_nrplants 159149620 28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species helianthus_annuus -logic_name repeatmask_nrplants 29 | helianthus_annuus genome_length 3027844945 30 | helianthus_annuus repeatmask_nrplants 788477175 31 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species malus_domestica_golden -logic_name repeatmask_nrplants 32 | malus_domestica_golden genome_length 702961352 33 | malus_domestica_golden repeatmask_nrplants 313026586 34 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species olea_europaea_sylvestris -logic_name repeatmask_nrplants 35 | olea_europaea_sylvestris genome_length 1140989389 36 | olea_europaea_sylvestris repeatmask_nrplants 227766390 37 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species oryza_sativa -logic_name repeatmask_nrplants 38 | oryza_sativa genome_length 375049285 39 | oryza_sativa repeatmask_nrplants 146748772 40 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species prunus_dulcis -logic_name repeatmask_nrplants 41 | prunus_dulcis genome_length 227498357 42 | prunus_dulcis repeatmask_nrplants 86267619 43 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species rosa_chinensis -logic_name repeatmask_nrplants 44 | rosa_chinensis genome_length 515588973 45 | rosa_chinensis repeatmask_nrplants 116760801 46 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species setaria_viridis -logic_name repeatmask_nrplants 47 | setaria_viridis genome_length 48 | setaria_viridis repeatmask_nrplants 80092623 49 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species trifolium_pratense -logic_name repeatmask_nrplants 50 | trifolium_pratense genome_length 304842038 51 | trifolium_pratense repeatmask_nrplants 33750565 52 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species triticum_turgidum -logic_name repeatmask_nrplants 53 | triticum_turgidum genome_length 10463058104 54 | triticum_turgidum repeatmask_nrplants 7583835351 55 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species vitis_vinifera -logic_name repeatmask_nrplants 56 | vitis_vinifera genome_length 486265422 57 | vitis_vinifera repeatmask_nrplants 216500288 58 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species zea_mays -logic_name repeatmask_nrplants 59 | zea_mays genome_length 2135083061 60 | zea_mays repeatmask_nrplants 1337627276 61 | -------------------------------------------------------------------------------- /repeats/bench/log.redat.bed: -------------------------------------------------------------------------------- 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species aegilops_tauschii -logic_name repeatmask_redat 2 | aegilops_tauschii genome_length 4224915394 3 | aegilops_tauschii repeatmask_redat 2904094129 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_halleri -logic_name repeatmask_redat 5 | arabidopsis_halleri genome_length 196243198 6 | arabidopsis_halleri repeatmask_redat 30398422 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_thaliana -logic_name repeatmask_redat 8 | arabidopsis_thaliana genome_length 119667750 9 | arabidopsis_thaliana repeatmask_redat 16998029 10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabis_alpina -logic_name repeatmask_redat 11 | arabis_alpina genome_length 12 | arabis_alpina repeatmask_redat 46572112 13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brachypodium_distachyon -logic_name repeatmask_redat 14 | brachypodium_distachyon genome_length 271163419 15 | brachypodium_distachyon repeatmask_redat 74226118 16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brassica_rapa -logic_name repeatmask_redat 17 | brassica_rapa genome_length 283822783 18 | brassica_rapa repeatmask_redat 24349441 19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species camelina_sativa -logic_name repeatmask_redat 20 | camelina_sativa genome_length 641356059 21 | camelina_sativa repeatmask_redat 101243109 22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species citrullus_lanatus -logic_name repeatmask_redat 23 | citrullus_lanatus genome_length 365450462 24 | citrullus_lanatus repeatmask_redat 24695282 25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species cucumis_melo -logic_name repeatmask_redat 26 | cucumis_melo genome_length 357857370 27 | cucumis_melo repeatmask_redat 29282530 28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species helianthus_annuus -logic_name repeatmask_redat 29 | helianthus_annuus genome_length 3027844945 30 | helianthus_annuus repeatmask_redat 302618657 31 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species malus_domestica_golden -logic_name repeatmask_redat 32 | malus_domestica_golden genome_length 702961352 33 | malus_domestica_golden repeatmask_redat 63711081 34 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species olea_europaea_sylvestris -logic_name repeatmask_redat 35 | olea_europaea_sylvestris genome_length 1140989389 36 | olea_europaea_sylvestris repeatmask_redat 205755459 37 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species oryza_sativa -logic_name repeatmask_redat 38 | oryza_sativa genome_length 375049285 39 | oryza_sativa repeatmask_redat 121253121 40 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species prunus_dulcis -logic_name repeatmask_redat 41 | prunus_dulcis genome_length 227498357 42 | prunus_dulcis repeatmask_redat 14847578 43 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species rosa_chinensis -logic_name repeatmask_redat 44 | rosa_chinensis genome_length 515588973 45 | rosa_chinensis repeatmask_redat 43127746 46 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species setaria_viridis -logic_name repeatmask_redat 47 | setaria_viridis genome_length 48 | setaria_viridis repeatmask_redat 73169312 49 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species trifolium_pratense -logic_name repeatmask_redat 50 | trifolium_pratense genome_length 304842038 51 | trifolium_pratense repeatmask_redat 33405939 52 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species triticum_turgidum -logic_name repeatmask_redat 53 | triticum_turgidum genome_length 10463058104 54 | triticum_turgidum repeatmask_redat 7498456644 55 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species vitis_vinifera -logic_name repeatmask_redat 56 | vitis_vinifera genome_length 486265422 57 | vitis_vinifera repeatmask_redat 43968916 58 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species zea_mays -logic_name repeatmask_redat 59 | zea_mays genome_length 2135083061 60 | zea_mays repeatmask_redat 1270268146 61 | -------------------------------------------------------------------------------- /repeats/bench/log.repeat.N50: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 10133 9973 9431 2 | arabidopsis_halleri 554 1380 1431 3 | arabidopsis_thaliana 445 1779 2211 4 | arabis_alpina 1040 2245 1050 5 | brachypodium_distachyon 4986 6260 6665 6 | brassica_rapa 642 1046 777 7 | camelina_sativa 878 1272 1176 8 | citrullus_lanatus 2596 1020 1103 9 | cucumis_melo 1939 3141 1338 10 | helianthus_annuus 5018 8716 1317 11 | malus_domestica_golden 2416 4729 1268 12 | olea_europaea_sylvestris 3153 1956 1218 13 | oryza_sativa 2931 4479 6077 14 | prunus_dulcis 1627 2528 1025 15 | rosa_chinensis 2125 1479 950 16 | setaria_viridis 3124 1727 1722 17 | trifolium_pratense 555 326 265 18 | triticum_turgidum 9066 9947 10124 19 | vitis_vinifera 1753 3369 1550 20 | zea_mays 13137 11806 11419 21 | -------------------------------------------------------------------------------- /repeats/bench/log.repeat.length: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 1506690 404 777962 1173 847592 963 2 | arabidopsis_halleri 226080 124 81857 308 57901 209 3 | arabidopsis_thaliana 172935 123 48144 267 28797 181 4 | arabis_alpina 279129 169 146057 415 98017 222 5 | brachypodium_distachyon 150191 121 74215 193 67632 183 6 | brassica_rapa 348258 110 160157 259 69345 163 7 | camelina_sativa 709160 122 267290 281 201059 223 8 | citrullus_lanatus 323894 112 151980 190 52941 183 9 | cucumis_melo 305083 118 148925 329 51833 213 10 | helianthus_annuus 2387122 198 355890 388 479400 352 11 | malus_domestica_golden 531496 148 211929 478 126487 201 12 | olea_europaea_sylvestris 901519 139 291445 254 375614 238 13 | oryza_sativa 278406 147 160371 255 129121 239 14 | prunus_dulcis 190357 109 105546 287 36891 148 15 | rosa_chinensis 463880 150 189086 268 93715 207 16 | setaria_viridis 247732 165 116459 271 105088 274 17 | trifolium_pratense 277811 206 139254 153 155808 147 18 | triticum_turgidum 4291533 312 1914776 1270 1784719 1456 19 | vitis_vinifera 423876 132 185204 395 69315 247 20 | zea_mays 847205 211 365978 696 372467 669 21 | -------------------------------------------------------------------------------- /repeats/bench/log.repeat.overlap: -------------------------------------------------------------------------------- 1 | aegilops_tauschiiaegilops_tauschii 3454963624 2833915889 2871180926 76588631 161578261 201668761 183619439 2 | arabidopsis_halleriarabidopsis_halleri 61005460 21270154 34296626 8063578 3574480 39244394 4508394 3 | arabidopsis_thalianaarabidopsis_thaliana 43943711 11477631 20623470 5229908 2896188 6146057 3361091 4 | arabis_alpinaarabis_alpina 116069065 34974204 101561844 7713526 7886525 41787455 10551564 5 | brachypodium_distachyonbrachypodium_distachyon 84292420 63878923 63001822 2649893 3750379 6329521 6241890 6 | brassica_rapabrassica_rapa 93148976 12137250 53471309 18246330 8884888 33405301 10922095 7 | camelina_sativacamelina_sativa 230739085 72221094 100798661 20302963 16816272 83987132 23257791 8 | citrullus_lanatuscitrullus_lanatus 149139515 17684233 45494701 13773277 9858124 37754666 14663873 9 | cucumis_melocucumis_melo 142860741 21541420 122388576 16345201 15266153 33077764 19515094 10 | helianthus_annuushelianthus_annuus 2227207957 286811294 729901538 35524719 47044137 97126226 58855498 11 | malus_domestica_goldenmalus_domestica_golden 292775696 51043115 256385563 15837106 26741124 109164886 34292176 12 | olea_europaea_sylvestrisolea_europaea_sylvestris 516233180 175103570 184876693 25250984 197048464 159769209 210634322 13 | oryza_sativaoryza_sativa 138926649 102882577 116355536 10160029 11040079 19487736 15617002 14 | prunus_dulcisprunus_dulcis 75942705 9724644 62228480 6821347 7159674 16557179 9697037 15 | rosa_chinensisrosa_chinensis 247911427 35147667 89040006 11590565 14996352 22275228 21000797 16 | setaria_viridissetaria_viridis 161622886 62499090 65656462 5299406 8470076 11109097 12159449 17 | trifolium_pratensetrifolium_pratense 91430112 21074343 16935969 14305569 5963288 57474544 8446622 18 | triticum_turgidumtriticum_turgidum 8599065456 7281888447 7341688163 171184854 283316382 400291848 330564764 19 | vitis_viniferavitis_vinifera 194397642 30845098 160405750 28366556 19834542 63337140 26244040 20 | zea_mayszea_mays 1687073942 1238165526 1300760021 32339911 68910571 55601139 77759381 21 | -------------------------------------------------------------------------------- /repeats/bench/log.updown500: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 37802544 2033719 2556035 6319681 2 | arabidopsis_halleri 32329295 3209393 5607682 10045427 3 | arabidopsis_thaliana 25181016 1142935 3541364 8878106 4 | arabis_alpina 20949737 804950 3986554 3468875 5 | brachypodium_distachyon 33409828 2753591 2591857 4262505 6 | brassica_rapa 45594270 902632 4380362 8410470 7 | camelina_sativa 92778852 3128183 6080024 13135142 8 | citrullus_lanatus 22596226 244442 849017 1732409 9 | cucumis_melo 29465346 471854 5587397 5542277 10 | helianthus_annuus 51826802 888649 3018001 15501880 11 | malus_domestica_golden 40858762 759814 6404045 7165639 12 | olea_europaea_sylvestris 51470230 1565047 2186588 13800220 13 | oryza_sativa 37562150 3910303 5347525 5571857 14 | prunus_dulcis 28912163 1042283 5338746 5180817 15 | rosa_chinensis 44161039 856677 3423518 10039144 16 | setaria_viridis 36616623 1995173 2161278 5533510 17 | trifolium_pratense 37853815 3500940 2647709 11237759 18 | triticum_turgidum 65803596 1556728 2294898 6261722 19 | vitis_vinifera 30402773 483070 4132002 5409249 20 | zea_mays 39795578 1491136 1729071 8777432 21 | -------------------------------------------------------------------------------- /repeats/bench/log.updown500.16mer: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 1111069 1325298 1902833 1363 1454080 1714053 2391503 1061 2298895 3100696 5795036 914 2878 2597 13888 0 0 0 2 | arabidopsis_halleri 1247101 1635827 2932850 1005 2548460 3253636 5179384 677 4261046 5692656 9016601 2288 9731 11588 15605 0 0 0 3 | arabidopsis_thaliana 624454 776496 1039275 2829 2153983 2610131 3318313 2921 4436392 5884689 8026943 5198 654 1568 3030 0 0 0 4 | arabis_alpina 474031 544822 723210 297 2231580 2632987 3715198 242 1516412 1930758 3152918 1728 746 5890 6915 0 0 0 5 | brachypodium_distachyon 1565921 1853918 2524763 779 1524916 1788360 2376687 795 2037498 2561620 3834269 969 2402 1992 4583 0 0 0 6 | brassica_rapa 530834 601740 791040 6796 2478355 2954115 4018130 8008 3914096 5034248 7423005 8274 873 4025 7990 0 0 0 7 | camelina_sativa 1556418 1895552 2815016 13834 3122315 3875028 5548080 14461 4375242 6407004 11598425 14856 5437 6442 20796 0 0 0 8 | citrullus_lanatus 150203 163711 211526 3834 560286 620903 747301 3909 857201 1011371 1455931 17616 139 287 1867 0 0 0 9 | cucumis_melo 246274 290930 420430 11398 2640761 3328074 5270065 13395 2094704 2839587 5081380 9189 400 9734 11503 0 0 0 10 | helianthus_annuus 356715 443445 809633 14526 1810759 2109613 2815682 3631 4695577 6575658 14166127 15288 3124 2829 61862 0 0 0 11 | malus_domestica_golden 348818 419027 682749 4301 3040908 3772748 5961699 4538 2501077 3353107 6455560 2690 1800 14598 22558 0 0 0 12 | olea_europaea_sylvestris 773402 940950 1445114 1710 1105399 1338148 2019176 1943 4818543 6524789 12771861 4430 2629 3603 46070 0 0 0 13 | oryza_sativa 1789624 2254108 3594856 7281 2593470 3225411 4939191 7934 1942108 2661402 5024991 6158 8489 10229 15572 0 0 0 14 | prunus_dulcis 421171 565558 959320 9250 2373012 3076906 4992783 12826 1804183 2539079 4695670 9597 1662 10073 11838 0 0 0 15 | rosa_chinensis 406613 515667 772244 9538 1681148 2115001 3143329 9248 3324915 4755938 9230439 10040 977 3890 24917 0 0 0 16 | setaria_viridis 970273 1147999 1835321 1459 1096754 1290404 1990566 1453 2356486 3018484 5076815 2184 4599 4395 10603 0 0 0 17 | trifolium_pratense 1869551 2225603 3094900 4938 1455142 1718906 2333199 4835 5427189 6828986 10372840 3601 4469 2921 17234 0 0 0 18 | triticum_turgidum 867856 1013476 1439819 1737 1264377 1485940 2131356 1414 2137683 2856801 5623225 648 1277 3410 16226 0 0 0 19 | vitis_vinifera 324268 360245 438647 6161 2352661 2806772 3859436 6274 2208371 2961505 4890287 5692 99 3723 7834 0 0 0 20 | zea_mays 782560 932579 1385724 2365 900580 1073884 1606348 2449 2862016 4091994 8069996 1097 2816 3364 24826 0 0 0 21 | -------------------------------------------------------------------------------- /repeats/bench/log.updown500.21mer: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 1223322 1408334 1860185 2290 1593018 1815191 2337877 1770 2746261 3473619 5625037 791 1863 1384 8776 0 0 0 2 | arabidopsis_halleri 1408835 1778359 2846660 936 2857251 3498578 5047256 553 5188324 6366149 8692000 1189 7033 8047 9579 0 0 0 3 | arabidopsis_thaliana 663200 797431 1005778 4011 2344450 2722424 3246191 3770 5159100 6335478 7753347 5167 356 1006 1331 0 0 0 4 | arabis_alpina 504995 564886 697566 289 2502190 2836538 3633829 200 1801615 2153476 3054388 1594 279 3698 4345 0 0 0 5 | brachypodium_distachyon 1734256 1978460 2450401 1251 1670711 1892553 2306867 1270 2381022 2819157 3695748 962 1179 1004 2184 0 0 0 6 | brassica_rapa 549634 609285 755899 5706 2735926 3121976 3904293 7147 4986641 5760273 7101468 7353 510 2326 3444 0 0 0 7 | camelina_sativa 1733276 2022031 2715179 16131 3506426 4135102 5380274 12695 5574783 7355635 11118445 13544 3739 4105 10383 0 0 0 8 | citrullus_lanatus 152292 163914 201039 5808 592707 636177 715949 5844 1059247 1149593 1366495 12031 81 111 762 0 0 0 9 | cucumis_melo 257229 297295 403849 18072 3109207 3710697 5166500 20554 2634651 3289721 4931468 14357 238 5135 5841 0 0 0 10 | helianthus_annuus 380532 458931 784156 24526 1982543 2229863 2749547 5684 6029547 7788907 13732666 24555 2933 1468 41612 0 0 0 11 | malus_domestica_golden 366050 430759 658087 6484 3545516 4172896 5817868 6421 3220361 3978248 6225186 3619 1592 9565 13733 0 0 0 12 | olea_europaea_sylvestris 871190 1018641 1406500 2449 1235991 1440761 1966152 2781 6411399 7911176 12443983 2554 1264 1861 26460 0 0 0 13 | oryza_sativa 2116101 2538384 3491871 12073 3020749 3585578 4806162 13162 2481478 3148448 4847778 10081 4277 5249 8160 0 0 0 14 | prunus_dulcis 446225 583912 932789 7176 2673607 3308626 4881290 9921 2173555 2836039 4539038 7582 1351 7325 7954 0 0 0 15 | rosa_chinensis 429337 528817 745298 14518 1826447 2215157 3054660 13866 4025118 5349942 8969485 8487 768 2633 16615 0 0 0 16 | setaria_viridis 1081627 1243340 1783202 2271 1208904 1384165 1935141 2202 2777054 3357458 4928682 1495 3674 3381 7616 0 0 0 17 | trifolium_pratense 2123482 2403634 2968335 7571 1602235 1815838 2237466 7451 7249275 8204857 10097794 4552 2099 1476 6772 0 0 0 18 | triticum_turgidum 950978 1072533 1402071 2916 1387830 1576218 2078341 2354 2575423 3226596 5421604 745 698 2191 10238 0 0 0 19 | vitis_vinifera 330290 360874 424517 9858 2766085 3111395 3771552 9755 2948732 3554803 4723595 8621 73 1371 1874 0 0 0 20 | zea_mays 864987 996709 1351487 4284 1000625 1151863 1566536 4379 3428345 4582250 7840336 969 1874 2245 16700 0 0 0 21 | -------------------------------------------------------------------------------- /repeats/bench/log.updown500.31mer: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 1327296 1470530 1776128 1559 1714700 1884340 2232188 1220 3252707 3824185 5292434 593 923 439 4187 0 0 0 2 | arabidopsis_halleri 1570475 1900695 2679047 154 3094086 3655001 4789805 131 5321226 6333885 8063564 1054 3654 4023 4790 0 0 0 3 | arabidopsis_thaliana 672608 785618 940364 1740 2352451 2683600 3104925 1520 4981634 6024166 7221741 1858 125 538 710 0 0 0 4 | arabis_alpina 524160 567793 647642 89 2739457 2987187 3473786 120 2066221 2324126 2864631 1521 59 1518 1884 0 0 0 5 | brachypodium_distachyon 1845094 2029045 2304231 850 1760042 1926626 2169866 866 2611273 2927279 3426905 685 238 194 472 0 0 0 6 | brassica_rapa 538950 585848 687479 4471 2817901 3124200 3683004 6220 4912675 5518795 6472154 6468 186 913 1616 0 0 0 7 | camelina_sativa 1842346 2066011 2520913 7881 3716167 4206627 5054915 10127 6202631 7624495 10197074 11795 2071 2189 4698 0 0 0 8 | citrullus_lanatus 148192 157470 180678 2945 576000 607310 655784 2921 1066160 1110006 1192984 5528 56 57 118 0 0 0 9 | cucumis_melo 259787 293561 371604 10555 3436927 3947971 4962679 11628 2981354 3526419 4639412 8997 97 1846 1970 0 0 0 10 | helianthus_annuus 398948 465783 734515 16886 2090683 2278943 2619561 3448 7508292 8939808 12883915 16468 2697 484 21530 0 0 0 11 | malus_domestica_golden 376778 432810 610052 3411 3935807 4429259 5535300 3166 3867927 4423059 5775524 1968 1297 5594 7141 0 0 0 12 | olea_europaea_sylvestris 980001 1091723 1331015 1033 1377986 1534403 1862652 1173 7938185 9078737 11806171 993 349 553 9723 0 0 0 13 | oryza_sativa 2349954 2699566 3289073 8470 3311311 3777267 4544331 9227 2940533 3479084 4502520 7125 1188 1482 2404 0 0 0 14 | prunus_dulcis 469297 595896 880989 3616 2892536 3443497 4662741 5042 2398511 2958175 4235468 4259 819 5091 5300 0 0 0 15 | rosa_chinensis 447598 531933 693111 7075 1947075 2268535 2882830 6659 4780031 5877362 8460802 6331 483 1619 9679 0 0 0 16 | setaria_viridis 1178837 1318310 1680782 1441 1303496 1453873 1826274 1441 3141321 3602375 4639356 832 2032 1903 4326 0 0 0 17 | trifolium_pratense 2193420 2395040 2720261 4109 1642125 1797835 2050302 4118 7826017 8494360 9563697 2113 730 582 2498 0 0 0 18 | triticum_turgidum 1020712 1109210 1327878 1930 1499577 1641559 1974249 1545 3050874 3558442 5031631 496 223 865 4691 0 0 0 19 | vitis_vinifera 328208 352166 397052 5808 2944335 3192184 3599005 5656 3224506 3666047 4398918 5065 47 340 397 0 0 0 20 | zea_mays 941465 1046159 1284341 3507 1092771 1211705 1488377 3544 4102855 5070164 7390280 751 858 1055 7903 0 0 0 21 | -------------------------------------------------------------------------------- /repeats/bench/log.wheat.Red.bed: -------------------------------------------------------------------------------- 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_arinalrfor -logic_name repeatdetector 2 | triticum_aestivum_arinalrfor genome_length 14659055505 3 | triticum_aestivum_arinalrfor repeatdetector 11909171698 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_jagger -logic_name repeatdetector 5 | triticum_aestivum_jagger genome_length 14552150998 6 | triticum_aestivum_jagger repeatdetector 11870523645 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_julius -logic_name repeatdetector 8 | triticum_aestivum_julius genome_length 14394882126 9 | triticum_aestivum_julius repeatdetector 11868610789 10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_lancer -logic_name repeatdetector 11 | triticum_aestivum_lancer genome_length 14293273119 12 | triticum_aestivum_lancer repeatdetector 11601105199 13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_landmark -logic_name repeatdetector 14 | triticum_aestivum_landmark genome_length 14443802583 15 | triticum_aestivum_landmark repeatdetector 11849684520 16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mace -logic_name repeatdetector 17 | triticum_aestivum_mace genome_length 14362806306 18 | triticum_aestivum_mace repeatdetector 11692169342 19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mattis -logic_name repeatdetector 20 | triticum_aestivum_mattis genome_length 14955365424 21 | triticum_aestivum_mattis repeatdetector 11748313140 22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_norin61 -logic_name repeatdetector 23 | triticum_aestivum_norin61 genome_length 14157632112 24 | triticum_aestivum_norin61 repeatdetector 11766521326 25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_stanley -logic_name repeatdetector 26 | triticum_aestivum_stanley genome_length 14207638630 27 | triticum_aestivum_stanley repeatdetector 11923454722 28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_spelta -logic_name repeatdetector 29 | triticum_spelta genome_length 14453523434 30 | triticum_spelta repeatdetector 11727050595 31 | -------------------------------------------------------------------------------- /repeats/bench/log.wheat.redat.bed: -------------------------------------------------------------------------------- 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_arinalrfor -logic_name repeatmask_redat 2 | triticum_aestivum_arinalrfor genome_length 14659055505 3 | triticum_aestivum_arinalrfor repeatmask_redat 10173498584 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_jagger -logic_name repeatmask_redat 5 | triticum_aestivum_jagger genome_length 14552150998 6 | triticum_aestivum_jagger repeatmask_redat 10067215340 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_julius -logic_name repeatmask_redat 8 | triticum_aestivum_julius genome_length 14394882126 9 | triticum_aestivum_julius repeatmask_redat 10120102277 10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_lancer -logic_name repeatmask_redat 11 | triticum_aestivum_lancer genome_length 14293273119 12 | triticum_aestivum_lancer repeatmask_redat 9923751623 13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_landmark -logic_name repeatmask_redat 14 | triticum_aestivum_landmark genome_length 14443802583 15 | triticum_aestivum_landmark repeatmask_redat 10047469943 16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mace -logic_name repeatmask_redat 17 | triticum_aestivum_mace genome_length 14362806306 18 | triticum_aestivum_mace repeatmask_redat 9997367079 19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mattis -logic_name repeatmask_redat 20 | triticum_aestivum_mattis genome_length 14955365424 21 | triticum_aestivum_mattis repeatmask_redat 10037233281 22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_norin61 -logic_name repeatmask_redat 23 | triticum_aestivum_norin61 genome_length 14157632112 24 | triticum_aestivum_norin61 repeatmask_redat 10067342383 25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_stanley -logic_name repeatmask_redat 26 | triticum_aestivum_stanley genome_length 14207638630 27 | triticum_aestivum_stanley repeatmask_redat 10114891908 28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_spelta -logic_name repeatmask_redat 29 | triticum_spelta genome_length 14453523434 30 | triticum_spelta repeatmask_redat 10021321896 31 | -------------------------------------------------------------------------------- /repeats/bench/pfam/enrich.R: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env Rscript 3 | # based on 4 | # https://github.com/eead-csic-compbio/get_homologues/blob/master/pfam_enrich.pl 5 | 6 | args = commandArgs(trailingOnly=TRUE) 7 | 8 | if(length(args)<3) { 9 | stop("# Usage: filenames", call.=FALSE) 10 | } 11 | 12 | # globals 13 | direction="greater" #"two.sides" 14 | multitest="fdr" 15 | verbose=F 16 | 17 | ##query_file="aegilops_tauschii.Red.genes.ovl50.tsv" 18 | #control_file="aegilops_tauschii.tsv.count.tsv" 19 | #out_file="kk" 20 | 21 | query_file=args[1] 22 | control_file=args[2] 23 | out_file=args[3] 24 | 25 | # parse data 26 | que_data=read.csv(query_file, sep="\t", header=FALSE); 27 | que_rows=nrow(que_data); 28 | ref_data=read.csv(control_file, sep="\t", header=FALSE); 29 | ref_rows=nrow(ref_data); 30 | 31 | # uses globals que_total, ref_total 32 | enrich_test <- function(x){ 33 | que_id=x[1]; 34 | que_value=as.numeric(x[2]); 35 | ref_value=as.numeric(ref_data[ref_data$V1==que_id,2]); 36 | if (length(ref_value)==0){ 37 | ref_value=0; 38 | } 39 | if(verbose==T){ 40 | cat(paste(que_id, "\n"), file=stderr()); 41 | } 42 | values=c(que_value, ref_value, que_total, ref_total); 43 | input_matrix=matrix(values, nrow = 2, 44 | dimnames=list(c("exp", "control"), c("Pfam", "total"))); 45 | if(verbose==T){ 46 | cat(paste(input_matrix, "\n"), file=stderr()); 47 | } 48 | fisher_htest=fisher.test(input_matrix, alternative=direction); 49 | if(verbose==T){ 50 | cat(paste(fisher_htest, "\n"), file=stderr()); 51 | } 52 | ret_value=c( que_id, fisher_htest$p.value ); 53 | return(ret_value); 54 | } 55 | 56 | print_pvalues <- function(x) { 57 | cat(paste(x[1],"\t",x[2],"\t",x[3],"\n"),file=out_file,append=T); 58 | } 59 | 60 | que_total=sum(que_data[,2]); 61 | ref_total=sum(ref_data[,2]); 62 | pvalues=apply(que_data, 1, enrich_test); 63 | 64 | ## Multiple test adjustment 65 | num_pvalues=as.numeric(pvalues[2,]); 66 | adj_pvalues=p.adjust(num_pvalues, method=multitest); 67 | result_pvalues=rbind(pvalues, adj_pvalues); 68 | result_pvalues=t(result_pvalues); # rows to columns 69 | apply(result_pvalues, 1, print_pvalues); 70 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/HOWTO.txt: -------------------------------------------------------------------------------- 1 | 2 | ## gene/exon overlap stats 3 | 4 | # 1st genes 5 | 6 | export MINOVER=50 7 | 8 | while read -r col1 col2; do 9 | rm=$(bedtools intersect -a ../bed/${col1}.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[7]; END{print $over}') 10 | genes=$(bedtools intersect -a ../bed/${col1}.bed -b ../bed/${col1}.bed -sorted -wo | perl -lane '$over+=$F[8]; END{print $over}') 11 | read tgenes filename <<< $(wc -l ../bed/${col1}.bed) 12 | printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm 13 | 14 | # save overlapped genes 15 | bedtools intersect -a ../bed/${col1}.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo > bed/${col2}.repeatmodeller.genes.bed 16 | 17 | # MINOVER is required 18 | perl -lane 'print $F[3] if($F[7] > ${MINOVER})' bed/${col2}.repeatmodeller.genes.bed | uniq > bed/${col2}.repeatmodeller.genes.ovl${MINOVER}.list 19 | 20 | # tally Pfam domain occurrences 21 | if [ ! -f "pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.enrich.tsv" ]; 22 | then 23 | while read ln; do fgrep "$ln" ../pfam/${col2}.tsv; done < bed/${col2}.repeatmodeller.genes.ovl${MINOVER}.list | cut -f 2 | sort | uniq -c | perl -lane 'if(/PF/){ print "$F[1]\t$F[0]" }' > pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.tsv 24 | 25 | # compute enrichment 26 | Rscript ../pfam/enrich.R pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.tsv ../pfam/${col2}.tsv.count.tsv pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.all.tsv 27 | perl -lane 'print if($F[2] < 0.05)' pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.all.tsv > pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.enrich.tsv 28 | else 29 | continue 30 | fi 31 | 32 | done < list.cores.sp > log.genes 33 | 34 | 35 | # now exons 36 | while read -r col1 col2; do 37 | rm=$(bedtools intersect -a ../bed/${col1}.exon.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[7]; END{print $over}') 38 | exons=$(bedtools intersect -a ../bed/${col1}.exon.bed -b ../bed/${col1}.exon.bed -sorted -wo | perl -lane '$over+=$F[8]; END{print $over}') 39 | read texons filename <<< $(wc -l ../bed/${col1}.exon.bed) 40 | printf "%s\t%d\t%d\t%d\n" $col2 $texons $exons $rm 41 | 42 | # save overlapped exons 43 | bedtools intersect -a ../bed/${col1}.exon.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo > bed/${col2}.repeatmodeller.exons.bed 44 | 45 | perl -lane 'print $F[3] if($F[7] > ${MINOVER})' bed/${col2}.repeatmodeller.exons.bed | uniq > bed/${col2}.repeatmodeller.exons.${MINOVER}list 46 | 47 | done < list.cores.sp > log.exons 48 | 49 | # now up & downstream 50 | while read -r col1 col2; do 51 | rm=$(bedtools intersect -a ../bed/${col1}.updown500.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 52 | updown=$(bedtools intersect -a ../bed/${col1}.updown500.bed -b ../bed/${col1}.updown500.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 53 | printf "%s\t%d\t%d\n" $col2 $updown $rm 54 | done < list.cores.sp > log.updown500 55 | 56 | # -> gene_exon_updown_plot 57 | 58 | 59 | ## check #copies, length of repeats and overlap among methods 60 | 61 | # repeat length stats 62 | while read -r col1 col2; do 63 | printf "$col2" 64 | perl -lane 'print $F[2]-$F[1]' ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed | Rscript -e 'median(scan(file="stdin"))' 2>&1 | perl -ne 'if(/Read (\d+) items/){ print "\t$1" } elsif(/\[1\] (\d+)/){ print "\t$1\n" }' 65 | 66 | done < list.cores.sp > log.repeat.length 67 | 68 | # repeat length stats (N50) 69 | while read -r col1 col2; do 70 | printf "$col2" 71 | perl -ane '$l=$F[2]-$F[1]; $TL+=$l; $R{$.}=$l; END{ foreach $s (sort {$R{$b}<=>$R{$a}} keys(%R)){ $t+=$R{$s}; if($t>$TL/2){ print "\t$R{$s}\n"; exit }}}' ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed 72 | 73 | done < list.cores.sp > log.repeat.N50 74 | 75 | # check Pfam enrichment of gene overlap 76 | 77 | cut -f 1 pfam/*repeatmodeller*enrich.tsv | sort | uniq -c | sort -nr | perl -lane 'if(/PF/){ print "$F[1]\t$F[0]" }' > pfam/enrich.repeatmodeller.tsv 78 | 79 | # Red repeat overlap vs others 80 | while read -r col1 col2; do 81 | printf "$col2" 82 | red=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.Red.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 83 | rm=$(bedtools intersect -b ../bed/${col2}.Red.bed -a ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 84 | nrplants=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.repeatmask_nrplants.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 85 | redat=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.repeatmask_redat.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 86 | dust=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.dust.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 87 | trf=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.trf.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 88 | totdust=$(bedtools intersect -a ../bed/${col2}.dust.bed -b ../bed/${col2}.dust.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 89 | tottrf=$(bedtools intersect -a ../bed/${col2}.trf.bed -b ../bed/${col2}.trf.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}') 90 | 91 | printf "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" $col2 $red $rm $redat $nrplants $dust $trf $totdust $tottrf 92 | 93 | done < list.cores.sp > log.repeat.overlap 94 | 95 | 96 | ## check overlap with denovo called Rgenes (NLR-annotator) 97 | 98 | while read -r col1 col2; do 99 | rm=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[9]; END{print $over}') 100 | genes=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../denovoRgenes/${col2}.nlr.bed.sorted -sorted -wo | perl -lane '$over+=$F[12]; END{print $over}') 101 | read tgenes filename <<< $(wc -l ../denovoRgenes/${col2}.nlr.bed.sorted) 102 | printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm 103 | 104 | done < list.cores.sp > log.Rgenes 105 | 106 | while read -r col1 col2; do 107 | rm=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane 'next if($F[9] <= $ENV{"MINOVER"}); $over+=$F[9]; END{print $over}') 108 | genes=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../denovoRgenes/${col2}.nlr.bed.sorted -sorted -wo | perl -lane '$over+=$F[12]; END{print $over}') 109 | read tgenes filename <<< $(wc -l ../denovoRgenes/${col2}.nlr.bed.sorted) 110 | printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm 111 | 112 | done < list.cores.sp > log.Rgenes.$MINOVER 113 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/list.cores.sp: -------------------------------------------------------------------------------- 1 | aegilops_tauschii_core_48_101_3 aegilops_tauschii 2 | arabidopsis_thaliana_core_48_101_11 arabidopsis_thaliana 3 | brachypodium_distachyon_core_48_101_4 brachypodium_distachyon 4 | brassica_rapa_core_48_101_1 brassica_rapa 5 | camelina_sativa_core_48_101_1 camelina_sativa 6 | cucumis_melo_core_48_101_1 cucumis_melo 7 | prunus_dulcis_core_48_101_1 prunus_dulcis 8 | rosa_chinensis_core_48_101_1 rosa_chinensis 9 | vitis_vinifera_core_48_101_3 vitis_vinifera 10 | zea_mays_core_48_101_7 zea_mays 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.Rgenes: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 916 3144217 425022 2 | arabidopsis_thaliana 69 201660 66316 3 | brachypodium_distachyon 344 1249602 266009 4 | brassica_rapa 219 729119 431797 5 | camelina_sativa 573 1808571 906291 6 | cucumis_melo 89 294239 107550 7 | prunus_dulcis 387 1354547 700937 8 | rosa_chinensis 963 3347977 2294566 9 | vitis_vinifera 739 2723777 1902623 10 | zea_mays 158 487237 76561 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.Rgenes.50: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 916 3144217 416193 2 | arabidopsis_thaliana 69 201660 65934 3 | brachypodium_distachyon 344 1249602 262733 4 | brassica_rapa 219 729119 430033 5 | camelina_sativa 573 1808571 901520 6 | cucumis_melo 89 294239 106155 7 | prunus_dulcis 387 1354547 695038 8 | rosa_chinensis 963 3347977 2284956 9 | vitis_vinifera 739 2723777 1893319 10 | zea_mays 158 487237 75567 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.exons: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 167903 68899123 4177668 2 | arabidopsis_thaliana 145966 46554976 2523410 3 | brachypodium_distachyon 152258 68995392 8240434 4 | brassica_rapa 206588 48263240 4524180 5 | camelina_sativa 481073 132504580 17860756 6 | cucumis_melo 137797 41060653 4349419 7 | prunus_dulcis 134744 47752477 6017017 8 | rosa_chinensis 174074 74446167 15845592 9 | vitis_vinifera 147613 39848131 5441784 10 | zea_mays 206593 63390920 5075327 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.genes: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 39614 348459650 148594336 2 | arabidopsis_thaliana 27628 67504196 3271128 3 | brachypodium_distachyon 34310 122507803 15431148 4 | brassica_rapa 41018 83240510 11035129 5 | camelina_sativa 89275 215022170 27495248 6 | cucumis_melo 28299 99772108 21734761 7 | prunus_dulcis 27963 94240163 14257008 8 | rosa_chinensis 45464 117342283 22983889 9 | vitis_vinifera 29927 153564951 50510060 10 | zea_mays 39583 168056455 42598368 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.repeat.N50: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 7894 2 | arabidopsis_thaliana 1178 3 | brachypodium_distachyon 2125 4 | brassica_rapa 628 5 | camelina_sativa 1105 6 | cucumis_melo 1819 7 | prunus_dulcis 1422 8 | rosa_chinensis 1958 9 | vitis_vinifera 1604 10 | zea_mays 11380 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.repeat.length: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 1758407 475 2 | arabidopsis_thaliana 72138 57 3 | brachypodium_distachyon 222710 135 4 | brassica_rapa 303119 147 5 | camelina_sativa 611700 140 6 | cucumis_melo 407579 136 7 | prunus_dulcis 243499 108 8 | rosa_chinensis 499475 188 9 | vitis_vinifera 496352 175 10 | zea_mays 853432 270 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.repeat.overlap: -------------------------------------------------------------------------------- 1 | aegilops_tauschiiaegilops_tauschii 3454963624 3343466712 2833915889 2871180926 76588631 161578261 201668761 183619439 2 | arabidopsis_thalianaarabidopsis_thaliana 43943711 14661371 11477631 20623470 5229908 2896188 6146057 3361091 3 | brachypodium_distachyonbrachypodium_distachyon 84292420 74811547 63878923 63001822 2649893 3750379 6329521 6241890 4 | brassica_rapabrassica_rapa 93148976 63192385 12137250 53471309 18246330 8884888 33405301 10922095 5 | camelina_sativacamelina_sativa 230739085 165779082 72221094 100798661 20302963 16816272 83987132 23257791 6 | cucumis_melocucumis_melo 142860741 134727620 21541420 122388576 16345201 15266153 33077764 19515094 7 | prunus_dulcisprunus_dulcis 75942705 66309477 9724644 62228480 6821347 7159674 16557179 9697037 8 | rosa_chinensisrosa_chinensis 247911427 223060560 35147667 89040006 11590565 14996352 22275228 21000797 9 | vitis_viniferavitis_vinifera 194397642 175266445 30845098 160405750 28366556 19834542 63337140 26244040 10 | zea_mayszea_mays 1687073942 1622632631 1238165526 1300760021 32339911 68910571 55601139 77759381 11 | -------------------------------------------------------------------------------- /repeats/bench/repeatmodeller/log.updown500: -------------------------------------------------------------------------------- 1 | aegilops_tauschii 37802544 10572939 2 | arabidopsis_thaliana 25181016 1793529 3 | brachypodium_distachyon 33409828 6752027 4 | brassica_rapa 45594270 6041729 5 | camelina_sativa 92778852 13652759 6 | cucumis_melo 29465346 9226572 7 | prunus_dulcis 28912163 6906338 8 | rosa_chinensis 44161039 13176712 9 | vitis_vinifera 30402773 7477692 10 | zea_mays 39795578 10459637 11 | -------------------------------------------------------------------------------- /repeats/get_repeats_ensembl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Script to download annotated repeated elements 4 | # of a selected species in Ensembl Plants 5 | 6 | # Required binaries: wget, sort, perl, mysql, bedtools 7 | 8 | # Copyright [2020-21] EMBL-European Bioinformatics Institute 9 | 10 | # documentation about Ensembl schemas can be found at 11 | # http://www.ensembl.org/info/docs/api/index.html 12 | 13 | if [[ $# -eq 0 ]] ; then 14 | echo "# example usage: $0 arabidopsis_thaliana" 15 | exit 0 16 | else 17 | SPECIES=$1 18 | fi 19 | 20 | # PARAMS 21 | MINLEN=90 22 | MAXDEGENPERC=10 23 | MAXIDFRAC=0.95 24 | DEBUG=0 25 | 26 | # SERVER DETAILS 27 | FTPSERVER="ftp://ftp.ensemblgenomes.org/pub" 28 | DIV=plants 29 | SERVER=mysql-eg-publicsql.ebi.ac.uk 30 | USER=anonymous 31 | PORT=4157 32 | 33 | ## 1) get Ensembl Plants current release number from FTP server 34 | # Note: wget is used, this can be modified to use alternatives ie curl 35 | SUMFILE="${FTPSERVER}/${DIV}/current/summary.txt" 36 | RELEASE=`wget --quiet -O - $SUMFILE | \ 37 | perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }'` 38 | 39 | # work out Ensembl Genomes release 40 | EGRELEASE=$(( RELEASE - 53)); 41 | 42 | ## 2) select core db matching selected species 43 | SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \ 44 | -e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}") 45 | 46 | if [ -z "$SPECIESCORE" ]; then 47 | echo "# ERROR: cannot find species $SPECIES" 48 | exit 1 49 | else 50 | echo "# Ensembl core db: $SPECIESCORE"; 51 | fi 52 | 53 | ## 3) retrieve 1-based coords of repeats 54 | 55 | # note these might be redundant/overlapping 56 | #1 3 106 trf 57 | #1 4 91 trf 58 | 59 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \ 60 | "SELECT sr.name,r.seq_region_start,r.seq_region_end,rc.repeat_class \ 61 | FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \ 62 | WHERE r.seq_region_id=sr.seq_region_id \ 63 | AND r.repeat_consensus_id=rc.repeat_consensus_id \ 64 | AND (rc.repeat_class <> 'Unspecified' AND rc.repeat_class <> \ 65 | 'repeatdetector' AND rc.repeat_class <> 'tallymer') \ 66 | AND (r.seq_region_end-r.seq_region_start+1) > $MINLEN" | \ 67 | sort -u -k1,1 -k2,2n > _${SPECIES}.repeats1.bed 68 | 69 | ## 4) retrieve 1-based coords of genes 70 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \ 71 | "SELECT sr.name,g.seq_region_start,g.seq_region_end,g.stable_id \ 72 | FROM gene g JOIN seq_region sr \ 73 | WHERE g.seq_region_id=sr.seq_region_id" | \ 74 | sort -k1,1 -k2,2n > _${SPECIES}.genes1.bed 75 | 76 | ## 5) curate repeats by substracting annotated genes and 77 | ## convert to 0-based BED format 78 | bedtools subtract -sorted \ 79 | -a _${SPECIES}.repeats1.bed -b _${SPECIES}.genes1.bed | \ 80 | perl -lane '$F[1]-=1; print join("\t",@F)' >\ 81 | _${SPECIES}.repeats.bed 82 | 83 | if [ ! -s _${SPECIES}.repeats.bed ]; then 84 | echo "# no repeats found" 85 | exit 2 86 | fi 87 | 88 | ## 6) download and uncompress genomic sequence 89 | FASTA="*${SPECIES^}*.dna.toplevel.fa.gz" 90 | URL="${FTPSERVER}/${DIV}/current/fasta/${SPECIES}/dna/${FASTA}" 91 | if [ ! -s _${SPECIES}.toplevel.fasta ]; then 92 | echo "# downloading $URL" 93 | wget -c $URL -O- | gunzip > _${SPECIES}.toplevel.fasta 94 | else 95 | echo "# re-using _${SPECIES}.toplevel.fasta" 96 | fi 97 | 98 | ## 7) extract repeat sequences 99 | bedtools getfasta -name -fi _${SPECIES}.toplevel.fasta -bed _${SPECIES}.repeats.bed >\ 100 | _${SPECIES}.repeats.fasta 101 | 102 | ## 8) eliminate degenerate (MAXDEGENPERC) repeat sequences 103 | cat _${SPECIES}.repeats.fasta | \ 104 | perl -slne 'if(/^(>.*)/){$h=$1} else {$fa{$h}.=$_} END{ foreach $h (keys(%fa)){ $l=length($fa{$h}); $dg=($fa{$h}=~tr/Nn//); print "$h\n$fa{$h}" if(100*$dg/$l<=$maxdeg) }}' \ 105 | -- -maxdeg=$MAXDEGENPERC > ${SPECIES}.repeats.nondeg.fasta 106 | 107 | ## 9) clean temp files 108 | if [ -z "$DEBUG" ] || [ "$DEBUG" -eq "0" ]; then 109 | echo 110 | echo "# removing temp files"; 111 | rm _${SPECIES}.*.bed _${SPECIES}.*.fasta _${SPECIES}.*.fai 112 | fi 113 | 114 | exit 0 115 | --------------------------------------------------------------------------------