├── .github
    └── workflows
    │   ├── ci.yml
    │   └── notice.yml
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── demo_test.t
├── files
    ├── Arabidopsis_thaliana.fna.gz
    ├── EnsemblPlants47.png
    ├── runtime_ram.png
    └── test_transcripts.fna
├── install_R_deps.R
├── lib
    ├── R
    │   └── README.md
    ├── cpanfileEnsembl
    ├── cpanfileREST
    └── requirements.txt
├── pangenes
    ├── CHANGES.txt
    ├── HPC.conf.sample
    ├── HPC.conf.sample.slurm
    ├── README.md
    ├── _chunk_chr.pl
    ├── _cluster_analysis.pl
    ├── _collinear_genes.pl
    ├── _cut_sequences.pl
    ├── _dotplot.pl
    ├── asciinema.txt
    ├── bin
    │   └── README.md
    ├── check_evidence.pl
    ├── check_quality.pl
    ├── cpanfile
    ├── get_pangenes.pl
    ├── lib
    │   ├── HPCluster.pm
    │   └── pangeneTools.pm
    ├── match_cluster.pl
    ├── pics
    │   ├── collinear_pangenes_minimap2.png
    │   ├── fixing_genemodels.png
    │   ├── flow-check-evidence.dia
    │   ├── flow-check-evidence.png
    │   ├── flow-get-pangenes.dia
    │   ├── flow-get-pangenes.png
    │   ├── long_model.png
    │   ├── pairs2clusters.png
    │   ├── pangene_set_nomenclature.png
    │   ├── pangenesPAG2023.pdf
    │   ├── wgaoverlap.dia
    │   └── wgaoverlap.png
    ├── plots
    │   ├── core_gene.tab_core_both.png
    │   ├── dotplot.png
    │   ├── haplotypes.trimmed.png
    │   ├── pan_gene.tab_pan.png
    │   ├── pangene_context.png
    │   └── pangene_matrix__shell.png
    └── rename_pangenes.pl
├── phylogenomics
    ├── Oryza.log
    ├── PlantCompUtils.pm
    ├── README.md
    ├── TODO.txt
    ├── downloads
    │   └── README.txt
    ├── ens_sequences.pl
    ├── ens_single-copy_core_genes.pl
    ├── ens_syntelogs.pl
    └── phylo_test.t
├── recipes
    ├── exampleAPI.pl
    ├── exampleBiomart.R
    ├── exampleCRAM.pl
    ├── exampleFTP.sh
    ├── exampleMySQL.sh
    ├── exampleREST.R
    ├── exampleREST.pl
    ├── exampleREST.py
    └── exampleVEP.sh
└── repeats
    ├── AnnotRedRepeats.py
    ├── README.md
    ├── Red2Ensembl.py
    ├── bench
        ├── README.md
        ├── list.Red
        ├── list.cores
        ├── list.cores.sp
        ├── list.cores.sp.toplevel
        ├── list.cores.wheat
        ├── list.toplevel
        ├── log.Rgenes.50
        ├── log.exons
        ├── log.gc
        ├── log.genes
        ├── log.nrplants.bed
        ├── log.redat.bed
        ├── log.repeat.N50
        ├── log.repeat.length
        ├── log.repeat.overlap
        ├── log.updown500
        ├── log.updown500.16mer
        ├── log.updown500.21mer
        ├── log.updown500.31mer
        ├── log.wheat.Red.bed
        ├── log.wheat.redat.bed
        ├── pfam
        │   └── enrich.R
        └── repeatmodeller
        │   ├── HOWTO.txt
        │   ├── list.cores.sp
        │   ├── log.Rgenes
        │   ├── log.Rgenes.50
        │   ├── log.exons
        │   ├── log.genes
        │   ├── log.repeat.N50
        │   ├── log.repeat.length
        │   ├── log.repeat.overlap
        │   └── log.updown500
    └── get_repeats_ensembl.sh


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | # See the NOTICE file distributed with this work for additional information
 2 | # regarding copyright ownership.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: "CI"
17 | 
18 | on:
19 |   push:
20 |     branches:
21 |       - master
22 |   pull_request:
23 | 
24 | defaults:
25 |   run:
26 |     working-directory: ./
27 | 
28 | jobs:
29 |   tests:
30 |     runs-on: ubuntu-latest
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 | 
35 |       - uses: shogo82148/actions-setup-perl@v1
36 |         with:
37 |           perl-version: "5.28"
38 | 
39 |       - name: Install dependencies
40 |         run: |
41 |           sudo apt-get update                 
42 |           sudo apt-get install -y wget python3 python3-pip python3-setuptools mysql-client libmysqlclient-dev libdb-dev g++-10 bedtools r-base
43 |           make install_REST
44 |           make install_ensembl
45 |           make install_repeats
46 |           make install_pangenes
47 |           PERL5LIB=$PWD/lib:$PERL5LIB
48 |           export PERL5LIB
49 |         shell: bash
50 | 
51 |       - name: Run tests
52 |         run: |
53 |           make test_travis
54 |           make test_repeats_travis
55 |           make test_pangenes
56 | 


--------------------------------------------------------------------------------
/.github/workflows/notice.yml:
--------------------------------------------------------------------------------
 1 | # See the NOTICE file distributed with this work for additional information
 2 | # regarding copyright ownership.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: "Update NOTICE copyright year at the start of every year"
17 | 
18 | on:
19 |   schedule:
20 |     - cron:  '15 15 1 1 *'
21 | 
22 | jobs:
23 |   notice_update:
24 |     name: Update NOTICE copyright year
25 |     runs-on: ubuntu-latest
26 | 
27 |     steps:
28 |       - uses: actions/checkout@v4
29 | 
30 |       - name: Update NOTICE file
31 |         run: |
32 |           sed -i "s/$(date +%Y --date='1 year ago')/$(date +%Y)/" NOTICE
33 | 
34 |       - uses: EndBug/add-and-commit@v9
35 |         with:
36 |           add: 'NOTICE'
37 |           message: 'Update NOTICE copyright year'
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | minimap2release = 2.24
  3 | gffreadrelease  = 0.12.7
  4 | gmaprelease     = 2021-12-17
  5 | clustalorelease = 1.2.4
  6 | alistatrelease  = 1.14
  7 | 
  8 | test:
  9 | 	perl demo_test.t
 10 | 
 11 | test_travis:
 12 | 	perl demo_test.t travis
 13 | 
 14 | clean:
 15 | 	rm -f *rachypodium* && rm -f Compara*gz 
 16 | 	rm -f new_genomes.txt && rm -f uniprot_report_EnsemblPlants.txt
 17 | 	rm -f arabidopsis_thaliana*.tar.gz
 18 | 	rm -f plants_species-tree*.nh
 19 | 	rm -f oryza_sativa*
 20 | 
 21 | install:
 22 | 	-sudo apt install -y wget mysql-client libmysqlclient-dev libdb-dev bedtools pip cpanminus
 23 | 
 24 | install_REST:
 25 | 	cpanm --local-lib lib --installdeps --notest --cpanfile lib/cpanfileREST .
 26 | 	pip3 install --user requests
 27 | 
 28 | install_biomart_r:
 29 | 	Rscript install_R_deps.R
 30 | 
 31 | install_ensembl:
 32 | 	cpanm --local-lib lib --installdeps --notest --cpanfile lib/cpanfileEnsembl .
 33 | 	cd lib && git clone https://github.com/Ensembl/ensembl.git
 34 | 	cd lib && git clone https://github.com/Ensembl/ensembl-variation.git
 35 | 	cd lib && git clone https://github.com/Ensembl/ensembl-funcgen.git
 36 | 	cd lib && git clone https://github.com/Ensembl/ensembl-compara.git
 37 | 	cd lib && git clone https://github.com/Ensembl/ensembl-metadata.git
 38 | 	cd lib && git clone -b release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git
 39 | 
 40 | install_minimap2:
 41 | 	if [ ! -d "lib/minimap2" ]; then \
 42 | 		cd lib && wget https://github.com/lh3/minimap2/releases/download/v${minimap2release}/minimap2-${minimap2release}.tar.bz2 && \
 43 | 			tar xfj minimap2-${minimap2release}.tar.bz2 && cd minimap2-${minimap2release} && make && cd .. && \
 44 | 			rm -f minimap2-${minimap2release}.tar.bz2 && ln -fs minimap2-${minimap2release} minimap2; \
 45 | 	fi
 46 | 
 47 | install_Red:
 48 | 	cd lib && git clone https://github.com/EnsemblGenomes/Red.git && cd Red/src_2.0 && make bin && make
 49 | 	#in case you need to use an alternative g++ compiler
 50 |         #cd lib && git clone https://github.com/EnsemblGenomes/Red.git && cd Red/src_2.0 && make bin && make CXX=g++-10
 51 | 
 52 | install_repeats: install_minimap2 install_Red
 53 | 	pip3 install --user -r lib/requirements.txt
 54 | 	cd files && wget -c https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2 && bunzip2 nrTEplantsJune2020.fna.bz2
 55 | 
 56 | install_redat:
 57 | 	cd files && wget -c ftp://ftpmips.helmholtz-muenchen.de/plants/REdat/mipsREdat_9.3p_ALL.fasta.gz && gunzip mipsREdat_9.3p_ALL.fasta.gz
 58 | 
 59 | test_repeats_travis:
 60 | 	cd repeats && ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna
 61 | 
 62 | test_repeats:
 63 | 	cd repeats && ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna && \
 64 | 		./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna test_Atha_chr4 --bed_file test.nrTEplants.bed
 65 | 
 66 | uninstall_repeats:
 67 | 	cd files && rm -rf nrTEplantsJune2020.fna*
 68 | 	cd lib && rm -rf Red minimap2-${minimap2release} minimap2
 69 | 
 70 | clean_repeats:
 71 | 	cd repeats && rm -rf test_Atha_chr4 Atha.sm.fna test.nrTEplants.bed
 72 | 
 73 | # gmap takes several minutes to compile
 74 | install_gmap: 
 75 | 	cd pangenes/bin && wget http://research-pub.gene.com/gmap/src/gmap-gsnap-${gmaprelease}.tar.gz && tar xfz gmap-gsnap-${gmaprelease}.tar.gz && \
 76 | 		cd gmap-${gmaprelease} && ./configure --prefix=${PWD}/pangenes/bin/gmap-${gmaprelease}/exe && \
 77 | 		make && make install && cd .. && rm -rf gmap-gsnap-${gmaprelease}.tar.gz && ln -fs gmap-${gmaprelease} gmap
 78 | 	
 79 | install_gffread:
 80 | 	cd pangenes/bin && wget https://github.com/gpertea/gffread/releases/download/v${gffreadrelease}/gffread-${gffreadrelease}.tar.gz && \
 81 | 		tar xfz gffread-${gffreadrelease}.tar.gz && cd gffread-${gffreadrelease} && make && cd .. && \
 82 | 		rm -f gffread-${gffreadrelease}.tar.gz && ln -fs gffread-${gffreadrelease} gffread
 83 | 
 84 | install_pangenes: install_minimap2 install_gffread install_gmap
 85 |         # core perl modules, DB_File not installed in Travis
 86 | 	cpanm -v --installdeps --notest --cpanfile pangenes/cpanfile .
 87 | 	cd files && wget -c https://github.com/Ensembl/plant-scripts/releases/download/v0.4/test_rice.tgz && tar xfz test_rice.tgz && rm -f test_rice.tgz
 88 | 
 89 | # see https://github.com/ekg/wfmash for other options
 90 | install_wfmash:
 91 | 	-sudo apt install cmake libjemalloc-dev zlib1g-dev libgsl-dev libhts-dev
 92 | 	cd pangenes/bin && git clone https://github.com/ekg/wfmash && cd wfmash && cmake -H. -Bbuild && cmake --build build -- -j 3
 93 | 
 94 | install_gsalign:
 95 | 	cd pangenes/bin && git clone https://github.com/hsinnan75/GSAlign.git && cd GSAlign && make
 96 | 
 97 | install_pangenes_quality:
 98 | 	cd pangenes/bin && wget http://www.clustal.org/omega/clustalo-${clustalorelease}-Ubuntu-x86_64 && \
 99 | 	chmod +x clustalo-${clustalorelease}-Ubuntu-x86_64 && \
100 | 	ln -fs clustalo-${clustalorelease}-Ubuntu-x86_64 clustalo && \
101 | 	wget https://github.com/thomaskf/AliStat/archive/refs/tags/v${alistatrelease}.tar.gz && \
102 | 	tar xfz v${alistatrelease}.tar.gz && cd AliStat-${alistatrelease} && make && cd .. && \
103 | 	rm -f v${alistatrelease}.tar.gz && ln -s AliStat-${alistatrelease} AliStat
104 | 
105 | uninstall_pangenes:
106 | 	cd pangenes/bin && rm -rf gffread-${gffreadrelease} gmap-${gmaprelease} gffread wfmash GSAlign gmap \
107 | 		clustalo-${clustalorelease}-Ubuntu-x86_64 clustalo AliStat-${alistatrelease} AliStat
108 | 	cd lib && rm -rf minimap2-${minimap2release} minimap2
109 | 	cd files && rm -rf test_rice
110 | 
111 | test_pangenes:
112 | 	cd pangenes && perl get_pangenes.pl -d ../files/test_rice && \
113 | 	perl get_pangenes.pl -d ../files/test_rice -t 0 -s '^\d+$$' &&\
114 | 	perl get_pangenes.pl -d ../files/test_rice -H && \
115 | 	perl check_evidence.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -i gene:ONIVA01G50800.cdna.fna -f -v && \
116 | 	perl match_cluster.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ \
117 | 		-s ../files/test_transcripts.fna -o test_transcripts.gmap.tsv && \
118 | 	perl rename_pangenes.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -o clade.consortium.1
119 | 
120 | clean_pangenes:
121 | 	cd pangenes && rm -rf test_rice_pangenes && rm test_transcripts.gmap.tsv && rm -rf clade.consortium.1
122 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Ensembl
 2 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
 3 | Copyright [2016-2025] EMBL-European Bioinformatics Institute
 4 | Copyright [2021-2025] Estacion Experimental Aula Dei-CSIC
 5 | 
 6 | This product includes software developed at:
 7 | - EMBL-European Bioinformatics Institute
 8 | - Wellcome Trust Sanger Institute
 9 | - Estacion Experimental Aula Dei-CSIC
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Scripting analyses of genomes in Ensembl Plants
  3 | 
  4 | This repo contains code examples for interrogating 
  5 | [Ensembl Plants](https://plants.ensembl.org/index.html) 
  6 | from your own scripts and for masking & annotating 
  7 | [repeats](#repeat-masking-and-annotation) and 
  8 | [calling pangenes](#pangenes) in plant genomes.
  9 | 
 10 | [![Build Status](https://github.com/Ensembl/plant-scripts/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/Ensembl/plant-scripts/actions/workflows/ci.yml)
 11 | 
 12 | - [List of recipes](#list-of-recipes)
 13 | - [Dependencies of recipes](#dependencies)
 14 |     - [FTP](#ftp)
 15 |     - [MySQL](#mysql)
 16 |     - [Perl](#perl)
 17 |     - [Python](#python)
 18 |     - [R](#r)
 19 | - [Repeat masking and annotation](#repeat-masking-and-annotation)
 20 | - [Pangene analysis](#pangenes)
 21 | - [Phylogenomics](#phylogenomics)
 22 | - [Species tree](#species-tree)
 23 | - [Citation](#citation)
 24 | 
 25 | 
 26 | ## List of recipes
 27 | 
 28 | The code for the recipes in this section can be found in folder [recipes](./recipes/).
 29 | They are grouped by type (API, BioMart, CRAM, FTP, MySQL, REST & VEP) and their dependencies 
 30 | are explained below. To create your own recipes please read the appropriate documentation:
 31 | 
 32 | | type | URLs |
 33 | |---|---|
 34 | | API | http://plants.ensembl.org/info/data/api.html |
 35 | | BioMart | http://plants.ensembl.org/info/data/biomart/index.html |
 36 | | FTP | http://plants.ensembl.org/info/data/ftp |
 37 | | MySQL | http://plants.ensembl.org/info/data/mysql.html |
 38 | | REST | http://plants.ensembl.org/info/data/rest.html |
 39 | | VEP | http://plants.ensembl.org/info/docs/tools/vep/index.html |
 40 | 
 41 | These are the script recipes, obtained with grep -P "^## \w\d+" recipes/example* :
 42 | 
 43 | ```
 44 | exampleAPI.pl:## A1) Load the Registry object with details of genomes available
 45 | exampleAPI.pl:## A2) Check which analyses are available for a species
 46 | exampleAPI.pl:## A3) Get soft masked sequences from Arabidopsis thaliana
 47 | exampleAPI.pl:## A4) Get BED file with repeats in chr4
 48 | exampleAPI.pl:## A5) Find the DEAR3 gene
 49 | exampleAPI.pl:## A6) Get the transcript used in Compara analyses
 50 | exampleAPI.pl:## A7) Find all orthologues of a gene
 51 | exampleAPI.pl:## A8) Get markers mapped on chr1D of bread wheat
 52 | exampleAPI.pl:## A9) Find all syntelogues among rices
 53 | exampleAPI.pl:## A10) Print all translations for otherfeatures genes
 54 | 
 55 | exampleBiomart.R:## B1) Check plant marts and select dataset
 56 | exampleBiomart.R:## B2) Check available filters and attributes
 57 | exampleBiomart.R:## B3) Download GO terms associated to genes
 58 | exampleBiomart.R:## B4) Get Pfam domains annotated in genes
 59 | exampleBiomart.R:## B5) Get SNP consequences from a selected variation source
 60 | 
 61 | exampleCRAM.pl:## C1) Find RNA-seq CRAM files for a genome assembly
 62 | 
 63 | exampleFTP.sh:## F1) Download peptide sequences in FASTA format
 64 | exampleFTP.sh:## F2) Download CDS nucleotide sequences in FASTA format
 65 | exampleFTP.sh:## F3) Download transcripts (cDNA) in FASTA format
 66 | exampleFTP.sh:## F4) Download soft-masked genomic sequences
 67 | exampleFTP.sh:## F5) Upstream/downstream sequences
 68 | exampleFTP.sh:## F6) Get mappings to UniProt proteins
 69 | exampleFTP.sh:## F7) Get indexed, bgzipped VCF file with variants mapped
 70 | exampleFTP.sh:## F8) Get precomputed VEP cache files
 71 | exampleFTP.sh:## F9) Download all homologies in a single TSV file, several GBs
 72 | exampleFTP.sh:## F10) Download UniProt report of Ensembl Plants, 
 73 | exampleFTP.sh:## F11) Retrieve list of new species in current release
 74 | exampleFTP.sh:## F12) Get current plant species tree (cladogram)
 75 | 
 76 | exampleMySQL.sh:## S1) Check currently supported Ensembl Genomes (EG) core schemas,
 77 | exampleMySQL.sh:## S2) Count protein-coding genes of a particular species
 78 | exampleMySQL.sh:## S3) Get stable_ids of transcripts used in Compara analyses 
 79 | exampleMySQL.sh:## S4) Get variants significantly associated to phenotypes
 80 | exampleMySQL.sh:## S5) Get Triticum aestivum homeologous genes across A,B & D subgenomes
 81 | exampleMySQL.sh:## S6) Count the number of whole-genome alignments of all genomes 
 82 | exampleMySQL.sh:## S7) Extract all the mutations and consequences for a selected wheat line
 83 | exampleMySQL.sh:## S8) Get FASTA of repeated sequences from selected species
 84 | exampleMySQL.sh:## S9) Get GFF of repeated sequences from selected species
 85 | 
 86 | exampleREST:## R1) Create a HTTP client and a helper functions 
 87 | exampleREST:## R2) Get metadata for all plant species 
 88 | exampleREST:## R3) Find features overlapping genomic region
 89 | exampleREST:## R4) Fetch phenotypes overlapping genomic region
 90 | exampleREST:## R5) Find homologues of selected gene
 91 | exampleREST:## R6) Get annotation of orthologous genes/proteins
 92 | exampleREST:## R7) Fetch variant consequences for multiple variant ids
 93 | exampleREST:## R8) Check consequences of SNP within CDS sequence
 94 | exampleREST:## R9) Retrieve variation sources of a species
 95 | exampleREST:## R10) Get soft-masked upstream sequence of gene in otherfeatures track
 96 | exampleREST:## R11) Get all species under a given taxonomy clade
 97 | exampleREST:## R12) transfer coordinates across genome alignments between species
 98 | 
 99 | exampleVEP.sh:## V1) Download, install and update VEP
100 | exampleVEP.sh:## V2) Unpack downloaded cache file & check SIFT support 
101 | exampleVEP.sh:## V3) Predict effect of variants 
102 | exampleVEP.sh:## V4) Predict effect of variants for species not in Ensembl
103 | ```
104 | 
105 | ### Dependencies
106 | 
107 | Some of the recipes and scripts depend on additional software packages, see below to learn how to install them.
108 | Note that only *make install* requires **sudo**, you might need help from your sysadmin for that task.
109 | 
110 | #### FTP
111 | 
112 | The examples for bulk downloads from the FTP site require the software [wget](https://www.gnu.org/software/wget/), 
113 | which is usually installed on most Linux distributions. For macOS it is available on [Homebrew](https://brew.sh). 
114 | For Windows it ships with [MobaXterm](https://mobaxterm.mobatek.net). On Debian/Ubuntu systems you can also install 
115 | it with (requires sudo):
116 | 
117 |     make install 
118 | 
119 | #### MySQL
120 | 
121 | The examples for SQL queries to Ensembl Genomes database servers require the [MySQL](https://www.mysql.com) client. 
122 | Depending on your Linux flavour this package can be named *mysql-client* or simply *mysql*. On Debian/Ubuntu systems 
123 | you can also install it with (requires sudo):
124 | 
125 |     make install
126 | 
127 | #### Perl
128 | 
129 | As listed in [cpanfile](./files/cpanfile), several modules are required for the REST examples: 
130 | [JSON](https://metacpan.org/pod/JSON), [JSON::XS](https://metacpan.org/pod/JSON::XS) and 
131 | [HTTP::Tiny](https://metacpan.org/pod/HTTP::Tiny). 
132 | Provided [cpanm](https://metacpan.org/pod/App::cpanminus) is available in your system (for instance after make install), 
133 | these modules can be installed with:
134 | 
135 |     #make install 
136 |     make install_REST
137 | 
138 | Similarly, the dependencies for the ensembl VEP ([DBI](https://metacpan.org/pod/DBI), [DBD::mysql](https://metacpan.org/pod/DBD::mysql) 
139 | and [Archive::Zip](https://metacpan.org/pod/Archive::Zip)), together with those used by recipes using the Ensembl Perl API, 
140 | can be installed with:
141 | 
142 |     #make install
143 |     make install_ensembl
144 | 
145 | Ensembl API installation instructions can be found [here](http://plants.ensembl.org/info/docs/api/api_installation.html), 
146 | or if you use git [here](http://plants.ensembl.org/info/docs/api/api_git.html). There is also a debugging 
147 | [guide](http://plants.ensembl.org/info/docs/api/debug_installation_guide.html), which lists some extra dependencies that might not have, 
148 | such as modules [DBI](https://metacpan.org/pod/DBI) and [DBD::mysql](https://metacpan.org/pod/DBD::mysql). 
149 | Note that your local Ensembl API should match the version of the current Ensembl release.
150 | 
151 | #### Python
152 | 
153 | The REST recipes written in python require library [requests](https://pypi.org/project/requests).
154 | Provided pip3 is available in your system (for instance after make install), it can be installed with:
155 | 
156 |     #make install
157 |     make install_REST
158 | 
159 | #### R
160 | 
161 | For the BioMart recipes you will need BioConductor package 
162 | [biomaRt](http://www.bioconductor.org/packages/release/bioc/html/biomaRt.html) 
163 | (read more [here](http://plants.ensembl.org/info/data/biomart/biomart_r_package.html)). 
164 | For the REST recipes two core packages are required: [httr](https://cran.r-project.org/web/packages/httr) and 
165 | [jsonlite](https://cran.r-project.org/web/packages/jsonlite). All these can be installed with:
166 | 
167 |     Rscript install_R_deps.R
168 | 
169 | ## Repeat masking and annotation
170 | 
171 | See examples and documentation in folder [repeats](./repeats/).
172 | 
173 | If you want to annotate repeats you must first run:
174 | 
175 |     #make install # install required bedtools
176 |     make install_repeats # requires gcc & g++ compilers 
177 | 
178 | ## Pangenes
179 | 
180 | See examples and documentation in folder [pangenes](./pangenes/).
181 | We recommend checking out the 
182 | [Runmodes and HPC configuration](https://github.com/Ensembl/plant-scripts/tree/master/pangenes#runmodes-and-hpc-configuration) docs.
183 | 
184 | Install it the [bioconda](https://anaconda.org/bioconda/get_pangenes) way:
185 | 
186 |     conda activate bioconda
187 |     conda create -n get_pangenes -c conda-forge -c bioconda get_pangenes
188 |     conda activate get_pangenes
189 |     # or simply
190 |     conda install bioconda::get_pangenes    
191 | 
192 | Install it the compilation way:
193 | 
194 |     #make install # install required bedtools
195 |     make install_pangenes # requires gcc & g++ compilers
196 | 
197 |     # optionally you might also want to try:
198 |     make install_gsalign
199 |     make install_pangenes_quality
200 | 
201 | 
202 | ## Phylogenomics
203 | 
204 | See examples and documentation in folder [phylogenomics](./phylogenomics/).
205 | 
206 | If you want to run any of those scripts you must first run:
207 | 
208 |     #make install 
209 |     make install_REST
210 | 
211 | ## Species tree
212 | 
213 | ![Plant species tree](./files/EnsemblPlants47.png)
214 | 
215 | *Fig. 1. Species tree of Ensembl Plants release 47 obtained with recipe F12. Figure generated with [iTOL](https://itol.embl.de)*
216 | 
217 | ## Citation
218 | 
219 | Contreras-Moreira B, Naamati G, Rosello M, Allen JE, Hunt SE, Muffato M, Gall A, Flicek P (2022) 
220 | Scripting Analyses of Genomes in Ensembl Plants. In: Edwards D. (eds) Plant Bioinformatics. 
221 | Methods in Molecular Biology, vol 2443. Humana, New York, NY. [10.1007/978-1-0716-2067-0_2](https://link.springer.com/protocol/10.1007%2F978-1-0716-2067-0_2)
222 | 
223 | <!--[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4121769.svg)](https://doi.org/10.5281/zenodo.4121769)-->
224 | 
225 | ### pangenes
226 | 
227 | For the pangene protocols the primary citation is:
228 | 
229 | Contreras-Moreira B, Saraf S, Naamati G, Casas AM, Amberkar SS, Flicek P, Jones AR & Dyer S (2023)
230 | GET_PANGENES: calling pangenes from plant genome alignments confirms presence-absence variation. 
231 | Genome Biol 24, 223. https://doi.org/10.1186/s13059-023-03071-z
232 | 
233 | Check all the references you need to cite in each script by running:
234 | 
235 |     perl get_pangenes.pl -v
236 |     perl check_evidence.pl -c
237 |     perl check_quality.pl -c
238 |     perl match_cluster.pl -c
239 | 
240 | 
241 | ### repeats
242 | 
243 | For the scripts and data in the [repeats](./repeats/) folder please cite:
244 | 
245 | Contreras-Moreira B, Filippi CV, Naamati G, García Girón C, Allen JE, Flicek P (2021) 
246 | Efficient masking of plant genomes by combining kmer counting and curated repeats Genomics. 
247 | Plant Genome https://doi.org/10.1002/tpg2.20143 
248 | (preprint https://www.biorxiv.org/content/10.1101/2021.03.22.436504v1)
249 | 
250 | Girgis HZ (2015) Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. 
251 | BMC Bioinformatics 16:227. https://doi.org/10.1186/s12859-015-0654-5
252 | 
253 | Li H (2018) Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34(18):3094–3100.
254 | https://doi.org/10.1093/bioinformatics/bty191
255 | 
256 | 


--------------------------------------------------------------------------------
/demo_test.t:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | use Test::More;
 4 | 
 5 | my $number_of_tests = 1;
 6 | 
 7 | ok( eval{ `recipes/exampleMySQL.sh test` } =~ /_core_/ , 'exampleMySQL.sh' );
 8 | 
 9 | if(!$ARGV[0] || $ARGV[0] ne 'travis'){
10 | 
11 | 	# FTP/REST/API tests might timeout from Travis
12 | 	
13 | 	$number_of_tests += 5;
14 | 	    
15 | 	ok( eval{ `recipes/exampleFTP.sh --spider test 2>&1` } =~ /Brachypodium_distachyon/ ,
16 | 		'exampleFTP.sh' );
17 | 
18 | 	ok( eval{ `python recipes/exampleREST.py test` } =~ /hordeum_vulgare/ , 'exampleREST.py' );
19 | 
20 | 	ok( eval{ `perl recipes/exampleREST.pl test` } =~ /hordeum_vulgare/ , 'exampleREST.pl' );
21 | 
22 | 	ok( eval{ `Rscript recipes/exampleREST.R test` } =~ /hordeum_vulgare/ , 'exampleREST.R' );
23 | 
24 | 	ok( eval{ `perl recipes/exampleCRAM.pl test` } =~ /subgroup/ , 'exampleCRAM.pl' );
25 | }
26 | 
27 | # requires perl API to be installed ie 'make install_ensembl'
28 | if($ARGV[0] && $ARGV[0] eq 'API'){
29 | 	ok( eval{ `perl recipes/exampleAPI.pl test` } =~ /xref/ , 'exampleAPI.pl' );
30 |     $number_of_tests++;
31 | }
32 | 
33 | # requires BiomaRt R library ie 'make install_biomart_r'
34 | if($ARGV[0] && $ARGV[0] eq 'biomart'){
35 | 	ok( eval{ `Rscript recipes/exampleBiomart.R test` } =~ /IWGSC/ , 'exampleBiomaRt.R' );
36 | 	$number_of_tests++;
37 | }
38 | 
39 | done_testing( $number_of_tests );
40 | 


--------------------------------------------------------------------------------
/files/Arabidopsis_thaliana.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/Arabidopsis_thaliana.fna.gz


--------------------------------------------------------------------------------
/files/EnsemblPlants47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/EnsemblPlants47.png


--------------------------------------------------------------------------------
/files/runtime_ram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/files/runtime_ram.png


--------------------------------------------------------------------------------
/files/test_transcripts.fna:
--------------------------------------------------------------------------------
  1 | >TR1
  2 | AAGAATTACAAACCTCAACAACTCTCGACTGTTACAGGCATCGCTGCCATAATATGAGTA
  3 | GAGTACATGGGCATCACCTAAATTCTGTGAGCGCCATCCATCAAACTATGTCTGACTGAC
  4 | TTGACTGTCTACTACAAGATATACTCATTCTCAATTACATGACGCTTTGACATAATGAGT
  5 | GCAAGAAAAATCTACTGATACATCATCACCAAACACACCAACCTAGATTTCCAAGAATCG
  6 | TCAATTCAAAACAAGAAACGCTAGAAACAACCTAATCAGATGCAATTATTATTCTTCCTC
  7 | TTGTTATGTCGTGAAGAAAGACCATAGCGTTTTCCCAATCGATTTGTTTCGCCTCTCTGA
  8 | ATCATCATCATCATCATCCTCGTCGTCATCTTCAACGCCCCCACGCTCACTTCCTGATGG
  9 | TGTCATGGGCATCGCATCTTCACCTTGTTCACCATCCACAACAATATCACCATACTCCTC
 10 | ACCGGCAGTAGCATGGGCATCAACAGTTTCAGCCTGGGAGAACTCTTGCAACTGGGATGC
 11 | ACCATCAGCTGATTCAGAGGGGCCCGTCCTCTTGCTATCAGCAGCTTTAGAGGTTCCTTC
 12 | CGTGTCATCAGCGGTAGTTTCTACCGTCTGCTTTTTTCCTCTCCTCACAGCTTTCTTCTT
 13 | CTCCCGTGGAGGGGCTGCTGCTGTGGCATTTGCAACTGTAGTGCGTCTCAAATTGTAGCG
 14 | TTTTTCACCTGTGGTTTGCATCAACGGGCCAGATGGCTGCCTCCTCTTACGAGGCTGCCC
 15 | GCCAACTGAAACACTTTCAGATTGTGCTTCACTAACTTCTTCATCTTCCTCACTGATCTC
 16 | TGGTCCAGCAAAGCGCCTCTTCCGTGTACCACCAACAGTAACTGAATCTTCCTGATCATC
 17 | ATTCTTTTCATCAAAATTCCCACCAAGTACTACTTTAGCATCTTCAAGCACTGCACGAAC
 18 | CGACTTTGTCCTTCTTGGTTTACCCTTCCGTTTCGGAGGCTTTGATGTTTCAGGTTGCAA
 19 | TTCAACTTCAGGGGCAGCCGTTGAGTCCTTTCCATTTTGGTCCACAGAGGCAATTGTTGT
 20 | GTCAACTGCCATGTCATTAGCACCAGCAAATGGCTGAGTTCCATGAACATCTATACAATT
 21 | ATCTGCAACTCCAACAGAAGACTCGATTTGGATATCATCAGCTATATCAAGCCTTTCGAA
 22 | TTCTACATTCCCCCTTGTTCCACTGTCAGAAGGGAGACCCTCAGCATCAAAAGAATTATT
 23 | TGCTGCCTGATAGACAGGAGTTGGCTCATAATCTGCTTCACTCGGTGATGCTTCCTCTAA
 24 | TCTCGCACCAAAAGGAATGTTAATTTCTGCTTCCTGGTCCGATGACTGCTCGGCCTTTGT
 25 | CCTAGGAGAGAACTTGAAAAGCCTTGAACACTTCTTCAACAGTGAGAAACGACCAGCTGA
 26 | ATTAACAAGTGTTCCTGTGTCTTGAGCCAAATTATCAGTGTTCGGGGAACGACCATCTCC
 27 | CTCAACAGCCAAACTTGGGTGGTCAATATCAGCATCATCCTTGAGAGATAGATCACCAAA
 28 | GCCCTCAAAAACAGAAACCCCACAGCTTTTGCACACCTTGTACTTTTCAAACATATCAAT
 29 | GAGGTTATTCCTGTCCCTGTTGTATGCTTCTCGGCGATCCTTCAGACTCTTGCTGAGTGC
 30 | ATGAAGACTATCAATGTCTCTTCTAATGTCTGCCTTGTCAGTTTCCAATTTTTGCTTCTC
 31 | CTCCTCTAGAACTTCTTTTTCCCTCAGCAGCTGTTCTTTCTCCAATGTTATCTTTTGAAT
 32 | CTTGGACTCGTTCAAATCAATAGCATGCCTTAGTTCATTTTCTACAAAATCCCTTCTCTT
 33 | ATTTAACTCATCCTCCTTCTGTTCCAGTTCCTTCTGCTTAATTGCTAGTTTATTCGCCAT
 34 | CTCCATCTCCAGTTCATGTCTATGGAGCTGCAAATTACGCTGAAGATCAGCACGTTCTCT
 35 | CTTCAGAAATTCATCAATCTCCTCACGCTGGTGTTTTATATCATCCAATAAAGCTTTTTC
 36 | CTTGAGTGCAAGACTATCCTGCTGCTCCTTGTATTTTGCATCCATTTCATCCTGTGTATC
 37 | CTTGAATCTCTTCTCTTCATTGTGACGCCACTTTCCTAGATTTGTTTTATCATTCTTGAG
 38 | CACCTTTGCCTCCTCTTCTAAATGAGCTCTTTTTTCATCAAGCTGCTCCCACTCTTCTTC
 39 | GAATTTCTGCCGCTGCTTCCTCAAATCTTCAGTTTCCTCCGAGAGGGAATTGCTTCGCAT
 40 | TCTATACTCATCAATCTCCTTCTTCAGTTGTGCTGTCAGCATGCTGTGCTCTTGTCTCTC
 41 | TTCCTCTGTAAGTTTCAAATTGTTTTGTTCTTCTAAAATTTTCTTCTTTTCAGCTTCAAT
 42 | TGTGGCTTTCAGCCTTTCAATATCTGATCTGTACATCTCAGCCTGTTTTCGTTCATTGTC
 43 | CACTTGCAGCTTTTCTTCGGATAAGCTATCCTTCTCAGTCTTCAGTGACTCCTCCCACTT
 44 | CTTCAGAGCTTTCGACTTAGTATCATGGTCAATCTGCAAATCTTCCAGCCTTTTCATGCT
 45 | TTCATTTAATGCTTGCTCCCTTTTAGATATTATGTTCTCACGAGAACTGAGATCTTTTTC
 46 | CTTCTTTACCAAATCAGCTTCCCTATGTTTCAGCTTTTCCTCAAAGGATTTCCTCTCACT
 47 | CTCCAGCTCTAATTCAAAATCTCGTCTTTTTGCTTCAAGCTTCACCTCGTGATCCTTGAT
 48 | AAGCTTCTGAAGCCCCATTTTTTCTCTTGTACTAGCTTTTTCTTCCCTCTCAGATAACTT
 49 | TTTCTCCCTTTTTTCCAAAGTCTTGTGCTTTGAGTCAGCATCCTTTTCTTGTGAGCGTAA
 50 | TTCATTCAGCCTTTTGGCAATATCCTCCTCCTTTGTTTTCAAAATGATTTTAGTTGACTC
 51 | CACTGTCTTTTTTGCCTCCTCCAATTCTTCCTGTTTTATCTTACAAAGTTTGTCATTCTC
 52 | ATTTGCTCTCTCCTCTCTGTCATTTACAGATCTCTGCAAATCAACGAGTCTGTTCTGACT
 53 | TTCTTTAAGCTTCTTCTCCCAGTCCTGCAGAGATTCCTCCTGCTCCTTGAGTTGCTTCTC
 54 | TCGGGCCTTCCTCTCGGTCTCAAAATATAGCTTCTCCTTCTCCAACCTACGTTGCCGAGC
 55 | TTCTGCCTCCTCTAAATCTCTGTCAGCTTGTGACTTCTTACGGTTTGCCTCCGCAAGCTT
 56 | CGCATCTGCGGCATGAAGCTTCCCCTCGATCTCCAAAGATTTTTCCTCCAAATTAGCCTC
 57 | AAGAGATTGAGCATCAGCTACCTTTTTCTGAGACGTGAACTTAACCTCAGCTATCTCTGA
 58 | TCGAATTTCACGTAGTGCCTTTTCAAGATCAGCTACACATTGTTTCTCAACACCCAGTGA
 59 | TTTCCTCATGTTTTCTTCTCGCCTTTCATATTCTGAAATAGCATTCAAATGTGCAGCTTG
 60 | TTCCCGCTTTAAGATCTCCTCTTTTTGTGTCAGCACTTGGGTGACTTCGTCAAATTTAGC
 61 | TGCCCACTCTTTTTTCTCAATCAAGAGCAGACCCATATTGTACTGATATTCATGTAACTC
 62 | CTGCTCGAGCTCGGCAGCTCTCTGTGAAGTCCCTTTGGCCTTGCCAAGGGGCGCAGATGC
 63 | GGGCGGAGCGCCACCACCGGCCCGCTGGTTTCCTGGAGACGGCGTGGACCAGCCGTTCCA
 64 | TCCTTTCCCCTGCGGAGTAAACATCACGGCTTCTTGCTCCCAATTCCTTCACGGCGCATC
 65 | AAAGAGACGAGGCGCCGCCCCCTACCGGGAGGTGCCCGGCGGCCACGG
 66 | >TR2
 67 | GTAACTTGGAGACAACGTGCAAAACTTTTTAGCTTGCAGTTTTTCAGCCAAAAGACTTTT
 68 | CACAAAATGGACCCATGATTATTAATTTTTCAACTTACATTGCACAGTTTTTGTAATTAA
 69 | TGTATCTCGCATGGCTCTTGTTTGTTCTCCTTCTTCCCTGGACAAAATTCAGCTAAGCTT
 70 | TTGTGTAACAATTGCTGGTGCAGCCTTGCTTCATACATCACAG
 71 | >TR3
 72 | TCCTTGAATCCTGGCTGTCTGCGGCGTCTCAGCTACTCGCCCGCCTGAACAAACGGATCG
 73 | AAGCCAAGGACTGGGAGGCGGCGGCGAGCGACTGCTGGATCCTGGAGCGGATCTGGAAGC
 74 | TGCTTGCCGACATCGAGGACCTGCACCTGCTCATGGACCCGGACGATTTCCTGCGGCTCA
 75 | AGAGCCAGCTCGCGATACGGTCGGCGCCGGACGGCACCGACGCGTCCTTCTGCTTCCGGA
 76 | CCAGAGCGCTGCTGCACGCCGCCAACGCCACGAGGGACATCAAGAAGCTGGTGCCGTGGG
 77 | TGATCGGCGTCGAGGCGGACCCCAATGGCGGGCCGAGGGTGCAGGAGGCGGCCATGAGGC
 78 | TGTACCACGGCCGGAGGCGCGGCGAGGGCGAGGACGCTGGCAAGATCGAGCTGCTGCAGG
 79 | CTTTCCAGGCCGTGGAGGCGGCCGTGCGGAGGTTCTTCTTCGCGTACCGGCAGGTCGTGG
 80 | CGGCGGTGTGTGGCACGGCGGAGGCGTCGGGCAACCGGGCGCTGTTCGTGCCGGCGGAGG
 81 | GGATGGACCCGCTCTCGCAGATGTTCCTCGAGCCGCCCTACTTCCCCAGCCTCGACGCCG
 82 | CCAAGACGTTCTTGGCCGACTACTGGGTTCAGCACATGGCCGCCGCCTCTGTTCCGTCAG
 83 | GGCGGAGCTGAAGGTTTCGAACGGCCAAAAACCGCGGCGATCGGTAATTTTGCAGGCTAG
 84 | AAGTTACCTATGATCCCCAGCCTGCAATCCTATAGTGATTCATCTCAGTAGCGATACATG
 85 | AGTACAGTAGATACTCCTAGATGCGTGTGTTGTGACTGTGATGCCATCTGTTCTAGTGTT
 86 | CTAGTATCACAGAGGAAGTATTTAACCGTGAGACATTCAATTAAATCAAG
 87 | >TR4
 88 | GCAGTGACACCTCCAAATCTAACATTTCGCGGTTGCATTACCATCTCTTGCCTCTTGGGCTCTGCCAAGA
 89 | ATAGCCAAAGCATATGTAGCCTTCCTGCCTCTCGTGTTCACTCGTTCGGTCCTCTTCCTCCGTTCTCCTC
 90 | TTCCCCTTGCCCTCCTCCAGATCGACCATCACTTGCATGCATGCGCAGGCACGATCGAACGCAGTAGATG
 91 | CATTGGCTGCCAGCTCGATCCGCACCGACGATACTCCGGCGAGGCAAAGCGCGGCGTAAGGAGGAGGAGG
 92 | AAAAGTGGCCGCGACCCGCGGGATGGGCCGTCGACGGAGCGCGCGTTCGAGGGG
 93 | CAGCCCGTCCCGCCGTGGACGGAGCAGGTGACGCTGCGCGCCGTGGTGGCGAGCGTGGCGCTGGGCGTGG
 94 | CCCTGAGCAGCGTGATGATGAACCTGGTGTTCACGTCGGGGATCATCCCGTCGCTCAACATCTCCGCCGG
 95 | CCTCCTCGGCTTCTTCCTCCTCAAGGCCTGGACGCGCCTGCTCGACCAGCTCGGCTCGCCGGGCCGCCCC
 96 | TTCACCCGCCAGGAGAACGCCGTCGTCCAGACCTGCGTCGTCGCCTGCGCCAGCATGACATACAGCGGTG
 97 | GGTTTGGATCGTATCTGCTGGCCATGGATCGGAAGACGGCGGAGAAGACGAGCACCGGGGACGACTCCAG
 98 | CGCGAGCGTCAGTGAGCCGGAGTTCGGTCGGATGATGGCCTTCTTCTTCCTCGTTAGCTTCGTCGGTCTC
 99 | CTCGCCATTGTCCCCATGAGGAAGACAATGATCATCCGCCACCGGCTGACGTTCCCAAGCGGCTCGGCGA
100 | CGGCTCACCTCATCAACAGCTTCCATACCCCTCACGGCGCTAGACAAGCGAAGAGGCAAGTCTCGCTCGT
101 | TCTCCGGTCGTCGTTGGCGAGCTTGTTCTGGTCCATCTTCCAGTGGTTCTACACCGGAGGTCCAAACTGC
102 | GGCTTCACTTCCTTCCCAACGTTTGGGCTCAGCGCCTTCAATCGCGGTTTCTACATCAGTTTGAACGGAA
103 | CTTATGTGGGAATGCTCTTCGGGTCCATCATCTC
104 | CTGGGGGATCATGCGGCCGTACATCCGGAGCAAAAGAGGAATCTGGTACGACGCCGATCTCCAGGAGACG
105 | AACTTGAAGAGCTTCAGTGGATACAAGGTGTTTTGCGCCATAGCAATGATCCTCGGCGACGGCATCTTCC
106 | AGCTCGTCGCGATCTCGCTGAGGACGATACACACGGTGCGCCACCACCAGGTAGCGGCGGAGACGCTCAG
107 | GTCCTTCTCCGACGTCGACGCCATGCCGCGGCCGGTGATGAGCTTCGACGACCGCCGCAGGACGCAGGTG
108 | TTCCTCAGGGAGCACATCCCGAGCACCTTCGCCATCAGCGGGTACGTCGTCCTGGCCACCGTCTCCACCG
109 | TCGTCATCCCGCTCATGTACGGCCAGGTGAGGTACTACCACGTCGCCGCCGCGTACGCGTTCGCGCCCGT
110 | CCTGGCCTTCTGCAACGCCTACGGCACGGGCGTCGCGGAGACCAACTTCTCGGCGCAGTACAACAAGCTG
111 | GTGATCCTCATGTTCGCGTCGTGGATCGGGATCAAGAACGGCGGGATCGTCGGGAGCCTCGTCATCTGCG
112 | GCATCGTGTCGTCCATCGTCTCCACCGCCTCCGACTTCATGTCGGACTTCAAGACGAGCTACCTGACGCT
113 | CACCTCGCCGCGGGCCACGCTGGTGAGCCAGGTGATCGGCACGGCGATGGGGTGCGTCGTCAACCCGGCC
114 | 


--------------------------------------------------------------------------------
/install_R_deps.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Installs missing R dependencies 
 4 | 
 5 | local_lib = "./lib/R/"
 6 | .libPaths( c( .libPaths(), local_lib) )
 7 | 
 8 | if(!requireNamespace("BiocManager", quietly=T))
 9 | 	install.packages("BiocManager", dependencies=T, lib=local_lib)
10 | 
11 | install.packages("dplyr", "stringi", "knitr", "httr", "jsonlite", dependencies=T, lib=local_lib)
12 | 
13 | BiocManager::install("biomaRt", lib=local_lib, dependencies=T)
14 | 
15 | sessionInfo()
16 | 


--------------------------------------------------------------------------------
/lib/R/README.md:
--------------------------------------------------------------------------------
1 | This folder will be used as local lib by install_R_deps.R
2 | 


--------------------------------------------------------------------------------
/lib/cpanfileEnsembl:
--------------------------------------------------------------------------------
1 | requires 'DBI';
2 | requires 'DBD::mysql';
3 | requires 'Archive::Zip';
4 | 


--------------------------------------------------------------------------------
/lib/cpanfileREST:
--------------------------------------------------------------------------------
1 | requires 'JSON';
2 | requires 'JSON::XS';
3 | requires 'HTTP::Tiny';
4 | 


--------------------------------------------------------------------------------
/lib/requirements.txt:
--------------------------------------------------------------------------------
1 | sqlalchemy==1.3.23 
2 | sqlalchemy_utils 
3 | PyMySQL
4 | 


--------------------------------------------------------------------------------
/pangenes/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | 21122021: _collinear_genes.pl tested with -s
 2 | 27122021: get_pangenes.pl integrates _cut_sequences.pl & _collinear_genes.pl
 3 | 04012022: checks regex matches chr names
 4 | 04012022: tested with -s, made test_rice
 5 | 10012022: wfmash only needed with -w
 6 | 25012022: gene BED files checked as they might be interrupted
 7 | 28012022: added -H, trying out diff MINMASKLEN values in barley
 8 | 03022022: while parsing GFF files, chr names and extracted sequences are checked
 9 | 11022022: genomic segments are added as segment_collinear features in _collinear_genes.pl
10 | 15022022: genomic segments are used to produce .gdna.fna clusters in get_pangenes.pl
11 | 02032022: added & tested _collinear_genes.pl -sg
12 | 04032022: added & tested get_pangenes.pl -s
13 | 04032022: get_pangenes.pl -s prints ANI matrix from GSAlign estimates
14 | 15032022: collinearity TSV files sorted
15 | 15032022: pangenome matrices are chr-sorted with -s
16 | 16032022: _cluster_analysis.pl unclusters non-neighbors more than $MAXDISTNEIGHBORS away
17 | 16032022: BED-like pangenome matrices produced with -s
18 | 17032022: updated documentation in README
19 | 24032022: added get_pangenes.pl -N (and _cluster_analysis.pl -m)
20 | 04042022: tested wfmash v0.8.1-25-g1344b9e on rice chr1 testset (-p 80 -s 1000)
21 | 20042022: added sub split_genome_sequences_per_chr_bedtools to _collinear_genes (uses faster bedtools)
22 | 20042022: adopted minimap2-2.24 and updated Makefile
23 | 22042022: save compressed copy of $merged_tsv_file as evidence of clusters
24 | 22042022: check_evidence.pl prints basic stats for a cluster
25 | 05052022: check_evidence.pl -f prints GFF fixes for long gene models
26 | 06052022: check_evidence.pl -f prints sorted- non-overlapping GFF fixes for long gene models
27 | 11052022: check_evidence.pl -f tested for split and missing gene models, premature stop codons tracked
28 | 11052022: GFF patches, maybe created with check_evidence.pl, can now be used with get_pangenes.pl -p, existing WGAs reused
29 | 20052022: prints WGA stats summary
30 | 27052022: check_evidence.pl -s -r oryza_sativa_RAPDB,oryza_sativa_MSU appends to file mode isoform sequence, preferring refs
31 | 02052022: check_evidence.pl locates internal stop codons in CDS sequences, those cannot be used to fix gene models
32 | 07062022: check_evidence.pl skip long segments with candidate long/split genes, uses $MAXSEGMENTSIZE
33 | 09062022: check_evidence.pl sub liftover_gmap calls validates lifted CDS sequences with sub no_premature_stops
34 | 21062022: check_evidence.pl split models only fixed if mapped gene overlaps >= MINFIXOVERLAP=0.75
35 | 21062022: check_evidence.pl long models only fixed if mapped gene pair overlaps >= MINFIXOVERLAP=0.75
36 | 29062022: added _dotplot.pl to produce dotplot figures with R package pafr
37 | 14072022: added _chunk_chr.pl to break long chromosomes in chunks separated by geneless stretches (testing only)
38 | 30082022: tested get_pangenes.pl with rice, barley and wheat
39 | 12092022: check_evidence.pl -f -v prints out GMAP alignments, -p allows for partial CDS lift-over
40 | 13092022: check_evidence.pl -f does not use long/short isoforms for lift-over
41 | 09022023: fixed segments with flipped species while producing PAF in _dotplot.pl
42 | 20022023: get_pangenes.pl stops if 0 genes parsed from GFF
43 | 07032023: _collinear_genes.pl prints out all unmapped genes and the underlying cause
44 | 09032023: _collinear_genes.pl now maps genes in WGAs in both strands, added optional -n
45 | 17032023: _cluster_analysis.pl now merges disjoint clusters (diff species, 75% supporting edges) caused often by split gene models
46 | 30032023: check_evidence.pl -P prints python code to plot genomic context of pangene cluster, requires pyGenomeViz
47 | 03042023: discard partially mapped genes at the ends of chrs (query2ref) in _collinear_genes.pl
48 | 26042023: updated documentation and added Example 2: pangene and Presence-Absence Variation (PAV) analysis
49 | 04052023: added check_quality.pl and updated documentation
50 | 30082023: completed target test_pangenes in Makefile
51 | 30082023: Makefile target install_pangenes now system-installs pangenes/cpanfile; these are core modules, only DB_file seems to be lacking in Travis
52 | 25092023: added match_cluster.pl and updated documentation; uses .cdna.fna clusters by default, -C seems slower 
53 | 24102023: added sub select_GFF_valid_genes to leave out non-coding genes; see $GFFACCEPTEDFEATS & $GFFVALIDGENEFEAT
54 | 24102023: get_pangenes.pl now prints number of valid & non-valid genes in each input GFF file (details saved in .gff.log files)
55 | 25102023: identical gene ids are now supported accross annotations, thanks for reporting the bug Pimmy!
56 | 10112023: improved calculation of overlap coordinates from WGA segments in different strands
57 | 14112023: genes lacking cDNA are skipped in _cluster_analysis.pl
58 | 15112023: removed bug from 25102023 that impaired removal of non-local genes, this duplicated a few clusters
59 | 23112023: match_cluster.pl now takes a complete pangene results folder ie Oryza_nivara_v1chr1_alltaxa_5neigh_algMmap_
60 | 24112023: added rename_pangenes.pl to assing pangene IDs to previously computed clusters
61 | 10012024: fixed bug in handling - strand coords in sub query2ref_coords
62 | 11012024: sub _parseCIGARfeature handles correctly 1bp CS-type SNPs when computing overlap with optional query coord
63 | 28022024: removed bug from 25102023 that misordered non-reference clusters in matrices
64 | 28022024: added -n to avoid intervining non-reference pangenes in _cluster_analysis.pl
65 | 28022024: added -f to get_pangenes.pl, which calls _cluster_analysis -v to make blocks of ref genes 
66 | 24052024: check_quality.pl -h prints header
67 | 01082024: added -S to rename_pangenes.pl
68 | 25092024: added section 'Example 6: estimation of haplotype diversity'
69 | 03102024: get_pangenes.pl expects min 95% sequence identity for WGA-based gene alignments, as in GET_HOMOLOGUES-EST, to help avoid diverged tandem copies
70 | 04102024: get_pangenes.pl now set MAXDISTNEIGHBORS=2, neighbor genes in a cluster cannot be more than 2 genes away
71 | 09102024: rename_pangenes.pl -r creates all expected outfiles, tested with rice data
72 | 15012025: prepare for conda package
73 | 06032025: get_pangenes.pl: sort & concat alignment results using tempfile with filenames to sort to avoid "Argument list too long"
74 | 24032025: BED matrix produced by _cluster_analysis.pl is 0-based 
75 | 25032025: match_cluster.pl was added -i to control sequence identity of matches
76 | 25032025: match_cluster.pl was added -F to produce a FASTA file with sequence index that can be exported as gene-based pangenome for mapping, 
77 | 25032025: with <global pangenome positions> estimated from reference genome
78 | 25032025: updated Makefiles and documentation
79 | 08042025: match_cluster.pl TSV output updated, tested with barley
80 | 08042025: add pangenome coords example to documentation
81 | 14052025: added POCS to troubleshooting to explain small cores
82 | 19052025: check_quality.pl does not assume gff files are available
83 | 27052025: _cluster_analysis.pl -t now affects pangene set growth simulation
84 | 


--------------------------------------------------------------------------------
/pangenes/HPC.conf.sample:
--------------------------------------------------------------------------------
 1 | # cluster/farm configuration file, edit as needed (use spaces or tabs)
 2 | # PATH might be empty or set to a path/ ending with '/'
 3 | PATH    /path/to/lsf/bin/
 4 | TYPE    lsf
 5 | SUBEXE  bsub
 6 | CHKEXE  bjobs
 7 | DELEXE  bkill
 8 | ERROR   EXIT
 9 | QARGS   -q production -M 20G 
10 | 


--------------------------------------------------------------------------------
/pangenes/HPC.conf.sample.slurm:
--------------------------------------------------------------------------------
 1 | # cluster/farm configuration file, edit as needed (use spaces or tabs)
 2 | # PATH might be empty or set to a path/ ending with '/'
 3 | TYPE    slurm
 4 | SUBEXE  sbatch
 5 | CHKEXE  squeue
 6 | DELEXE  scancel
 7 | ERROR   F
 8 | # 70GB was enough for chr-split wheat analysis with minimap2
 9 | QARGS   -p production --time=24:00:00 --mem 70G
10 | 


--------------------------------------------------------------------------------
/pangenes/_chunk_chr.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | use Getopt::Long qw(:config no_ignore_case);
  5 | 
  6 | # Takes a GFF & FASTA pair of files and produces a new pair of files with 
  7 | # the original chromosomes/contigs split in chunks of contiguous genes.
  8 | # A new chunk is created when the next gene on the current chr is further 
  9 | # than MAXGENEDIST bp. A chunk is a genomic block containing at least one gene,
 10 | # usually also ending with a gene.
 11 | #
 12 | # Not used anymore, legacy only.
 13 | 
 14 | # Copyright [2022-24] 
 15 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC
 16 | 
 17 | # perl _chunk_chr.pl -sp oryza_sativa -fa Oryza_sativa.IRGSP-1.0.dna.toplevel.fa \
 18 | #   -gf Oryza_sativa.IRGSP-1.0.51.gff3 
 19 | 
 20 | my $BEDTOOLSEXE = 'bedtools';
 21 | 
 22 | my $MAXGENEDIST = 500_000; 
 23 | 
 24 | my %main_gff_feats = (
 25 |   'gene' => 1,
 26 |   'ncRNA_gene' => 1
 27 | );
 28 | 
 29 | my %skip_gff_feats = (
 30 |   'chromosome' => 1,
 31 |   'scaffold' => 1
 32 | );
 33 | 
 34 | my ( $help, $sp1, $fasta1, $bedtools_path, $cmd, $bed) = (0, 0);
 35 | my ( $maxdist, $gff1, $outpath ) = ($MAXGENEDIST, '', '');
 36 | 
 37 | GetOptions(
 38 |   "help|?"       => \$help,
 39 |   "sp|species=s" => \$sp1,
 40 |   "fa|fasta=s"   => \$fasta1,
 41 |   "gf|gff=s"     => \$gff1,
 42 |   "d|maxdist=i"  => \$maxdist,
 43 |   "o|outpath=s"  => \$outpath,
 44 |   "B|bedtools=s"  => \$bedtools_path
 45 | ) || help_message();
 46 | 
 47 | sub help_message {
 48 |   print "\nusage: $0 [options]\n\n"
 49 |     . "-sp binomial/trinomial species name (required, example: -sp oryza_sativa, used to name outfiles)\n"
 50 |     . "-fa genome FASTA filename           (required, example: -fa oryza_sativa.fna)\n"
 51 |     . "-gf GFF filename                    (required, example: -gf oryza_sativa.RAPDB.gff)\n"
 52 |     . "-d  max distance (bp) tp next gene  (optional, example: -d $MAXGENEDIST)\n"
 53 |     . "-o  path to output folder           (optional, default current folder)\n"
 54 |     . "-B  path to bedtools binary          (optional, default: -B bedtools)\n\n"
 55 | }
 56 | 
 57 | if($help || (!$sp1 || !$fasta1 || !$gff1)){ 
 58 |   help_message();
 59 |   exit(0);
 60 | }  
 61 | 
 62 | if(!-s $fasta1 || !-s $gff1){
 63 |   print "# ERROR: please make sure all input files exist and have content\n";
 64 |   exit(-1);
 65 | } 
 66 | 
 67 | if($maxdist < 1){ 
 68 |   print "# ERROR: distance must be positive\n";
 69 |   exit(-1);
 70 | }
 71 | 
 72 | # check binaries
 73 | if(!$bedtools_path) {
 74 |   $bedtools_path = $BEDTOOLSEXE
 75 | }
 76 | if(`$bedtools_path` !~ 'sage') {
 77 |   print "# ERROR: cannot find binary file $bedtools_path , exit\n";
 78 |   exit(-1)
 79 | }
 80 | 
 81 | print "\n# $0 -sp $sp1 -fa $fasta1 -gf $gff1 -d $maxdist -o $outpath -B $bedtools_path\n\n";
 82 | 
 83 | 
 84 | 
 85 | # set output filenames
 86 | my $chunkfnafile = "$sp1.chunk$maxdist.fna";
 87 | my $chunkgfffile = "$sp1.chunk$maxdist.gff";
 88 | my $chunkbedfile = "$sp1.chunk$maxdist.bed";
 89 | if(-e $outpath) {
 90 |   $chunkfnafile = "$outpath/$chunkfnafile";
 91 |   $chunkgfffile = "$outpath/$chunkgfffile";
 92 |   $chunkbedfile = "$outpath/$chunkbedfile";
 93 | }
 94 | 
 95 | my ($ref_chrs, $ref_chunk_genes, $ref_chunks) = 
 96 |   chunk_GFF($gff1, $maxdist, \%main_gff_feats, \%skip_gff_feats);
 97 | 
 98 | if(scalar(keys(%$ref_chunks)) == 0) {
 99 |   die "# ERROR: cannot chunk GFF file ($gff1)\n";
100 | }
101 | 
102 | open(GFFCHUNK,">",$chunkgfffile) ||
103 |   die "# ERROR: cannot open chunk GFF file ($chunkgfffile)\n";
104 | 
105 | open(FNACHUNK,">",$chunkfnafile) ||
106 |   die "# ERROR: cannot open chunk FASTA file ($chunkfnafile)\n";
107 | 
108 | open(BEDCHUNK,">",$chunkbedfile) ||
109 |   die "# ERROR: cannot open chunk BED file ($chunkbedfile)\n";
110 | 
111 | my $total_chunks = 0;
112 | foreach my $chr (@$ref_chrs) {
113 |   foreach my $chunk (sort {$a<=>$b} keys(%{$ref_chunks->{$chr}})) {
114 | 
115 |     # print transformed gene models to chunked GFF file
116 |     print GFFCHUNK $ref_chunk_genes->{$chr}{$chunk};
117 | 
118 |     # print sequence to chunked FASTA file
119 |     $bed = sprintf("%s\t%d\t%d", 
120 |       $chr,
121 |       $ref_chunks->{$chr}{$chunk}{'start'}-1, #0-based
122 |       $ref_chunks->{$chr}{$chunk}{'end'});
123 | 
124 |     $cmd = "echo '$bed' | $bedtools_path getfasta -fi $fasta1 -bed stdin";
125 |     open(BEDTOOLS,"$cmd |") ||
126 |       die "# ERROR: cannot run bedtools ($cmd)\n";
127 |     while(<BEDTOOLS>) {
128 |       if(/^>/) { 
129 |         print FNACHUNK ">$chr\.chunk$chunk\n";
130 |       } else {
131 |         print FNACHUNK;
132 |       }
133 |     }
134 |     close(BEDTOOLS);
135 | 
136 |     # log
137 |     print BEDCHUNK "$bed\t$chr\.chunk$chunk\n";
138 | 
139 |     $total_chunks++ 
140 |   }
141 | }
142 | 
143 | close(BEDCHUNK);
144 | close(FNACHUNK);
145 | close(GFFCHUNK);
146 | 
147 | print "# chunked GFF file: $chunkgfffile\n";
148 | print "# chunked FASTA file: $chunkfnafile\n";
149 | print "# chunked BED file: $chunkbedfile\n";
150 | 
151 | 
152 | printf("\n# total chr/contigs=%d total chunks=%d\n",
153 |   scalar(@$ref_chrs), 
154 |   $total_chunks);
155 | 
156 | 
157 | ###############################
158 | 
159 | # Parses GFF file and finds chunks of contiguous genes.
160 | # Returns: 
161 | # i)   ref to hash mapping chunk ID to translated gene models in chunks
162 | # ii)  ref to hash mapping chunk ID to chunk 1-based coordinates in original FASTA
163 | # iii) ref to list with chr names in same order as input
164 | sub chunk_GFF {
165 | 
166 |   my ($gff_file, $maxdist, $ref_main_gff, $ref_skip_gff) = @_;
167 | 
168 |   my ($chr, $start, $end, $chunk_start, $chunk_end, $gff_line);
169 |   my ($num_chunk, $dist, $offset, $prev_end) = (1, 0, 0, 0);
170 |   my (%chunk_genes, %chunk, @chrs);
171 | 
172 |   open(GFF,"<",$gff_file) || 
173 |     die "# ERROR(chunk_GFF): cannot read $gff_file\n";
174 |   while(<GFF>){
175 | 
176 |     next if(/^#/ || /^$/);
177 | 
178 |     my @gff = split(/\t/,$_);
179 |     ($chr, $start, $end) = @gff[0,3,4];
180 | 
181 |     next if($ref_skip_gff->{ $gff[2] });
182 | 
183 |     if($ref_main_gff->{ $gff[2] }) { 
184 | 
185 |       # new chunk with new chr
186 |       if($num_chunk > 1 && !grep(/^$chr$/,@chrs)) {
187 |         push(@chrs, $chr);
188 |         $prev_end = 0;
189 |         $num_chunk++;
190 | 
191 |       } elsif($prev_end > 0) { # new chunk if previous gene too far
192 | 
193 |         $dist = $start-$prev_end;
194 |         if($dist > $maxdist) {
195 |           $num_chunk++;
196 |         }
197 |       }
198 | 
199 |       # set chunk start and offset, 
200 |       # used to cut sequence & to transform gene coords
201 |       if(!defined($chunk{$chr}{$num_chunk}{'start'})) { 
202 |         $chunk{$chr}{$num_chunk}{'start'} = $start;
203 |         $chunk{$chr}{$num_chunk}{'offset'} = $start-1;
204 |       }
205 | 
206 |       # update chunk last coord with every new gene
207 |       $chunk{$chr}{$num_chunk}{'end'} = $end;
208 | 
209 |       # save end coord for next iteration
210 |       $prev_end = $end; 
211 |     } 
212 | 
213 |     # transform coords relative to current chunk
214 |     $gff[0] = "$chr.chunk$num_chunk"; 
215 |     $gff[3] -= $chunk{$chr}{$num_chunk}{'offset'};
216 |     $gff[4] -= $chunk{$chr}{$num_chunk}{'offset'};
217 |     $gff_line = join("\t",@gff); 
218 | 
219 |     $chunk_genes{$chr}{$num_chunk} .= $gff_line;
220 |   }
221 |   close(GFF);
222 | 
223 |   return (\@chrs, \%chunk_genes, \%chunk);
224 | }
225 | 
226 | 


--------------------------------------------------------------------------------
/pangenes/_cut_sequences.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | use Getopt::Long qw(:config no_ignore_case);
  5 | use File::Copy qw(cp);
  6 | 
  7 | # Takes a GFF & FASTA pair of files and produces FASTA files with 
  8 | # cDNA, CDS nucl & pep sequences. Optionally it can take also a 
  9 | # GFF patch that modifies the input GFF.
 10 | # Note: also creates a FASTA index file (.fai)
 11 | #
 12 | # Uses external software: gffread [https://f1000research.com/articles/9-304/v2]
 13 | 
 14 | # Copyright [2021-23] 
 15 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC
 16 | 
 17 | # perl _cut_sequences.pl -sp oryza_sativa -fa Oryza_sativa.IRGSP-1.0.dna.toplevel.fa \
 18 | #   -gf Oryza_sativa.IRGSP-1.0.51.gff3
 19 | 
 20 | my $GFFREADEXE = 'gffread'; # v0.12.7
 21 | 
 22 | my ( $help, $nored, $gffreadpath, $sp1, $fasta1) = (0, 0);
 23 | my ( $minlen, $gff1, $patchgff1, $tname, $outpath ) = (0, '', '');
 24 | my $patched_gff_filename = '';
 25 | 
 26 | GetOptions(
 27 |   "help|?"       => \$help,
 28 |   "sp|species=s" => \$sp1,
 29 |   "fa|fasta=s"   => \$fasta1,
 30 |   "gf|gff=s"     => \$gff1,
 31 |   "pt|patch=s"   => \$patchgff1,
 32 |   "l|minlen=i"   => \$minlen,
 33 |   "nr|n"         => \$nored,
 34 |   "p|path=s"     => \$gffreadpath,
 35 |   "o|outpath=s"  => \$outpath
 36 | ) || help_message();
 37 | 
 38 | sub help_message {
 39 |   print "\nusage: $0 [options]\n\n"
 40 |     . "-sp binomial/trinomial species name (required, example: -sp oryza_sativa, used to name outfiles)\n"
 41 |     . "-fa genome FASTA filename           (required, example: -fa oryza_sativa.fna)\n"
 42 |     . "-gf GFF filename                    (required, example: -gf oryza_sativa.RAPDB.gff)\n"
 43 |     . "-pt patch GFF filename              (optional, example: -pt oryza_sativa.RAPDB.patch.gff,\n"
 44 |     . "                                     creates oryza_sativa.RAPDB.patched.gff)\n"
 45 |     . "-l  min length (bp) of features     (optional, example: -l 100)\n"
 46 |     . "-nr remove redundancy in seq names  (optional, ie 'gene:ONIVA01G00100')\n"
 47 |     . "-p  path to gffread binary          (optional, default: $GFFREADEXE)\n"
 48 |     . "-o  path to output folder           (optional, default current folder)\n\n"
 49 | }
 50 | 
 51 | if($help || (!$sp1 || !$fasta1 || !$gff1)){ 
 52 |   help_message();
 53 |   exit(0);
 54 | }  
 55 | 
 56 | if(!-s $fasta1 || !-s $gff1){
 57 |   print "# ERROR: please make sure all input files exist and have content\n";
 58 |   exit(-1);
 59 | } 
 60 | 
 61 | if($patchgff1 && !-e $patchgff1){
 62 |   print "# ERROR: please make sure patch GFF file exists\n";
 63 |   exit(-2);
 64 | }
 65 | 
 66 | if(!$gffreadpath){
 67 |   $gffreadpath = $GFFREADEXE;
 68 | }
 69 | 
 70 | if($minlen < 1){ 
 71 |   $minlen = 0 
 72 | }
 73 | 
 74 | print "\n# $0 -sp $sp1 -fa $fasta1 -gf $gff1 -pt $patchgff1 " .
 75 |   "-l $minlen -nr $nored -path $gffreadpath\n\n";
 76 | 
 77 | # set output filenames
 78 | my $cdnafile = "$sp1.cdna.fna";
 79 | my $cdsfile  = "$sp1.cds.fna";
 80 | my $pepfile  = "$sp1.cds.faa";
 81 | if($patchgff1){
 82 |   $cdnafile = "$sp1.patch.cdna.fna";
 83 |   $cdsfile  = "$sp1.patch.cds.fna";
 84 |   $pepfile  = "$sp1.patch.cds.faa";
 85 |   $patched_gff_filename = $gff1;
 86 |   $patched_gff_filename =~ s/\.gff$/.patched.gff/;
 87 | }
 88 | if($outpath) {
 89 |   $cdnafile = "$outpath/$cdnafile";
 90 |   $cdsfile  = "$outpath/$cdsfile";
 91 |   $pepfile  = "$outpath/$pepfile";
 92 | }
 93 | 
 94 | # only patch if required
 95 | if(-s $patchgff1) {
 96 |   my $num_patches = patch_gff($gff1, $patchgff1, $patched_gff_filename);
 97 |   $gff1 = $patched_gff_filename;
 98 | 
 99 | } else {
100 |   # otherwise just make symb link
101 |   symlink($gff1, $patched_gff_filename);
102 | }
103 | 
104 | my ($ref_names, $ref_coords) = parse_genes($gff1);
105 | 
106 | my $num_cdna = parse_gffread($gffreadpath,$fasta1,$gff1,$cdnafile,
107 |                  'cdna',$minlen,$nored,$ref_names,$ref_coords);
108 | my $num_cds  = parse_gffread($gffreadpath,$fasta1,$gff1,$cdsfile,
109 |                  'cds',$minlen,$nored,$ref_names,$ref_coords);
110 | my $num_pep  = parse_gffread($gffreadpath,$fasta1,$gff1,$pepfile,
111 |                  'pep',$minlen,$nored,$ref_names,$ref_coords);
112 | 
113 | if(scalar(keys(%$ref_names)) == 0) {
114 |   die "# ERROR: cannot parse Parent IDs of mRNA/transcripts, please check GFF format ($gff1)\n";
115 | }
116 | 
117 | if($num_cdna) {
118 |   print "# $cdnafile n=$num_cdna\n";
119 | } else {
120 |   die "# ERROR: cannot extract cDNA sequences, please check GFF format and/or chr names ($gff1)\n";
121 | }
122 | 
123 | if($num_cds) {
124 |   print "# $cdsfile n=$num_cds\n"
125 | } else {
126 |   die "# ERROR: cannot extract CDS sequences, please check GFF format and/or chr names ($gff1)\n";
127 | }
128 | 
129 | print "# $pepfile n=$num_pep\n";
130 | 
131 | 
132 | ###############################
133 | 
134 | # Runs gffread, parses its stdout and saves output in FASTA file.
135 | # Returns number of sequences printed out.
136 | sub parse_gffread {
137 | 
138 |   my ($gffreadexe,$fasta_file,$gff_file,$outfile,
139 |     $seqtype,$minlen,$remove_red,$ref_tr2gene,$ref_tr2coords) = @_;
140 | 
141 |   my ($params,$mrnaid,$geneid,$coords);
142 | 
143 |   if($seqtype eq 'cds'){
144 |     $params = '-x - ';
145 |   } elsif($seqtype eq 'pep'){
146 |     $params = '-y - ';
147 |   } else {
148 |     $params = '-w - '; # cDNA, default
149 |   }
150 | 
151 |   if($minlen > 0) {
152 |     $params .= " -l $minlen ";
153 |   }
154 | 
155 |   my $num_seqs = 0;
156 |   open(OUT,">",$outfile) || 
157 |     die "# ERROR(parse_gffread): cannot create $outfile\n";
158 | 
159 |   open(GFFREAD,"$gffreadexe $params -g $fasta_file $gff_file |") ||
160 |     die "# ERROR(parse_gffread): cannot run $gffreadexe\n"; 
161 | 
162 |   while(<GFFREAD>){
163 |     if(/^>(\S+)/){
164 |       $mrnaid = $1;
165 |       $geneid = $ref_tr2gene->{$mrnaid} || '';
166 |       $coords = $ref_tr2coords->{$mrnaid} || '';
167 | 
168 |       # remove redundant bits
169 |       if($remove_red){
170 |         $mrnaid =~ s/transcript://;
171 |         $geneid =~ s/gene://;
172 |       }
173 | 
174 |       print OUT ">$mrnaid $geneid $coords [$sp1]\n";
175 |       $num_seqs++;
176 |     } else {
177 |       print OUT;
178 |     }
179 |   }
180 |   close(GFFREAD);
181 | 
182 |   close(OUT);
183 | 
184 |   # do not remove index (used later)
185 |   # unlink($fasta_file.'.fai'); 
186 | 
187 |   return $num_seqs
188 | }
189 | 
190 | # Reads in GFF file to parse gene names as parent IDs of transcripts.
191 | # Returns: 
192 | # i) ref to hash mapping transcript ID -> gene ID
193 | # ii) ref to hash mapping transcript ID -> gene coords
194 | sub parse_genes {
195 | 
196 |   my ($gff_file) = @_;
197 | 
198 |   my ($mrnaid,$geneid,$coord,%names,%coords);
199 | 
200 |   open(GFF,"<",$gff_file) || die "# ERROR(parse_genenames): cannot read $gff_file\n";
201 |   while(<GFF>){
202 |     my @F = split(/\t/,$_);
203 | 
204 |     next if(scalar(@F)<9 || ($F[2] ne "mRNA" && $F[2] ne "transcript"));
205 | 
206 |     # take only genes where ID can be parsed
207 |     if($F[8] =~ /ID=([^;]+).*?Parent=([^;]+)/){ 
208 |  
209 |       $mrnaid = $1;
210 |       $geneid = $2;
211 |       chomp $geneid;
212 |  
213 |       $coord = "$F[0]:$F[3]-$F[4]($F[6])";
214 |       $names{$mrnaid} = $geneid;
215 |       $coords{$mrnaid} = $coord;
216 |     }
217 |   }
218 |   close(GFF);
219 | 
220 |   return (\%names,\%coords);
221 | }
222 | 
223 | # Takes two GFF files (original and patch) and produces a new GFF file
224 | # that includes patched gene models. Patched models match the original
225 | # ones by means of 'old_locus_tag' tags in gene features. 
226 | # Patched GFF files might be concatenated; this means latter models
227 | # can replaced models declared earlier in the same file. Two params:
228 | # i)   GFF filename
229 | # ii)  GFF filename with selected gene model patches
230 | # iii) output patched GFF filename 
231 | # Returns integer with number of patched gene models
232 | sub patch_gff {
233 | 
234 |   my ($gff_file, $patchfile, $patched_gff_filename) = @_;
235 | 
236 |   my ($patch,$gene_id,$old_gene_id,$comment);
237 |   my ($chr,$start,$line);
238 |   my (%depr_gene_id, %new2old, %patched_model, %coords);
239 | 
240 |   # read in GFF patches, possible concatenated
241 |   open(PATCH,'<',$patchfile) ||
242 |     die "# ERROR(patch_gff): cannot read $patchfile\n";
243 |   while($line = <PATCH>) {
244 | 
245 |     if($line =~ /^$/){ 
246 |       next 
247 |     } elsif($line =~ /^#/) {
248 |       $comment = $line;
249 |     } else {
250 |       chomp($line);
251 |       my @gffdata = split(/\t/,$line);
252 | 
253 |       #chr01 gmap gene 411 776 . - . ID=Os121164;..old_locus_tag=missing;
254 |       #chr01 gmap gene 411 776 . - . ID=gene:Os127564;..old_locus_tag=Os121164;
255 |       #chr01 gmap gene 411 800 . - . ID=Os22222;..old_locus_tag=Os127564;
256 | 
257 |       if($gffdata[2] && $gffdata[2] eq 'gene') {
258 | 
259 |         $chr = $gffdata[0];
260 |         $start = $gffdata[3];
261 | 
262 |         if($gffdata[8] =~ m/ID=([^;]+)/) {
263 |           $gene_id = $1;
264 |         }
265 | 
266 |         while($gffdata[8] =~ m/old_locus_tag=([^;]+)/g) {
267 |           $old_gene_id = $1;
268 | 
269 |           # if gene replaces a previous patched model,
270 |           # correct old_locus_tag to point to original
271 |           if($new2old{ $old_gene_id }) {
272 | 
273 |             # add original old locus tag
274 |             if($line !~ /$new2old{$old_gene_id}/) {
275 |               $line .= "old_locus_tag=$new2old{$old_gene_id};";
276 |             }
277 | 
278 |             $depr_gene_id{ $old_gene_id } = $gene_id;
279 |             $old_gene_id = $new2old{ $old_gene_id };
280 |           } 
281 | 
282 |           $new2old{ $gene_id } = $old_gene_id;
283 |           $depr_gene_id{ $old_gene_id } = $gene_id;
284 |         }
285 | 
286 |         $patched_model{ $gene_id } = "$line\n";
287 |         $coords{ $chr }{ $start } = $gene_id;
288 | 
289 |       } else {
290 |         $patched_model{ $gene_id } .= "$line\n"
291 |       }
292 |     }
293 |   }
294 |   close(PATCH);
295 | 
296 |   # find non-redundant patches, sort and concat them
297 |   my $total_patched = 0;
298 |   foreach $chr (sort {$a cmp $b} keys(%coords)) {
299 |     foreach $start (sort {$a <=> $b} keys(%{ $coords{$chr} })) {
300 | 
301 |       $gene_id = $coords{ $chr }{ $start };
302 | 
303 |       if($depr_gene_id{ $gene_id }) {
304 |         print "# skip old patched $gene_id\n";
305 |         next;
306 |       }
307 | 
308 |       #if($gene_id eq 'gene:Os06g0705350.path1') { print "mira\n" }
309 |       $patch .= $patched_model{ $gene_id };
310 |       $total_patched++;
311 |     } 
312 |   }
313 |   
314 |   printf("# total patched: %d deprecated: %d\n\n",
315 |     $total_patched,scalar(keys(%depr_gene_id)));
316 | 
317 |   # read in original GFF, apply patches and save output
318 |   open(PATCHED,">",$patched_gff_filename) ||
319 |     die "# ERROR(patch_gff): cannot create $patched_gff_filename\n";
320 | 
321 |   my $geneOK = 1;
322 |   open(GFF,'<',$gff_file) ||
323 |     die "# ERROR(patch_gff): cannot read $gff_file\n";
324 |   while(<GFF>) {
325 | 
326 |     my @gffdata = split(/\t/,$_);
327 |     if($gffdata[2] && $gffdata[2] eq 'gene') {
328 |       if($gffdata[8] =~ m/ID=([^;]+)/) {
329 |         $gene_id = $1; 
330 |         if($depr_gene_id{ $gene_id }){
331 |           $geneOK = 0; 
332 |         } else { $geneOK = 1 }
333 |       } else { $geneOK = 1 }
334 | 
335 |       print PATCHED if($geneOK);
336 | 
337 |     } else {
338 |       print PATCHED if($geneOK)
339 |     }
340 |   }
341 |   close(GFF);
342 | 
343 |   # apply patches 
344 |   print PATCHED $patch;
345 |   close(PATCHED);
346 | 
347 |   return $total_patched
348 | }
349 | 


--------------------------------------------------------------------------------
/pangenes/_dotplot.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | use File::Basename qw( dirname );
  5 | use FindBin '$Bin';
  6 | use lib "$Bin/lib";
  7 | use pangeneTools qw( read_FAI_regex2hash );
  8 | 
  9 | $|=1;
 10 | 
 11 | # Parses pairwise collinear TSV files made with _collinear_genes.pl to 
 12 | # produce PAF files that can be used to produce a dotplot of matched gene
 13 | # models with R package pafr [https://cran.r-project.org/package=pafr]
 14 | # Note: contigs < $MINCONTIGSIZE are ignored for clarity
 15 | 
 16 | # Copyright [2022-23] 
 17 | # EMBL-European Bioinformatics Institute & Estacion Experimental de Aula Dei-CSIC
 18 | 
 19 | # perl _dotplot.pl _Oryza_nivara_v1.Oryza_sativa.IRGSP-1.0.algMmap.overlap0.5.tsv 
 20 | 
 21 | my $MINCONTIGSIZE = 100_000;
 22 | my $DUMMYQUAL = 60;
 23 | my $VERBOSE = 0;
 24 | 
 25 | my $TSVfile = $ARGV[0] || die "# usage: $0 <TSVfile>\n";
 26 | 
 27 | my $resultsDIR = dirname($TSVfile);
 28 | 
 29 | my ($sp1filename, $sp2filename) = ('', '');
 30 | my ($faifile, $sp1, $sp2, $species, $chr, $len) = ('','','');
 31 | my (%file,%size);
 32 | 
 33 | ## locate FASTA index files (.fai), usually created in _cut_sequences.pl
 34 | opendir(DIR,$resultsDIR) || die "# ERROR: cannot list $resultsDIR\n";
 35 | my @faifiles = grep {/\.fai$/} readdir(DIR);
 36 | closedir(DIR);
 37 | 
 38 | foreach $faifile (@faifiles) {
 39 | 
 40 |   $species = $faifile; 
 41 |   $species =~ s/^_//;
 42 |   $species =~ s/\.fna.fai$//;
 43 | 
 44 |   if($TSVfile =~ m/_$species\./) {
 45 |     $sp1 = $species;
 46 |     $sp1filename = $sp1;
 47 |     $file{ $sp1 } = "$resultsDIR/$faifile";
 48 |   } elsif($TSVfile =~ m/\.$species\./) {
 49 |     $sp2 = $species;
 50 |     $sp2filename = $sp2;
 51 |     $file{ $sp2 } = "$resultsDIR/$faifile";
 52 |   }
 53 | } 
 54 | 
 55 | ## parse contig sizes from FASTA index files
 56 | if($sp1 eq '' || $sp2 eq '') {
 57 |   die "# ERROR: cannot find FASTA indexes (.fai) for $TSVfile ($sp1, $sp2)\n";
 58 | } else {
 59 |   for $species ($sp1, $sp2) {
 60 |     my $ref_bed = read_FAI_regex2hash( $file{$species}  );
 61 |     foreach $chr (keys(%$ref_bed)) {
 62 |       $len = (split(/\t/,$ref_bed->{$chr}))[2];
 63 |       $size{$species}{$chr} = $len;
 64 | 	  print "# $species $chr $len\n" if($VERBOSE);
 65 |     }
 66 |   }
 67 | }
 68 | 
 69 | ## parse TSV file and convert it to PAF format so that pafr can take it,
 70 | ## see https://dwinter.github.io/pafr/articles/Introduction_to_pafr.html
 71 | 
 72 | my $outPAFfile = $TSVfile;
 73 | $outPAFfile =~ s/\.tsv$/.genes.paf/;
 74 | 
 75 | open(PAF,">",$outPAFfile) || die "# ERROR: cannot create $outPAFfile\n";
 76 | 
 77 | my (@data,$chr1,$start1,$end1,$chr2,$start2,$end2);
 78 | my ($strand1,$strand2,$relstrand);
 79 | open(TSV,"<",$TSVfile) || die "# ERROR: cannot read $TSVfile\n";
 80 | while(<TSV>) {
 81 |  
 82 |   @data = split(/\t/,$_);
 83 | 
 84 |   # OsMH63_01G000010 OsMH63_01G000010 oryza_sativa_mh63 .. ortholog_collinear .. oryza_sativa .. 1:7538-15379(+);1:2902-10817(+)
 85 |   # Note that after
 86 |   # https://github.com/Ensembl/plant-scripts/blob/f9c9e4e71fbb8e46f84b0609a6dfc1dd5930bacf/pangenes/_collinear_genes.pl#L1774
 87 |   # in segments species order might change
 88 |   # Os01g0100466 Os01g0100466 oryza_sativa .. segment_collinear .. oryza_sativa_mh63:1:17370-18541(-) .. 1:12807-13978(-);1:17370-18541(-)
 89 | 
 90 |   if($data[14] =~ m/(\S+)?:(\d+)-(\d+)\(([+-])\);(\S+)?:(\d+)-(\d+)\(([+-])\)/) {
 91 |    
 92 |     $sp1 = $data[2];
 93 |     $sp2 = $data[7];
 94 |     ($chr1,$start1,$end1,$strand1,$chr2,$start2,$end2,$strand2) = ($1,$2,$3,$4,$5,$6,$7,$8);
 95 | 
 96 |     # skip unknown contigs ie unplaced_chrUn, might occur with -s
 97 |     next if(!$size{$sp1}{$chr1} || !$size{$sp2}{$chr2});
 98 | 
 99 |     next if($size{$sp1}{$chr1} < $MINCONTIGSIZE || 
100 |       $size{$sp2}{$chr2} < $MINCONTIGSIZE);
101 | 
102 |     if($strand1 eq $strand2) {
103 |       $relstrand = '+';
104 |     } else {
105 |       $relstrand = '-';
106 |     }
107 | 
108 |     if($sp1 eq $sp1filename) {
109 |       printf( PAF "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n",
110 |         $chr1,$size{$sp1}{$chr1},$start1-1,$end1,
111 |         $relstrand,
112 |         $chr2,$size{$sp2}{$chr2},$start2-1,$end2,
113 |         $data[3], # overlap instead of matching bases in the mapping
114 |         $data[3], # overlap instead of bases, including gaps, in the mapping
115 |         $DUMMYQUAL);
116 |     } else {
117 |       printf( PAF "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n",
118 |         $chr2,$size{$sp2}{$chr2},$start2-1,$end2,
119 |         $relstrand,	
120 |         $chr1,$size{$sp1}{$chr1},$start1-1,$end1,
121 |         $data[3], # overlap instead of matching bases in the mapping
122 |         $data[3], # overlap instead of bases, including gaps, in the mapping
123 |         $DUMMYQUAL);
124 |     }
125 |   }
126 | }
127 | close(TSV);
128 | 
129 | close(PAF);
130 | 
131 | print "\n# \$MINCONTIGSIZE = $MINCONTIGSIZE\n";
132 | print "\n# PAF file: $outPAFfile\n\n";
133 | 
134 | print "# Make a dotplot of aligned models coords with the following R script:\n";
135 | 
136 | print<<EOF;
137 | 
138 |   #https://dwinter.github.io/pafr/articles/Introduction_to_pafr.html
139 |   #install.packages(devtools)
140 |   #devtools::install_github("dwinter/pafr")
141 | 
142 |   library(pafr, quietly=TRUE)
143 | 
144 |   pafile = "$outPAFfile"
145 |   ali <- read_paf(pafile)
146 | 
147 |   dotplot(ali, label_seqs = TRUE, xlab='$sp1', ylab='$sp2')
148 | 
149 |   #if chr/contig coverage wanted instead
150 |   #plot_coverage(ali) + scale_fill_brewer()
151 | 
152 | EOF
153 | 
154 | 


--------------------------------------------------------------------------------
/pangenes/asciinema.txt:
--------------------------------------------------------------------------------
 1 | # download code
 2 | git clone https://github.com/Ensembl/plant-scripts.git
 3 | cd plant-scripts/
 4 | 
 5 | # if you already had the software you could update as this
 6 | git pull
 7 | 
 8 | sleep 3
 9 | 
10 | # install dependencies and download dataset test_rice, takes some time
11 | make install_pangenes
12 | ls lib pangenes/bin
13 | 
14 | sleep 3
15 | 
16 | # configure for your HPC cluster (optional, recommended)
17 | cd pangenes/
18 | 
19 | # there are two sample config files: HPC.conf.sample -> LSF , HPC.conf.sample.slurm -> slurm
20 | ls HPC*
21 | 
22 | # suppose you want to run GET_PANGENES in a slurm setting
23 | cp HPC.conf.sample.slurm HPC.conf
24 | cat HPC.conf
25 | # you should manually edit file HPC.conf to match your settings
26 | # for instance, the provided sample file assumes a queue named 'production' and max 70GB RAM per job, 
27 | # enough in our benchmarks up to wheat and maize using minimap2, you might want to change that
28 | 
29 | sleep 3
30 | 
31 | ## examples
32 | 
33 | # local analysis of test_rice data, make it HPC/parallel with: perl get_pangenes.pl -d ../files/test_rice -m cluster
34 | perl get_pangenes.pl -d ../files/test_rice
35 | 
36 | # results folder is: /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_
37 | ls /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_
38 | 
39 | sleep 3
40 | 
41 | # now restrict whole-genome alignments (WGA) to homologous chromosomes,
42 | # for this to work chr names in input FASTA files must be consistent so that regular expression will match them all, let's check:
43 | zgrep "^>" ../files/test_rice/*fa.gz
44 | 
45 | # in the test_rice example the main chromosomes are named with integer numbers, this should work:
46 | perl get_pangenes.pl -d ../files/test_rice -s '^\d+'
47 | 
48 | # now the results are in /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_split_
49 | # you can see the folder name now is added 'split_' to indicate that input genomes were split by chr matching the regular expression
50 | 
51 | sleep 3
52 | 
53 | # now we will mask geneless regions >= 1Mb, this is required by minimap2 with large genomes 
54 | perl get_pangenes.pl -d ../files/test_rice -H
55 | # in this case, output is saved to Oryza_nivara_v1chr1_highrep_alltaxa_2neigh_algMmap_
56 | # note the 'highrep_' tag
57 | 
58 | sleep 3
59 | 
60 | # let's check how much disk the output folder takes, most are temporary files
61 | # that might be re-used in future jobs, but can be removed if needed
62 | du -hs test_rice_pangenes/
63 | du -hs test_rice_pangenes/tmp
64 | 
65 | sleep 3
66 | 
67 | # we will now extract the WGA evidence supporting an example pangene cluster,
68 | # see also options -f -v
69 | perl check_evidence.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -i gene:ONIVA01G50800.cdna.fna
70 | 
71 | sleep 3
72 | 
73 | # match arbitrary sequences to computed pangene clusters
74 | perl match_cluster.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -s ../files/test_transcripts.fna -o test_transcripts.gmap.tsv 
75 | cat test_transcripts.gmap.tsvtest_transcripts.gmap.tsv
76 | 
77 | sleep 3
78 | 
79 | # clean up
80 | rm -rf test_rice_pangenes
81 | 
82 | exit
83 | 


--------------------------------------------------------------------------------
/pangenes/bin/README.md:
--------------------------------------------------------------------------------
1 | This is where external binaries needed to analyze a pan-gene set are to be installed.
2 | They can be installed with:
3 | 
4 |     cd ../..
5 |     make install_pangenes
6 | 
7 | 


--------------------------------------------------------------------------------
/pangenes/check_quality.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # This script takes a cDNA/CDS cluster produced by get_pangenes.pl and 
  4 | # produces a quality control report 
  5 | 
  6 | # Copyright [2023-25]
  7 | # EMBL-European Bioinformatics Institute & Estacion Experimental Aula Dei-CSIC
  8 | 
  9 | $|=1;
 10 | 
 11 | use strict;
 12 | use warnings;
 13 | use Getopt::Std;
 14 | use File::Temp qw/ tempfile /;
 15 | use FindBin '$Bin';
 16 | use lib "$Bin/lib";
 17 | use pangeneTools qw( check_installed_features feature_is_installed 
 18 |                      parse_sequence_FASTA_file calc_stdev calc_mode );
 19 | 
 20 | my @FEATURES2CHECK = (
 21 |   'EXE_CLUSTALO', 'EXE_ALISTAT', 'EXE_GREP'
 22 | );
 23 | 
 24 | my ($INP_dir, $INP_clusterfile, $INP_first_isof, $INP_noheader, $INP_outdir) = ('','',0,0,'');
 25 | my ($isCDS, $ispep, $seq, $n_isof, $occup, $SE_len, $SE_exons) = ( 0, 0 );
 26 | my ($updir, $n_exons, $gff_file, $SE_dist, $max_dist, $c);
 27 | my ($cluster_list_file,$cluster_folder, $gene_id, $isof_id);
 28 | my ($msa_filename, $dist_filename, $fhmsa, $fhdist, $cmd);
 29 | my ($sites, $Ca, $Cr_max, $Cr_min, $Cc_max, $Cc_min, $Cij_max, $Cij_min);
 30 | my (%opts, %isof_len, %isof_seq, %isof_header, %isof_order);
 31 | my (%taxa, @len, @exons, @dist, @mode_len, @mode_exons, @mode_dist);
 32 | 
 33 | getopts('hnIco:d:i:', \%opts);
 34 | 
 35 | if(($opts{'h'})||(scalar(keys(%opts))==0))
 36 | {
 37 |   print "\nusage: $0 [options]\n\n";
 38 |   print "-h this message\n";
 39 |   print "-c print credits and checks installation\n";
 40 |   print "-d directory produced by get_pangenes.pl        (example: -d /path/data_pangenes/..._algMmap_,\n";
 41 |   print "                                                 genomic and gff files usually one folder up)\n";
 42 |   print "-i cdna/cds .fna/.faa file as in .cluster_list  (example: -i gene:ONIVA01G52180.cdna.fna)\n";
 43 |   print "-I take 1st isoform only                        (optional, by default takes all)\n";
 44 |   print "-o folder to write output files                 (optional, MSA files removed by default)\n";
 45 |   print "-n do not print header in text report           (optional, by default the following header is added:\n";
 46 |   print "                                                 file 1stisof occup seqs mode_len SE_len mode_exons\n";
 47 |   print "                                                 SE_exons mode_dist max_dist SE_dist sites Ca Cr_max\n";
 48 |   print "                                                 Cr_min Cc_max Cc_min Cij_max Cij_min)\n";
 49 |   exit(0);
 50 | }
 51 | 
 52 | if(defined($opts{'c'})) {
 53 |   print "\nPrimary citation:\n https://doi.org/10.1186/s13059-023-03071-z\n";
 54 |   print "\nThis software uses external algorithms, please cite them accordingly:\n";
 55 |   print " clustal-omega https://doi.org/10.1002%2Fpro.3290\n";
 56 |   print " AliStat https://doi.org/10.1093/nargab/lqaa024\n";
 57 | 
 58 |   # check all binaries needed by this program and print diagnostic info
 59 |   print check_installed_features(@FEATURES2CHECK);
 60 |   exit(0);
 61 | }
 62 | 
 63 | if(defined($opts{'d'})) { 
 64 |   $INP_dir = $opts{'d'};
 65 |   $updir = $INP_dir . '/..'; 
 66 | }
 67 | else{ die "# EXIT : need a -d directory\n" }
 68 | 
 69 | if(defined($opts{'i'})){  
 70 |   $INP_clusterfile = $opts{'i'};
 71 |   if($INP_clusterfile !~ /\.cdna\.fna$/ && 
 72 |     $INP_clusterfile !~ /\.cds\.fna$/ &&
 73 |     $INP_clusterfile !~ /\.cds\.faa$/) {
 74 |     die "# EXIT : need a .fna/.faa cluster filename with parameter -i\n"
 75 | 
 76 |   } else {
 77 |     if($INP_clusterfile =~ /\.cds\.f/) {
 78 |       $isCDS = 1
 79 |     }
 80 | 
 81 |     if($INP_clusterfile =~ /\.cds\.faa/) { 
 82 |       $ispep = 1
 83 |     }
 84 |   }
 85 | }
 86 | else{ die "# EXIT : need parameter -i\n" }
 87 | 
 88 | if(defined($opts{'I'})){ 
 89 |   $INP_first_isof = 1 
 90 | }
 91 | 
 92 | if(defined($opts{'o'})){
 93 |   $INP_outdir = $opts{'o'};
 94 |   if(!-e $INP_outdir) {
 95 |     mkdir($INP_outdir);
 96 |   }
 97 | }
 98 | 
 99 | if(defined($opts{'n'})){
100 |   $INP_noheader = 1
101 | }
102 | 
103 | # 1) locate .cluster_list file to check clusterfile is there
104 | opendir(INPDIR,$INP_dir) || 
105 |   die "# ERROR: cannot list $INP_dir , please check -d argument is a valid folder\n";
106 | my @files = grep {/\.cluster_list/} readdir(INPDIR);
107 | closedir(INPDIR);
108 | 
109 | if(@files) {
110 |   $cluster_list_file = $files[0];
111 |   $cluster_folder = (split(/\.cluster_list/,$cluster_list_file))[0]
112 | } else {
113 |   die "# ERROR: cannot find .cluster_list file in $INP_dir\n";
114 | }
115 | 
116 | my $clusternameOK = 0;
117 | open(LIST,"<","$INP_dir/$cluster_list_file") ||
118 |   die "# ERROR: cannot read $INP_dir/$cluster_list_file, ".
119 |     "please check -d argument is a valid folder\n";
120 | 
121 | while(<LIST>) {
122 |   if(/$INP_clusterfile/) {
123 |     $clusternameOK = 1;
124 |   }
125 | }
126 | close(LIST);
127 | 
128 | if($clusternameOK == 0) {
129 |   die "# ERROR: cannot find $INP_clusterfile in $INP_dir/$cluster_list_file, please correct\n";
130 | }
131 | 
132 | # 2) parse FASTA file, extract gene names and sequence lengths
133 | my ( $ref_geneid, $ref_fasta, $ref_isof_coords, $ref_taxon ) = 
134 |   parse_sequence_FASTA_file( "$INP_dir/$cluster_folder/$INP_clusterfile" , 1);
135 | 
136 | foreach $gene_id (@$ref_geneid) {
137 | 
138 |   $n_isof = 0;
139 |   foreach $seq (split(/\n/,$ref_fasta->{$gene_id})) {
140 | 
141 |     if($seq =~ /^>(\S+)/) {
142 |       $n_isof++;
143 |       $isof_id = $1;
144 |       $isof_header{$gene_id}{$isof_id} = $seq;
145 |       $isof_order{$gene_id}{$isof_id} = $n_isof;
146 |       next;
147 |     }
148 |     $isof_len{$gene_id}{$isof_id} += length($seq);
149 |     $isof_seq{$gene_id}{$isof_id} .= $seq;
150 |   }
151 | }
152 | 
153 | # 3) print selected isoform sequence(s) to temp file and work out basic stats 
154 | my ($fh, $filename) = tempfile( 'tempfasXXXXX', UNLINK => 1);
155 | 
156 | foreach $gene_id (@$ref_geneid) {
157 |   foreach $isof_id (keys(%{$isof_len{$gene_id}})) {
158 | 
159 |     next if($INP_first_isof == 1 && $isof_order{$gene_id}{$isof_id} != 1);
160 |     
161 |     $taxa{ $ref_taxon->{$gene_id} }++;
162 |     push(@len, $isof_len{$gene_id}{$isof_id});
163 | 
164 |     # find GFF file & get number of exons
165 |     $n_exons = 0;
166 |     $gff_file = $updir . "/_$ref_taxon->{$gene_id}.gff";
167 | 
168 |     # not always there, perhaps only results folder present   
169 |     if(-z $gff_file) {
170 |       open(GREP, "$ENV{'EXE_GREP'} '$isof_id;' $gff_file |");
171 |       while(<GREP>) {
172 |         #1	NAM	exon	2575663	2575953 ...
173 |         my @data = split(/\t/,$_);
174 | 
175 |         if($isCDS == 0 && $data[2] eq 'exon') {
176 |           $n_exons++
177 | 
178 |         } elsif($isCDS == 1 && $data[2] eq 'CDS') {
179 |           $n_exons++
180 |         }
181 |       }
182 |       close(GREP);  
183 |     }
184 | 
185 |     push(@exons, $n_exons);
186 | 
187 |     # actually print to temp file
188 |     print $fh "$isof_header{$gene_id}{$isof_id}\n$isof_seq{$gene_id}{$isof_id}\n";
189 |   }
190 | }
191 | 
192 | $occup = scalar(keys(%taxa));
193 | $n_isof = scalar(@len); # recompute in case inly 1st isoform taken
194 | 
195 | $SE_len = sprintf("%1.1f", calc_stdev( \@len ) / sqrt($n_isof));
196 | @mode_len = calc_mode( \@len ); 
197 | $SE_exons = sprintf("%1.1f", calc_stdev( \@exons ) / sqrt($n_isof));
198 | @mode_exons = calc_mode( \@exons );
199 | 
200 | 
201 | # 4) compute multiple sequence alignment (MSA), distance matrix & MSA report
202 | 
203 | if($INP_outdir ne '') {
204 |   $msa_filename = "$INP_outdir/$INP_clusterfile";
205 |   $msa_filename =~ s/\.(f[na]a)$/.aln.$1/;
206 |   $dist_filename = "$INP_outdir/$INP_clusterfile";
207 |   $dist_filename =~ s/\.(f[na]a)$/.dist.$1/;
208 | } else {
209 |   ($fhmsa, $msa_filename) = tempfile( 'tempmsaXXXXX', UNLINK => 1);
210 |   ($fhdist, $dist_filename) = tempfile( 'tempdistXXXXX', UNLINK => 1);
211 | }
212 | 
213 | $cmd = "$ENV{'EXE_CLUSTALO'} --force --full -i $filename -o $msa_filename --distmat-out=$dist_filename 2>&1";
214 | system($cmd); 
215 | if ( $? != 0 ) {
216 |   die "# ERROR: failed running clustal-omega ($cmd)\n";
217 | } elsif ( !-s $msa_filename ) {
218 |   die "# ERROR: failed generating $msa_filename file ($cmd)\n";
219 | }
220 | 
221 | # parse MSA distances
222 | $max_dist = -1;
223 | open(DIST,"<",$dist_filename) ||
224 |   die "# ERROR: cannot read $dist_filename\n";
225 | while(<DIST>) {
226 |   chomp;
227 |   my @data = split(/\s+/,$_);
228 |   next if($#data < 1);
229 |   foreach $c (1 .. $#data) { 
230 |     push(@dist, $data[$c]);
231 |     if($data[$c] > $max_dist){ $max_dist = $data[$c] } 
232 |   }  
233 | }
234 | close(DIST);
235 | 
236 | $SE_dist = sprintf("%1.1f", calc_stdev( \@dist ) / $n_isof);
237 | @mode_dist = calc_mode( \@dist ); 
238 | 
239 | # MSA report
240 | $cmd = "$ENV{'EXE_ALISTAT'} $msa_filename 1 -b";
241 | if($ispep) {
242 |   $cmd = "$ENV{'EXE_ALISTAT'} $msa_filename 6 -b";
243 | }
244 | 
245 | open(ALISTAT,"$cmd |") ||
246 |   die "# ERROR: cannot run $cmd\n";
247 | while(<ALISTAT>) {
248 |   #sequences, #sites, Ca, Cr_max, Cr_min, Cc_max, Cc_min, Cij_max, Cij_min
249 |   if(/^$msa_filename/) {
250 |     chomp;
251 |     my @data = split(/,\s+/,$_);
252 |     ($sites, $Ca, $Cr_max, $Cr_min, $Cc_max, $Cc_min, $Cij_max, $Cij_min) = @data[2 .. $#data];
253 |   }
254 | }
255 | close(ALISTAT);
256 | 
257 | 
258 | # 5) finally print summary in one line
259 | if($INP_noheader == 0) {
260 |   print "file\t1stisof\toccup\tseqs\tmode_len\tSE_len\tmode_exons\tSE_exons\t" .
261 |     "mode_dist\tmax_dist\tSE_dist\tsites\tCa\tCr_max\tCr_min\tCc_max\tCc_min\tCij_max\tCij_min\n";
262 | }
263 | 
264 | printf(
265 |   "%s\t%d\t%d\t%d\t%d\t%1.1f\t" .
266 |     "%d\t%1.1f\t%1.6f\t%1.6f\t%1.6f\t%d\t" .
267 |     "%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\t%1.6f\n",
268 | 
269 |   $INP_clusterfile,
270 |   $INP_first_isof,  
271 |   $occup,
272 |   $n_isof,
273 |   $mode_len[0],
274 |   $SE_len,
275 | 
276 |   $mode_exons[0],
277 |   $SE_exons,
278 |   $mode_dist[0],
279 |   $max_dist,
280 |   $SE_dist,
281 |   $sites, 
282 | 
283 |   $Ca, 
284 |   $Cr_max, 
285 |   $Cr_min, 
286 |   $Cc_max, 
287 |   $Cc_min, 
288 |   $Cij_max, 
289 |   $Cij_min);
290 | 


--------------------------------------------------------------------------------
/pangenes/cpanfile:
--------------------------------------------------------------------------------
 1 | requires 'FindBin';
 2 | requires 'Compress::Zlib';
 3 | requires 'Cwd';
 4 | requires 'Fcntl';
 5 | requires 'File::Basename';
 6 | requires 'File::Copy';
 7 | requires 'File::Temp';
 8 | requires 'Getopt::Long';
 9 | requires 'Getopt::Std';
10 | requires 'DB_File';
11 | 


--------------------------------------------------------------------------------
/pangenes/lib/HPCluster.pm:
--------------------------------------------------------------------------------
  1 | package HPCluster;
  2 | 
  3 | # Package to manage cluster jobs from get_pangenes.pl
  4 | 
  5 | # Currently supports SGE, SLURM and LSF clusters, 
  6 | # but it should not be too dificult to add support for other systems (help welcome)
  7 | 
  8 | # taken from https://github.com/eead-csic-compbio
  9 | 
 10 | use strict;
 11 | require Exporter;
 12 | 
 13 | our @ISA = qw( Exporter );
 14 | our @EXPORT = qw( 
 15 |   read_cluster_config print_cluster_config cluster_is_available 
 16 |   submit_cluster_job check_cluster_jobs
 17 | );
 18 | 
 19 | # key cluster management binaries
 20 | my @CLBINS = ('SUBEXE','CHKEXE','DELEXE');
 21 | 
 22 | # Default SGE cluster configuration options
 23 | # can be overriden by custom config file 
 24 | my %CLUSTER_CONF;
 25 | $CLUSTER_CONF{'PATH'}   = '';     # should end with /
 26 | $CLUSTER_CONF{'TYPE'}   = 'sge';
 27 | $CLUSTER_CONF{'SUBEXE'} = 'qsub';
 28 | $CLUSTER_CONF{'CHKEXE'} = 'qstat';
 29 | $CLUSTER_CONF{'DELEXE'} = 'qdel';
 30 | $CLUSTER_CONF{'ERROR'}  = 'Eqw';  # state of failed jobs 
 31 | $CLUSTER_CONF{'QARGS'}  = '';     # queue name, resources, etc
 32 | $CLUSTER_CONF{'STIME'}  = 1;      # interval in seconds between sub commands
 33 | $CLUSTER_CONF{'CTIME'}  = 30;     # interval in seconds between stat commands
 34 | 
 35 | # check sample.HPC.conf for suggested LSF & slurm parameters
 36 | 
 37 | # Checks whether cluster config file exists and parses it.
 38 | # input: 
 39 | # 1 (string) full path to optional config file
 40 | sub read_cluster_config {
 41 |   my ($config_file) = @_;
 42 |   if(open(CONF,"<",$config_file)) {
 43 |     while(<CONF>) {
 44 |       next if(/^#/);
 45 |       chomp;
 46 |       if(/^(\S+)\s+([^\n]+)/){ $CLUSTER_CONF{$1} = $2 }
 47 |     }
 48 |     close(CONF);
 49 |   } else {
 50 |     print "# INFO: no cluster config file\n\n";
 51 |   }
 52 | }
 53 | 
 54 | # Prints to stdout current cluster configuration
 55 | sub print_cluster_config {
 56 |   foreach my $conf (sort keys(%CLUSTER_CONF)) {
 57 |     print "# $conf\t$CLUSTER_CONF{$conf}\n";
 58 |   }
 59 |   print "\n";
 60 | }
 61 | 
 62 | # Checks whether cluster management binaries can be used, returns 0 otherwise
 63 | # Uses system 'which'
 64 | sub cluster_is_available {
 65 |   my ($path,$output);
 66 |   for my $bin (@CLBINS) {
 67 |     # concat path and binary and do system call 
 68 |     $path = `which $CLUSTER_CONF{"PATH"}$CLUSTER_CONF{$bin}`;
 69 |     chomp($path);
 70 |     if($path eq '' || $path =~ /no $CLUSTER_CONF{$bin} in/) {
 71 |       print "# ERROR: cannot find cluster binary $bin\n\n";
 72 |       return 0;
 73 |     } else {
 74 | 
 75 |       $output = `$CLUSTER_CONF{"PATH"}$CLUSTER_CONF{$bin} -help 2>&1`;
 76 |       if(!$output || 
 77 |         ($output !~ /usage:/i && $output !~ /use/i && $output !~ /invalid option/)) { 
 78 |         print "# ERROR: wrong cluster binary $bin\n";
 79 |         print "$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{$bin} -help\n";
 80 |         return 0; 
 81 |       }
 82 |     }
 83 |   }
 84 | 
 85 |   return 1;
 86 | }
 87 | 
 88 | # submits a cluster job, stores the assigned process id and waits STIME 
 89 | # input:
 90 | # 1 (string) job name
 91 | # 2 (string) command to be run
 92 | # 3 (string) name of output file
 93 | # 4 (string) name of work directory
 94 | # 5 reference to cluster job hash
 95 | sub submit_cluster_job {
 96 |   my ($jobname,$command,$outfile,$dir,$ref_cluster_PIDs) = @_;
 97 | 
 98 |   my ($qPID,$qsubcommand) = ('','');
 99 | 
100 |   if($CLUSTER_CONF{'TYPE'} eq 'lsf') {
101 |     $qsubcommand = " -J n$jobname -o $outfile ";
102 |   } elsif($CLUSTER_CONF{'TYPE'} eq 'slurm') {
103 |     $qsubcommand = " --job-name=n$jobname -o $outfile ";
104 |   }
105 | 
106 |   # other cluster management types could be added here with elsif
107 |   else { # default SGE 
108 |     $qsubcommand = " -N n$jobname -j y -o $outfile -S /bin/bash";
109 |   }
110 | 
111 |   $qPID = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'SUBEXE'} $CLUSTER_CONF{'QARGS'} $qsubcommand <<EOF
112 | #!/bin/env bash
113 | cd $dir
114 | $command
115 | EOF`;
116 | 
117 |   # parse and save process id   
118 |   if($qPID =~ /^(\d+)\./ || 
119 |     $qPID =~ /^Your job (\d+)/ ||
120 |     $qPID =~ /^Job <(\d+)> is submitted/ ||
121 |     $qPID =~ /^Submitted batch job (\d+)/ ){ $qPID = $1 } 
122 | 
123 |   # save job details associated to process id
124 |   $ref_cluster_PIDs->{$qPID}{'command'} = $qsubcommand;
125 |   $ref_cluster_PIDs->{$qPID}{'executable'} = $command;
126 |   $ref_cluster_PIDs->{$qPID}{'status'} = 'sent';
127 | 
128 |   # sleep to avoid overloading
129 |   sleep($CLUSTER_CONF{'STIME'});
130 | }
131 | 
132 | # Checks status of cluster jobs and prints messages to stdout
133 | # input:
134 | # 1 (string) name of work directory
135 | # 2 reference to cluster job hash
136 | sub check_cluster_jobs {
137 |   my ($dir,$ref_PIDs) = @_;
138 | 
139 |   my ($waiting,$qPID,$newqPID,$qout) = (1);
140 | 
141 |   while($waiting) {
142 |     $waiting=0;
143 |     foreach $qPID (sort {$a<=>$b} (keys(%$ref_PIDs))) {
144 | 
145 |       next if($ref_PIDs->{$qPID}{'status'} eq 'deleted');
146 | 
147 |       # get status of this job
148 |       $qout = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'CHKEXE'} | grep $qPID`; 
149 |       if($qout){
150 |         if($qout =~ /\s+$CLUSTER_CONF{'ERROR'}\s+/) {
151 |           # resubmit failed jobs
152 |           $newqPID = `$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'SUBEXE'} $CLUSTER_CONF{'QARGS'} $ref_PIDs->{$qPID}{'command'} <<EOF		  
153 |           cd $dir
154 |           $ref_PIDs->{$qPID}{'executable'}
155 | EOF`; 
156 |           
157 |           if($newqPID =~ /^(\d+)\./ || $newqPID =~ /^Your job (\d+)/){ $newqPID = $1 }
158 |           $ref_PIDs->{$newqPID}{'command'} = $ref_PIDs->{$qPID}{'command'};
159 |           $ref_PIDs->{$newqPID}{'executable'} = $ref_PIDs->{$qPID}{'executable'};
160 |           $ref_PIDs->{$newqPID}{'status'} = 'sent';
161 |           sleep($CLUSTER_CONF{'STIME'});
162 | 
163 |           # remove failed job
164 |           system("$CLUSTER_CONF{'PATH'}$CLUSTER_CONF{'DELEXE'} $qPID");
165 |           $ref_PIDs->{$qPID}{'status'} = 'deleted';
166 |           print "# check_cluster_jobs: deleted job $qPID , resubmitted as $newqPID\n";
167 |           $waiting++;
168 |         }
169 |         else{ $waiting++; last; }
170 |       }
171 |     }
172 |     if($waiting)
173 |     {
174 |       print "# check_cluster_jobs: waiting ...\n";
175 |       sleep($CLUSTER_CONF{'CTIME'});
176 |     }
177 |   }
178 | }
179 | 
180 | 1;
181 | 


--------------------------------------------------------------------------------
/pangenes/pics/collinear_pangenes_minimap2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/collinear_pangenes_minimap2.png


--------------------------------------------------------------------------------
/pangenes/pics/fixing_genemodels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/fixing_genemodels.png


--------------------------------------------------------------------------------
/pangenes/pics/flow-check-evidence.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-check-evidence.dia


--------------------------------------------------------------------------------
/pangenes/pics/flow-check-evidence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-check-evidence.png


--------------------------------------------------------------------------------
/pangenes/pics/flow-get-pangenes.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-get-pangenes.dia


--------------------------------------------------------------------------------
/pangenes/pics/flow-get-pangenes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/flow-get-pangenes.png


--------------------------------------------------------------------------------
/pangenes/pics/long_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/long_model.png


--------------------------------------------------------------------------------
/pangenes/pics/pairs2clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pairs2clusters.png


--------------------------------------------------------------------------------
/pangenes/pics/pangene_set_nomenclature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pangene_set_nomenclature.png


--------------------------------------------------------------------------------
/pangenes/pics/pangenesPAG2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/pangenesPAG2023.pdf


--------------------------------------------------------------------------------
/pangenes/pics/wgaoverlap.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/wgaoverlap.dia


--------------------------------------------------------------------------------
/pangenes/pics/wgaoverlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/pics/wgaoverlap.png


--------------------------------------------------------------------------------
/pangenes/plots/core_gene.tab_core_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/core_gene.tab_core_both.png


--------------------------------------------------------------------------------
/pangenes/plots/dotplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/dotplot.png


--------------------------------------------------------------------------------
/pangenes/plots/haplotypes.trimmed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/haplotypes.trimmed.png


--------------------------------------------------------------------------------
/pangenes/plots/pan_gene.tab_pan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pan_gene.tab_pan.png


--------------------------------------------------------------------------------
/pangenes/plots/pangene_context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pangene_context.png


--------------------------------------------------------------------------------
/pangenes/plots/pangene_matrix__shell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/plant-scripts/852b5cd5c54bc1d0611cd9afcd8864af9d28bec7/pangenes/plots/pangene_matrix__shell.png


--------------------------------------------------------------------------------
/phylogenomics/Oryza.log:
--------------------------------------------------------------------------------
 1 | 
 2 | # WARNING : folder 'Oryza' exists, files might be overwritten
 3 | 
 4 | # pangene_analysis.pl -d Plants -c Oryza -r oryza_sativa -o  -f Oryza -t protein -G 0 -W 0 -L 0 -S 0
 5 | 
 6 | # supported species in NCBI taxon Oryza : 11
 7 | 
 8 | # total selected species : 11
 9 | 
10 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_sativa.tsv.gz
11 | 
12 | # re-using Oryza_sativa.IRGSP-1.0.pep.all.fa.gz
13 | 
14 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_indica.tsv.gz
15 | 
16 | # re-using Oryza_indica.ASM465v1.pep.all.fa.gz
17 | 
18 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_brachyantha.tsv.gz
19 | 
20 | # re-using Oryza_brachyantha.Oryza_brachyantha.v1.4b.pep.all.fa.gz
21 | 
22 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_meridionalis.tsv.gz
23 | 
24 | # re-using Oryza_meridionalis.Oryza_meridionalis_v1.3.pep.all.fa.gz
25 | 
26 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_glaberrima.tsv.gz
27 | 
28 | # re-using Oryza_glaberrima.Oryza_glaberrima_V1.pep.all.fa.gz
29 | 
30 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_punctata.tsv.gz
31 | 
32 | # re-using Oryza_punctata.Oryza_punctata_v1.2.pep.all.fa.gz
33 | 
34 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_longistaminata.tsv.gz
35 | 
36 | # re-using Oryza_longistaminata.O_longistaminata_v1.0.pep.all.fa.gz
37 | 
38 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_barthii.tsv.gz
39 | 
40 | # re-using Oryza_barthii.O.barthii_v1.pep.all.fa.gz
41 | 
42 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_nivara.tsv.gz
43 | 
44 | # re-using Oryza_nivara.Oryza_nivara_v1.0.pep.all.fa.gz
45 | 
46 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_rufipogon.tsv.gz
47 | 
48 | # re-using Oryza_rufipogon.OR_W1943.pep.all.fa.gz
49 | 
50 | # re-using plant_tools/compara/downloads/Compara.98.protein_default.homologies.oryza_glumipatula.tsv.gz
51 | 
52 | # re-using Oryza_glumipatula.Oryza_glumaepatula_v1.5.pep.all.fa.gz
53 | 
54 | # oryza_sativa : sequences = 35775 clusters = 35419 (singletons = 8371)
55 | # oryza_indica : sequences = 40745 clusters = 39590 (singletons = 7645)
56 | # oryza_brachyantha : sequences = 32037 clusters = 31856 (singletons = 10421)
57 | # oryza_meridionalis : sequences = 29308 clusters = 28659 (singletons = 4605)
58 | # oryza_glaberrima : sequences = 33164 clusters = 31710 (singletons = 4051)
59 | # oryza_punctata : sequences = 31762 clusters = 30673 (singletons = 6246)
60 | # oryza_longistaminata : sequences = 31686 clusters = 30034 (singletons = 7460)
61 | # oryza_barthii : sequences = 34575 clusters = 33903 (singletons = 3263)
62 | # oryza_nivara : sequences = 36313 clusters = 35918 (singletons = 3387)
63 | # oryza_rufipogon : sequences = 37071 clusters = 36716 (singletons = 3837)
64 | # oryza_glumipatula : sequences = 35735 clusters = 35231 (singletons = 4249)
65 | 
66 | # total sequences = 378171
67 | 
68 | # number_of_clusters = 110289 (core = 8052)
69 | 
70 | # cluster_list = Oryza/oryzasativa_Oryza_algEnsemblCompara.cluster_list
71 | # cluster_directory = Oryza/oryzasativa_Oryza_algEnsemblCompara
72 | 
73 | # percent_conserved_proteins_file = Oryza/POCP.matrix.tab
74 | 
75 | # pangenome_file = Oryza/pangenome_matrix.tab tranposed = Oryza/pangenome_matrix.tr.tab
76 | # pangenome_genes = Oryza/pangenome_matrix_genes.tab transposed = Oryza/pangenome_matrix_genes.tr.tab
77 | # pangenome_FASTA_file = Oryza/pangenome_matrix.fasta
78 | 
79 | # genome composition report (samples=10,seed=12345)
80 | ## sample 0 (oryza_sativa | 0,1,2,3,4,5,6,7,8,9,10,)
81 | ## sample 1 (oryza_meridionalis | 3,2,4,8,10,0,6,9,1,7,5,)
82 | ## sample 2 (oryza_meridionalis | 3,0,1,9,4,2,6,8,7,5,10,)
83 | ## sample 3 (oryza_barthii | 7,10,2,8,6,5,3,9,1,0,4,)
84 | ## sample 4 (oryza_indica | 1,4,2,6,7,3,0,10,8,9,5,)
85 | ## sample 5 (oryza_sativa | 0,6,3,5,10,4,8,7,2,1,9,)
86 | ## sample 6 (oryza_barthii | 7,8,4,1,9,5,0,10,2,6,3,)
87 | ## sample 7 (oryza_glaberrima | 4,10,3,9,8,1,7,6,2,0,5,)
88 | ## sample 8 (oryza_indica | 1,0,2,3,6,8,10,9,5,4,7,)
89 | ## sample 9 (oryza_glaberrima | 4,6,1,5,0,2,10,9,8,7,3,)
90 | 
91 | # pan-gene (number of clusters) = Oryza/pan_gene.tab
92 | # core-gene (number of clusters) = Oryza/core_gene.tab
93 | 
94 | # runtime: 146 wallclock secs (79.95 usr  4.76 sys + 29.57 cusr  1.52 csys = 115.80 CPU)
95 | 


--------------------------------------------------------------------------------
/phylogenomics/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Plant phylogenomics scripts
 3 | 
 4 | These scripts interrogate Ensembl Plants through [REST endpoints](https://rest.ensembl.org) 
 5 | and the FTP site to export data that might be useful for phylogenomic and pan-gene set studies.
 6 | 
 7 | These scripts were tested at the 
 8 | [CABANA workshop: Analysis of crop genomics data ](http://training.ensembl.org/events/2021/2021-03-01-CABANA).
 9 | 
10 | ## Documentation and examples
11 | 
12 | Run any of the scripts with argument -h to get instructions and examples.
13 | 
14 | ## Dependencies
15 | 
16 | The following dependencies can be installed in the parent folder with:
17 | 
18 |     make install_REST
19 | 
20 | The scripts require the following non-core Perl modules:
21 | * [HTTP::Tiny](https://metacpan.org/release/HTTP-Tiny)
22 | * [JSON](https://metacpan.org/release/JSON)
23 | * [DBI](https://metacpan.org/pod/DBI)
24 | * [DBD::mysql](https://metacpan.org/pod/DBD::mysql)
25 | 
26 | which can be installed with: 
27 | ```
28 | # install cpanminus installer, check more options at https://metacpan.org/pod/App::cpanminus
29 | sudo cpan -i App::cpanminus  
30 | 
31 | # actually install modules
32 | sudo apt-get install -y mysql-client libmysqlclient-dev
33 | cpanm JSON JSON::XS HTTP::Tiny DBI DBD::mysql
34 | ```
35 | 
36 | In addition the scripts import module [PlantCompUtils.pm](./PlantCompUtils.pm), 
37 | which is included in this folder.
38 | 
39 | 
40 | ### ens_single-copy_core_genes.pl
41 | 
42 | This script can be used to obtain single-copy core genes present within a clade.
43 | Example calls include:
44 | 
45 | ```
46 | perl ens_single-copy_core_genes.pl -c Brassicaceae -f Brassicaceae
47 | perl ens_single-copy_core_genes.pl -c Brassicaceae -f Brassicaceae -t cdna -o beta_vulgaris
48 | perl ens_single-copy_core_genes.pl -f poaceae -c 4479 -r oryza_sativa -WGA 75
49 | perl ens_single-copy_core_genes.pl -f all -c 33090 -m all -r physcomitrium_patens
50 | ```
51 | 
52 | Note option -f produces FASTA files of aligned peptide sequences, one per cluster. 
53 | Such a task takes usually takes over an hour over the Ensembl REST API.
54 | 
55 | 
56 | ### ens_syntelogs.pl
57 | 
58 | This script is related to [ens_single-copy_core_genes.pl](ens_single-copy_core_genes.pl) but explicitely considers only orthogroups with Gene Order Conservation (GOC) score >= 75 by default. The output matrix contains also the genomic coordinates of genes of the reference genome:
59 | 
60 | ```
61 | perl ens_syntelogs.pl -c Brassicaceae -f Brassicaceae
62 | 
63 | ```
64 | 
65 | A sample output matrix is available in [Brassicaceae.syntelogs.GOC75.tsv](./bench/Brassicaceae.syntelogs.GOC75.tsv). 
66 | A benchmark is described in <https://github.com/Ensembl/plant_tools/tree/master/bench/synthelogs>.
67 | 
68 | Note option -f produces FASTA files of aligned peptide sequences, one per cluster. 
69 | Such a task takes usually takes over an hour over the Ensembl REST API.
70 | 
71 | WARNING: not all species are included in the Compara gene-tree analysis. You can exclude them with -i.
72 | 
73 | ### ens_sequences.pl
74 | 
75 | Produces a FASTA file with the canonical cds/pep sequences of species in a clade in Ensembl Plants:
76 | ```
77 | perl ens_syntelogs.pl -c Brassicaceae -f Brassicaceae.fna
78 | 
79 | ```
80 | 
81 | 
82 | ### ens_pangene_analysis.pl
83 | 
84 | This was a prototype which was eventually replaced by the scripts at 
85 | [pangenes](https://github.com/Ensembl/plant-scripts/tree/master/pangenes).
86 | 
87 | 


--------------------------------------------------------------------------------
/phylogenomics/TODO.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 1) workout stable id from cluster content, so it is reasonable stable
 3 | 
 4 | 1.1) IDEA 1
 5 | 1.1.1) produce a list of ordered core markers/genes along chrs -> how to be sure they will be core in the future?
 6 | 1.1.2) for each cluster, find the closest marker both sides and define interval
 7 | 1.1.3) number clusters per interval
 8 | 
 9 | 1.2) IDEA 2
10 | 1.2.1) compute N most frequent 21-mers in cluster
11 | 1.2.2) work out cluster id from kmer composition
12 | 
13 | 1.3) IDEA 3
14 | 1.3.1) Use previous clusters to compare to new and reuse stable ids as much as possible
15 | 


--------------------------------------------------------------------------------
/phylogenomics/downloads/README.txt:
--------------------------------------------------------------------------------
1 | 
2 | This folder is where Compara TSV and fasta compressed files are stored.
3 | 


--------------------------------------------------------------------------------
/phylogenomics/ens_sequences.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | 
  5 | use Getopt::Long qw(:config no_ignore_case);
  6 | use Benchmark;
  7 | use HTTP::Tiny;
  8 | use JSON qw(decode_json);
  9 | use FindBin '$Bin';
 10 | use lib $Bin;
 11 | use PlantCompUtils qw(
 12 |   list_ensembl_mysql_dbs get_canonical_transcript_ids
 13 |   download_FASTA_file parse_isoform_FASTA_file perform_rest_action
 14 |   $REQUEST_COUNT $FASTADIR @DIVISIONS
 15 | );
 16 | 
 17 | # Downloads cds/pep sequences of species in a clade from Ensembl Plants.
 18 | # Uses canonical transcripts, used in the gene tree analysis,
 19 | # which usually are the longest translation with no stop codons
 20 | #
 21 | # Copyright [2019-2021] EMBL-European Bioinformatics Institute
 22 | 
 23 | # Ensembl Genomes
 24 | my $RESTURL   = 'http://rest.ensembl.org';
 25 | my $INFOPOINT = $RESTURL . '/info/genomes/division/';
 26 | my $TAXOPOINT = $RESTURL . '/info/genomes/taxonomy/';
 27 | 
 28 | my $downloadir = $Bin . '/downloads';
 29 | my $division   = 'Plants';
 30 | my $seqtype    = 'protein';
 31 | my $taxonid    = '';    # NCBI Taxonomy id Brassicaceae=3700
 32 |                         # Asterids=71274, Poaceae=4479
 33 | 
 34 | my ( $fastadir, $outfile, $out_genome ) = ( '', '', '' );
 35 | 
 36 | my ( $help, $sp, $line, $id, $show_supported, $request, $response );
 37 | my ( $filename, $dnafile, $pepfile, $seqfolder, $ext );
 38 | my ( @ignore_species, %ignore, %division_supported );
 39 | 
 40 | GetOptions(
 41 |     "help|?"       => \$help,
 42 |     "supported|l"  => \$show_supported,
 43 |     "division|d=s" => \$division,
 44 |     "clade|c=s"    => \$taxonid,
 45 |     "outgroup|o=s" => \$out_genome,
 46 |     "ignore|i=s"   => \@ignore_species,
 47 |     "type|t=s"     => \$seqtype,
 48 |     "outfile|f=s"  => \$outfile
 49 | ) || help_message();
 50 | 
 51 | sub help_message {
 52 |     print "\nusage: $0 [options]\n\n"
 53 |       . "-c NCBI Taxonomy clade of interest (required, example: -c Brassicaceae or -c 3700)\n"
 54 |       . "-f output FASTA file               (required, example: -f myfile.fasta)\n"
 55 |       . "-l list supported species_names    (optional, example: -l)\n"
 56 |       . "-d Ensembl division                (optional, default: -d $division)\n"
 57 |       . "-o outgroup species_name           (optional, example: -o brachypodium_distachyon)\n"
 58 |       . "-i ignore species_name(s)          (optional, example: -i selaginella_moellendorffii -i ...)\n"
 59 |       . "-t sequence type [protein|cdna]    (optional, default: -t protein)\n\n";
 60 | 
 61 |     print "Example calls:\n\n"
 62 |       . " perl $0 -c Liliopsida -o arabidopsis_thaliana -f Liliopsoda.Atha.EG44.faa\n";
 63 |     exit(0);
 64 | }
 65 | 
 66 | if ($help) { help_message() }
 67 | 
 68 | if ($division) {
 69 |     if ( !grep( /^$division$/, @PlantCompUtils::DIVISIONS ) ) {
 70 |         die "# ERROR: accepted values for division are: "
 71 |           . join( ',', @PlantCompUtils::DIVISIONS ) . "\n";
 72 |     }
 73 |     else {
 74 |         my $lcdiv = lc($division);
 75 |         $fastadir = $PlantCompUtils::FASTADIR;
 76 |         $fastadir =~ s/xxx/$lcdiv/;
 77 |     }
 78 | }
 79 | 
 80 | if ($show_supported) {
 81 |     print "# $0 -d $division -l \n\n";
 82 | }
 83 | else {
 84 | 
 85 |     if ( $taxonid eq '' ) {
 86 |         print "# ERROR: need a valid NCBI Taxonomy clade, ".
 87 |                "such as -c Brassicaceae or -c 3700\n\n";
 88 |         print "# Check https://www.ncbi.nlm.nih.gov/taxonomy\n";
 89 |         exit;
 90 |     }
 91 |     else {
 92 |         $taxonid =~ s/\s+/%20/g;
 93 |     }
 94 | 
 95 |     if (@ignore_species) {
 96 |         foreach my $sp (@ignore_species) {
 97 |             $ignore{$sp} = 1;
 98 |         }
 99 |         printf( "\n# ignored species : %d\n\n", scalar( keys(%ignore) ) );
100 |     }
101 | 
102 |     if ( $seqtype ne 'protein' && $seqtype ne 'cdna' ) {
103 |         die "# ERROR: accepted values for seqtype are: protein|cdna\n";
104 |     }
105 |     else {
106 |         if ( $seqtype eq 'protein' ) {
107 |             $ext       = '.faa';
108 |             $seqfolder = 'pep';
109 |         }
110 |         else {
111 |             $ext       = '.fna';
112 |             $seqfolder = 'cdna';
113 |         }
114 |     }
115 | 
116 |     if ( !$outfile ) {
117 |         print "# ERROR: need a valid output file, such as -f Brassicaceae.fasta\n\n";
118 |         exit;
119 |     }
120 | 
121 |     print "# $0 -d $division -c $taxonid -o $out_genome -f $outfile -t $seqtype\n\n";
122 | }
123 | 
124 | my $start_time = new Benchmark();
125 | 
126 | # new object and params for REST requests
127 | my $http = HTTP::Tiny->new();
128 | my $global_headers = { 'Content-Type' => 'application/json' };
129 | $PlantCompUtils::REQUEST_COUNT = 0;
130 | 
131 | ## 0) check supported species in division 
132 | 
133 | $request = $INFOPOINT . "Ensembl$division?";
134 | 
135 | $response = perform_rest_action( $http, $request, $global_headers );
136 | my $infodump = decode_json($response);
137 | 
138 | foreach $sp ( @{$infodump} ) {
139 |     if ( $sp->{'has_peptide_compara'} ) {
140 |         $division_supported{ $sp->{'name'} } = 1;
141 |     }
142 | }
143 | 
144 | # list supported species and exit
145 | if ($show_supported) {
146 | 
147 |     foreach $sp ( sort( keys(%division_supported) ) ) {
148 |         print "$sp\n";
149 |     }
150 |     exit;
151 | }
152 | 
153 | # check outgroup is supported
154 | if ( $out_genome && !$division_supported{$out_genome} ) {
155 |     die "# ERROR: genome $out_genome is not supported\n";
156 | }
157 | 
158 | ## 1) check species in clade
159 | 
160 | my ( $n_of_species, $n_of_sequences ) = ( 0, 0 );
161 | my ( @supported_species, %supported );
162 | 
163 | $request = $TAXOPOINT . "$taxonid?";
164 | 
165 | $response = perform_rest_action( $http, $request, $global_headers );
166 | $infodump = decode_json($response);
167 | 
168 | foreach $sp ( @{$infodump} ) {
169 |     if ( $sp->{'name'} && $division_supported{ $sp->{'name'} } ) {
170 | 
171 |         next if ( $ignore{ $sp->{'name'} } );
172 | 
173 |         # add sorted clade species
174 |         $supported{ $sp->{'name'} } = 1;
175 |         push( @supported_species, $sp->{'name'} );
176 |     }
177 | }
178 | 
179 | printf( "# supported species in NCBI taxon %s : %d\n\n",
180 |     $taxonid, scalar(@supported_species) );
181 | 
182 | # add outgroup if required
183 | if ($out_genome) {
184 |     push( @supported_species, $out_genome );
185 |     $supported{$out_genome} = 1;
186 |     print "# outgenome: $out_genome\n";
187 | }
188 | 
189 | $n_of_species = scalar(@supported_species);
190 | print "# total selected species : $n_of_species\n\n";
191 | 
192 | ## 2) connect to public Ensembl server and 
193 | ##    find latest database schema for each species
194 | 
195 | my $ref_dbs = list_ensembl_mysql_dbs();
196 | 
197 | my %species2db;
198 | foreach $sp (@supported_species) {
199 |   foreach my $db (@$ref_dbs) {
200 |     if($db =~ /$sp\_core_\d+/) {
201 |       $species2db{$sp} = $db;
202 |     } 
203 |   }
204 | }
205 | 
206 | ## 3) get sequences for selected (plant) species
207 | 
208 | open( OUTFILE, ">", $outfile ) || die "# ERROR: cannot create $outfile\n";
209 | 
210 | # iteratively get and parse FASTA files
211 | foreach $sp (@supported_species) {
212 | 
213 |     # get list of canonical transcripts for this species
214 |     my $ref_canon_isofs = get_canonical_transcript_ids($species2db{$sp});
215 | 
216 |     printf("# %s canonical isoforms=%d\n", $sp,
217 |       scalar(keys(%$ref_canon_isofs)));
218 | 
219 |     # now get FASTA file and parse it, selected/longest isoforms are read
220 |     my $stored_sequence_file =
221 |       download_FASTA_file( $fastadir, "$sp/$seqfolder", $downloadir );
222 | 
223 |     my ( $ref_sequence, $ref_header ) =
224 |       parse_isoform_FASTA_file($stored_sequence_file, $ref_canon_isofs);
225 | 
226 |     foreach $id ( keys(%$ref_sequence) ) {
227 |         print OUTFILE ">$id $ref_header->{$id} [$sp]\n$ref_sequence->{$id}\n";
228 |         $n_of_sequences++;
229 |     }
230 | }
231 | 
232 | close(OUTFILE);
233 | 
234 | print "# created $outfile with $n_of_sequences sequences\n";
235 | 


--------------------------------------------------------------------------------
/phylogenomics/phylo_test.t:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | use Test::More tests => 1;
4 | 
5 | ok( eval{ `perl ens_single-copy_core_genes.pl -c Oryza -r oryza_sativa` } =~ /# total single-copy core clusters/ , 'ens_single-copy_core_genes.pl' );
6 | 


--------------------------------------------------------------------------------
/recipes/exampleAPI.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # Examples of queries to Ensembl Plants using the native Perl API
  4 | # Check the tutorials for more examples:
  5 | # https://m.ensembl.org/info/docs/api/core/core_tutorial.html
  6 | # https://www.ensembl.org/info/docs/api/compara/compara_tutorial.html
  7 | #
  8 | # Copyright [2017-21] EMBL-European Bioinformatics Institute
  9 | 
 10 | 
 11 | # Install the Ensembl Perl API and updated env as explained in
 12 | # http://www.ensembl.org/info/docs/api/api_installation.html
 13 | # http://www.ensembl.org/info/docs/api/api_git.html
 14 | # https://m.ensembl.org/info/docs/api/debug_installation_guide.html
 15 | # Note you might need some dependencies, such as libmysqlclient-dev
 16 | # and Perl modules DBI & DBD::mysql
 17 | 
 18 | 
 19 | ## A1) Load the Registry object with details of genomes available
 20 | 
 21 | use warnings;
 22 | use strict;
 23 | use List::Util qw(all);
 24 | use FindBin;
 25 | use lib "$FindBin::Bin/../lib/bioperl-live/";
 26 | use lib "$FindBin::Bin/../lib/ensembl/modules/";
 27 | use lib "$FindBin::Bin/../lib/ensembl-compara/modules/";
 28 | use lib "$FindBin::Bin/../lib/ensembl-variation/modules/";
 29 | use lib "$FindBin::Bin/../lib/ensembl-funcgen/modules/";
 30 | use lib "$FindBin::Bin/../lib/ensembl-metadata/modules/";
 31 | 
 32 | use Bio::EnsEMBL::Registry;
 33 | 
 34 | Bio::EnsEMBL::Registry->load_registry_from_db(
 35 | 	-USER => 'anonymous',
 36 | 	-HOST => 'mysql-eg-publicsql.ebi.ac.uk',
 37 | 	-PORT => '4157',
 38 | 	#-VERBOSE => 1 # uncomment to see dbs loaded
 39 | );
 40 | 
 41 | ## A2) Check which analyses are available for a species
 42 | 
 43 | # Note: logic_names are printed for each analysis
 44 | 
 45 | my $division = 'plants';
 46 | my $species = 'arabidopsis_thaliana';
 47 | 
 48 | my $analysis_adaptor = Bio::EnsEMBL::Registry->
 49 | 	get_adaptor( $species, "core", "analysis" );
 50 | 
 51 | foreach my $analysis (sort 
 52 | 	{$a->logic_name() cmp $b->logic_name()} 
 53 | 		@{ $analysis_adaptor->fetch_all() }){
 54 | 		print $analysis->logic_name(), "\n";
 55 | } 
 56 | 
 57 | # stop here if only test
 58 | if($ARGV[0] eq "test"){
 59 | 	exit(0);
 60 | }
 61 | 
 62 | 
 63 | ## A3) Get soft masked sequences from Arabidopsis thaliana
 64 | 
 65 | my $slice_adaptor = Bio::EnsEMBL::Registry->
 66 |     get_adaptor($species, 'core', 'Slice');
 67 | 
 68 | my ($total,$masked, $softseq) = (0,0);
 69 | foreach my $slice (@{ $slice_adaptor->fetch_all('toplevel') }){
 70 | 
 71 | 	# for brevity consider only the plastome
 72 | 	next if($slice->seq_region_name() ne 'Pt'); 
 73 | 
 74 |     # note Ensembl 1-based inclusive coordinates
 75 | 	printf(">%s %s %d-%d\n",
 76 | 		$slice->seq_region_name(),
 77 | 		$slice->coord_system_name(),
 78 | 		$slice->start(),
 79 | 		$slice->end());
 80 | 	
 81 | 	# By default repeatmask* analyses, see recipe A2 to list others
 82 | 	# Repeat analyses include 'repeatmask_redat', 
 83 | 	# 'repeatmask_nrplants' or 'repeatdetector_curated'
 84 | 	# $slice->get_repeatmasked_seq( ['repeatmask_redat'], 1 )
 85 | 	# only print a 50b segment for brevity
 86 | 	print substr($slice->get_repeatmasked_seq( undef, 1 )->seq(),80,50), "\n";
 87 | }
 88 | 
 89 | ## A4) Get BED file with repeats in chr4
 90 | 
 91 | my $chrname = 'chr4';
 92 | 
 93 | my $slice = $slice_adaptor->
 94 | 	fetch_by_region( 'toplevel', $chrname );
 95 | 
 96 | 	my @repeats = @{ $slice->get_all_RepeatFeatures() };
 97 | 	my $total_repeats = 0;
 98 | 
 99 | foreach my $repeat (@repeats) {
100 | 
101 | 	# for brevity
102 | 	last if($total_repeats++ > 10);
103 | 
104 | 	printf("%s\t%d\t%d\t%s\t%s\t%s\n",
105 | 		$chrname,
106 | 		$repeat->start()-1,
107 | 		$repeat->end(),
108 | 		$repeat->analysis()->logic_name(),
109 | 		$repeat->repeat_consensus()->repeat_class(),
110 | 		$repeat->repeat_consensus()->repeat_type() );
111 | }
112 | 
113 | ## A5) Find the DEAR3 gene
114 | 
115 | # gene of interest and species
116 | my $gene_name = 'DEAR3';
117 | 
118 | # get a gene adaptor to work with genes from
119 | # the species
120 | my $gene_adaptor = Bio::EnsEMBL::Registry->
121 | 	get_adaptor($species, 'core', 'gene');
122 | 
123 | # find the gene with the specified name using
124 | # the adaptor
125 | my ($gene_obj) = @{$gene_adaptor->
126 |    fetch_all_by_external_name($gene_name)};
127 | 
128 | ## A6) Get the transcript used in Compara analyses
129 | 
130 | # The canonical transcript is used in the gene tree analysis,
131 | # which usually is the longest translation with no stop codons
132 | 
133 | printf(">DEAR3 %s\n%s\n",
134 | 	$gene_obj->canonical_transcript()->stable_id(),
135 | 	$gene_obj->canonical_transcript()->spliced_seq() );
136 | 
137 | printf(">DEAR3 %s CDS\n%s\n",
138 |     $gene_obj->canonical_transcript()->stable_id(),
139 | 	$gene_obj->canonical_transcript()->translateable_seq() );
140 | 
141 | printf(">DEAR3 %s\n%s\n\n",
142 |     $gene_obj->canonical_transcript()->translation->stable_id(),
143 |     $gene_obj->canonical_transcript()->translate->seq() );
144 | 
145 | ## A7) Find all orthologues of a gene
146 | 
147 | # get an adaptor to work with genes from compara
148 | my $gene_member_adaptor = Bio::EnsEMBL::Registry->
149 | 	get_adaptor($division, 'compara', 'GeneMember');
150 | 
151 | # find the corresponding gene in compara
152 | my $gene_member = $gene_member_adaptor->
153 | 	fetch_by_stable_id($gene_obj->stable_id());
154 | 
155 | # get an adaptor to work with homologues in compara
156 | my $homology_adaptor = Bio::EnsEMBL::Registry->
157 | 	get_adaptor($division, 'compara', 'Homology');
158 | 
159 | # find all homologues of the gene
160 | my @homologies = @{$homology_adaptor->
161 | 	fetch_all_by_Member($gene_member)};
162 | 
163 | # filter out homologues based on type
164 | @homologies = grep {
165 | 	$_->description =~ m/ortholog/
166 | } @homologies;
167 | 
168 | foreach my $homology (@homologies) {
169 | 
170 | 	# get the protein from the target
171 | 	my $target = $homology->get_all_Members->[1];	
172 | 	
173 | 	printf("%s\t%s\t%s\t%s\n",
174 | 		$gene_obj->stable_id(), 
175 | 		$species, 
176 | 		$target->stable_id(),
177 | 		$target->genome_db->name() );
178 | }
179 | 
180 | ## A8) Get markers mapped on chr1D of bread wheat
181 | 
182 | # Note: only a few plants have markers
183 | # As of release EG47/100:
184 | # triticum_aestivum, oryza_indica, brassica_rapa
185 | #
186 | # Coordinates are returned in BED format
187 | 
188 | $species = 'triticum_aestivum';
189 | $chrname = '1D';
190 | 
191 | $slice_adaptor = Bio::EnsEMBL::Registry->
192 | 	get_adaptor( $species, 'Core', 'Slice' );
193 | 
194 | $slice = $slice_adaptor->
195 | 	fetch_by_region( 'chromosome', $chrname );
196 | 
197 | my $total_markers = 0;
198 | foreach my $mf (@{ $slice->get_all_MarkerFeatures() }) {
199 | 
200 | 	last if($total_markers++ > 10); #for brevity
201 | 
202 | 	my $marker = $mf->marker(); 
203 | 
204 | 	printf("%s\t%d\t%d\t%s\t%s\t%s\t%d\n",
205 | 		$mf->seq_region_name(),
206 | 		$mf->start()-1,      
207 | 		$mf->end(), 
208 | 		$mf->display_id(),
209 | 		$marker->left_primer(),
210 | 		$marker->right_primer(),
211 | 		$marker->max_primer_dist() );
212 | }
213 | 
214 | 
215 | ## A9) Find all syntelogues among rices
216 | 
217 | # Note: GOC=Gene Order Conservation score
218 | # Read more at 
219 | # https://www.ensembl.org/info/genome/compara/Ortholog_qc_manual.html
220 | 
221 | # get an adaptor to work with comparative sets from compara
222 | my $mlss_adaptor = Bio::EnsEMBL::Registry->
223 | 	get_adaptor($division, 'compara', 'MethodLinkSpeciesSet');
224 | 
225 | # find the mlss that describes orthologies between these
226 | # two rice species
227 | my $mlss = $mlss_adaptor->fetch_by_method_link_type_registry_aliases(
228 |     'ENSEMBL_ORTHOLOGUES', ['oryza_sativa', 'oryza_indica']);
229 | 
230 | # find all homologues between these two rice species
231 | @homologies = @{$homology_adaptor->
232 |     fetch_all_by_MethodLinkSpeciesSet($mlss)};
233 | 
234 | # filter out homologues based on local gene order conservation
235 | @homologies = grep {
236 | 	$_->goc_score && $_->goc_score >= 75
237 | } @homologies;
238 | 
239 | my $count = 1;
240 | foreach my $homology (@homologies) {
241 | 
242 | 	# get one orthologue
243 | 	my $prot = $homology->get_all_Members->[1];
244 | 
245 |     # find all orthologues in rice
246 |     my @rice_homologies = @{$homology_adaptor->
247 |         fetch_all_by_Member($prot,
248 |             -METHOD_LINK_TYPE => 'ENSEMBL_ORTHOLOGUES',
249 |             -TARGET_TAXON => 'Oryza')};
250 | 
251 |     if (all {$_->goc_score && $_->goc_score >= 75}
252 |                 @rice_homologies) {
253 |         foreach my $rh (@rice_homologies) {
254 |             printf("%s\t%s\t%s\t%s\t%d\n",
255 |                 $rh->get_all_Members->[0]->genome_db->name,
256 |                 $rh->get_all_Members->[0]->stable_id,
257 |                 $rh->get_all_Members->[1]->genome_db->name,
258 |                 $rh->get_all_Members->[1]->stable_id,
259 |                 $rh->goc_score);
260 |         }
261 |         print "\n";
262 | 
263 |         # Only print the first 10 groups
264 |         last if $count++ == 10;
265 |     }
266 | }
267 | 
268 | ## A10) Print all translations for otherfeatures genes
269 | 
270 | # Note: otherfeatures dbs are Ensembl databases that
271 | # usually contain additional annotation tracks
272 | 
273 | $count = 0;
274 | $species = 'triticum_aestivum';
275 | 
276 | $gene_adaptor = Bio::EnsEMBL::Registry->
277 | 	get_adaptor($species, "otherfeatures", "gene");
278 | my $genes = $gene_adaptor->fetch_all_by_biotype('protein_coding');
279 | 
280 | for my $gene (@$genes){
281 |     my $transcripts = $gene->get_all_Transcripts;
282 |     for my $t (@$transcripts){
283 |         if ($t->biotype ne 'protein_coding'){
284 |             next;
285 |         }
286 |         $count++;
287 |         print ">",$gene->stable_id,"\n";
288 | 
289 |         my $translation = $t->translation;
290 |         my $sequence = $translation->seq;
291 |         print $translation->seq, "\n";
292 |     }
293 | 
294 | 	# Print only first 10, comment for real use
295 |     last if ($count == 10);
296 | }
297 | 
298 | 


--------------------------------------------------------------------------------
/recipes/exampleBiomart.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | # Examples of R queries to Ensembl Plants Biomart
  4 | # Your usage of the data returned by the Biomart service is
  5 | # subject to same conditions as laid out on the Ensembl website.
  6 | #
  7 | # See documentation at 
  8 | # https://www.ensembl.org/info/data/biomart/how_to_use_biomart.html
  9 | # https://www.ensembl.org/info/data/biomart/biomart_r_package.html
 10 | #
 11 | # Copyright [2020-21] EMBL-European Bioinformatics Institute
 12 | 
 13 | # To install R package biomaRt run: Rscript install_R_deps.R
 14 | 
 15 | local_lib = "./lib/R/"
 16 | .libPaths( c( .libPaths(), local_lib) )
 17 | 
 18 | library("biomaRt")
 19 | 
 20 | args = commandArgs(trailingOnly=TRUE)
 21 | 
 22 | ## B1) Check plant marts and select dataset
 23 | 
 24 | listMarts( host="plants.ensembl.org" )
 25 | 
 26 | EPgenes = useEnsembl( biomart="plants_mart", 
 27 | 			host="plants.ensembl.org")
 28 | 
 29 | dsets = listDatasets(EPgenes)
 30 | 
 31 | dsets[grep("Triticum aestivum", dsets$description),]
 32 | #              dataset                     description version
 33 | # 69 taestivum_eg_gene Triticum aestivum genes (IWGSC)   IWGSC
 34 | 
 35 | # take a note of the dataset name 'taestivum_eg_gene'
 36 | 
 37 | 
 38 | ## B2) Check available filters and attributes
 39 | 
 40 | EPgenes = useMart( 
 41 | 	biomart="plants_mart",
 42 |   	host="plants.ensembl.org",
 43 | 	dataset="taestivum_eg_gene")
 44 | 
 45 | head( listFilters(EPgenes) )
 46 | 
 47 | head( listAttributes(EPgenes) )
 48 | 
 49 | # stop here if just a test
 50 | if(length(args)==1 && args[1]=="test"){
 51 | 	q("no",1)
 52 | }
 53 | 
 54 | 
 55 | ## B3) Download GO terms associated to genes
 56 | 
 57 | # Note genes might appear in several rows
 58 | 
 59 | go = getBM( 
 60 | 		attributes=c("ensembl_gene_id", "go_id"), 
 61 | 		mart=EPgenes) 
 62 | 
 63 | head(go)
 64 | 
 65 | ## B4) Get Pfam domains annotated in genes
 66 | 
 67 | EPgenes = useMart(  
 68 | 	biomart="plants_mart",
 69 | 	host="plants.ensembl.org",
 70 | 	dataset="hannuus_eg_gene")
 71 | 
 72 | pfam = getBM(
 73 | 		attributes=c("ensembl_gene_id", "pfam"),
 74 | 		mart=EPgenes)
 75 | 
 76 | head(pfam)
 77 | 
 78 | ## B5) Get SNP consequences from a selected variation source
 79 | 
 80 | # Note this requires connecting to a different mart (snp)
 81 | # Note this query takes a few minutes to run
 82 | 
 83 | EPvar = useMart( biomart="plants_variations",
 84 |         	host="plants.ensembl.org", 
 85 | 			dataset="taestivum_eg_snp")
 86 | 
 87 | snp_source = c("EMS-induced mutation")
 88 | 
 89 | chrs = listFilterValues(mart=EPvar,
 90 | 		filter="chr_name")
 91 | 
 92 | attribs = c(
 93 | 	"refsnp_id", 
 94 | 	"refsnp_source",
 95 | 	"ensembl_gene_stable_id",
 96 | 	"consequence_type_tv",
 97 | 	"sift_prediction",
 98 | 	"sift_score")
 99 | 
100 | filts = c( 
101 | 	"variation_source", 
102 | 	"chr_name",
103 |     "sift_prediction")
104 | 
105 | preds = c(
106 | 	"tolerated", # comment if unwanted
107 | 	"deleterious")
108 | 
109 | snps <- NULL
110 | for(chr in chrs){
111 | 	print(chr) # show progress 
112 | 
113 | 	for(pred in preds){
114 | 		print(pred) # show progress
115 | 
116 | 		tmp_s <- getBM(
117 | 			attributes=attribs,
118 | 			filters=filts,
119 | 			values=list(
120 | 				variation_source=snp_source, 
121 | 				chr_name=chr, 
122 | 				sift_prediction=c(pred)),
123 | 			mart=EPvar)
124 | 		
125 | 		# append SNP batches to object snps
126 | 		if(is.null(snps)){
127 | 			snps<-tmp_s
128 | 		}else{
129 | 		    snps<-rbind(snps,tmp_s)
130 | 		}																			
131 | 	}
132 | }
133 | 
134 | head(snps)
135 | 


--------------------------------------------------------------------------------
/recipes/exampleCRAM.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # Example of Perl client to browse RNA-seq CRAM files from FTP server
  4 | 
  5 | # Sequence reads from RNA-seq studies at the European Nucleotide Archive
  6 | # are regularly mapped to genome assemblies in Ensembl Plants. For each study 
  7 | # CRAM files are created with the https://www.ebi.ac.uk/fg/rnaseq/api pipeline 
  8 | # and published on FTP site ftp://ftp.ensemblgenomes.org/pub/misc_data/Track_Hubs
  9 | #
 10 | # Each study contains a separate folder for each assembly used for mapping. 
 11 | # For instance, study SRP133995 was mapped to tomato assembly SL3.0 and the 
 12 | # tracksDb.txt file therein indicates the full path to the relevant CRAM file 
 13 | # next to its metadata. 
 14 | 
 15 | # Copyright [2020-24] EMBL-European Bioinformatics Institute
 16 | 
 17 | use strict;
 18 | use warnings;
 19 | use Net::FTP;
 20 | 
 21 | ## C1) Find RNA-seq CRAM files for a genome assembly
 22 | 
 23 | # Note: assembly name is 'assembly_default' in recipe R2
 24 | # Note: can take a few minutes
 25 | 
 26 | my $FTPURL = 'ftp.ensemblgenomes.org';
 27 | my $FTPDIR = '/pub/misc_data/Track_Hubs';
 28 | 
 29 | my $assembly_name = '';
 30 | if($ARGV[0]){ $assembly_name = $ARGV[0] }
 31 | else{ die "# usage: $0 <assembly name, ie SL3.0>\n" }
 32 | 
 33 | my ($study,$file,$descr,$cramfile,$subgroup);
 34 | 
 35 | if( my $ftp = Net::FTP->new( $FTPURL, Passive=>1, Debug=>0, Timeout=>60) ){
 36 | 
 37 | 	$ftp->login( "anonymous", '-anonymous@' )|| 
 38 | 		die "# ERROR: cannot login " . $ftp->message();
 39 | 	$ftp->cwd($FTPDIR) || 
 40 | 		die "# ERROR: cannot change working directory to $FTPDIR " .
 41 | 		$ftp->message();
 42 | 
 43 | 	# print header
 44 | 	print "cramfile\tstudy\tassembly\tsubgroup\tdescription\n";
 45 | 
 46 | 	# stop if test only
 47 | 	if($assembly_name eq 'test'){
 48 | 		exit(0);
 49 | 	}
 50 | 
 51 | 	# list all ENA studies
 52 | 	foreach $study ( $ftp->ls() ) {
 53 | 		
 54 | 		$ftp->cwd($study);
 55 | 
 56 | 		my @contents = $ftp->ls();
 57 | 
 58 | 		# skip other assemblies
 59 | 		if(!grep(/^$assembly_name$/, @contents)){
 60 | 			$ftp->cdup();
 61 | 			next;
 62 | 		}
 63 | 
 64 | 		# get description from hub.txt
 65 | 		$descr = 'NA';
 66 | 		$ftp->get("hub.txt");
 67 | 		if(open(HUB,"<","hub.txt")){
 68 | 			while(<HUB>){
 69 | 				if(/^longLabel ([^;]*)/){
 70 | 					$descr = $1;
 71 | 				}
 72 | 			}
 73 | 			close(HUB);
 74 | 			unlink('hub.txt');
 75 | 		}
 76 | 		else { warn "# WARN: cannot get $study/hub.txt\n" }
 77 | 
 78 | 		# look for CRAM files
 79 | 		foreach $file (@contents) {
 80 | 			if($file eq $assembly_name) {
 81 | 
 82 | 				$ftp->cwd($file);
 83 | 
 84 | 				# get and parse trackDb.txt
 85 | 				$ftp->get("trackDb.txt");
 86 | 				if(open(TRACKDB,"<","trackDb.txt")){
 87 | 				
 88 | 					while(<TRACKDB>){
 89 | 						if(/track/){ $subgroup = 'NA' }
 90 | 						elsif(/subGroups (.*)$/){ $subgroup = $1 }
 91 | 						elsif(/bigDataUrl (\S+\.cram)/){
 92 | 							$cramfile = $1;
 93 | 
 94 | 							# print this CRAM file
 95 | 							print "$cramfile\t$study\t$file\t$subgroup\t$descr\n";
 96 | 						}
 97 | 					}
 98 | 					close(HUB);
 99 | 					unlink('trackDb.txt');
100 | 				}
101 | 				else { warn "# WARN: cannot get $study/$file/trackDb.txt\n" }
102 | 
103 | 				$ftp->cdup();
104 | 			}
105 | 		}
106 | 
107 | 		# up to study level
108 | 		$ftp->cdup();
109 | 	}
110 | 
111 | 	$ftp->close()
112 | }
113 | 


--------------------------------------------------------------------------------
/recipes/exampleFTP.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright [2020-21] EMBL-European Bioinformatics Institute
  4 | 
  5 | # The recipes below use wget to download files 
  6 | # Please change these env variables to use other tools ie curl
  7 | EXE="wget"
  8 | ARGSDEF=" -c " 
  9 | ARGSTOFILE=" -O "
 10 | ARGSTDOUT=" --quiet $ARGSTOFILE - "
 11 | 
 12 | # set servers & division
 13 | SERVER="ftp://ftp.ensemblgenomes.org/pub"
 14 | DIV=plants
 15 | BIOMARTSERVICE="http://plants.ensembl.org/biomart/martservice"
 16 | 
 17 | # get Ensembl Plants current release number
 18 | SUMFILE="${SERVER}/${DIV}/current/summary.txt"
 19 | RELEASE=$($EXE $ARGSTDOUT $SUMFILE | \
 20 | 	perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }')
 21 | 
 22 | # work out Ensembl Genomes release
 23 | EGRELEASE=$((RELEASE - 53));
 24 | 
 25 | # alternatively set a different Ensembl Genomes (EG) release
 26 | # EGRELEASE=
 27 | 
 28 | # optional arguments, if any
 29 | OPTARG=$1
 30 | 
 31 | echo "EGRELEASE=${EGRELEASE} OPTARG=${OPTARG}"
 32 | echo
 33 | 
 34 | # set example species
 35 | SPECIES=Brachypodium_distachyon
 36 | 
 37 | ## F1) Download peptide sequences in FASTA format
 38 | 
 39 | FASTAPEP="${SPECIES}*pep.all.fa.gz"
 40 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/pep/${FASTAPEP}"
 41 | echo "# downloading $URL"
 42 | $EXE $OPTARG $ARGSDEF $URL
 43 | 
 44 | # stop here if just a test
 45 | if [[ $# -ge 2 ]] && [[ $2 = "test" ]]; then
 46 | 	exit 0
 47 | fi
 48 | 
 49 | 
 50 | 
 51 | ## F2) Download CDS nucleotide sequences in FASTA format
 52 | 
 53 | FASTACDS="${SPECIES}*cds.all.fa.gz"
 54 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/cds/${FASTACDS}"
 55 | echo "# downloading $URL"
 56 | $EXE $OPTARG $ARGSDEF $URL 
 57 | 
 58 | ## F3) Download transcripts (cDNA) in FASTA format
 59 | 
 60 | FASTACDNA="${SPECIES}*cdna.all.fa.gz"
 61 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/cdna/${FASTACDNA}"
 62 | echo "# downloading $URL"
 63 | $EXE $OPTARG $ARGSDEF $URL
 64 | 
 65 | ## F4) Download soft-masked genomic sequences
 66 | 
 67 | FASTASM="${SPECIES}*.dna_sm.toplevel.fa.gz"
 68 | URL="${SERVER}/release-${EGRELEASE}/${DIV}/fasta/${SPECIES,,}/dna/${FASTASM}"
 69 | echo "# downloading $URL"
 70 | $EXE $OPTARG $ARGSDEF $URL
 71 | 
 72 | ## F5) Upstream/downstream sequences
 73 | 
 74 | # Note: this is actually a precompiled BioMart query.
 75 | # You can construct your queries at http://plants.ensembl.org/biomart/martview
 76 | # and export them as XML
 77 | 
 78 | MARTSPECIES=bdistachyon_eg_gene
 79 | BIOMARTQUERY=$(cat <<-XMLQUERY
 80 | <?xml version="1.0" encoding="UTF-8"?>
 81 | <!DOCTYPE Query>
 82 | <Query  virtualSchemaName = "plants_mart" formatter = "FASTA" header = "0" uniqueRows = "0" count = "0" datasetConfigVersion = "0.6" >
 83 | 	<Dataset name = "$MARTSPECIES" interface = "default" >
 84 | 		<Filter name = "chromosome_name" value = "5"/>        
 85 | 		<Filter name = "upstream_flank" value = "100"/>
 86 | 		<Attribute name = "ensembl_gene_id" />
 87 | 		<Attribute name = "5utr" />
 88 | 	</Dataset>
 89 | </Query>
 90 | XMLQUERY
 91 | )
 92 | 
 93 | FASTAUP="${SPECIES}.upstream_flank100.chr5.fa"
 94 | URL="${BIOMARTSERVICE}?query=$BIOMARTQUERY"
 95 | echo "# downloading $FASTAUP"
 96 | if [[ $OPTARG == "--spider" ]]; then
 97 | 	echo "# skip this recipe in test"
 98 | 	echo
 99 | else
100 | 	$EXE $OPTARG $ARGSDEF "$URL" $ARGSTOFILE $FASTAUP
101 | fi
102 | 
103 | ## F6) Get mappings to UniProt proteins
104 | 
105 | UNIPTSV="${SPECIES}*.uniprot.tsv.gz"
106 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/${SPECIES,,}/$UNIPTSV"
107 | echo "# downloading $URL"
108 | $EXE $OPTARG $ARGSDEF $URL
109 | 
110 | ## F7) Get indexed, bgzipped VCF file with variants mapped
111 | 
112 | # Note: this file contains all variants known to Ensembl Plants,
113 | # individual genotypes are not necessarily conserved
114 | 
115 | VCF="${SPECIES,,}.vcf.gz*"
116 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/variation/vcf/${SPECIES,,}/${VCF}"
117 | echo "# downloading $URL"
118 | $EXE $OPTARG $ARGSDEF $URL
119 | 
120 | # wheat is an exception, as you can tell from the VCF file which EMS lines
121 | # share a certain mutation, as in this excerpt:
122 | #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
123 | #1A      238016  Cadenza0202.chr1A.238016        G       A       .       .       EMS-induced mutation;TSA=SNV
124 | #1A      238016  Cadenza0230.chr1A.238016        G       A       .       .       EMS-induced mutation;TSA=SNV
125 | #1A      238016  Cadenza1874.chr1A.238016        G       A       .       .       EMS-induced mutation;TSA=SNV
126 | #1A      406098  Cadenza0148.chr1A.406098        T       C       .       .       EMS-induced mutation;TSA=SNV
127 | #1A      406098  Cadenza0877.chr1A.406098        T       C       .       .       EMS-induced mutation;TSA=SNV
128 | #1A      406098  Cadenza1340.chr1A.406098        T       C       .       .       EMS-induced mutation;TSA=SNV
129 | 
130 | ## F8) Get precomputed VEP cache files
131 | 
132 | SPECIES=arabidopsis_thaliana
133 | VEPCACHE="${SPECIES,,}*.tar.gz*"
134 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/variation/vep/${VEPCACHE}"
135 | echo "# downloading $URL"
136 | $EXE $OPTARG $ARGSDEF $URL
137 | 
138 | # Note: you can get indexed cached files instead from 
139 | # URL=${SERVER}/${DIV}/release-${EGRELEASE}/variation/indexed_vep_cache/${VEPCACHE}
140 | 
141 | ## F9) Download all homologies in a single TSV file, several GBs
142 | 
143 | TSVFILE="Compara.${RELEASE}.protein_default.homologies.tsv.gz"
144 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/ensembl-compara/homologies/${TSVFILE}"
145 | echo "# downloading $URL"
146 | $EXE $OPTARG $ARGSDEF $URL
147 | 
148 | # Note: you can extract homologies from this file by parsing it
149 | # in the command line. Example:
150 | # zcat $TSVFILE | grep triticum_aestivum | grep oryza_sativa | grep ortholog 
151 | 
152 | # Note: homologies of each species can be retrieved from a more specific file
153 | # SPECIES=Triticum_aestivum
154 | #URL="${SERVER}/${DIV}/release-${EGRELEASE}/tsv/ensembl-compara/homologies/${SPECIES,,}${TSVFILE}"
155 | #wget -c "$URL"
156 | #zcat "$TSVFILE" | grep oryza_sativa | grep ortholog
157 | 
158 | # Note: Alternatively a smaller file in OrthoXML format can be obtained
159 | # OXMLFILE="Compara.${RELEASE}.protein_default.allhomologies.orthoxml.xml.gz"
160 | # URL="${SERVER}/${DIV}/release-${EGRELEASE}/xml/ensembl-compara/homologies/${OXMLFILE}"
161 | 
162 | ## F10) download UniProt report of Ensembl Plants, 
163 | # summarized how many protein sequences from each species
164 | # have been annotated in SwissProt & TrEMBL
165 | 
166 | UNIPFILE="uniprot_report_EnsemblPlants.txt"
167 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/$UNIPFILE"
168 | echo "# downloading $URL"
169 | $EXE $OPTARG $ARGSDEF $URL
170 | 
171 | ## F11) Retrieve list of new species in current release
172 | 
173 | NEWLIST="new_genomes.txt"
174 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/$NEWLIST"
175 | echo "# downloading $URL"
176 | $EXE $OPTARG $ARGSDEF $URL
177 | 
178 | ## F12) Get current plant species tree (cladogram)
179 | 
180 | TREEFILE="plants_protein-trees_default.nh"
181 | URL="${SERVER}/${DIV}/release-${EGRELEASE}/compara/species_trees/$TREEFILE"
182 | echo "# downloading $URL"
183 | $EXE $OPTARG $ARGSDEF $URL
184 | 
185 | 


--------------------------------------------------------------------------------
/recipes/exampleMySQL.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Your usage of the data returned by the public MySQL server is
  4 | # subject to same conditions as laid out on the Ensembl website.
  5 | #
  6 | # Copyright [2020-21] EMBL-European Bioinformatics Institute
  7 | 
  8 | # documentation about Ensembl schemas can be found at 
  9 | # http://www.ensembl.org/info/docs/api/index.html
 10 | 
 11 | # set server details
 12 | SERVER=mysql-eg-publicsql.ebi.ac.uk
 13 | USER=anonymous
 14 | PORT=4157
 15 | 
 16 | # get Ensembl Plants current release number from FTP server
 17 | # Note: wget is used, this can be modified to use alternatives ie curl
 18 | FTPSERVER="ftp://ftp.ensemblgenomes.org/pub"
 19 | DIV=plants
 20 | SUMFILE="${FTPSERVER}/${DIV}/current/summary.txt"
 21 | RELEASE=$(wget --quiet -O - $SUMFILE | \
 22 | 	perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }')
 23 | 
 24 | # work out Ensembl Genomes release
 25 | EGRELEASE=$((RELEASE - 53));
 26 | 
 27 | # alternatively set other EG release number
 28 | # EGRELEASE=
 29 | 
 30 | echo "EGRELEASE=${EGRELEASE}"
 31 | echo
 32 | 
 33 | # stop here if just a test
 34 | if [[ $# -ge 0 ]] && [[ $1 = "test" ]]; then
 35 | 	mysql --host $SERVER --user $USER --port $PORT -e "show databases"
 36 | 	exit 0
 37 | fi
 38 | 
 39 | ## S1) Check currently supported Ensembl Genomes (EG)/non-vertebrates core schemas,
 40 | 
 41 | # Note: includes non-plants as well
 42 | 
 43 | mysql --host $SERVER --user $USER --port $PORT \
 44 | 	-e "show databases" | grep "core_${EGRELEASE}_${RELEASE}"
 45 | 
 46 | # The following API script can also be used:
 47 | # https://github.com/Ensembl/ensembl-metadata/blob/master/misc_scripts/get_list_databases_for_division.pl
 48 | 
 49 | ## S2) Count protein-coding genes of a particular species
 50 | 
 51 | SPECIES=arabidopsis_thaliana
 52 | SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \
 53 | 	-e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}")
 54 | 
 55 | mysql --host $SERVER --user $USER --port $PORT \
 56 | 	$SPECIESCORE -e "SELECT COUNT(*) FROM gene WHERE biotype='protein_coding'"
 57 | 
 58 | ## S3) Get stable_ids of transcripts used in Compara analyses 
 59 | 
 60 | # Canonical transcripts are used in the gene tree analysis,
 61 | # which usually are the longest translations with no stop codons.
 62 | # This file can be combined to that obtained in recipe F3 to
 63 | # obtain the sequences
 64 | 
 65 | mysql --host $SERVER --user $USER --port $PORT \
 66 | 	"ensembl_compara_plants_${EGRELEASE}_${RELEASE}" \
 67 |     -e "SELECT sm.stable_id \
 68 | 		FROM seq_member sm, gene_member gm, genome_db gdb \
 69 | 		WHERE sm.seq_member_id = gm.canonical_member_id \
 70 | 		AND sm.genome_db_id = gdb.genome_db_id \
 71 | 		AND gdb.name = '$SPECIES' \
 72 | 		LIMIT 10"
 73 | 
 74 | ## S4) Get variants significantly associated to phenotypes
 75 | 
 76 | # Variation schema documented at 
 77 | # http://www.ensembl.org/info/docs/api/variation/variation_schema.html
 78 | 
 79 | SPECIESVAR=$(mysql --host $SERVER --user $USER --port $PORT \
 80 | 	-e "show databases" | grep "${SPECIES}_variation_${EGRELEASE}_${RELEASE}")
 81 | 
 82 | mysql --host $SERVER --user $USER --port $PORT \
 83 |     $SPECIESVAR<<SQL
 84 | 	SELECT f.object_id, s.name, f.seq_region_start, 
 85 | 		f.seq_region_end, p.description
 86 |     FROM phenotype p 
 87 | 		JOIN phenotype_feature f ON p.phenotype_id = f.phenotype_id  
 88 | 		JOIN seq_region s ON f.seq_region_id = s.name
 89 |     WHERE f.type = 'Variation' AND f.is_significant=1 LIMIT 10
 90 | SQL
 91 | 
 92 | 
 93 | ## S5) Get Triticum aestivum homeologous genes across A,B & D subgenomes
 94 | 
 95 | # Compara schema is described at 
 96 | # https://m.ensembl.org/info/docs/api/compara/compara_schema.html
 97 | 
 98 | # find out the correct method_link_species_set_id in this release
 99 | MLSSID=$(mysql --host $SERVER --user $USER --port $PORT \
100 | 	ensembl_compara_plants_${EGRELEASE}_$RELEASE -Nb -e \
101 | 	"SELECT method_link_species_set_id \
102 | 	FROM method_link_species_set \
103 | 	WHERE name LIKE 'T%aes%homoeologues'")
104 | 
105 | # actually retrieve the homeologues using the MLSSID retrieved above
106 | # remove LIMIT 10 if you want the complete set
107 | mysql --host $SERVER --user $USER --port $PORT \
108 |     ensembl_compara_plants_${EGRELEASE}_$RELEASE <<SQL
109 | SELECT 
110 | 	homology_member.homology_id, cigar_line, perc_cov, perc_id, 
111 | 	perc_pos, gene_member.stable_id as genes, gene_member.genome_db_id
112 | FROM
113 | 	homology_member 
114 | 	INNER JOIN homology USING (homology_id)
115 | 	INNER JOIN gene_member USING (gene_member_id)
116 | WHERE method_link_species_set_id = ${MLSSID}
117 | LIMIT 10
118 | SQL
119 | 
120 | ## S6) Count the number of whole-genome alignments of all genomes
121 | 
122 | # Compara schema is described at 
123 | # https://m.ensembl.org/info/docs/api/compara/compara_schema.html
124 | 
125 | mysql --host $SERVER --user $USER --port $PORT \
126 |     ensembl_compara_plants_${EGRELEASE}_$RELEASE <<SQL
127 | SELECT
128 |     genome_db.name,
129 |     SUM(type = "LASTZ_NET") AS n_lastz,
130 |     SUM(type = "SYNTENY") AS n_syntenies,
131 |     SUM(type IN ("EPO", "EPO_LOW_COVERAGE")) AS n_multiple
132 | FROM
133 |     genome_db
134 |     JOIN species_set USING (genome_db_id)
135 |     JOIN method_link_species_set USING (species_set_id)
136 |     JOIN method_link USING (method_link_id)
137 | WHERE
138 |     genome_component IS NULL
139 |     AND genome_db.name != "ancestral_sequences"
140 | GROUP BY genome_db_id
141 | ORDER BY genome_db.name
142 | SQL
143 | 
144 | ## S7) Extract all the mutations and consequences for a selected wheat line
145 | 
146 | # The variation name contains the name of the line where the muation is present. 
147 | # Variation schema documented at 
148 | # http://www.ensembl.org/info/docs/api/variation/variation_schema.html
149 | 
150 | SPECIES=triticum_aestivum
151 | LINE="Cadenza1441"
152 | SPECIESVAR=$(mysql --host $SERVER --user $USER --port $PORT \
153 | 	-e "show databases" | grep "${SPECIES}_variation_${EGRELEASE}_${RELEASE}")
154 | mysql --host $SERVER --user $USER --port $PORT \
155 |     $SPECIESVAR <<SQL
156 | SELECT 
157 |     seq_region.name as CHR, 
158 |     variation_feature.seq_region_start as POS, 
159 |     variation_feature.allele_string,  
160 |     variation.name as ID, 
161 |     transcript_variation.feature_stable_id, 
162 |     transcript_variation.consequence_types, 
163 |     transcript_variation.sift_prediction, 
164 |     transcript_variation.sift_score,
165 |     variation_set.name as variation_set
166 | FROM variation 
167 | JOIN variation_set_variation
168 |     ON variation.variation_id =variation_set_variation.variation_id
169 | JOIN variation_set 
170 |     ON variation_set.variation_set_id = variation_set_variation.variation_set_id 
171 | JOIN variation_feature 
172 |     ON variation_feature.variation_id = variation.variation_id
173 | JOIN transcript_variation 
174 |     ON transcript_variation.variation_feature_id = variation_feature.variation_feature_id
175 | JOIN seq_region
176 |     ON variation_feature.seq_region_id = seq_region.seq_region_id
177 | WHERE 
178 |     variation.name LIKE "${LINE}%" 
179 | SQL
180 | 
181 | ## S8) Get FASTA of repeated sequences from selected species
182 | 
183 | # This recipe first interrogates the MySQL server (BED file) and 
184 | # then the FTP server (genome FASTA file) and finally produces a 
185 | # FASTA file with repeated sequences. Uses MINLEN to skip short repeats
186 | # and skips repeats starting on some coordinate.
187 | # Note: requires wget, sort, gzip and bedtools
188 | 
189 | MINLEN=90
190 | SPECIES=oryza_sativa
191 | SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \
192 | 	-e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}") 
193 | BEDFILE=${SPECIES}.repeats.bed
194 | FASTANAME="*${SPECIES^}*.dna.toplevel.fa.gz"
195 | FASTA=${SPECIES}.toplevel.fasta
196 | REPFASTA=${SPECIES}.repeats.fasta
197 | 
198 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
199 | 	"SELECT sr.name,(r.seq_region_start-1),r.seq_region_end,rc.repeat_class \
200 | 	FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \
201 | 	WHERE r.seq_region_id=sr.seq_region_id \
202 | 	AND r.repeat_consensus_id=rc.repeat_consensus_id \
203 | 	AND (r.seq_region_end-r.seq_region_start+1) > ${MINLEN}" | \
204 | 	sort -u -k1,1 -k2,2n > $BEDFILE
205 | 
206 | # similar to recipe F4
207 | URL="${FTPSERVER}/${DIV}/current/fasta/${SPECIES}/dna/${FASTANAME}"
208 | wget -c $URL -O- | gunzip > $FASTA
209 | bedtools getfasta -name -fi $FASTA -bed $BEDFILE > $REPFASTA
210 | 
211 | ## S9) Get GFF of repeated sequences from selected species
212 | 
213 | # This recipe first interrogates the MySQL server and produces a GFF file 
214 | # with repeat sequences. Uses MINLEN to skip short repeats.
215 | # Note: requires wget, sort and perl
216 | 
217 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
218 | 	"SELECT sr.name,rc.repeat_class,'Repeat',r.seq_region_start, \
219 | 	r.seq_region_end,r.score,r.seq_region_strand,0,rc.repeat_name \
220 | 	FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \
221 | 	WHERE r.seq_region_id=sr.seq_region_id \
222 | 	AND r.repeat_consensus_id=rc.repeat_consensus_id \
223 | 	AND (r.seq_region_end-r.seq_region_start+1) > 90" | sort -k1,1 -k4,4n | \
224 | 	perl -lane 'if($F[5] eq "NULL"){ $F[5]="."}; if($F[6]==1){ $F[6]="+" } else {$F[6]="-" }; $F[8]="Note=\"$F[8]\";"; print join("\t",@F)'
225 | 
226 | :
227 | 


--------------------------------------------------------------------------------
/recipes/exampleVEP.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright [2020-21] EMBL-European Bioinformatics Institute
  4 | 
  5 | # documentation about Ensembl VEP can be found at 
  6 | # http://www.ensembl.org/info/docs/tools/vep/index.html
  7 | 
  8 | # set Ensembl Plants release number, check it
  9 | # at the bottom of http://plants.ensembl.org
 10 | # EG stands for Ensembl Genomes
 11 | 
 12 | # example call: VEPATH=/path/to/ensembl-vep EGRELEASE=50 ./exampleVEP.sh
 13 | 
 14 | if [ -z "${EGRELEASE}" ]; then
 15 | 	EGRELEASE=49
 16 | fi
 17 | 
 18 | # edit if needed to point to ensembl-vep
 19 | if [ -z "${VEPATH}" ]; then
 20 | 	VEPATH=./
 21 | fi
 22 | 
 23 | # check VEP
 24 | if [ ! -f "${VEPATH}/ensembl-vep/vep" ]; then
 25 | 	echo "# ERROR: Cannot find ${VEPATH}/ensembl-vep/vep not found, please set VEPATH accordingly"
 26 |     exit 1
 27 | fi
 28 | 
 29 | 
 30 | # work out Ensembl release, do not change
 31 | RELEASE=$((EGRELEASE + 53));
 32 | 
 33 | echo "EGRELEASE=${EGRELEASE}"
 34 | echo
 35 | 
 36 | ## V1) Download, install and update VEP
 37 | 
 38 | # Fresh install
 39 | #git clone https://github.com/Ensembl/ensembl-vep.git
 40 | #cd ensembl-vep
 41 | #perl INSTALL.pl
 42 | 
 43 | # To update from a previous version:
 44 | #cd ensembl-vep
 45 | #git pull
 46 | #git checkout release/$RELEASE
 47 | #perl INSTALL.pl
 48 | 
 49 | ## V2) Unpack downloaded cache file & check SIFT support 
 50 | 
 51 | # Note: cache downloaded in recipe F8
 52 | # Note: look for "sift  b"
 53 | 
 54 | SPECIES=arabidopsis_thaliana
 55 | VEPCACHE="${SPECIES}*.tar.gz*"
 56 | 
 57 | if [ ! -f ${VEPCACHE} ]; then
 58 | 	echo "# ERROR: Cache file ${VEPCACHE} not found, get it with recipe F8"
 59 | 	exit 1
 60 | else
 61 | 	tar xfz $VEPCACHE
 62 | 	pattern="${SPECIES}/${EGRELEASE}_*/info.txt"
 63 | 	files=( $pattern )
 64 | 	INFOFILE="${files[0]}" 
 65 | 	if [ -f "${INFOFILE}" ]; then
 66 | 		grep sift "${INFOFILE}"
 67 | 		echo "${INFOFILE}"
 68 | 	else
 69 | 		echo "# ERROR: Cannot find file ${INFOFILE}, please correct/set variable EGRELEASE"
 70 | 		exit 1
 71 | 	fi
 72 | fi
 73 | 
 74 | ## V3) Predict effect of variants 
 75 | 
 76 | # See more options and examples at 
 77 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
 78 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_example.html 
 79 | 
 80 | VCFILE="${VEPATH}/ensembl-vep/examples/arabidopsis_thaliana.TAIR10.vcf"
 81 | OUTFILE='arabidopsis_thaliana.vep.output'
 82 | 
 83 | VEPOPTIONS=(
 84 | 	--genomes              # Ensembl Genomes, for Plants
 85 | 	--species $SPECIES 
 86 | 	--cache                # use local cache file, opposed to --database
 87 | 	--dir_cache ./         # location of unpacked cache $SPECIES folder
 88 | 	--cache_version $EGRELEASE
 89 | 	--check_existing       # co-located known variants
 90 | 	--distance 5000        # max dist between variant and transcript
 91 | 	--biotype              # show biotype of neighbor transcript
 92 | 	--input_file $VCFILE
 93 | 	--output_file $OUTFILE
 94 | )
 95 | 
 96 | #   --sift b               # only some species have SIFT precomputed
 97 | 
 98 | ${VEPATH}/ensembl-vep/vep "${VEPOPTIONS[@]}"
 99 | 
100 | ## V4) Predict effect of variants for species not in Ensembl
101 | 
102 | # GFF file must be sorted and indexed with BGZIP and TABIX, see 
103 | # http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#gff
104 | 
105 | FASTAGZFILE=      # GZIP-compressed file of genome FASTA file
106 | GFFILE=           # gene models matching sequences in FASTAGZFILE
107 | GZGFFILE=$GFFILE.sorted.gz
108 | 
109 | if [[ -f $GFFILE && -f $FASTAGZFILE ]]; then
110 | 	# sort and index
111 | 	grep -v "#" $GFFILE | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > $GZGFFILE
112 | 	tabix -p gff $GZGFFILE
113 | 	
114 | 	# actually call vep
115 | 	${VEPATH}/ensembl-vep/vep -i $VCFILE -gff $GZGFFILE -fasta $FASTAGZFILE
116 | fi
117 | 


--------------------------------------------------------------------------------
/repeats/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | These scripts can be used to:
  3 | + i) mask repeated sequences in plant genomes with the 
  4 | [Repeat detector](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0654-5) (Red)
  5 | + ii) annotate repeats with https://github.com/lh3/minimap2 and  
  6 | 	* the curated library of repeats 
  7 | [nrTEplants](https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2),
  8 | described in full detail [here](https://github.com/Ensembl/plant_tools/tree/master/bench/repeat_libs)
  9 | 	* annotated repeats from species in Ensembl Plants, obtained with *get_repeats_ensembl.sh*
 10 | 
 11 | Optionally, repeats and annotated repeats can be loaded into an [Ensembl core db](https://www.ensembl.org/info/docs/api/core/core_schema.html) 
 12 | as new analyses with default logic names are 'repeatdetector' and 'repeatdetector_annotated'. 
 13 | 
 14 | ## Dependencies
 15 | 
 16 | The following dependencies can be installed in the parent folder with:
 17 | 
 18 |     make install install_repeats
 19 | 
 20 | There are two required binaries, for which version 10 of the GNU C++ compiler (actually any g++ >= 8 should work, please edit the [Makefile](../Makefile) accordingly):
 21 | 
 22 | * A clone of Red from https://github.com/EnsemblGenomes/Red (the original repo is [here](https://github.com/BioinformaticsToolsmith/Red))
 23 | * A clone of minimap2 from https://github.com/lh3/minimap2
 24 | 
 25 | Plus:
 26 | 
 27 | * A copy of the [nrTEplants library](https://github.com/Ensembl/plant-scripts/releases/download/v0.3/nrTEplantsJune2020.fna.bz2)
 28 | 
 29 | And three Python3 modules:
 30 | 
 31 | * [sqlalchemy](https://pypi.org/project/SQLAlchemy)
 32 | * [sqlalchemy_utils](https://pypi.org/project/SQLAlchemy-Utils)
 33 | * [pymysql](https://pypi.org/project/PyMySQL)
 34 | 
 35 | 
 36 | 
 37 | Note that script [get_repeats_ensembl.sh](./get_repeats_ensembl.sh) has some more dependencies listed at its header.
 38 | 
 39 | ## Argument lists
 40 | 
 41 | If you run 
 42 | 
 43 |     ./Red2Ensembl.py -h
 44 | 
 45 | you'll get the list of supported arguments and what they're for:
 46 | 
 47 | ```
 48 | ./Red2Ensembl.py -h
 49 | usage: Red2Ensembl.py [-h] [--exe EXE] [--cor COR] [--msk_file MSK_FILE]
 50 |                       [--bed_file BED_FILE] [--host HOST] [--user USER]
 51 |                       [--pw PW] [--port PORT] [--db DB]
 52 |                       [--logic_name LOGIC_NAME] [--description DESCRIPTION]
 53 |                       [--displaylabel DISPLAYLABEL]
 54 |                       fasta_file outdir
 55 | 
 56 | Script to run RepeatDetector (a fork of Red v2) to mask repeats,
 57 | and optionally feed results into an Ensembl core database.
 58 | 
 59 | positional arguments:
 60 |   fasta_file            path to FASTA file with top-level genomic sequences
 61 |   outdir                path to directory to store Red temp results
 62 | 
 63 | optional arguments:
 64 |   -h, --help            show this help message and exit
 65 |   --exe EXE             path to Red executable, default: ./../lib/Red/bin/Red
 66 |   --cor COR             number of cores for Red, default: 1
 67 |   --msk_file MSK_FILE   name of output FASTA file with soft-masked sequences
 68 |   --bed_file BED_FILE   name of output BED file with repeated ranges, uses
 69 |                         original sequence names
 70 |   --host HOST           name of the database host, required to store repeats
 71 |                         in Ensembl core
 72 |   --user USER           host user, required to store repeats in Ensembl core
 73 |   --pw PW               host password, required to store repeats in Ensembl
 74 |                         core
 75 |   --port PORT           host port, required to store repeats in Ensembl core
 76 |   --db DB               name of the core database, required to store repeats
 77 |                         in Ensembl core
 78 |   --logic_name LOGIC_NAME
 79 |                         logic name of Ensembl analysis, default:
 80 |                         repeatdetector
 81 |   --description DESCRIPTION
 82 |                         quoted string with Ensembl analysis description,
 83 |                         default: Repeats detected using <a href="https://bmcbi
 84 |                         oinformatics.biomedcentral.com/articles/10.1186/s12859
 85 |                         -015-0654-5">Red (REPeatDetector)</a>
 86 |   --displaylabel DISPLAYLABEL
 87 |                         string with Ensembl analysis display label, default:
 88 |                         Repeats:Red
 89 | 
 90 | Citation:
 91 | Contreras-Moreira et al (2021) https://doi.org/10.1002/tpg2.20143
 92 | Girgis HZ (2015) BMC Bioinformatics 16:227. doi: 10.1186/s12859-015-0654-5
 93 | ```
 94 | 
 95 | Similarly, if you run
 96 | 
 97 |     ./AnnotRedRepeats.py -h
 98 | 
 99 | you'll get:
100 | 
101 | ```
102 | usage: AnnotRedRepeats.py [-h] [--exe EXE] [--cor COR] [--minlen MINLEN]
103 |                           [--bed_file BED_FILE] [--host HOST] [--user USER]
104 |                           [--pw PW] [--port PORT] [--db DB]
105 |                           [--logic_name LOGIC_NAME]
106 |                           [--description DESCRIPTION]
107 |                           [--displaylabel DISPLAYLABEL]
108 |                           repeat_fasta_file outdir
109 | 
110 | Script to annotate Red repeats and optionally
111 | feed the new consensus_repeats into an Ensembl core database.
112 | 
113 | positional arguments:
114 |   repeat_fasta_file     path to FASTA file with repeat sequences in RepBase
115 |                         format
116 |   outdir                path to directory with stored Red results
117 | 
118 | optional arguments:
119 |   -h, --help            show this help message and exit
120 |   --exe EXE             path to minimap2 executable, default:
121 |                         ./../lib/minimap2/minimap2
122 |   --cor COR             number of cores for minimap2, default: 1
123 |   --minlen MINLEN       min length of repeats to be annotated, default: 90bp
124 |   --bed_file BED_FILE   name of output BED file with annotated repeats
125 |   --host HOST           name of the database host
126 |   --user USER           host user
127 |   --pw PW               host password
128 |   --port PORT           host port
129 |   --db DB               name of the core database
130 |   --logic_name LOGIC_NAME
131 |                         logic name of Ensembl analysis, default:
132 |                         repeatdetector_annotated
133 |   --description DESCRIPTION
134 |                         quoted string with Ensembl analysis description,
135 |                         default: Repeats detected using <a href="https://bmcbi
136 |                         oinformatics.biomedcentral.com/articles/10.1186/s12859
137 |                         -015-0654-5">Red (REPeatDetector)</a> and annotated by
138 |                         alignment to a repeat library.
139 |   --displaylabel DISPLAYLABEL
140 |                         string with Ensembl analysis display label, default:
141 |                         'Repeats:Red (annotated)'
142 | 
143 | Citation:
144 | Contreras-Moreira et al (2021) https://doi.org/10.1002/tpg2.20143
145 | Girgis HZ (2015) BMC Bioinformatics 16:227. doi: 10.1186/s12859-015-0654-5
146 | Li H (2018) Bioinformatics 34(18):3094–3100. doi: 10.1093/bioinformatics/bty191
147 | ```
148 | 
149 | ## Examples
150 | 
151 | Note that the input FASTA file can be GZIP/BZIP2 compressed.
152 | The script *Red2Ensembl.py* will attempt to estimate the GB RAM needed for the input genome.
153 | 
154 | ### i) Masking
155 | 
156 | ```
157 | ## test run, saves results in folder 'test_Atha_chr4' 
158 | ./Red2Ensembl.py ../files/Arabidopsis_thaliana.fna.gz test_Atha_chr4 --msk_file Atha.sm.fna --bed_file Atha.bed
159 | 
160 | # parsing FASTA file
161 | # genome length = 18585056 bp
162 | ...
163 | 
164 | 
165 | ## real example, with several chromosomes, taking 4 CPU cores 
166 | ./Red2Ensembl.py Brachypodium_distachyon_v3.0.dna.toplevel.fa Brachypodium_distachyon --cor 4 
167 | 
168 | ## local run & loading repeats in core Ensembl db (will re-use previous Red results)
169 | ./Red2Ensembl.py Brachypodium_distachyon_v3.0.dna.toplevel.fa Brachypodium_distachyon \
170 | 	--host pl1 --user xyz --pw XYZ --port 123 --db brachypodium_distachyon_core_49_102
171 | ```
172 | 
173 | ### ii) Annotating masked repeated sequences
174 | 
175 | The repeats called by Red can be optionally annotated by similarity to sequences in an external FASTA file, 
176 | such as the library **nrTEplants**. The script does not load the resulting annotations in a core db just yet:
177 | ```
178 | ## test run, re-uses folder 'test_Atha_chr4'
179 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna test_Atha_chr4 --bed_file test.nrTEplants.bed
180 | 
181 | ## consider only repeats with length >= 200 bp
182 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna Brachypodium_distachyon --cor 4 \
183 | 	--minlen 200
184 | 
185 | ## add annotated repeats to Ensembl core db and use a different minimap2 binary
186 | ./AnnotRedRepeats.py ../files/nrTEplantsJune2020.fna Brachypodium_distachyon --exe /path/to/minimap2 --cor 4 \
187 |     --host pl1 --user xyz --pw XYZ \
188 |     --port 123 --db brachypodium_distachyon_core_49_102
189 | ```
190 | 
191 | Note that any FASTA file can be used to annotate the repeats. For instance, repeats annotated
192 | in current species in Ensembl can be retrieved and used as well:
193 | ```
194 | ./get_repeats_ensembl.sh arabidopsis_thaliana
195 | 
196 | # This will produce file: arabidopsis_thaliana.repeats.nondeg.fasta
197 | 
198 | # Note this file can be highly redundant; redundancy can be eliminated with linclust,
199 | # see https://github.com/soedinglab/MMseqs2
200 | 
201 | ./AnnotRedRepeats.py arabidopsis_thaliana.repeats.nondeg.fasta test_Atha_chr4 --bed_file test.ensembl.bed
202 | ```
203 | 
204 | ## Annotation summary 
205 | 
206 | If a library such as nrTEplants or any other RepBase-formatted file is used, 
207 | an annotation report like this is produced. These are valid examples of FASTA headers:
208 | 
209 |     >TEdenovo-B-R2315-Map11:repetDB.Mar2020#TIR @Brassica_rapa [S:]
210 | 	>AT1TE94285:TAIR10_TE#DNA/MuDR @Arabidopsis_thaliana [S:]
211 | 
212 | The repeat classification is then parsed to produce a report like this:
213 | 
214 | ```
215 | # Genome length: 18585056 Repeated content: 6837303 36.8% Annotated: 2748796 14.8%
216 | 
217 | class	bp
218 | DIRS	1212
219 | DNA	32110
220 | DNA/En-Spm	50044
221 | DNA/HAT	33911
222 | DNA/Harbinger	13879
223 | DNA/Mariner	3935
224 | DNA/MuDR	283157
225 | DNA/Pogo	21954
226 | DNA/Tc1	3467
227 | Helitron	20670
228 | LARD	83725
229 | LINE	2384
230 | LINE/L1	9898
231 | LINE?	1235
232 | LTR	113511
233 | LTR/Copia	88739
234 | LTR/Gypsy	900679
235 | MITE	2502
236 | Other	42920
237 | Other/Simple	1596
238 | RC/Helitron	766803
239 | RathE1_cons	1188
240 | RathE2_cons	245
241 | RathE3_cons	196
242 | SINE	9192
243 | Satellite	132
244 | TIR	79579
245 | TIR/Mutator	364
246 | TRIM	53086
247 | Unclassified	126483
248 | ```
249 | 
250 | ## Runtime and RAM requirements
251 | 
252 | These data were measured on a CentOS7.9 computer using 4 cores of a Xeon E5-2620 v4 (2.10GHz) CPU.
253 | 
254 | ![](../files/runtime_ram.png)
255 | 
256 | 
257 | ## Error messages
258 | 
259 | + ERROR: cannot run Red -9: This means the Red process was killed by the Operating system, usually for taking too much RAM. You will need more RAM to run this job.
260 | 
261 | 


--------------------------------------------------------------------------------
/repeats/bench/list.Red:
--------------------------------------------------------------------------------
 1 | Aegilops_tauschii
 2 | Arabidopsis_halleri
 3 | Arabidopsis_thaliana
 4 | Arabis_alpina
 5 | Brachypodium_distachyon
 6 | Brassica_rapa
 7 | Camelina_sativa
 8 | Citrullus_lanatus
 9 | Cucumis_melo
10 | Helianthus_annuus
11 | Malus_domestica_golden
12 | Olea_europaea_sylvestris
13 | Oryza_sativa
14 | Prunus_dulcis
15 | Rosa_chinensis
16 | Setaria_viridis
17 | Trifolium_pratense
18 | Triticum_turgidum
19 | Vitis_vinifera
20 | Zea_mays
21 | 


--------------------------------------------------------------------------------
/repeats/bench/list.cores:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii_core_48_101_3
 2 | arabidopsis_halleri_core_48_101_1
 3 | arabidopsis_thaliana_core_48_101_11
 4 | arabis_alpina_core_48_101_1
 5 | brachypodium_distachyon_core_48_101_4
 6 | brassica_rapa_core_48_101_1
 7 | camelina_sativa_core_48_101_1
 8 | citrullus_lanatus_core_48_101_1
 9 | cucumis_melo_core_48_101_1
10 | helianthus_annuus_core_48_101_1
11 | malus_domestica_golden_core_48_101_1
12 | olea_europaea_sylvestris_core_48_101_1
13 | oryza_sativa_core_48_101_7
14 | prunus_dulcis_core_48_101_1
15 | rosa_chinensis_core_48_101_1
16 | setaria_viridis_core_48_101_1
17 | trifolium_pratense_core_48_101_1
18 | triticum_turgidum_core_48_101_1
19 | vitis_vinifera_core_48_101_3
20 | zea_mays_core_48_101_7
21 | 


--------------------------------------------------------------------------------
/repeats/bench/list.cores.sp:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii_core_48_101_3	aegilops_tauschii
 2 | arabidopsis_halleri_core_48_101_1	arabidopsis_halleri
 3 | arabidopsis_thaliana_core_48_101_11	arabidopsis_thaliana
 4 | arabis_alpina_core_48_101_1	arabis_alpina
 5 | brachypodium_distachyon_core_48_101_4	brachypodium_distachyon
 6 | brassica_rapa_core_48_101_1	brassica_rapa
 7 | camelina_sativa_core_48_101_1	camelina_sativa
 8 | citrullus_lanatus_core_48_101_1	citrullus_lanatus
 9 | cucumis_melo_core_48_101_1	cucumis_melo
10 | helianthus_annuus_core_48_101_1	helianthus_annuus
11 | malus_domestica_golden_core_48_101_1	malus_domestica_golden
12 | olea_europaea_sylvestris_core_48_101_1	olea_europaea_sylvestris
13 | oryza_sativa_core_48_101_7	oryza_sativa
14 | prunus_dulcis_core_48_101_1	prunus_dulcis
15 | rosa_chinensis_core_48_101_1	rosa_chinensis
16 | setaria_viridis_core_48_101_1	setaria_viridis
17 | trifolium_pratense_core_48_101_1	trifolium_pratense
18 | triticum_turgidum_core_48_101_1	triticum_turgidum
19 | vitis_vinifera_core_48_101_3	vitis_vinifera
20 | zea_mays_core_48_101_7	zea_mays
21 | 


--------------------------------------------------------------------------------
/repeats/bench/list.cores.sp.toplevel:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii_core_48_101_3	aegilops_tauschii	Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa
 2 | arabidopsis_halleri_core_48_101_1	arabidopsis_halleri	Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa
 3 | arabidopsis_thaliana_core_48_101_11	arabidopsis_thaliana	Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
 4 | arabis_alpina_core_48_101_1	arabis_alpina	Arabis_alpina.A_alpina_V4.dna.toplevel.fa
 5 | brachypodium_distachyon_core_48_101_4	brachypodium_distachyon	Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa
 6 | brassica_rapa_core_48_101_1	brassica_rapa	Brassica_rapa.Brapa_1.0.dna.toplevel.fa
 7 | camelina_sativa_core_48_101_1	camelina_sativa	Camelina_sativa.Cs.dna.toplevel.fa
 8 | citrullus_lanatus_core_48_101_1	citrullus_lanatus	Citrullus_lanatus.Cla97_v1.dna.toplevel.fa
 9 | cucumis_melo_core_48_101_1	cucumis_melo	Cucumis_melo.Melonv4.dna.toplevel.fa
10 | helianthus_annuus_core_48_101_1	helianthus_annuus	Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa
11 | malus_domestica_golden_core_48_101_1	malus_domestica_golden	Malus_domestica_golden.ASM211411v1.dna.toplevel.fa
12 | olea_europaea_sylvestris_core_48_101_1	olea_europaea_sylvestris	Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa
13 | oryza_sativa_core_48_101_7	oryza_sativa	Oryza_sativa.IRGSP-1.0.dna.toplevel.fa
14 | prunus_dulcis_core_48_101_1	prunus_dulcis	Prunus_dulcis.ALMONDv2.dna.toplevel.fa
15 | rosa_chinensis_core_48_101_1	rosa_chinensis	Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa
16 | setaria_viridis_core_48_101_1	setaria_viridis	Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa
17 | trifolium_pratense_core_48_101_1	trifolium_pratense	Trifolium_pratense.Trpr.dna.toplevel.fa
18 | triticum_turgidum_core_48_101_1	triticum_turgidum	Triticum_turgidum.Svevo.v1.dna.toplevel.fa
19 | vitis_vinifera_core_48_101_3	vitis_vinifera	Vitis_vinifera.12X.dna.toplevel.fa
20 | zea_mays_core_48_101_7	zea_mays	Zea_mays.B73_RefGen_v4.dna.toplevel.fa
21 | 


--------------------------------------------------------------------------------
/repeats/bench/list.cores.wheat:
--------------------------------------------------------------------------------
 1 | triticum_aestivum_arinalrfor_core_51_104_1
 2 | triticum_aestivum_jagger_core_51_104_1
 3 | triticum_aestivum_julius_core_51_104_1
 4 | triticum_aestivum_lancer_core_51_104_1
 5 | triticum_aestivum_landmark_core_51_104_1
 6 | triticum_aestivum_mace_core_51_104_1
 7 | triticum_aestivum_mattis_core_51_104_1
 8 | triticum_aestivum_norin61_core_51_104_1
 9 | triticum_aestivum_stanley_core_51_104_1
10 | 


--------------------------------------------------------------------------------
/repeats/bench/list.toplevel:
--------------------------------------------------------------------------------
 1 | Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa
 2 | Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa
 3 | Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
 4 | Arabis_alpina.A_alpina_V4.dna.toplevel.fa
 5 | Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa
 6 | Brassica_rapa.Brapa_1.0.dna.toplevel.fa
 7 | Camelina_sativa.Cs.dna.toplevel.fa
 8 | Citrullus_lanatus.Cla97_v1.dna.toplevel.fa
 9 | Cucumis_melo.Melonv4.dna.toplevel.fa
10 | Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa
11 | Malus_domestica_golden.ASM211411v1.dna.toplevel.fa
12 | Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa
13 | Oryza_sativa.IRGSP-1.0.dna.toplevel.fa
14 | Prunus_dulcis.ALMONDv2.dna.toplevel.fa
15 | Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa
16 | Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa
17 | Trifolium_pratense.Trpr.dna.toplevel.fa
18 | Triticum_turgidum.Svevo.v1.dna.toplevel.fa
19 | Vitis_vinifera.12X.dna.toplevel.fa
20 | Zea_mays.B73_RefGen_v4.dna.toplevel.fa
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.Rgenes.50:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	916	3144217	130459	178814	225364
 2 | arabidopsis_halleri	209	635687	55807	425348	119975
 3 | arabidopsis_thaliana	69	201660	22619	151854	54427
 4 | arabis_alpina	364	1201405	91329	787666	192520
 5 | brachypodium_distachyon	344	1249602	154904	208189	177288
 6 | brassica_rapa	219	729119	55369	485467	158167
 7 | camelina_sativa	573	1808571	108683	1032089	352086
 8 | citrullus_lanatus	43	164993	15906	72027	9153
 9 | cucumis_melo	89	294239	38958	189021	44912
10 | helianthus_annuus	604	2055877	66811	497800	798084
11 | malus_domestica_golden	637	2535932	343622	1861255	944792
12 | olea_europaea_sylvestris	402	1095646	80911	151255	121251
13 | oryza_sativa	45	169051	9415	34747	10603
14 | prunus_dulcis	387	1354547	223204	1055234	442933
15 | rosa_chinensis	963	3347977	398581	2255192	1287944
16 | setaria_viridis	453	1682607	109789	161966	393371
17 | trifolium_pratense	553	1781566	518826	810044	570870
18 | triticum_turgidum	2459	8351371	349698	465873	570011
19 | vitis_vinifera	739	2723777	377354	1645288	1065500
20 | zea_mays	158	487237	19550	39041	94402
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.exons:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	167903	68899123	1093757	1476448	3150918
 2 | arabidopsis_halleri	168961	48889148	4713750	10589208	8184041
 3 | arabidopsis_thaliana	145966	46554976	2639673	7998681	8355123
 4 | arabis_alpina	108362	30098225	1690816	6446837	2726170
 5 | brachypodium_distachyon	152258	68995392	3749292	6022129	5544077
 6 | brassica_rapa	206588	48263240	2740496	9859653	3779043
 7 | camelina_sativa	481073	132504580	6268348	20505918	36061050
 8 | citrullus_lanatus	118249	26303029	1094402	4528151	292055
 9 | cucumis_melo	137797	41060653	1150542	6423622	2905048
10 | helianthus_annuus	231826	82980403	2162522	3054869	9975943
11 | malus_domestica_golden	221431	62527585	2256187	8083094	3939718
12 | olea_europaea_sylvestris	235142	65931783	2550352	6425574	12189818
13 | oryza_sativa	156630	65380182	2349371	5368395	4300234
14 | prunus_dulcis	134744	47752477	2716214	10176450	4580463
15 | rosa_chinensis	174074	74446167	3081688	12161789	12510233
16 | setaria_viridis	161524	74246042	2306301	4559099	6563336
17 | trifolium_pratense	181312	55716319	4583322	10160516	5755230
18 | triticum_turgidum	308306	82770985	554686	1220411	2480209
19 | vitis_vinifera	147613	39848131	1841978	5904363	4073035
20 | zea_mays	206593	63390920	488030	1046187	8058936
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.gc:
--------------------------------------------------------------------------------
 1 | ../Red_minimap2//Aegilops_tauschii.Aet_v4.0.dna.toplevel.fa	46.3
 2 | ../Red_minimap2//Arabidopsis_halleri.Ahal2.2.dna.toplevel.fa	36.0
 3 | ../Red_minimap2//Arabidopsis_thaliana.TAIR10.dna.toplevel.fa	36.1
 4 | ../Red_minimap2//Arabis_alpina.A_alpina_V4.dna.toplevel.fa	36.8
 5 | ../Red_minimap2//Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa	46.4
 6 | ../Red_minimap2//Brassica_rapa.Brapa_1.0.dna.toplevel.fa	35.3
 7 | ../Red_minimap2//Camelina_sativa.Cs.dna.toplevel.fa	36.6
 8 | ../Red_minimap2//Citrullus_lanatus.Cla97_v1.dna.toplevel.fa	33.6
 9 | ../Red_minimap2//Cucumis_melo.Melonv4.dna.toplevel.fa	33.5
10 | ../Red_minimap2//Helianthus_annuus.HanXRQr1.0.dna.toplevel.fa	38.5
11 | ../Red_minimap2//Malus_domestica_golden.ASM211411v1.dna.toplevel.fa	38.0
12 | ../Red_minimap2//Olea_europaea_sylvestris.O_europaea_v1.dna.toplevel.fa	35.4
13 | ../Red_minimap2//Oryza_sativa.IRGSP-1.0.dna.toplevel.fa	43.6
14 | ../Red_minimap2//Prunus_dulcis.ALMONDv2.dna.toplevel.fa	37.6
15 | ../Red_minimap2//Rosa_chinensis.RchiOBHm-V2.dna.toplevel.fa	38.8
16 | ../Red_minimap2//Setaria_viridis.Setaria_viridis_v2.0.dna.toplevel.fa	46.2
17 | ../Red_minimap2//Trifolium_pratense.Trpr.dna.toplevel.fa	32.4
18 | ../Red_minimap2//Triticum_turgidum.Svevo.v1.dna.toplevel.fa	46.0
19 | ../Red_minimap2//Vitis_vinifera.12X.dna.toplevel.fa	34.5
20 | ../Red_minimap2//Zea_mays.B73_RefGen_v4.dna.toplevel.fa	46.9
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.genes:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	39614	348459650	107216460	115488145	136485083
 2 | arabidopsis_halleri	32158	77672904	7028692	14970718	14329155
 3 | arabidopsis_thaliana	27628	67504196	3255170	9734724	13753315
 4 | arabis_alpina	21609	45801712	2194862	8499977	4069886
 5 | brachypodium_distachyon	34310	122507803	7423797	9928120	10466133
 6 | brassica_rapa	41018	83240510	4332091	15559458	11155807
 7 | camelina_sativa	89275	215022170	8523587	26276159	49343237
 8 | citrullus_lanatus	22541	81241809	2904724	10076936	6915666
 9 | cucumis_melo	28299	99772108	3949118	17800648	11892758
10 | helianthus_annuus	52191	199869937	7343679	16325186	58538874
11 | malus_domestica_golden	40624	145824479	8244741	29968698	25457319
12 | olea_europaea_sylvestris	50678	153512663	6720213	11820126	29235344
13 | oryza_sativa	35775	130112020	8042778	13051885	11449782
14 | prunus_dulcis	27963	94240163	4142093	17684800	11380531
15 | rosa_chinensis	45464	117342283	3905528	14950018	18102851
16 | setaria_viridis	38334	123295550	4481990	7094063	10854583
17 | trifolium_pratense	39917	133489771	12955199	17236097	24083603
18 | triticum_turgidum	66545	475684837	148018114	156621545	185695090
19 | vitis_vinifera	29927	153564951	10261857	40330281	30458129
20 | zea_mays	39583	168056455	14098099	16263247	41720838
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.nrplants.bed:
--------------------------------------------------------------------------------
 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species aegilops_tauschii -logic_name repeatmask_nrplants
 2 | aegilops_tauschii	genome_length	4224915394
 3 | aegilops_tauschii	repeatmask_nrplants	2954386572
 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_halleri -logic_name repeatmask_nrplants
 5 | arabidopsis_halleri	genome_length	196243198
 6 | arabidopsis_halleri	repeatmask_nrplants	54248175
 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_thaliana -logic_name repeatmask_nrplants
 8 | arabidopsis_thaliana	genome_length	119667750
 9 | arabidopsis_thaliana	repeatmask_nrplants	34493047
10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabis_alpina -logic_name repeatmask_nrplants
11 | arabis_alpina	genome_length	
12 | arabis_alpina	repeatmask_nrplants	138763577
13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brachypodium_distachyon -logic_name repeatmask_nrplants
14 | brachypodium_distachyon	genome_length	271163419
15 | brachypodium_distachyon	repeatmask_nrplants	76177501
16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brassica_rapa -logic_name repeatmask_nrplants
17 | brassica_rapa	genome_length	283822783
18 | brassica_rapa	repeatmask_nrplants	85105371
19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species camelina_sativa -logic_name repeatmask_nrplants
20 | camelina_sativa	genome_length	641356059
21 | camelina_sativa	repeatmask_nrplants	158537047
22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species citrullus_lanatus -logic_name repeatmask_nrplants
23 | citrullus_lanatus	genome_length	365450462
24 | citrullus_lanatus	repeatmask_nrplants	64792392
25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species cucumis_melo -logic_name repeatmask_nrplants
26 | cucumis_melo	genome_length	357857370
27 | cucumis_melo	repeatmask_nrplants	159149620
28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species helianthus_annuus -logic_name repeatmask_nrplants
29 | helianthus_annuus	genome_length	3027844945
30 | helianthus_annuus	repeatmask_nrplants	788477175
31 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species malus_domestica_golden -logic_name repeatmask_nrplants
32 | malus_domestica_golden	genome_length	702961352
33 | malus_domestica_golden	repeatmask_nrplants	313026586
34 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species olea_europaea_sylvestris -logic_name repeatmask_nrplants
35 | olea_europaea_sylvestris	genome_length	1140989389
36 | olea_europaea_sylvestris	repeatmask_nrplants	227766390
37 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species oryza_sativa -logic_name repeatmask_nrplants
38 | oryza_sativa	genome_length	375049285
39 | oryza_sativa	repeatmask_nrplants	146748772
40 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species prunus_dulcis -logic_name repeatmask_nrplants
41 | prunus_dulcis	genome_length	227498357
42 | prunus_dulcis	repeatmask_nrplants	86267619
43 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species rosa_chinensis -logic_name repeatmask_nrplants
44 | rosa_chinensis	genome_length	515588973
45 | rosa_chinensis	repeatmask_nrplants	116760801
46 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species setaria_viridis -logic_name repeatmask_nrplants
47 | setaria_viridis	genome_length	
48 | setaria_viridis	repeatmask_nrplants	80092623
49 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species trifolium_pratense -logic_name repeatmask_nrplants
50 | trifolium_pratense	genome_length	304842038
51 | trifolium_pratense	repeatmask_nrplants	33750565
52 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species triticum_turgidum -logic_name repeatmask_nrplants
53 | triticum_turgidum	genome_length	10463058104
54 | triticum_turgidum	repeatmask_nrplants	7583835351
55 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species vitis_vinifera -logic_name repeatmask_nrplants
56 | vitis_vinifera	genome_length	486265422
57 | vitis_vinifera	repeatmask_nrplants	216500288
58 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species zea_mays -logic_name repeatmask_nrplants
59 | zea_mays	genome_length	2135083061
60 | zea_mays	repeatmask_nrplants	1337627276
61 | 


--------------------------------------------------------------------------------
/repeats/bench/log.redat.bed:
--------------------------------------------------------------------------------
 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species aegilops_tauschii -logic_name repeatmask_redat
 2 | aegilops_tauschii	genome_length	4224915394
 3 | aegilops_tauschii	repeatmask_redat	2904094129
 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_halleri -logic_name repeatmask_redat
 5 | arabidopsis_halleri	genome_length	196243198
 6 | arabidopsis_halleri	repeatmask_redat	30398422
 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabidopsis_thaliana -logic_name repeatmask_redat
 8 | arabidopsis_thaliana	genome_length	119667750
 9 | arabidopsis_thaliana	repeatmask_redat	16998029
10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species arabis_alpina -logic_name repeatmask_redat
11 | arabis_alpina	genome_length	
12 | arabis_alpina	repeatmask_redat	46572112
13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brachypodium_distachyon -logic_name repeatmask_redat
14 | brachypodium_distachyon	genome_length	271163419
15 | brachypodium_distachyon	repeatmask_redat	74226118
16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species brassica_rapa -logic_name repeatmask_redat
17 | brassica_rapa	genome_length	283822783
18 | brassica_rapa	repeatmask_redat	24349441
19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species camelina_sativa -logic_name repeatmask_redat
20 | camelina_sativa	genome_length	641356059
21 | camelina_sativa	repeatmask_redat	101243109
22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species citrullus_lanatus -logic_name repeatmask_redat
23 | citrullus_lanatus	genome_length	365450462
24 | citrullus_lanatus	repeatmask_redat	24695282
25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species cucumis_melo -logic_name repeatmask_redat
26 | cucumis_melo	genome_length	357857370
27 | cucumis_melo	repeatmask_redat	29282530
28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species helianthus_annuus -logic_name repeatmask_redat
29 | helianthus_annuus	genome_length	3027844945
30 | helianthus_annuus	repeatmask_redat	302618657
31 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species malus_domestica_golden -logic_name repeatmask_redat
32 | malus_domestica_golden	genome_length	702961352
33 | malus_domestica_golden	repeatmask_redat	63711081
34 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species olea_europaea_sylvestris -logic_name repeatmask_redat
35 | olea_europaea_sylvestris	genome_length	1140989389
36 | olea_europaea_sylvestris	repeatmask_redat	205755459
37 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species oryza_sativa -logic_name repeatmask_redat
38 | oryza_sativa	genome_length	375049285
39 | oryza_sativa	repeatmask_redat	121253121
40 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species prunus_dulcis -logic_name repeatmask_redat
41 | prunus_dulcis	genome_length	227498357
42 | prunus_dulcis	repeatmask_redat	14847578
43 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species rosa_chinensis -logic_name repeatmask_redat
44 | rosa_chinensis	genome_length	515588973
45 | rosa_chinensis	repeatmask_redat	43127746
46 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species setaria_viridis -logic_name repeatmask_redat
47 | setaria_viridis	genome_length	
48 | setaria_viridis	repeatmask_redat	73169312
49 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species trifolium_pratense -logic_name repeatmask_redat
50 | trifolium_pratense	genome_length	304842038
51 | trifolium_pratense	repeatmask_redat	33405939
52 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species triticum_turgidum -logic_name repeatmask_redat
53 | triticum_turgidum	genome_length	10463058104
54 | triticum_turgidum	repeatmask_redat	7498456644
55 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species vitis_vinifera -logic_name repeatmask_redat
56 | vitis_vinifera	genome_length	486265422
57 | vitis_vinifera	repeatmask_redat	43968916
58 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $p1panreg -species zea_mays -logic_name repeatmask_redat
59 | zea_mays	genome_length	2135083061
60 | zea_mays	repeatmask_redat	1270268146
61 | 


--------------------------------------------------------------------------------
/repeats/bench/log.repeat.N50:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	10133	9973	9431
 2 | arabidopsis_halleri	554	1380	1431
 3 | arabidopsis_thaliana	445	1779	2211
 4 | arabis_alpina	1040	2245	1050
 5 | brachypodium_distachyon	4986	6260	6665
 6 | brassica_rapa	642	1046	777
 7 | camelina_sativa	878	1272	1176
 8 | citrullus_lanatus	2596	1020	1103
 9 | cucumis_melo	1939	3141	1338
10 | helianthus_annuus	5018	8716	1317
11 | malus_domestica_golden	2416	4729	1268
12 | olea_europaea_sylvestris	3153	1956	1218
13 | oryza_sativa	2931	4479	6077
14 | prunus_dulcis	1627	2528	1025
15 | rosa_chinensis	2125	1479	950
16 | setaria_viridis	3124	1727	1722
17 | trifolium_pratense	555	326	265
18 | triticum_turgidum	9066	9947	10124
19 | vitis_vinifera	1753	3369	1550
20 | zea_mays	13137	11806	11419
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.repeat.length:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	1506690	404	777962	1173	847592	963
 2 | arabidopsis_halleri	226080	124	81857	308	57901	209
 3 | arabidopsis_thaliana	172935	123	48144	267	28797	181
 4 | arabis_alpina	279129	169	146057	415	98017	222
 5 | brachypodium_distachyon	150191	121	74215	193	67632	183
 6 | brassica_rapa	348258	110	160157	259	69345	163
 7 | camelina_sativa	709160	122	267290	281	201059	223
 8 | citrullus_lanatus	323894	112	151980	190	52941	183
 9 | cucumis_melo	305083	118	148925	329	51833	213
10 | helianthus_annuus	2387122	198	355890	388	479400	352
11 | malus_domestica_golden	531496	148	211929	478	126487	201
12 | olea_europaea_sylvestris	901519	139	291445	254	375614	238
13 | oryza_sativa	278406	147	160371	255	129121	239
14 | prunus_dulcis	190357	109	105546	287	36891	148
15 | rosa_chinensis	463880	150	189086	268	93715	207
16 | setaria_viridis	247732	165	116459	271	105088	274
17 | trifolium_pratense	277811	206	139254	153	155808	147
18 | triticum_turgidum	4291533	312	1914776	1270	1784719	1456
19 | vitis_vinifera	423876	132	185204	395	69315	247
20 | zea_mays	847205	211	365978	696	372467	669
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.repeat.overlap:
--------------------------------------------------------------------------------
 1 | aegilops_tauschiiaegilops_tauschii	3454963624	2833915889	2871180926	76588631	161578261	201668761	183619439
 2 | arabidopsis_halleriarabidopsis_halleri	61005460	21270154	34296626	8063578	3574480	39244394	4508394
 3 | arabidopsis_thalianaarabidopsis_thaliana	43943711	11477631	20623470	5229908	2896188	6146057	3361091
 4 | arabis_alpinaarabis_alpina	116069065	34974204	101561844	7713526	7886525	41787455	10551564
 5 | brachypodium_distachyonbrachypodium_distachyon	84292420	63878923	63001822	2649893	3750379	6329521	6241890
 6 | brassica_rapabrassica_rapa	93148976	12137250	53471309	18246330	8884888	33405301	10922095
 7 | camelina_sativacamelina_sativa	230739085	72221094	100798661	20302963	16816272	83987132	23257791
 8 | citrullus_lanatuscitrullus_lanatus	149139515	17684233	45494701	13773277	9858124	37754666	14663873
 9 | cucumis_melocucumis_melo	142860741	21541420	122388576	16345201	15266153	33077764	19515094
10 | helianthus_annuushelianthus_annuus	2227207957	286811294	729901538	35524719	47044137	97126226	58855498
11 | malus_domestica_goldenmalus_domestica_golden	292775696	51043115	256385563	15837106	26741124	109164886	34292176
12 | olea_europaea_sylvestrisolea_europaea_sylvestris	516233180	175103570	184876693	25250984	197048464	159769209	210634322
13 | oryza_sativaoryza_sativa	138926649	102882577	116355536	10160029	11040079	19487736	15617002
14 | prunus_dulcisprunus_dulcis	75942705	9724644	62228480	6821347	7159674	16557179	9697037
15 | rosa_chinensisrosa_chinensis	247911427	35147667	89040006	11590565	14996352	22275228	21000797
16 | setaria_viridissetaria_viridis	161622886	62499090	65656462	5299406	8470076	11109097	12159449
17 | trifolium_pratensetrifolium_pratense	91430112	21074343	16935969	14305569	5963288	57474544	8446622
18 | triticum_turgidumtriticum_turgidum	8599065456	7281888447	7341688163	171184854	283316382	400291848	330564764
19 | vitis_viniferavitis_vinifera	194397642	30845098	160405750	28366556	19834542	63337140	26244040
20 | zea_mayszea_mays	1687073942	1238165526	1300760021	32339911	68910571	55601139	77759381
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.updown500:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	37802544	2033719	2556035	6319681
 2 | arabidopsis_halleri	32329295	3209393	5607682	10045427
 3 | arabidopsis_thaliana	25181016	1142935	3541364	8878106
 4 | arabis_alpina	20949737	804950	3986554	3468875
 5 | brachypodium_distachyon	33409828	2753591	2591857	4262505
 6 | brassica_rapa	45594270	902632	4380362	8410470
 7 | camelina_sativa	92778852	3128183	6080024	13135142
 8 | citrullus_lanatus	22596226	244442	849017	1732409
 9 | cucumis_melo	29465346	471854	5587397	5542277
10 | helianthus_annuus	51826802	888649	3018001	15501880
11 | malus_domestica_golden	40858762	759814	6404045	7165639
12 | olea_europaea_sylvestris	51470230	1565047	2186588	13800220
13 | oryza_sativa	37562150	3910303	5347525	5571857
14 | prunus_dulcis	28912163	1042283	5338746	5180817
15 | rosa_chinensis	44161039	856677	3423518	10039144
16 | setaria_viridis	36616623	1995173	2161278	5533510
17 | trifolium_pratense	37853815	3500940	2647709	11237759
18 | triticum_turgidum	65803596	1556728	2294898	6261722
19 | vitis_vinifera	30402773	483070	4132002	5409249
20 | zea_mays	39795578	1491136	1729071	8777432
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.updown500.16mer:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	1111069	1325298	1902833	1363	1454080	1714053	2391503	1061	2298895	3100696	5795036	914	2878	2597	13888			0	0	0
 2 | arabidopsis_halleri	1247101	1635827	2932850	1005	2548460	3253636	5179384	677	4261046	5692656	9016601	2288	9731	11588	15605			0	0	0
 3 | arabidopsis_thaliana	624454	776496	1039275	2829	2153983	2610131	3318313	2921	4436392	5884689	8026943	5198	654	1568	3030			0	0	0
 4 | arabis_alpina	474031	544822	723210	297	2231580	2632987	3715198	242	1516412	1930758	3152918	1728	746	5890	6915			0	0	0
 5 | brachypodium_distachyon	1565921	1853918	2524763	779	1524916	1788360	2376687	795	2037498	2561620	3834269	969	2402	1992	4583			0	0	0
 6 | brassica_rapa	530834	601740	791040	6796	2478355	2954115	4018130	8008	3914096	5034248	7423005	8274	873	4025	7990			0	0	0
 7 | camelina_sativa	1556418	1895552	2815016	13834	3122315	3875028	5548080	14461	4375242	6407004	11598425	14856	5437	6442	20796			0	0	0
 8 | citrullus_lanatus	150203	163711	211526	3834	560286	620903	747301	3909	857201	1011371	1455931	17616	139	287	1867			0	0	0
 9 | cucumis_melo	246274	290930	420430	11398	2640761	3328074	5270065	13395	2094704	2839587	5081380	9189	400	9734	11503			0	0	0
10 | helianthus_annuus	356715	443445	809633	14526	1810759	2109613	2815682	3631	4695577	6575658	14166127	15288	3124	2829	61862			0	0	0
11 | malus_domestica_golden	348818	419027	682749	4301	3040908	3772748	5961699	4538	2501077	3353107	6455560	2690	1800	14598	22558			0	0	0
12 | olea_europaea_sylvestris	773402	940950	1445114	1710	1105399	1338148	2019176	1943	4818543	6524789	12771861	4430	2629	3603	46070			0	0	0
13 | oryza_sativa	1789624	2254108	3594856	7281	2593470	3225411	4939191	7934	1942108	2661402	5024991	6158	8489	10229	15572			0	0	0
14 | prunus_dulcis	421171	565558	959320	9250	2373012	3076906	4992783	12826	1804183	2539079	4695670	9597	1662	10073	11838			0	0	0
15 | rosa_chinensis	406613	515667	772244	9538	1681148	2115001	3143329	9248	3324915	4755938	9230439	10040	977	3890	24917			0	0	0
16 | setaria_viridis	970273	1147999	1835321	1459	1096754	1290404	1990566	1453	2356486	3018484	5076815	2184	4599	4395	10603			0	0	0
17 | trifolium_pratense	1869551	2225603	3094900	4938	1455142	1718906	2333199	4835	5427189	6828986	10372840	3601	4469	2921	17234			0	0	0
18 | triticum_turgidum	867856	1013476	1439819	1737	1264377	1485940	2131356	1414	2137683	2856801	5623225	648	1277	3410	16226			0	0	0
19 | vitis_vinifera	324268	360245	438647	6161	2352661	2806772	3859436	6274	2208371	2961505	4890287	5692	99	3723	7834			0	0	0
20 | zea_mays	782560	932579	1385724	2365	900580	1073884	1606348	2449	2862016	4091994	8069996	1097	2816	3364	24826			0	0	0
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.updown500.21mer:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	1223322	1408334	1860185	2290	1593018	1815191	2337877	1770	2746261	3473619	5625037	791	1863	1384	8776			0	0	0
 2 | arabidopsis_halleri	1408835	1778359	2846660	936	2857251	3498578	5047256	553	5188324	6366149	8692000	1189	7033	8047	9579			0	0	0
 3 | arabidopsis_thaliana	663200	797431	1005778	4011	2344450	2722424	3246191	3770	5159100	6335478	7753347	5167	356	1006	1331			0	0	0
 4 | arabis_alpina	504995	564886	697566	289	2502190	2836538	3633829	200	1801615	2153476	3054388	1594	279	3698	4345			0	0	0
 5 | brachypodium_distachyon	1734256	1978460	2450401	1251	1670711	1892553	2306867	1270	2381022	2819157	3695748	962	1179	1004	2184			0	0	0
 6 | brassica_rapa	549634	609285	755899	5706	2735926	3121976	3904293	7147	4986641	5760273	7101468	7353	510	2326	3444			0	0	0
 7 | camelina_sativa	1733276	2022031	2715179	16131	3506426	4135102	5380274	12695	5574783	7355635	11118445	13544	3739	4105	10383			0	0	0
 8 | citrullus_lanatus	152292	163914	201039	5808	592707	636177	715949	5844	1059247	1149593	1366495	12031	81	111	762			0	0	0
 9 | cucumis_melo	257229	297295	403849	18072	3109207	3710697	5166500	20554	2634651	3289721	4931468	14357	238	5135	5841			0	0	0
10 | helianthus_annuus	380532	458931	784156	24526	1982543	2229863	2749547	5684	6029547	7788907	13732666	24555	2933	1468	41612			0	0	0
11 | malus_domestica_golden	366050	430759	658087	6484	3545516	4172896	5817868	6421	3220361	3978248	6225186	3619	1592	9565	13733			0	0	0
12 | olea_europaea_sylvestris	871190	1018641	1406500	2449	1235991	1440761	1966152	2781	6411399	7911176	12443983	2554	1264	1861	26460			0	0	0
13 | oryza_sativa	2116101	2538384	3491871	12073	3020749	3585578	4806162	13162	2481478	3148448	4847778	10081	4277	5249	8160			0	0	0
14 | prunus_dulcis	446225	583912	932789	7176	2673607	3308626	4881290	9921	2173555	2836039	4539038	7582	1351	7325	7954			0	0	0
15 | rosa_chinensis	429337	528817	745298	14518	1826447	2215157	3054660	13866	4025118	5349942	8969485	8487	768	2633	16615			0	0	0
16 | setaria_viridis	1081627	1243340	1783202	2271	1208904	1384165	1935141	2202	2777054	3357458	4928682	1495	3674	3381	7616			0	0	0
17 | trifolium_pratense	2123482	2403634	2968335	7571	1602235	1815838	2237466	7451	7249275	8204857	10097794	4552	2099 1476	6772			0	0	0
18 | triticum_turgidum	950978	1072533	1402071	2916	1387830	1576218	2078341	2354	2575423	3226596	5421604	745	698 2191	10238			0	0	0
19 | vitis_vinifera	330290	360874	424517	9858	2766085	3111395	3771552	9755	2948732	3554803	4723595	8621	73	1371	1874			0	0	0
20 | zea_mays	864987	996709	1351487	4284	1000625	1151863 1566536	4379	3428345	4582250	7840336	969	1874	2245	16700			0	0	0
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.updown500.31mer:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	1327296	1470530	1776128	1559	1714700	1884340	2232188	1220	3252707	3824185	5292434	593	923	439	4187			0	0	0
 2 | arabidopsis_halleri	1570475	1900695	2679047	154	3094086	3655001	4789805	131	5321226	6333885	8063564	1054	3654	4023	4790			0	0	0
 3 | arabidopsis_thaliana	672608	785618	940364	1740	2352451	2683600	3104925	1520	4981634	6024166	7221741	1858	125	538	710			0	0	0
 4 | arabis_alpina	524160	567793	647642	89	2739457	2987187	3473786	120	2066221	2324126	2864631	1521	59	1518	1884			0	0	0
 5 | brachypodium_distachyon	1845094	2029045	2304231	850	1760042	1926626	2169866	866	2611273	2927279	3426905	685	238	194	472			0	0	0
 6 | brassica_rapa	538950	585848	687479	4471	2817901	3124200	3683004	6220	4912675	5518795	6472154	6468	186	913	1616			0	0	0
 7 | camelina_sativa	1842346	2066011	2520913	7881	3716167	4206627	5054915	10127	6202631	7624495	10197074	11795	2071	2189	4698			0	0	0
 8 | citrullus_lanatus	148192	157470	180678	2945	576000	607310	655784	2921	1066160	1110006	1192984	5528	56	57	118			0	0	0
 9 | cucumis_melo	259787	293561	371604	10555	3436927	3947971	4962679	11628	2981354	3526419	4639412	8997	97	1846	1970			0	0	0
10 | helianthus_annuus	398948	465783	734515	16886	2090683	2278943	2619561	3448	7508292	8939808	12883915	16468	2697	484	21530			0	0	0
11 | malus_domestica_golden	376778	432810	610052	3411	3935807	4429259	5535300	3166	3867927	4423059	5775524	1968	1297	5594	7141			0	0	0
12 | olea_europaea_sylvestris	980001	1091723	1331015	1033	1377986	1534403	1862652	1173	7938185	9078737	11806171	993	349	553	9723			0	0	0
13 | oryza_sativa	2349954	2699566	3289073	8470	3311311	3777267	4544331	9227	2940533	3479084	4502520	7125	1188	1482	2404			0	0	0
14 | prunus_dulcis	469297	595896	880989	3616	2892536	3443497	4662741	5042	2398511	2958175	4235468	4259	819	5091	5300			0	0	0
15 | rosa_chinensis	447598	531933	693111	7075	1947075	2268535	2882830	6659	4780031	5877362	8460802	6331	483	1619	9679			0	0	0
16 | setaria_viridis	1178837	1318310	1680782	1441	1303496	1453873	1826274	1441	3141321	3602375	4639356	832	2032	1903	4326			0	0	0
17 | trifolium_pratense	2193420	2395040	2720261	4109	1642125	1797835	2050302	4118	7826017	8494360	9563697	2113	730	582	2498			0	0	0
18 | triticum_turgidum	1020712	1109210	1327878	1930	1499577	1641559	1974249	1545	3050874	3558442	5031631	496	223	865	4691			0	0	0
19 | vitis_vinifera	328208	352166	397052	5808	2944335	3192184	3599005	5656	3224506	3666047	4398918	5065	47	340	397			0	0	0
20 | zea_mays	941465	1046159	1284341	3507	1092771	1211705	1488377	3544	4102855	5070164	7390280	751	858	1055	7903			0	0	0
21 | 


--------------------------------------------------------------------------------
/repeats/bench/log.wheat.Red.bed:
--------------------------------------------------------------------------------
 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_arinalrfor -logic_name repeatdetector
 2 | triticum_aestivum_arinalrfor	genome_length	14659055505
 3 | triticum_aestivum_arinalrfor	repeatdetector	11909171698
 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_jagger -logic_name repeatdetector
 5 | triticum_aestivum_jagger	genome_length	14552150998
 6 | triticum_aestivum_jagger	repeatdetector	11870523645
 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_julius -logic_name repeatdetector
 8 | triticum_aestivum_julius	genome_length	14394882126
 9 | triticum_aestivum_julius	repeatdetector	11868610789
10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_lancer -logic_name repeatdetector
11 | triticum_aestivum_lancer	genome_length	14293273119
12 | triticum_aestivum_lancer	repeatdetector	11601105199
13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_landmark -logic_name repeatdetector
14 | triticum_aestivum_landmark	genome_length	14443802583
15 | triticum_aestivum_landmark	repeatdetector	11849684520
16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mace -logic_name repeatdetector
17 | triticum_aestivum_mace	genome_length	14362806306
18 | triticum_aestivum_mace	repeatdetector	11692169342
19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mattis -logic_name repeatdetector
20 | triticum_aestivum_mattis	genome_length	14955365424
21 | triticum_aestivum_mattis	repeatdetector	11748313140
22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_norin61 -logic_name repeatdetector
23 | triticum_aestivum_norin61	genome_length	14157632112
24 | triticum_aestivum_norin61	repeatdetector	11766521326
25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_stanley -logic_name repeatdetector
26 | triticum_aestivum_stanley	genome_length	14207638630
27 | triticum_aestivum_stanley	repeatdetector	11923454722
28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_spelta -logic_name repeatdetector
29 | triticum_spelta	genome_length	14453523434
30 | triticum_spelta	repeatdetector	11727050595
31 | 


--------------------------------------------------------------------------------
/repeats/bench/log.wheat.redat.bed:
--------------------------------------------------------------------------------
 1 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_arinalrfor -logic_name repeatmask_redat
 2 | triticum_aestivum_arinalrfor	genome_length	14659055505
 3 | triticum_aestivum_arinalrfor	repeatmask_redat	10173498584
 4 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_jagger -logic_name repeatmask_redat
 5 | triticum_aestivum_jagger	genome_length	14552150998
 6 | triticum_aestivum_jagger	repeatmask_redat	10067215340
 7 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_julius -logic_name repeatmask_redat
 8 | triticum_aestivum_julius	genome_length	14394882126
 9 | triticum_aestivum_julius	repeatmask_redat	10120102277
10 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_lancer -logic_name repeatmask_redat
11 | triticum_aestivum_lancer	genome_length	14293273119
12 | triticum_aestivum_lancer	repeatmask_redat	9923751623
13 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_landmark -logic_name repeatmask_redat
14 | triticum_aestivum_landmark	genome_length	14443802583
15 | triticum_aestivum_landmark	repeatmask_redat	10047469943
16 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mace -logic_name repeatmask_redat
17 | triticum_aestivum_mace	genome_length	14362806306
18 | triticum_aestivum_mace	repeatmask_redat	9997367079
19 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_mattis -logic_name repeatmask_redat
20 | triticum_aestivum_mattis	genome_length	14955365424
21 | triticum_aestivum_mattis	repeatmask_redat	10037233281
22 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_norin61 -logic_name repeatmask_redat
23 | triticum_aestivum_norin61	genome_length	14157632112
24 | triticum_aestivum_norin61	repeatmask_redat	10067342383
25 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_aestivum_stanley -logic_name repeatmask_redat
26 | triticum_aestivum_stanley	genome_length	14207638630
27 | triticum_aestivum_stanley	repeatmask_redat	10114891908
28 | perl ~/plant_tools/production/misc_scripts/repeat_feature_summary.pl -reg_file $s3panreg -species triticum_spelta -logic_name repeatmask_redat
29 | triticum_spelta	genome_length	14453523434
30 | triticum_spelta	repeatmask_redat	10021321896
31 | 


--------------------------------------------------------------------------------
/repeats/bench/pfam/enrich.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env Rscript 
 3 | # based on 
 4 | # https://github.com/eead-csic-compbio/get_homologues/blob/master/pfam_enrich.pl
 5 | 
 6 | args = commandArgs(trailingOnly=TRUE)
 7 | 
 8 | if(length(args)<3) {
 9 |   stop("# Usage: <exp> <control> <out> filenames", call.=FALSE)
10 | }
11 | 
12 | # globals
13 | direction="greater" #"two.sides" 
14 | multitest="fdr"
15 | verbose=F
16 | 
17 | ##query_file="aegilops_tauschii.Red.genes.ovl50.tsv"
18 | #control_file="aegilops_tauschii.tsv.count.tsv"
19 | #out_file="kk"
20 | 
21 | query_file=args[1]
22 | control_file=args[2]
23 | out_file=args[3]
24 | 
25 | # parse data
26 | que_data=read.csv(query_file, sep="\t", header=FALSE);
27 | que_rows=nrow(que_data);
28 | ref_data=read.csv(control_file, sep="\t", header=FALSE);
29 | ref_rows=nrow(ref_data); 
30 | 
31 | # uses globals que_total, ref_total
32 | enrich_test <- function(x){
33 |   que_id=x[1];
34 |   que_value=as.numeric(x[2]);
35 |   ref_value=as.numeric(ref_data[ref_data$V1==que_id,2]);
36 |   if (length(ref_value)==0){
37 |     ref_value=0;
38 |   }
39 |   if(verbose==T){
40 |     cat(paste(que_id, "\n"), file=stderr());
41 |   }
42 |   values=c(que_value, ref_value, que_total, ref_total);
43 |   input_matrix=matrix(values, nrow = 2,
44 |                       dimnames=list(c("exp", "control"), c("Pfam", "total")));
45 |   if(verbose==T){
46 |     cat(paste(input_matrix, "\n"), file=stderr());
47 |   }
48 |   fisher_htest=fisher.test(input_matrix, alternative=direction);
49 |   if(verbose==T){
50 |     cat(paste(fisher_htest, "\n"), file=stderr());
51 |   }
52 |   ret_value=c( que_id, fisher_htest$p.value );
53 |   return(ret_value);
54 | }
55 | 
56 | print_pvalues <- function(x) {
57 |   cat(paste(x[1],"\t",x[2],"\t",x[3],"\n"),file=out_file,append=T);
58 | }
59 | 
60 | que_total=sum(que_data[,2]);
61 | ref_total=sum(ref_data[,2]);
62 | pvalues=apply(que_data, 1, enrich_test);
63 | 
64 | ## Multiple test adjustment
65 | num_pvalues=as.numeric(pvalues[2,]);
66 | adj_pvalues=p.adjust(num_pvalues, method=multitest);
67 | result_pvalues=rbind(pvalues, adj_pvalues);
68 | result_pvalues=t(result_pvalues); # rows to columns
69 | apply(result_pvalues, 1, print_pvalues);
70 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/HOWTO.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | ## gene/exon overlap stats
  3 | 
  4 | # 1st genes
  5 | 
  6 | export MINOVER=50
  7 | 
  8 | while read -r col1 col2; do	
  9 | 	rm=$(bedtools intersect -a ../bed/${col1}.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[7]; END{print $over}')
 10 | 	genes=$(bedtools intersect -a ../bed/${col1}.bed -b ../bed/${col1}.bed -sorted -wo | perl -lane '$over+=$F[8]; END{print $over}')
 11 | 	read tgenes filename <<< $(wc -l ../bed/${col1}.bed)
 12 | 	printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm
 13 | 
 14 | 	# save overlapped genes
 15 | 	bedtools intersect -a ../bed/${col1}.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo > bed/${col2}.repeatmodeller.genes.bed
 16 | 
 17 | 	# MINOVER is required
 18 | 	perl -lane 'print $F[3] if($F[7] > ${MINOVER})' bed/${col2}.repeatmodeller.genes.bed | uniq > bed/${col2}.repeatmodeller.genes.ovl${MINOVER}.list
 19 | 
 20 | 	# tally Pfam domain occurrences
 21 | 	if [ ! -f "pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.enrich.tsv" ];
 22 | 	then
 23 | 		while read ln; do fgrep "$ln" ../pfam/${col2}.tsv; done < bed/${col2}.repeatmodeller.genes.ovl${MINOVER}.list | cut -f 2 | sort | uniq -c | perl -lane 'if(/PF/){ print "$F[1]\t$F[0]" }' > pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.tsv
 24 | 
 25 | 		# compute enrichment
 26 | 		Rscript ../pfam/enrich.R pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.tsv ../pfam/${col2}.tsv.count.tsv pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.all.tsv
 27 | 		perl -lane 'print if($F[2] < 0.05)' pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.all.tsv > pfam/${col2}.repeatmodeller.genes.ovl${MINOVER}.enrich.tsv
 28 | 	else
 29 | 		continue
 30 | 	fi
 31 | 
 32 | done < list.cores.sp > log.genes
 33 | 
 34 | 
 35 | # now exons
 36 | while read -r col1 col2; do
 37 |     rm=$(bedtools intersect -a ../bed/${col1}.exon.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[7]; END{print $over}')
 38 | 	exons=$(bedtools intersect -a ../bed/${col1}.exon.bed -b ../bed/${col1}.exon.bed -sorted -wo | perl -lane '$over+=$F[8]; END{print $over}')
 39 | 	read texons filename <<< $(wc -l ../bed/${col1}.exon.bed)
 40 | 	printf "%s\t%d\t%d\t%d\n" $col2 $texons $exons $rm
 41 | 
 42 | 	# save overlapped exons
 43 |     bedtools intersect -a ../bed/${col1}.exon.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo > bed/${col2}.repeatmodeller.exons.bed
 44 | 
 45 | 	perl -lane 'print $F[3] if($F[7] > ${MINOVER})' bed/${col2}.repeatmodeller.exons.bed | uniq > bed/${col2}.repeatmodeller.exons.${MINOVER}list
 46 | 
 47 | done < list.cores.sp > log.exons
 48 | 
 49 | # now up & downstream
 50 | while read -r col1 col2; do
 51 | 	rm=$(bedtools intersect -a ../bed/${col1}.updown500.bed -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 52 | 	updown=$(bedtools intersect -a ../bed/${col1}.updown500.bed -b ../bed/${col1}.updown500.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 53 | 	printf "%s\t%d\t%d\n" $col2 $updown $rm
 54 | done < list.cores.sp > log.updown500
 55 | 
 56 | # -> gene_exon_updown_plot
 57 | 
 58 |  
 59 | ## check #copies, length of repeats and overlap among methods
 60 | 
 61 | # repeat length stats 
 62 | while read -r col1 col2; do
 63 | 	printf "$col2"
 64 | 	perl -lane 'print $F[2]-$F[1]' ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed | Rscript -e 'median(scan(file="stdin"))' 2>&1 | perl -ne 'if(/Read (\d+) items/){ print "\t$1" } elsif(/\[1\] (\d+)/){ print "\t$1\n" }'
 65 | 
 66 | done < list.cores.sp > log.repeat.length
 67 | 
 68 | # repeat length stats (N50)
 69 | while read -r col1 col2; do
 70 |     printf "$col2"
 71 |     perl -ane '$l=$F[2]-$F[1]; $TL+=$l; $R{$.}=$l; END{ foreach $s (sort {$R{$b}<=>$R{$a}} keys(%R)){ $t+=$R{$s}; if($t>$TL/2){ print "\t$R{$s}\n"; exit }}}' ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed
 72 | 
 73 | done < list.cores.sp > log.repeat.N50
 74 | 
 75 | # check Pfam enrichment of gene overlap
 76 | 
 77 | cut -f 1 pfam/*repeatmodeller*enrich.tsv | sort | uniq -c | sort -nr | perl -lane 'if(/PF/){ print "$F[1]\t$F[0]" }' > pfam/enrich.repeatmodeller.tsv
 78 | 
 79 | # Red repeat overlap vs others
 80 | while read -r col1 col2; do
 81 | 	printf "$col2"
 82 | 	red=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.Red.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 83 | 	rm=$(bedtools intersect -b  ../bed/${col2}.Red.bed -a ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 84 | 	nrplants=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.repeatmask_nrplants.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 85 | 	redat=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.repeatmask_redat.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 86 | 	dust=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.dust.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 87 | 	trf=$(bedtools intersect -a ../bed/${col2}.Red.bed -b ../bed/${col2}.trf.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 88 | 	totdust=$(bedtools intersect -a ../bed/${col2}.dust.bed -b ../bed/${col2}.dust.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 89 | 	tottrf=$(bedtools intersect -a ../bed/${col2}.trf.bed -b ../bed/${col2}.trf.bed -sorted -wo | perl -lane '$over+=$F[6]; END{print $over}')
 90 | 
 91 | 	printf "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" $col2 $red $rm $redat $nrplants $dust $trf $totdust $tottrf
 92 | 
 93 | done < list.cores.sp > log.repeat.overlap
 94 | 
 95 | 
 96 | ## check overlap with denovo called Rgenes (NLR-annotator)
 97 | 
 98 | while read -r col1 col2; do
 99 | 	rm=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane '$over+=$F[9]; END{print $over}')
100 | 	genes=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../denovoRgenes/${col2}.nlr.bed.sorted -sorted -wo | perl -lane '$over+=$F[12]; END{print $over}')
101 | 	read tgenes filename <<< $(wc -l ../denovoRgenes/${col2}.nlr.bed.sorted)
102 | 	printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm
103 | 
104 | done < list.cores.sp > log.Rgenes
105 | 
106 | while read -r col1 col2; do
107 | 	rm=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../../RepeatModeller/${col2^}.repeatmodeller.sort.bed -sorted -wo | perl -lane 'next if($F[9] <= $ENV{"MINOVER"}); $over+=$F[9]; END{print $over}')
108 | 	genes=$(bedtools intersect -a ../denovoRgenes/${col2}.nlr.bed.sorted -b ../denovoRgenes/${col2}.nlr.bed.sorted -sorted -wo | perl -lane '$over+=$F[12]; END{print $over}')
109 | 	read tgenes filename <<< $(wc -l ../denovoRgenes/${col2}.nlr.bed.sorted)
110 | 	printf "%s\t%d\t%d\t%d\n" $col2 $tgenes $genes $rm
111 | 
112 | done < list.cores.sp > log.Rgenes.$MINOVER
113 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/list.cores.sp:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii_core_48_101_3	aegilops_tauschii
 2 | arabidopsis_thaliana_core_48_101_11	arabidopsis_thaliana
 3 | brachypodium_distachyon_core_48_101_4	brachypodium_distachyon
 4 | brassica_rapa_core_48_101_1	brassica_rapa
 5 | camelina_sativa_core_48_101_1	camelina_sativa
 6 | cucumis_melo_core_48_101_1	cucumis_melo
 7 | prunus_dulcis_core_48_101_1	prunus_dulcis
 8 | rosa_chinensis_core_48_101_1	rosa_chinensis
 9 | vitis_vinifera_core_48_101_3	vitis_vinifera
10 | zea_mays_core_48_101_7	zea_mays
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.Rgenes:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	916	3144217	425022
 2 | arabidopsis_thaliana	69	201660	66316
 3 | brachypodium_distachyon	344	1249602	266009
 4 | brassica_rapa	219	729119	431797
 5 | camelina_sativa	573	1808571	906291
 6 | cucumis_melo	89	294239	107550
 7 | prunus_dulcis	387	1354547	700937
 8 | rosa_chinensis	963	3347977	2294566
 9 | vitis_vinifera	739	2723777	1902623
10 | zea_mays	158	487237	76561
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.Rgenes.50:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	916	3144217	416193
 2 | arabidopsis_thaliana	69	201660	65934
 3 | brachypodium_distachyon	344	1249602	262733
 4 | brassica_rapa	219	729119	430033
 5 | camelina_sativa	573	1808571	901520
 6 | cucumis_melo	89	294239	106155
 7 | prunus_dulcis	387	1354547	695038
 8 | rosa_chinensis	963	3347977	2284956
 9 | vitis_vinifera	739	2723777	1893319
10 | zea_mays	158	487237	75567
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.exons:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	167903	68899123	4177668
 2 | arabidopsis_thaliana	145966	46554976	2523410
 3 | brachypodium_distachyon	152258	68995392	8240434
 4 | brassica_rapa	206588	48263240	4524180
 5 | camelina_sativa	481073	132504580	17860756
 6 | cucumis_melo	137797	41060653	4349419
 7 | prunus_dulcis	134744	47752477	6017017
 8 | rosa_chinensis	174074	74446167	15845592
 9 | vitis_vinifera	147613	39848131	5441784
10 | zea_mays	206593	63390920	5075327
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.genes:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	39614	348459650	148594336
 2 | arabidopsis_thaliana	27628	67504196	3271128
 3 | brachypodium_distachyon	34310	122507803	15431148
 4 | brassica_rapa	41018	83240510	11035129
 5 | camelina_sativa	89275	215022170	27495248
 6 | cucumis_melo	28299	99772108	21734761
 7 | prunus_dulcis	27963	94240163	14257008
 8 | rosa_chinensis	45464	117342283	22983889
 9 | vitis_vinifera	29927	153564951	50510060
10 | zea_mays	39583	168056455	42598368
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.repeat.N50:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	7894
 2 | arabidopsis_thaliana	1178
 3 | brachypodium_distachyon	2125
 4 | brassica_rapa	628
 5 | camelina_sativa	1105
 6 | cucumis_melo	1819
 7 | prunus_dulcis	1422
 8 | rosa_chinensis	1958
 9 | vitis_vinifera	1604
10 | zea_mays	11380
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.repeat.length:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	1758407	475
 2 | arabidopsis_thaliana	72138	57
 3 | brachypodium_distachyon	222710	135
 4 | brassica_rapa	303119	147
 5 | camelina_sativa	611700	140
 6 | cucumis_melo	407579	136
 7 | prunus_dulcis	243499	108
 8 | rosa_chinensis	499475	188
 9 | vitis_vinifera	496352	175
10 | zea_mays	853432	270
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.repeat.overlap:
--------------------------------------------------------------------------------
 1 | aegilops_tauschiiaegilops_tauschii	3454963624	3343466712	2833915889	2871180926	76588631	161578261	201668761	183619439
 2 | arabidopsis_thalianaarabidopsis_thaliana	43943711	14661371	11477631	20623470	5229908	2896188	6146057	3361091
 3 | brachypodium_distachyonbrachypodium_distachyon	84292420	74811547	63878923	63001822	2649893	3750379	6329521	6241890
 4 | brassica_rapabrassica_rapa	93148976	63192385	12137250	53471309	18246330	8884888	33405301	10922095
 5 | camelina_sativacamelina_sativa	230739085	165779082	72221094	100798661	20302963	16816272	83987132	23257791
 6 | cucumis_melocucumis_melo	142860741	134727620	21541420	122388576	16345201	15266153	33077764	19515094
 7 | prunus_dulcisprunus_dulcis	75942705	66309477	9724644	62228480	6821347	7159674	16557179	9697037
 8 | rosa_chinensisrosa_chinensis	247911427	223060560	35147667	89040006	11590565	14996352	22275228	21000797
 9 | vitis_viniferavitis_vinifera	194397642	175266445	30845098	160405750	28366556	19834542	63337140	26244040
10 | zea_mayszea_mays	1687073942	1622632631	1238165526	1300760021	32339911	68910571	55601139	77759381
11 | 


--------------------------------------------------------------------------------
/repeats/bench/repeatmodeller/log.updown500:
--------------------------------------------------------------------------------
 1 | aegilops_tauschii	37802544	10572939
 2 | arabidopsis_thaliana	25181016	1793529
 3 | brachypodium_distachyon	33409828	6752027
 4 | brassica_rapa	45594270	6041729
 5 | camelina_sativa	92778852	13652759
 6 | cucumis_melo	29465346	9226572
 7 | prunus_dulcis	28912163	6906338
 8 | rosa_chinensis	44161039	13176712
 9 | vitis_vinifera	30402773	7477692
10 | zea_mays	39795578	10459637
11 | 


--------------------------------------------------------------------------------
/repeats/get_repeats_ensembl.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Script to download annotated repeated elements 
  4 | # of a selected species in Ensembl Plants
  5 | 
  6 | # Required binaries: wget, sort, perl, mysql, bedtools
  7 | 
  8 | # Copyright [2020-21] EMBL-European Bioinformatics Institute
  9 | 
 10 | # documentation about Ensembl schemas can be found at 
 11 | # http://www.ensembl.org/info/docs/api/index.html
 12 | 
 13 | if [[ $# -eq 0 ]] ; then
 14 | 	echo "# example usage: $0 arabidopsis_thaliana"
 15 | 	exit 0
 16 | else
 17 | 	SPECIES=$1
 18 | fi
 19 | 
 20 | # PARAMS
 21 | MINLEN=90
 22 | MAXDEGENPERC=10
 23 | MAXIDFRAC=0.95
 24 | DEBUG=0
 25 | 
 26 | # SERVER DETAILS
 27 | FTPSERVER="ftp://ftp.ensemblgenomes.org/pub"
 28 | DIV=plants
 29 | SERVER=mysql-eg-publicsql.ebi.ac.uk
 30 | USER=anonymous
 31 | PORT=4157
 32 | 
 33 | ## 1) get Ensembl Plants current release number from FTP server
 34 | # Note: wget is used, this can be modified to use alternatives ie curl
 35 | SUMFILE="${FTPSERVER}/${DIV}/current/summary.txt"
 36 | RELEASE=`wget --quiet -O - $SUMFILE | \
 37 | 	perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }'`
 38 | 
 39 | # work out Ensembl Genomes release
 40 | EGRELEASE=$(( RELEASE - 53));
 41 | 
 42 | ## 2) select core db matching selected species
 43 | SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \
 44 | 	-e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}")
 45 | 
 46 | if [ -z "$SPECIESCORE" ]; then
 47 | 	echo "# ERROR: cannot find species $SPECIES"
 48 | 	exit 1
 49 | else
 50 | 	echo "# Ensembl core db: $SPECIESCORE";
 51 | fi
 52 | 
 53 | ## 3) retrieve 1-based coords of repeats
 54 | 
 55 | # note these might be redundant/overlapping
 56 | #1       3       106     trf
 57 | #1       4       91      trf
 58 | 
 59 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
 60 | 	"SELECT sr.name,r.seq_region_start,r.seq_region_end,rc.repeat_class \
 61 | 	FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \
 62 | 	WHERE r.seq_region_id=sr.seq_region_id \
 63 | 	AND r.repeat_consensus_id=rc.repeat_consensus_id \
 64 | 	AND (rc.repeat_class <> 'Unspecified' AND rc.repeat_class <> \
 65 | 		'repeatdetector' AND rc.repeat_class <> 'tallymer') \
 66 | 	AND (r.seq_region_end-r.seq_region_start+1) > $MINLEN" | \
 67 | 	sort -u -k1,1 -k2,2n > _${SPECIES}.repeats1.bed
 68 | 
 69 | ## 4) retrieve 1-based coords of genes
 70 | mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
 71 | 	"SELECT sr.name,g.seq_region_start,g.seq_region_end,g.stable_id \
 72 | 	FROM gene g JOIN seq_region sr \
 73 | 	WHERE g.seq_region_id=sr.seq_region_id" | \
 74 | 	sort -k1,1 -k2,2n > _${SPECIES}.genes1.bed
 75 | 
 76 | ## 5) curate repeats by substracting annotated genes and
 77 | ##    convert to 0-based BED format
 78 | bedtools subtract -sorted \
 79 | 	-a _${SPECIES}.repeats1.bed -b _${SPECIES}.genes1.bed | \
 80 | 	perl -lane '$F[1]-=1; print join("\t",@F)' >\
 81 | 	_${SPECIES}.repeats.bed
 82 | 
 83 | if [ ! -s  _${SPECIES}.repeats.bed ]; then
 84 | 	echo "# no repeats found"
 85 | 	exit 2
 86 | fi
 87 | 
 88 | ## 6) download and uncompress genomic sequence 
 89 | FASTA="*${SPECIES^}*.dna.toplevel.fa.gz"
 90 | URL="${FTPSERVER}/${DIV}/current/fasta/${SPECIES}/dna/${FASTA}"
 91 | if [ ! -s  _${SPECIES}.toplevel.fasta ]; then
 92 | 	echo "# downloading $URL"
 93 | 	wget -c $URL -O- | gunzip > _${SPECIES}.toplevel.fasta
 94 | else
 95 | 	echo "# re-using _${SPECIES}.toplevel.fasta"
 96 | fi
 97 | 
 98 | ## 7) extract repeat sequences 
 99 | bedtools getfasta -name -fi _${SPECIES}.toplevel.fasta -bed _${SPECIES}.repeats.bed >\
100 | 	_${SPECIES}.repeats.fasta
101 | 
102 | ## 8) eliminate degenerate (MAXDEGENPERC) repeat sequences 
103 | cat _${SPECIES}.repeats.fasta | \
104 | 	perl -slne 'if(/^(>.*)/){$h=$1} else {$fa{$h}.=$_} END{ foreach $h (keys(%fa)){ $l=length($fa{$h}); $dg=($fa{$h}=~tr/Nn//); print "$h\n$fa{$h}" if(100*$dg/$l<=$maxdeg) }}' \
105 | 	-- -maxdeg=$MAXDEGENPERC > ${SPECIES}.repeats.nondeg.fasta
106 | 
107 | ## 9) clean temp files
108 | if [ -z "$DEBUG" ] || [ "$DEBUG" -eq "0" ]; then
109 | 	echo
110 | 	echo "# removing temp files"; 
111 | 	rm _${SPECIES}.*.bed _${SPECIES}.*.fasta _${SPECIES}.*.fai
112 | fi
113 | 
114 | exit 0
115 | 


--------------------------------------------------------------------------------