├── scripts ├── Merge │ ├── README-vega-check.txt │ ├── merge-refseq.conf │ └── merge.conf ├── protein │ ├── THESE_SCRIPTS_ARE_USED_TO_RUN_THE_PROTEIN_ANNOTATION_PIPELINE │ └── chunk_protein_file.pl ├── assembly_patches │ ├── remove_patch_karyotype.sql │ └── remove_patch_raw_compute.sql ├── imgt │ └── kill_list.txt ├── cdna_update │ └── find_N.pl ├── genebuild │ ├── parse_embl_cds2uniprotkb.pl │ ├── sncrna │ │ ├── filter_cm.pl │ │ ├── repeats_dump.pl │ │ └── dump_prefilter_features.pl │ ├── find_seq_in_fasta.pl │ ├── convert_genome_dump.pl │ └── slice_coding_gene_cnt.pl ├── databases │ └── process_uniprot_isoforms.pl ├── chunk_fasta_file.pl ├── markers │ ├── map_weight.pl │ └── marker_match.pl └── delete_big_dir.pl ├── requirements.txt ├── modules ├── t │ ├── test-genome-DBs │ │ └── pararge_aegeria │ │ │ └── core │ │ │ ├── seq_region_attrib.txt │ │ │ ├── meta_coord.txt │ │ │ ├── seq_region_synonym.txt │ │ │ ├── coord_system.txt │ │ │ ├── analysis.txt │ │ │ ├── seq_region.txt │ │ │ ├── assembly.txt │ │ │ ├── external_db.txt │ │ │ ├── attrib_type.txt │ │ │ ├── analysis_description.txt │ │ │ └── meta.txt │ ├── MultiTestDB.conf.default │ ├── repeatcoverage.t │ ├── hiveassemblycomponents_rb.t │ ├── hiveloadgenomesequences_rb.t │ ├── hiveprocessassemblyreport_rb.t │ ├── prepare_local_tests.sh │ ├── hivecreatedirectories_rb.t │ ├── hiverepeatcoverage_rb.t │ └── filter_t.t └── Bio │ └── EnsEMBL │ └── Analysis │ ├── RunnableDB │ ├── Bam2Genes.pm │ ├── Bam2Introns.pm │ ├── Solexa2Genes.pm │ ├── BlastRNASeqPep.pm │ ├── ExonerateSolexa.pm │ ├── RefineSolexaGenes.pm │ ├── Solexa2GenesLiteNew.pm │ ├── ExonerateSolexaTranscript.pm │ ├── ExonerateSolexaLocalAlignment.pm │ ├── ProteinAnnotation │ │ ├── PrositePattern.pm.retired │ │ ├── Hamap_wormbase.pm │ │ ├── PrositeProfile.pm │ │ ├── PrositeProfile_wormbase.pm │ │ ├── Prints.pm │ │ ├── Prints_wormbase.pm │ │ ├── PrositePattern.pm │ │ ├── PrositePattern_wormbase.pm │ │ ├── Coil.pm │ │ ├── Signalp.pm │ │ ├── Hmmpfam.pm │ │ ├── PIRSF.pm │ │ ├── Superfamily.pm │ │ ├── IPRScan.pm │ │ ├── Pfam_wormbase.pm │ │ ├── Tmhmm.pm │ │ ├── Superfamily_wormbase.pm │ │ ├── Seg.pm │ │ └── Panther.pm │ ├── Finished │ │ ├── EPCR.pm │ │ └── RepeatMasker.pm │ ├── Accumulator.pm │ ├── Snap.pm │ ├── Fgenesh.pm │ └── Funcgen │ │ ├── ACME.pm │ │ └── Chipotle.pm │ ├── Config │ ├── GeneBuild │ │ ├── Bam2Genes.pm.example │ │ ├── Solexa2Genes.pm.example │ │ ├── Solexa2GenesLiteNew.pm │ │ ├── RefineSolexaGenes.pm.example │ │ ├── BlastRNASeqPep.pm.example │ │ ├── OrthologueEvaluatorExonerate.pm.example │ │ ├── Sam2Bam.pm.example │ │ ├── BuildChecks.pm.example │ │ ├── IgSegBuilder.pm.example │ │ ├── ProjectedTranscriptEvidence.pm.example │ │ ├── ExonerateSolexaLocalAlignment.pm.example │ │ └── Gsnap.pm.example │ ├── AddStableIds.pm.example │ ├── S3Config.pm.example │ ├── CloneEndsLinking.pm.example │ └── CollapseAffyProbes.pm.example │ ├── Hive │ ├── Config │ │ ├── genome_annotation.ini │ │ ├── sample_genes_registry_conf.pl │ │ └── BamMergeStatic.pm │ └── RunnableDB │ │ ├── HiveRunExternalCmd.pm │ │ ├── HiveCreateFastqDownloadJobs.pm │ │ ├── HiveStoreUnmappedcDNAs.pm │ │ ├── HiveLoadProteins.pm │ │ ├── HiveSequencesToFiles.pm │ │ ├── HiveLoadmRNAs.pm │ │ ├── HiveDBSeqFiles.pm │ │ ├── HiveLoadcDNAs.pm │ │ ├── HivecDNAManyHits.pm │ │ └── HiveIndexGenome.pm │ ├── Tools │ ├── BlastDBTracking │ │ └── Entry.pm │ ├── IMGT │ │ └── Seq │ │ │ └── RichSeqIMGT.pm │ ├── SoftwareConfigLoad.pm │ ├── Stashes.pm │ ├── GenomeOverlapFilter.pm │ ├── PacBioTranscriptFilter.pm │ ├── GeneBuildUtils │ │ └── HomologyUtils.pm │ ├── AllExonOverlapFilter.pm │ └── CodingExonOverlapFilter.pm │ └── Runnable │ ├── DustMasker.pm │ ├── ProteinAnnotation │ └── PrositeProfile.pm │ └── SamtoolsMerge.pm ├── cpanfile ├── travisci └── MultiTestDB.conf.mysql ├── requirements_p36_ncrna.txt ├── pull_request_template.md ├── .gitignore └── sql └── repeat_db_tables.sql /scripts/Merge/README-vega-check.txt: -------------------------------------------------------------------------------- 1 | TBC 2 | -------------------------------------------------------------------------------- /scripts/protein/THESE_SCRIPTS_ARE_USED_TO_RUN_THE_PROTEIN_ANNOTATION_PIPELINE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | deeptools ~= 3.4.1 2 | gspread 3 | oauth2client 4 | google-auth 5 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/seq_region_attrib.txt: -------------------------------------------------------------------------------- 1 | 1 367 1 2 | 1 6 1 3 | -------------------------------------------------------------------------------- /cpanfile: -------------------------------------------------------------------------------- 1 | requires 'Bio::DB::HTS'; 2 | requires 'Proc::ProcessTable'; 3 | requires 'Bio::DB::EUtilities'; 4 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/meta_coord.txt: -------------------------------------------------------------------------------- 1 | gene 1 43270 2 | exon 1 3426 3 | transcript 1 6190 4 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/seq_region_synonym.txt: -------------------------------------------------------------------------------- 1 | 1 1 LR990895.1 50710 2 | 2 1 NC_053180.1 1830 3 | -------------------------------------------------------------------------------- /travisci/MultiTestDB.conf.mysql: -------------------------------------------------------------------------------- 1 | { 2 | 'port' => '3306', 3 | 'driver' => 'mysql', 4 | 'user' => 'root', 5 | 'host' => '127.0.0.1', 6 | } 7 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Genes.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Genes.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Introns.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Introns.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2Genes.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2Genes.pm -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/coord_system.txt: -------------------------------------------------------------------------------- 1 | 1 1 primary_assembly ilParAegt1.1 1 default_version 2 | 2 1 contig \N 2 default_version,sequence_level 3 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/BlastRNASeqPep.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/BlastRNASeqPep.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexa.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexa.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/RefineSolexaGenes.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/RefineSolexaGenes.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2GenesLiteNew.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2GenesLiteNew.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Bam2Genes.pm.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Bam2Genes.pm.example -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/analysis.txt: -------------------------------------------------------------------------------- 1 | 1 2021-05-08 10:07:46 ensembl \N \N \N \N \N \N \N \N \N \N \N 2 | 2 2021-05-08 10:08:27 ncrna \N \N \N \N \N \N \N \N \N \N \N 3 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2Genes.pm.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2Genes.pm.example -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2GenesLiteNew.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2GenesLiteNew.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaTranscript.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaTranscript.pm -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/RefineSolexaGenes.pm.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/RefineSolexaGenes.pm.example -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaLocalAlignment.pm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaLocalAlignment.pm -------------------------------------------------------------------------------- /requirements_p36_ncrna.txt: -------------------------------------------------------------------------------- 1 | # We can only use python <= 3.6 because it needs scipy 0.18.1 2 | # We need to reinstall numpy as pandas will update to the latest version possible 3 | pandas 4 | scipy==0.18.1 5 | scikit-learn==0.18.1 6 | numpy==1.17.5 7 | -------------------------------------------------------------------------------- /scripts/assembly_patches/remove_patch_karyotype.sql: -------------------------------------------------------------------------------- 1 | delete karyotype from attrib_type, seq_region_attrib, karyotype where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = karyotype.seq_region_id; 2 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/Config/genome_annotation.ini: -------------------------------------------------------------------------------- 1 | assembly_accessions=[] 2 | output_path= 3 | release_number= 4 | email_address= 5 | genebuilder_id= 6 | pipeline_name= 7 | user_r= 8 | user_w= 9 | password= 10 | pipe_db_host= 11 | databases_host= 12 | dna_db_host= 13 | pipe_db_port= 14 | databases_port= 15 | dna_db_port= 16 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/seq_region.txt: -------------------------------------------------------------------------------- 1 | 1 1 1 21295481 2 | 2 contig_1 2 42060 3 | 3 contig_2 2 43299 4 | 4 contig_3 2 13106 5 | 5 contig_4 2 20932 6 | 6 contig_5 2 10928 7 | 7 contig_6 2 17392 8 | 8 contig_7 2 7483 9 | 9 contig_8 2 26761 10 | 10 contig_9 2 149864 11 | 11 contig_10 2 16265 12 | 12 contig_11 2 22067 13 | 13 contig_12 2 18568 14 | 14 contig_13 2 72768 15 | -------------------------------------------------------------------------------- /modules/t/MultiTestDB.conf.default: -------------------------------------------------------------------------------- 1 | # This provides a default set of configurations which is merged with your 2 | # provided MultiTestDB.conf. 3 | # 4 | # You should provide a MultiTestDB.conf which defines the databases 5 | 6 | { 7 | 'databases' => { 8 | 'homo_sapiens' => { 9 | 'core' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', 10 | }, 11 | 'pararge_aegeria' => { 12 | 'core' => 'Bio::EnsEMBL::DBSQL::DBAdaptor', 13 | }, 14 | }, 15 | } 16 | -------------------------------------------------------------------------------- /pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Requirements 2 | When creating your Pull request, please fill out the template below: 3 | 4 | # PR details 5 | _Is this a fix/ update/ new feature?_ 6 | 7 | _Include a short description_ 8 | 9 | _Include links to JIRA tickets_ 10 | 11 | # Testing 12 | _Have you tested it?_ 13 | 14 | # Assign to the weekly GitHub reviewer 15 | _If you are a member of Ensembl, please check the Genebuild weekly Rotas and assign this week's GitHub reviewer to the PR_ 16 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/assembly.txt: -------------------------------------------------------------------------------- 1 | 1 8 1190846 1198328 1 7483 1 2 | 1 9 1254816 1281576 1 26761 1 3 | 1 10 1551975 1701838 1 149864 1 4 | 1 11 2139403 2155667 1 16265 1 5 | 1 12 2914577 2936643 1 22067 1 6 | 1 2 3594096 3636155 1 42060 1 7 | 1 3 7512206 7555504 1 43299 1 8 | 1 4 7747315 7760420 1 13106 1 9 | 1 5 9996681 10017612 1 20932 1 10 | 1 6 13645738 13656665 1 10928 1 11 | 1 7 15616608 15633999 1 17392 1 12 | 1 13 13399324 13417891 1 18568 1 13 | 1 14 14122744 14195511 1 72768 1 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | blib/ 2 | .build/ 3 | _build/ 4 | cover_db/ 5 | inc/ 6 | Build 7 | !Build/ 8 | Build.bat 9 | .last_cover_stats 10 | MANIFEST.bak 11 | META.yml 12 | MYMETA.yml 13 | nytprof.out 14 | pm_to_blib 15 | .DS_Store 16 | Thumbs.db 17 | *.swp 18 | *.swo 19 | *~ 20 | \#*\# 21 | /.emacs.desktop 22 | /.emacs.desktop.lock 23 | .elc 24 | auto-save-list 25 | tramp 26 | .\#* 27 | # Org-mode 28 | .org-id-locations 29 | *_archive 30 | a.out 31 | *.o 32 | *.obj 33 | *.class 34 | modules/t/MultiTestDB.conf 35 | modules/Bio/EnsEMBL/Analysis/Config/General.pm 36 | -------------------------------------------------------------------------------- /scripts/imgt/kill_list.txt: -------------------------------------------------------------------------------- 1 | AF062232 human; heavy chain orphon (chr15) not annotated as such 2 | AF062120 human; heavy chain orphon (chr15) not annotated as such 3 | HSIGHZF human; heavy chain orphon (chr15) not annotated as such 4 | HSIGVH441 human; heavy chain orphon (chr15) not annotated as such 5 | HSIGHXX27 human; heavy chain orphon (chr15) not annotated as such 6 | HSIGV79 human; heavy chain orphon (chr15) not annotated as such 7 | HSIGLC16 human; light chain orphon (chr22, distal). Probable pseudo 8 | MM07554 mouse; C-REGION contains J-REGION 9 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/external_db.txt: -------------------------------------------------------------------------------- 1 | 1830 RefSeq_genomic \N KNOWN 193 RefSeq Genomic MISC \N \N This external_db_id can be used in the seq_region_synonym table. For species such as human, cow, dog we store chromosome names (1-22, X,Y) in the name column of the seq_region table. The RefSeq_genomic is stored as a synonym. eg. NC_000011.10, NT_187365.1 2 | 50710 INSDC \N KNOWNXREF 5 International Nucleotide Sequence Database Collaboration MISC INSDC \N This external_db_id was initially made for use in the seq_region_synonym table. For species such as human, cow, dog we store chromosome names (1-22, X,Y) in the name column of the seq_region table. The INSDC accession is stored as a synonym. 3 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/BlastDBTracking/Entry.pm: -------------------------------------------------------------------------------- 1 | package Bio::EnsEMBL::Analysis::Tools::BlastDBTracking::Entry; 2 | 3 | use warnings ; 4 | use strict ; 5 | use namespace::autoclean; 6 | use Moose; 7 | 8 | has filename => ( is => 'ro', isa => 'Str', required => 1 ); 9 | has version => ( is => 'ro', isa => 'Str', required => 1 ); 10 | has sanger_version => ( is => 'ro', isa => 'Int', ); 11 | has installation => ( is => 'ro', isa => 'Int', ); 12 | has count => ( is => 'ro', isa => 'Int', ); 13 | has checksum => ( is => 'ro', isa => 'Str', ); 14 | has from_file => ( is => 'ro', isa => 'Bool', ); 15 | 16 | 1; 17 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/attrib_type.txt: -------------------------------------------------------------------------------- 1 | 6 toplevel Top Level Top Level Non-Redundant Sequence Region 2 | 367 karyotype_rank Rank in the karyotype For a given seq_region, if it is part of the species karyotype, will indicate its rank 3 | 554 is_canonical Ensembl Canonical This transcript is the chosen canonical for its gene. For protein-coding genes, this is the MANE_Select transcript if there is one. If not, the canonical transcript is chosen by a pipeline that takes into account several criteria including transcript support (TSL), functional importance (APPRIS), representation in RefSeq and UniProt databases, length and coverage of pathogenic variants, where available. For non protein-coding genes, it is usually the longest transcript with the same biotype as its parent gene. 4 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveRunExternalCmd.pm: -------------------------------------------------------------------------------- 1 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRunExternalCmd; 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use base ('Bio::EnsEMBL::Hive::Process'); 7 | use Bio::EnsEMBL::Compara::Utils::RunCommand; 8 | 9 | sub param_defaults { 10 | return { 11 | 'cmd' => undef, # command to run 12 | }; 13 | } 14 | 15 | sub run { 16 | my $self = shift; 17 | 18 | my $cmd = $self->param_required('cmd'); 19 | 20 | my $rc = Bio::EnsEMBL::Compara::Utils::RunCommand 21 | ->new_and_exec($cmd, { die_on_failure => 1 }); 22 | 23 | # Save stdout into a hive param so it can be used in flow_into 24 | my $stdout = $rc->out; 25 | chomp $stdout; 26 | $self->param('stdout', $stdout); 27 | $self->param('stderr', $rc->err); 28 | } 29 | 30 | sub write_output { 31 | my $self = shift; 32 | 33 | # Flow stdout as #stdout# 34 | $self->dataflow_output_id({ stdout => $self->param('stdout') }, 1); 35 | } 36 | 37 | 1; 38 | -------------------------------------------------------------------------------- /modules/t/repeatcoverage.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRepeatCoverage'); 22 | 23 | TODO: { 24 | local $TODO = 'Proper tests needed'; 25 | note($TODO); 26 | } 27 | 28 | done_testing(); 29 | -------------------------------------------------------------------------------- /modules/t/hiveassemblycomponents_rb.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadAssemblyComponents'); 22 | 23 | TODO: { 24 | local $TODO = 'Proper tests needed'; 25 | note($TODO); 26 | } 27 | 28 | done_testing(); 29 | -------------------------------------------------------------------------------- /modules/t/hiveloadgenomesequences_rb.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadGenomeSequences'); 22 | 23 | TODO: { 24 | local $TODO = 'Proper tests needed'; 25 | note($TODO); 26 | } 27 | 28 | done_testing(); 29 | -------------------------------------------------------------------------------- /modules/t/hiveprocessassemblyreport_rb.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveProcessAssemblyReport'); 22 | 23 | TODO: { 24 | local $TODO = 'Proper tests needed'; 25 | note($TODO); 26 | } 27 | 28 | done_testing(); 29 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/analysis_description.txt: -------------------------------------------------------------------------------- 1 | 1 Annotation produced by the Ensembl genebuild. Ensembl 1 {"caption": "Genes (Ensembl)", "colour_key": "[biotype]", "default": {"MultiBottom": "collapsed_label", "MultiTop": "gene_label", "alignsliceviewbottom": "as_collapsed_label", "contigviewbottom": "transcript_label", "contigviewtop": "gene_label", "cytoview": "gene_label"}, "key": "ensembl", "label_key": "[biotype]", "multi_name": "Ensembl genes"} 2 | 2 Non-coding RNAs (ncRNAs) predicted using sequences from RFAM and miRBase. See article. ncRNAs 1 {"caption": "Genes (Ensembl)", "colour_key": "[biotype]", "default": {"MultiBottom": "collapsed_label", "MultiTop": "gene_label", "alignsliceviewbottom": "as_collapsed_label", "contigviewbottom": "transcript_label", "contigviewtop": "gene_label", "cytoview": "gene_label"}, "key": "ensembl", "label_key": "[biotype]", "multi_name": "Ensembl genes"} 3 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositePattern.pm.retired: -------------------------------------------------------------------------------- 1 | 2 | =pod 3 | 4 | =head1 NAME 5 | 6 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern 7 | 8 | =head1 SYNOPSIS 9 | 10 | my $tmhmm = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern-> 11 | new ( 12 | -db => $db, 13 | -input_id => $input_id, 14 | -analysis => $analysis) 15 | ); 16 | $tmhmm->fetch_input; # gets sequence from DB 17 | $tmhmm->run; 18 | $tmhmm->write_output; # writes features to to DB 19 | 20 | =head1 DESCRIPTION 21 | 22 | =cut 23 | 24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern; 25 | 26 | use strict; 27 | use vars qw(@ISA); 28 | 29 | 30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern; 31 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 32 | 33 | 34 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 35 | 36 | 37 | sub fetch_input { 38 | my ($self, @args) = @_; 39 | 40 | $self->SUPER::fetch_input(@args); 41 | 42 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern-> 43 | new(-query => $self->query, 44 | -analysis => $self->analysis, 45 | %{$self->parameters_hash} 46 | ); 47 | $self->runnable($run); 48 | } 49 | 50 | 1; 51 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/IMGT/Seq/RichSeqIMGT.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | package Bio::EnsEMBL::Analysis::Tools::IMGT::Seq::RichSeqIMGT; 17 | use warnings ; 18 | use strict; 19 | 20 | use base qw(Bio::Seq::RichSeq); 21 | 22 | 23 | sub new { 24 | # standard new call.. 25 | my($caller,@args) = @_; 26 | my $self = $caller->SUPER::new(@args); 27 | 28 | my ($data_class) = $self->_rearrange([qw(DATA_CLASS 29 | )], 30 | @args); 31 | 32 | defined $data_class and $self->data_class($data_class); 33 | 34 | return $self; 35 | } 36 | 37 | 38 | sub data_class { 39 | my $obj = shift; 40 | if( @_ ) { 41 | my $value = shift; 42 | $obj->{'_data_class'} = $value; 43 | } 44 | return $obj->{'_data_class'}; 45 | 46 | } 47 | 48 | 49 | 1; 50 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Hamap_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Hamap_wormbase; 18 | use warnings ; 19 | use vars qw(@ISA); 20 | 21 | use strict; 22 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 23 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hamap_wormbase; 24 | 25 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 26 | 27 | 28 | sub fetch_input { 29 | my ($self, @args) = @_; 30 | 31 | $self->SUPER::fetch_input(@args); 32 | 33 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hamap_wormbase-> 34 | new(-query => $self->query, 35 | -analysis => $self->analysis); 36 | $self->runnable($run); 37 | } 38 | 39 | 40 | 1; 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /sql/repeat_db_tables.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE assembly ( 3 | 4 | assembly_id INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, 5 | gca VARCHAR(14) NOT NULL, 6 | species_id INT(10) NOT NULL, 7 | 8 | PRIMARY KEY (assembly_id), 9 | 10 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; 11 | 12 | 13 | CREATE TABLE species ( 14 | 15 | species_id INT(10) NOT NULL AUTO_INCREMENT, 16 | taxon_id INT(10) UNSIGNED NOT NULL, 17 | common_name VARCHAR(40) NOT NULL, 18 | group_name VARCHAR(40) NOT NULL, 19 | 20 | PRIMARY KEY (species_id), 21 | 22 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; 23 | 24 | 25 | CREATE TABLE repeat_sequence ( 26 | 27 | repeat_sequence_id INT(10) NOT NULL AUTO_INCREMENT, 28 | repeat_class_id INT(10) NOT NULL, 29 | species_id INT(10) UNSIGNED NOT NULL, 30 | assembly_id INT(10) UNSIGNED NOT NULL, 31 | 32 | PRIMARY KEY (repeat_sequence_id), 33 | 34 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; 35 | 36 | 37 | CREATE TABLE repeat_class ( 38 | 39 | repeat_class_id INT(10) NOT NULL AUTO_INCREMENT, 40 | repeat_name VARCHAR(255) NOT NULL, 41 | repeat_class VARCHAR(100) NOT NULL, 42 | repeat_type VARCHAR(40) NOT NULL, 43 | repeat_sequence LONGTEXT NOT NULL, 44 | 45 | PRIMARY KEY (repeat_class_id), 46 | KEY name (repeat_name), 47 | KEY class (repeat_class), 48 | KEY type (repeat_type), 49 | 50 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM; 51 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositeProfile.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | =head1 SYNOPSIS 19 | 20 | =head1 DESCRIPTION 21 | 22 | =cut 23 | 24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositeProfile; 25 | use warnings ; 26 | use vars qw(@ISA); 27 | 28 | use strict; 29 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile; 31 | 32 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 33 | 34 | 35 | sub fetch_input { 36 | my ($self, @args) = @_; 37 | 38 | $self->SUPER::fetch_input(@args); 39 | 40 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile-> 41 | new(-query => $self->query, 42 | -analysis => $self->analysis); 43 | $self->runnable($run); 44 | } 45 | 46 | 47 | 1; 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveCreateFastqDownloadJobs.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveCreateFastqDownloadJobs; 19 | 20 | use strict; 21 | use warnings; 22 | use feature 'say'; 23 | 24 | use base ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory'); 25 | 26 | =head2 fetch_input 27 | 28 | Arg [1] : 29 | Description: 30 | 31 | Returntype : None 32 | Exceptions : None 33 | 34 | =cut 35 | 36 | 37 | sub write_output { 38 | my $self = shift; 39 | my $inputfile = $self->param('inputfile'); 40 | my @fastq_list = `cut -d\$'\t' -f4 $inputfile`; 41 | my @output_ids; 42 | foreach my $fastq (@fastq_list){ 43 | chomp $fastq; 44 | if ($fastq ne ""){ 45 | push(@output_ids, {iid => $fastq}) 46 | } 47 | } 48 | $self->dataflow_output_id(\@output_ids, $self->param('fan_branch_code')); 49 | } 50 | 51 | 1; 52 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositeProfile_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | =head1 SYNOPSIS 19 | 20 | =head1 DESCRIPTION 21 | 22 | =cut 23 | 24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositeProfile_wormbase; 25 | use warnings ; 26 | use vars qw(@ISA); 27 | 28 | use strict; 29 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile_wormbase; 31 | 32 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 33 | 34 | 35 | sub fetch_input { 36 | my ($self, @args) = @_; 37 | 38 | $self->SUPER::fetch_input(@args); 39 | 40 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile_wormbase-> 41 | new(-query => $self->query, 42 | -analysis => $self->analysis); 43 | $self->runnable($run); 44 | } 45 | 46 | 47 | 1; 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /modules/t/prepare_local_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | CURRENT_DIR=`dirname $0` 18 | if [ ! -e "$CURRENT_DIR/MultiTestDB.conf" ];then 19 | echo "You should create a file $CURRENT_DIR/MultiTestDB.conf containing connection details to a database:" 20 | cat < 3306, 23 | user => EHIVE_USER, 24 | pass => EHIVE_PASS, 25 | host => HOST, 26 | driver => 'mysql', 27 | } 28 | EOF 29 | echo 30 | fi 31 | 32 | BASEDIR="modules/t/test-genome-DBs/homo_sapiens/core" 33 | ENSEMBLDIR="../ensembl/$BASEDIR" 34 | if [ ! -e "$ENSEMBLDIR" ];then 35 | if [ -n "$PERL5LIB" ];then 36 | for D in `echo $PERL5LIB | sed 's/:/\n/g'`; do 37 | if [ "$D" != "${D/ensembl\/modules}" ]; then 38 | ENSEMBLDIR=$D 39 | fi 40 | done 41 | else 42 | printf "\033[31mPERL5LIB is not set\033[0m\n" 43 | exit 1 44 | fi 45 | fi 46 | 47 | if [ ! -e "$BASEDIR" ]; then 48 | mkdir -p "$BASEDIR" 49 | fi 50 | 51 | for F in ${ENSEMBLDIR}/*; do 52 | # We also want the SQLite table in case we start testing it too 53 | cp -r "$F" "$BASEDIR" 54 | done 55 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/SoftwareConfigLoad.pm: -------------------------------------------------------------------------------- 1 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | package Bio::EnsEMBL::Analysis::Tools::SoftwareConfigLoad; 17 | 18 | use strict; 19 | use warnings; 20 | use JSON; 21 | use File::Basename; 22 | use File::Spec; 23 | use Exporter 'import'; 24 | our @EXPORT_OK = qw(get_software_path); 25 | 26 | my $config_file = File::Spec->catfile(dirname(__FILE__), 'SoftwareConfig.json'); # Find suitable location 27 | 28 | sub get_software_path { 29 | my ($software_type, $tool) = @_; 30 | 31 | open my $fh, '<', $config_file or die "Could not open config file: $!"; 32 | my $json_text = do { local $/; <$fh> }; 33 | close $fh; 34 | 35 | my $config = decode_json($json_text); 36 | 37 | 38 | # Validate inputs 39 | unless ($software_type && exists $config->{software_paths}{$software_type}) { 40 | die "Software type '$software_type' not found in config. Available types: " 41 | . join(", ", keys %{ $config->{software_paths} }) . "\n"; 42 | } 43 | 44 | unless (exists $config->{software_paths}{$software_type}{$tool}) { 45 | die "Tool '$tool' not found for software type '$software_type'. Available tools: " 46 | . join(", ", keys %{ $config->{software_paths}{$software_type} }) . "\n"; 47 | } 48 | 49 | return $config->{software_paths}{$software_type}{$tool}; 50 | 51 | } 52 | -------------------------------------------------------------------------------- /scripts/cdna_update/find_N.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings ; 3 | use strict; 4 | 5 | 6 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 7 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | 22 | #script to parse a fasta file and identify sequences with large strings of 'N's 23 | 24 | #perl find_N.pl missing_fasta.out >many_n.out 25 | 26 | my $percent = 2; #percentage of sequence which must be consecutive Ns 27 | my $total_percent = 5 * $percent; #total Ns 28 | 29 | my $data = $ARGV[0]; 30 | my $a_count = 0; 31 | 32 | local $/ = "\n>"; 33 | 34 | open(DATA, "<$data") or die ("Can't read $data $! \n"); 35 | 36 | while(){ 37 | #have a sequence: 38 | 39 | s/>//g; 40 | 41 | my $len = length $_; 42 | my $max_n = sprintf "%.0f", (($len / 100) * $percent); #threshold number of Ns which we want to flag 43 | my $percent_n = 0; 44 | 45 | my ($name, $seq); 46 | if ($_=~/^([\w\.]+)\s+([\w\s]+)/){ 47 | $name = $1; 48 | my @tmp = $2; 49 | 50 | for my $s (@tmp){ 51 | $s =~s/\s//g; 52 | } 53 | $seq = join "", @tmp; 54 | } 55 | 56 | while ($_=~/(N+)/g){ #will match greedily 57 | $percent_n += length $1; 58 | if (length $1 >= $max_n && $percent_n >= $total_percent){ 59 | 60 | print "$name\n"; #print the seq id 61 | last; 62 | } 63 | } 64 | } 65 | 66 | 67 | -------------------------------------------------------------------------------- /scripts/genebuild/parse_embl_cds2uniprotkb.pl: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #!/usr/bin/env perl 17 | 18 | use strict; 19 | use warnings; 20 | 21 | use Getopt::Long; 22 | 23 | my $fasta_file; 24 | my $map_file; 25 | my $edited_fasta_file; 26 | 27 | &GetOptions( 28 | 'fasta_file:s' => \$fasta_file, 29 | 'map_file:s' => \$map_file, 30 | 'edited_fasta_file:s' => \$edited_fasta_file 31 | ); 32 | 33 | my %map_ids; 34 | 35 | open (EFF,">".$edited_fasta_file) || die "Could not open edited_fasta_file for writing\n"; 36 | open (MAP, "$map_file") or die "Can't open ".$map_file."\n"; 37 | 38 | while(){ 39 | chomp; 40 | my @values = split(/\t/,$_); 41 | 42 | if ($values[1] eq "EMBL-CDS"){ 43 | $map_ids{$values[2]} = $values[0]; 44 | } 45 | } 46 | 47 | close MAP; 48 | 49 | open (FASTA, "$fasta_file") or die "Can't open ".$fasta_file."\n"; 50 | 51 | while (){ 52 | chomp; 53 | 54 | if ($_ =~/^>/){ 55 | my @accessions = split(/\s+/,$_); 56 | if($map_ids{$accessions[1]}){ 57 | print EFF $accessions[0]." ".$map_ids{$accessions[1]}."\n"; 58 | }else{ 59 | print EFF $_."\n"; 60 | } 61 | }else{ 62 | print EFF $_."\n"; 63 | } 64 | } 65 | close EFF; 66 | close FASTA; 67 | -------------------------------------------------------------------------------- /scripts/genebuild/sncrna/filter_cm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use Getopt::Long; 21 | use Path::Tiny qw(path); 22 | use Data::Dumper; 23 | 24 | sub filter_rfam { 25 | my ($cm, $rfam_acc) = @_; 26 | 27 | my @filtered; 28 | my @cm_models = split(/\/\/\n/, $cm); 29 | 30 | my %rfam_acc = map { $_ => 1 } @$rfam_acc; 31 | foreach my $cm_model (@cm_models) { 32 | $cm_model =~ m/(RF\d+)/ ; 33 | my $rfam = $1; 34 | print $rfam . "\t"; 35 | if (exists($rfam_acc{$rfam})) { 36 | push @filtered, $cm_model; 37 | } else { 38 | # print("Rfam model $rfam removed by filtering.\n"); 39 | } 40 | } 41 | 42 | return join("//\n", @filtered)."//\n"; 43 | } 44 | 45 | my $rfam_cm_file = $ARGV[0]; #$self->param_required('rfam_cm_file'); 46 | my $rfam_accessions = $ARGV[1]; #$self->param_required('rfam_accessions'); 47 | 48 | my $cm_path = path($rfam_cm_file); 49 | my $ra_path = path($rfam_accessions); 50 | my $output = path($ARGV[2]);#path($working_dir . "/Rfam.cm"); 51 | 52 | my $cm = $cm_path->slurp; 53 | my $ra = $ra_path->slurp; 54 | 55 | my @accessions = split(/\n/, $ra); 56 | 57 | $cm = filter_rfam($cm, \@accessions); 58 | $output->spew($cm); 59 | 60 | 61 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/Stashes.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | package Bio::EnsEMBL::Analysis::Tools::Stashes; 17 | 18 | use warnings; 19 | use strict; 20 | no strict "refs"; 21 | 22 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); 23 | use Exporter; 24 | use vars qw (@ISA @EXPORT %stash $alias @alias %alias); 25 | 26 | @ISA = qw(Exporter); 27 | @EXPORT = qw( package_stash ); 28 | 29 | sub package_stash { 30 | my ($packageName) = @_; 31 | 32 | my %result; 33 | 34 | local (*alias); 35 | *stash = *{"${packageName}::"}; 36 | 37 | while ( my ( $varName, $globValue ) = each %stash ) { 38 | # only return the config hash 39 | next if $varName =~ m/BEGIN/; 40 | next if $varName =~ m/import/; 41 | 42 | *alias = $globValue; 43 | $result{$varName} = $alias if ( defined($alias) ); 44 | $result{$varName} = \@alias if ( *alias{ARRAY} ); 45 | $result{$varName} = \%alias if ( *alias{HASH} ); 46 | } 47 | 48 | if ( scalar( keys %result > 1 ) ) { 49 | throw( "Have more than one item exported from " . 50 | "$packageName - you'll run into trouble\n" ); 51 | } 52 | my $hash_name = ( keys %result )[0]; 53 | return [ $result{$hash_name}, $hash_name ]; 54 | } ## end sub package_stash 55 | 56 | 1; 57 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveStoreUnmappedcDNAs.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | #Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveStoreUnmappedcDNAs; 19 | 20 | use strict; 21 | use warnings; 22 | 23 | 24 | use Bio::EnsEMBL::UnmappedObject; 25 | 26 | use parent ('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveBaseRunnableDB'); 27 | 28 | 29 | 30 | sub fetch_input { 31 | my $self = shift; 32 | 33 | $self->create_analysis; 34 | my $db = $self->get_database_by_name('target_db'); 35 | $self->hrdb_set_con($db, 'target_db'); 36 | return 1; 37 | } 38 | 39 | sub run { 40 | my ($self) = shift; 41 | 42 | return 1; 43 | } 44 | 45 | sub write_output { 46 | my $self = shift; 47 | 48 | my $unmapped_adaptor = $self->hrdb_get_con('target_db')->get_UnmappedObjectAdaptor; 49 | foreach my $iid (@{$self->param('iid')}) { 50 | $unmapped_adaptor->store(Bio::EnsEMBL::UnmappedObject->new( 51 | -type => 'cDNA', 52 | -identifier => $iid, 53 | -summary => 'No output from Exonerate', 54 | -full_desc => 'Exonerate returned no hits using standard parameters plus options --maxintron 400000 and --softmasktarget FALSE', 55 | -analysis => $self->analysis, 56 | )); 57 | } 58 | } 59 | 60 | 61 | 1; 62 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Prints.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Prints.pm - DESCRIPTION of Object 19 | 20 | =head1 SYNOPSIS 21 | 22 | my $rsb = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Prints->new( 23 | -db => $db 24 | -input_id => $id 25 | -analysis => $analysis); 26 | 27 | 28 | =head1 DESCRIPTION 29 | 30 | 31 | =cut 32 | 33 | 34 | # Let the code begin... 35 | 36 | 37 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Prints; 38 | use warnings ; 39 | use vars qw(@ISA); 40 | use strict; 41 | 42 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 43 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints; 44 | 45 | 46 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 47 | 48 | 49 | sub fetch_input { 50 | my ($self, @args) = @_; 51 | 52 | $self->SUPER::fetch_input(@args); 53 | 54 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints->new(-query => $self->query, 55 | -analysis => $self->analysis); 56 | $self->runnable($run); 57 | } 58 | 59 | 60 | 61 | 62 | 1; 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Prints_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Prints.pm - DESCRIPTION of Object 19 | 20 | =head1 SYNOPSIS 21 | 22 | my $rsb = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Prints_wormbase->new( 23 | -db => $db 24 | -input_id => $id 25 | -analysis => $analysis); 26 | 27 | 28 | =head1 DESCRIPTION 29 | 30 | 31 | =cut 32 | 33 | 34 | # Let the code begin... 35 | 36 | 37 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Prints_wormbase; 38 | use warnings ; 39 | use vars qw(@ISA); 40 | use strict; 41 | 42 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 43 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints_wormbase; 44 | 45 | 46 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 47 | 48 | 49 | sub fetch_input { 50 | my ($self, @args) = @_; 51 | 52 | $self->SUPER::fetch_input(@args); 53 | 54 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints_wormbase->new(-query => $self->query, 55 | -analysis => $self->analysis); 56 | $self->runnable($run); 57 | } 58 | 59 | 60 | 61 | 62 | 1; 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /scripts/genebuild/sncrna/repeats_dump.pl: -------------------------------------------------------------------------------- 1 | 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use Bio::EnsEMBL::DBSQL::DBAdaptor; 21 | 22 | my ($dbname, $dbhost, $dbport, $dbuser, $working_dir, $logic_name) = @ARGV; 23 | 24 | my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( 25 | -DBNAME => $dbname, 26 | -HOST => $dbhost, 27 | -PORT => $dbport, 28 | -USER => $dbuser, 29 | -DRIVER => 'mysql', 30 | ); 31 | 32 | # dump repeat features 33 | my $rfa = $db->get_RepeatFeatureAdaptor(); 34 | my $fn = $working_dir . "/repeats.bed"; 35 | open(FH, '>', $fn) or die "Could not write to $fn"; 36 | 37 | my $sa = $db->get_SliceAdaptor(); 38 | my $slice_name; 39 | 40 | my $logic_names = $db->get_MetaContainer->list_value_by_key('repeat.analysis'); 41 | if (!@$logic_names) { 42 | push(@$logic_names, ''); 43 | } 44 | foreach my $slice (@{ $sa->fetch_all( 'toplevel') }){ 45 | $slice_name = $slice->seq_region_name(); 46 | foreach my $logic_name (@$logic_names) { 47 | foreach my $repeat (@{ $rfa->fetch_all_by_Slice($slice, $logic_name) }){ 48 | print FH $slice_name, "\t", 49 | $repeat->seq_region_start(), "\t", 50 | $repeat->seq_region_end(), "\t", 51 | ($repeat->strand() == 1 ? '+' : '-'), "\n"; 52 | } 53 | } 54 | } 55 | 56 | close(FH) or die "Could not close $fn"; 57 | 58 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositePattern.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =pod 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern 21 | 22 | =head1 SYNOPSIS 23 | 24 | my $tmhmm = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern-> 25 | new ( 26 | -db => $db, 27 | -input_id => $input_id, 28 | -analysis => $analysis) 29 | ); 30 | $tmhmm->fetch_input; # gets sequence from DB 31 | $tmhmm->run; 32 | $tmhmm->write_output; # writes features to to DB 33 | 34 | =head1 DESCRIPTION 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern; 39 | 40 | use warnings ; 41 | use strict; 42 | use vars qw(@ISA); 43 | 44 | 45 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern; 46 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 47 | 48 | 49 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 50 | 51 | 52 | sub fetch_input { 53 | my ($self, @args) = @_; 54 | 55 | $self->SUPER::fetch_input(@args); 56 | 57 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern-> 58 | new(-query => $self->query, 59 | -analysis => $self->analysis, 60 | %{$self->parameters_hash} 61 | ); 62 | $self->runnable($run); 63 | } 64 | 65 | 1; 66 | -------------------------------------------------------------------------------- /modules/t/hivecreatedirectories_rb.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use File::Path qw(remove_tree); 20 | use File::Spec::Functions qw(catdir); 21 | 22 | use Test::More; 23 | 24 | use Bio::EnsEMBL::Hive::Utils::Test qw(standaloneJob); 25 | 26 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveCreateDirectories'); 27 | 28 | my $directory = catdir($ENV{PWD}, 'test_directory'); 29 | standaloneJob( 30 | 'Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveCreateDirectories', # module 31 | { # input param hash 32 | paths => [$directory], 33 | }, 34 | ); 35 | 36 | my @stat = stat($directory); 37 | cmp_ok(sprintf("%04o", $stat[2] & 07777), 'eq', 2775, 'Checking permissions for default production directory'); 38 | remove_tree($directory); 39 | 40 | my $directory_2 = catdir($directory, 'directory_test'); 41 | standaloneJob( 42 | 'Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveCreateDirectories', # module 43 | { # input param hash 44 | paths => [{ 45 | path => $directory_2, 46 | mode => 0755, 47 | }], 48 | }, 49 | ); 50 | 51 | @stat = stat($directory); 52 | cmp_ok(sprintf("%04o", $stat[2] & 07777), 'eq', '0755', 'Checking permissions for directory'); 53 | @stat = stat($directory_2); 54 | cmp_ok(sprintf("%04o", $stat[2] & 07777), 'eq', '0755', 'Checking permissions for subdirectory'); 55 | remove_tree($directory); 56 | 57 | done_testing(); 58 | -------------------------------------------------------------------------------- /scripts/protein/chunk_protein_file.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | use warnings ; 19 | use strict; 20 | use Getopt::Long; 21 | use Bio::SeqIO; 22 | 23 | my ($input_pep_file, 24 | $output_chunk_dir, 25 | $chunk_size); 26 | 27 | GetOptions('pepfile=s' => \$input_pep_file, 28 | 'chunkdir=s' => \$output_chunk_dir, 29 | 'chunksize=s' => \$chunk_size); 30 | 31 | 32 | die "You must supply a valid input peptide file\n" 33 | if not defined $input_pep_file or not -e $input_pep_file; 34 | die "You must supply a valid output chunk directory\n" 35 | if not defined $output_chunk_dir or not -d $output_chunk_dir; 36 | 37 | if (not defined $chunk_size or $chunk_size < 0) { 38 | warn "No/invalid chunk size given; defaulting to 20"; 39 | $chunk_size = 20; 40 | } 41 | 42 | my $seqio = Bio::SeqIO->new(-format => 'fasta', 43 | -file => $input_pep_file); 44 | 45 | my $count = 0; 46 | my $chunk_num = 1; 47 | my $outseqio; 48 | 49 | while (my $seq = $seqio->next_seq) { 50 | if (not defined $outseqio) { 51 | $outseqio = Bio::SeqIO->new(-format => 'fasta', 52 | -file => ">$output_chunk_dir/chunk." . $chunk_num++); 53 | } 54 | 55 | $outseqio->write_seq($seq); 56 | $count++; 57 | 58 | if ($count >= $chunk_size) { 59 | $outseqio->close; 60 | $outseqio = undef; 61 | $count = 0; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/GenomeOverlapFilter.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | package Bio::EnsEMBL::Analysis::Tools::GenomeOverlapFilter; 18 | 19 | use strict; 20 | use warnings; 21 | 22 | use Bio::EnsEMBL::Root; 23 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); 24 | use Bio::EnsEMBL::Utils::Argument qw( rearrange ); 25 | 26 | 27 | 28 | sub new{ 29 | my ($class, @args) = @_; 30 | my $self = bless {},$class; 31 | 32 | if (scalar(@args)) { 33 | throw("GenomeOverlapFilter should have no args in new"); 34 | } 35 | 36 | return $self; 37 | } 38 | 39 | ##################################### 40 | sub filter { 41 | my ($self, $these, $others) = @_; 42 | 43 | # interference is judged by overlap at genomic level 44 | # assumption is that @others is sorted by gene start 45 | 46 | my @filtered; 47 | 48 | foreach my $obj (@$these) { 49 | my ($left_bound, $genomic_overlap); 50 | 51 | for(my $i=0; $i < @$others && !$genomic_overlap; $i++) { 52 | my $o_obj = $others->[$i]; 53 | 54 | next if $o_obj->strand != $obj->strand; 55 | 56 | if ($o_obj->end < $obj->start) { 57 | next; 58 | } elsif ($o_obj->start > $obj->end) { 59 | last; 60 | } else { 61 | $genomic_overlap = 1; 62 | } 63 | } 64 | 65 | if (not $genomic_overlap) { 66 | push @filtered, $obj; 67 | } 68 | } 69 | 70 | return \@filtered; 71 | } 72 | 73 | 1; 74 | -------------------------------------------------------------------------------- /scripts/assembly_patches/remove_patch_raw_compute.sql: -------------------------------------------------------------------------------- 1 | delete repeat_feature from attrib_type, seq_region_attrib, repeat_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = repeat_feature.seq_region_id; 2 | 3 | delete prediction_exon from attrib_type, seq_region_attrib, prediction_exon where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = prediction_exon.seq_region_id; 4 | 5 | delete prediction_transcript from attrib_type, seq_region_attrib, prediction_transcript where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = prediction_transcript.seq_region_id; 6 | 7 | delete simple_feature from attrib_type, seq_region_attrib, simple_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = simple_feature.seq_region_id; 8 | 9 | delete dna_align_feature from attrib_type, seq_region_attrib, dna_align_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = dna_align_feature.seq_region_id and dna_align_feature_id not in (select feature_id from transcript_supporting_feature where feature_type = 'dna_align_feature') and dna_align_feature_id not in (select feature_id from supporting_feature where feature_type = 'dna_align_feature'); 10 | 11 | delete protein_align_feature from attrib_type, seq_region_attrib, protein_align_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = protein_align_feature.seq_region_id and protein_align_feature_id not in (select feature_id from transcript_supporting_feature where feature_type = 'protein_align_feature') and protein_align_feature_id not in (select feature_id from supporting_feature where feature_type = 'protein_align_feature'); 12 | 13 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositePattern_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =pod 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern_wormbase 21 | 22 | =head1 SYNOPSIS 23 | 24 | my $tmhmm = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern_wormbase-> 25 | new ( 26 | -db => $db, 27 | -input_id => $input_id, 28 | -analysis => $analysis) 29 | ); 30 | $tmhmm->fetch_input; # gets sequence from DB 31 | $tmhmm->run; 32 | $tmhmm->write_output; # writes features to to DB 33 | 34 | =head1 DESCRIPTION 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern_wormbase; 39 | 40 | use warnings ; 41 | use strict; 42 | use vars qw(@ISA); 43 | 44 | 45 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern_wormbase; 46 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 47 | 48 | 49 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 50 | 51 | 52 | sub fetch_input { 53 | my ($self, @args) = @_; 54 | 55 | $self->SUPER::fetch_input(@args); 56 | 57 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern_wormbase-> 58 | new(-query => $self->query, 59 | -analysis => $self->analysis, 60 | %{$self->parameters_hash} 61 | ); 62 | $self->runnable($run); 63 | } 64 | 65 | 1; 66 | -------------------------------------------------------------------------------- /scripts/genebuild/find_seq_in_fasta.pl: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #!/usr/bin/env perl 17 | 18 | use strict; 19 | use warnings; 20 | use Bio::SeqIO; 21 | use Getopt::Long; 22 | 23 | my ($fasta_file,$id_file); 24 | my $id; 25 | my $prefix; 26 | 27 | &GetOptions( 28 | 'id:s' => \$id, 29 | 'fasta_file:s' => \$fasta_file, 30 | 'id_file:s' => \$id_file, 31 | 'prefix:s' => \$prefix, 32 | ); 33 | 34 | if (!defined($fasta_file) || (!defined($id) && !defined($prefix) && !defined($id_file))) { 35 | die "ERROR: Must at least set file (-fasta_file) and full id (-id)\n" . 36 | " or prefix (-prefix)\n"; 37 | } 38 | 39 | my %ids ; 40 | if ( $id_file ) { 41 | open(I,"$id_file") || die ( "Cant read file : $id_file\n") ; 42 | while(my $line=){ 43 | chomp($line); 44 | $ids{$line} = 1; 45 | } 46 | } elsif ( $id ) { 47 | $ids{$id} = 1; 48 | } 49 | 50 | 51 | my $inputer = Bio::SeqIO->new(-file => "<" . $fasta_file , '-format' => 'Fasta') ; 52 | my $outputer = Bio::SeqIO->new(-file => ">-" , '-format' => 'Fasta') ; 53 | 54 | while (my $seq = $inputer->next_seq) { 55 | #print $seq->id ."\n" ; 56 | if (exists $ids{$seq->id}) { 57 | $outputer->write_seq($seq); 58 | } 59 | if (defined($prefix)) { 60 | if ($seq->id =~ /^$prefix/) { 61 | $outputer->write_seq($seq); 62 | } 63 | } 64 | } 65 | $inputer->close; 66 | $outputer->close; 67 | -------------------------------------------------------------------------------- /scripts/databases/process_uniprot_isoforms.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use Getopt::Long; 21 | use Bio::SeqIO; 22 | 23 | my $infile; 24 | my $outfile; 25 | my $isofile; 26 | my $use_version = 0; 27 | my $use_description = 0; 28 | 29 | &GetOptions ( 30 | 'i|infile=s' => \$infile, 31 | 'v|isofile=s' => \$isofile, 32 | 'o|outfile=s' => \$outfile, 33 | 'version!' => \$use_version, 34 | 'desc!' => \$use_description, 35 | ); 36 | 37 | my %accessions; 38 | open(RF, $infile) || die("Could not open $infile\n"); 39 | while () { 40 | if (/^>(\w+)\.(\d+)/) { 41 | $accessions{$1} = $2; 42 | } 43 | } 44 | close(RF) || die("Could not close $infile\n"); 45 | 46 | 47 | my $sequences = Bio::SeqIO->new(-format => 'fasta', -file => $isofile); 48 | my $writer = Bio::SeqIO->new(-format => 'fasta', -file => '>'.$outfile); 49 | $writer->preferred_id_type('accession'); 50 | $writer->preferred_id_type('accession.version') if ($use_version); 51 | while (my $seq = $sequences->next_seq()) { 52 | my ($accession, $isoform_id) = $seq->id =~ /[sptr]{2}\|(\w+)(-\d+)/; 53 | if (exists $accessions{$accession}) { 54 | $seq->accession_number($accession.$isoform_id); 55 | $seq->version($accessions{$accession}) if ($use_version); 56 | $seq->desc('') unless ($use_description); 57 | $writer->write_seq($seq); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/Config/sample_genes_registry_conf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use strict; 18 | use warnings; 19 | use Bio::EnsEMBL::Utils::ConfigRegistry; 20 | use Bio::EnsEMBL::DBSQL::DBAdaptor; 21 | use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; 22 | use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; 23 | 24 | my $curr_release = $ENV{ENSEMBL_RELEASE}; 25 | 26 | # ---------------------- CURRENT COMPARA DATABASE --------------------------------- 27 | 28 | my $compara_dbs = { 29 | 'compara_curr' => [ 'mysql-ens-sta-1', 'ensembl_compara_'.$curr_release ], 30 | }; 31 | 32 | foreach my $alias_name ( keys %$compara_dbs ) { 33 | my ( $host, $db_name ) = @{ $compara_dbs->{$alias_name} }; 34 | my ( $user, $pass ) = ( 'ensro', '' ); 35 | Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new( 36 | -host => $host, 37 | -user => $user, 38 | -pass => $pass, 39 | -port => get_port($host), 40 | -species => $alias_name, 41 | -dbname => $db_name, 42 | ); 43 | } 44 | 45 | # ---------------------- CURRENT CORE DATABASES --------------------------------- 46 | 47 | # The majority of core databases live on staging servers: 48 | Bio::EnsEMBL::Registry->load_registry_from_url( 49 | "mysql://ensro\@mysql-ens-sta-1.ebi.ac.uk:4519/$curr_release"); 50 | 51 | sub get_port { 52 | my $host = shift; 53 | my $port = `$host port`; 54 | chomp $port; 55 | return $port; 56 | } 57 | 58 | 1; 59 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Coil.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =pod 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Coil 21 | 22 | =head1 SYNOPSIS 23 | 24 | my $ncoils = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Coil->new ( -db => $db, 25 | -input_id => $input_id, 26 | -analysis => $analysis, 27 | ); 28 | $ncoils->fetch_input; # gets sequence from DB 29 | $ncoils->run; 30 | $ncoils->write_output; # writes features to to DB 31 | 32 | =head1 DESCRIPTION 33 | 34 | =cut 35 | 36 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Coil; 37 | 38 | use warnings ; 39 | use strict; 40 | use vars qw(@ISA); 41 | 42 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 43 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Coil; 44 | 45 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 46 | 47 | 48 | sub fetch_input { 49 | my ($self, @args) = @_; 50 | 51 | $self->SUPER::fetch_input(@args); 52 | 53 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Coil->new(-query => $self->query, 54 | -analysis => $self->analysis); 55 | 56 | $self->runnable($run); 57 | } 58 | 59 | 60 | 1; 61 | -------------------------------------------------------------------------------- /modules/t/hiverepeatcoverage_rb.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use Bio::EnsEMBL::Test::TestUtils; 22 | use Bio::EnsEMBL::Test::MultiTestDB; 23 | 24 | use Bio::EnsEMBL::Hive::Utils::Test qw(standaloneJob); 25 | 26 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRepeatCoverage'); 27 | 28 | my $multi = Bio::EnsEMBL::Test::MultiTestDB->new(); 29 | 30 | my $db = $multi->get_DBAdaptor('core'); 31 | my %target_db = ( 32 | -dbname => $db->dbc->dbname, 33 | -host => $db->dbc->host, 34 | -port => $db->dbc->port, 35 | -user => $db->dbc->user, 36 | -pass => $db->dbc->pass, 37 | -driver => $db->dbc->driver, 38 | ); 39 | 40 | standaloneJob( 41 | 'Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRepeatCoverage', # module 42 | { # input param hash 43 | source_db => \%target_db, 44 | repeat_logic_names => ['RepeatMask'], 45 | coord_system_version => 'NCBI33', 46 | }, 47 | [ # list of events to test for (just 1 event in this case) 48 | [ # start event 49 | 'WARNING', # event to test for (could be WARNING) 50 | $db->dbc->dbname . "\nAnalyses: RepeatMask\nTotal bases = 62842997\nTotal masked = 504576\t( 0.80% masked)\n", # expected data flowed out 51 | ], # end event 52 | [ # start event 53 | 'DATAFLOW', # event to test for (could be WARNING) 54 | {repeat_mask_coverage => 0.802915239704433574}, # expected data flowed out 55 | 2 # dataflow branch 56 | ], # end event 57 | ] 58 | ); 59 | 60 | done_testing(); 61 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Signalp.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | =pod 16 | 17 | =head1 NAME 18 | 19 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Signalp 20 | 21 | =head1 SYNOPSIS 22 | 23 | my $signalp = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Signalp->new ( -db => $db, 24 | -input_id => $input_id, 25 | -analysis => $analysis, 26 | ); 27 | $signalp->fetch_input; # gets sequence from DB 28 | $signalp->run; 29 | $signalp->write_output; # writes features to to DB 30 | 31 | =head1 DESCRIPTION 32 | 33 | =cut 34 | 35 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Signalp; 36 | 37 | use warnings ; 38 | use strict; 39 | use vars qw(@ISA); 40 | 41 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 42 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Signalp; 43 | 44 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 45 | 46 | sub fetch_input { 47 | my ($self, @args) = @_; 48 | 49 | $self->SUPER::fetch_input(@args); 50 | 51 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Signalp->new(-query => $self->query, 52 | -analysis => $self->analysis); 53 | $self->runnable($run); 54 | } 55 | 56 | 57 | 1; 58 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveLoadProteins.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadProteins; 19 | 20 | use strict; 21 | use warnings; 22 | use feature 'say'; 23 | use Bio::EnsEMBL::IO::Parser::Fasta; 24 | 25 | use parent ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory'); 26 | 27 | sub param_defaults { 28 | my ($self) = @_; 29 | 30 | return { 31 | %{$self->SUPER::param_defaults()}, 32 | column_names => ['iid'], 33 | sequence_table_name => 'protein_sequences', 34 | load_biotype => 0, 35 | } 36 | } 37 | 38 | sub fetch_input { 39 | my $self = shift; 40 | 41 | my $parser = Bio::EnsEMBL::IO::Parser::Fasta->open($self->param_required('protein_file')); 42 | 43 | my $table_adaptor = $self->db->get_NakedTableAdaptor(); 44 | $table_adaptor->table_name($self->param_required('sequence_table_name')); 45 | 46 | my @iids; 47 | while($parser->next()) { 48 | my ($accession) = $parser->getHeader =~ /^(\S+)/; 49 | my $db_row = [{ 50 | 'accession' => $accession, 51 | 'seq' => $parser->getSequence, 52 | }]; 53 | if ($self->param('load_biotype')) { 54 | if ($parser->getHeader =~ /\S+\s+(\S+)/) { 55 | $db_row->[0]->{biotype} = $1; 56 | } 57 | else { 58 | $self->warning('Could not find biotype for '.$accession); 59 | } 60 | } 61 | $table_adaptor->store($db_row); 62 | push(@iids, $accession); 63 | } 64 | $self->param('inputlist', \@iids); 65 | } 66 | 67 | 1; 68 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveSequencesToFiles.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveSequencesToFiles 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveSequencesToFiles; 39 | 40 | use strict; 41 | use warnings; 42 | 43 | use Bio::SeqIO; 44 | use Bio::Seq; 45 | 46 | use parent ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory'); 47 | 48 | sub param_defaults { 49 | my ($self) = @_; 50 | 51 | return { 52 | %{$self->SUPER::param_defaults}, 53 | format => 'fasta', 54 | chunk => 0, 55 | chunk_size => 10, 56 | column_names => ['filename'], 57 | inputlist => ['#filename#'], 58 | } 59 | } 60 | 61 | 62 | sub fetch_input { 63 | my ($self) = @_; 64 | 65 | my $sth = $self->db->dbc->prepare('SELECT accession, seq FROM '.$self->param_required('sequence_table_name')); 66 | $sth->execute(); 67 | my $parser = Bio::SeqIO->new(-format => $self->param('format'), -file => '>'.$self->param_required('filename')); 68 | while (my $row = $sth->fetchrow_arrayref) { 69 | my $seq = Bio::Seq->new(-id => $row->[0], -seq => $row->[1]); 70 | $parser->write_seq($seq); 71 | } 72 | } 73 | 74 | 1; 75 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Hmmpfam.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # 17 | # 18 | # 19 | =pod 20 | 21 | =head1 NAME 22 | 23 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Hmmpfam 24 | 25 | =head1 SYNOPSIS 26 | 27 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Hmmpfam 28 | ->new ( -db => $db, 29 | -input_id => $input_id, 30 | -analysis => $analysis, 31 | ); 32 | $seg->fetch_input; # gets sequence from DB 33 | $seg->run; 34 | $seg->output; 35 | $seg->write_output; # writes features to to DB 36 | 37 | =head1 DESCRIPTION 38 | 39 | This object wraps Bio::EnsEMBL::Pipeline::Runnable::Hmmpfam 40 | to add functionality to read and write to databases in 41 | 42 | 43 | =head1 CONTACT 44 | 45 | =cut 46 | 47 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Hmmpfam; 48 | 49 | use warnings ; 50 | use strict; 51 | use vars qw(@ISA); 52 | 53 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 54 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hmmpfam; 55 | 56 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 57 | 58 | 59 | # 60 | # overridden methods 61 | # 62 | sub fetch_input { 63 | my ($self) = @_; 64 | 65 | $self->SUPER::fetch_input; 66 | 67 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hmmpfam-> 68 | new(-query => $self->query, 69 | -analysis => $self->analysis, 70 | -database => $self->analysis->db_file, 71 | %{$self->parameters_hash} 72 | ); 73 | $self->runnable($run); 74 | } 75 | 76 | 77 | 1; 78 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PIRSF.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # 18 | =pod 19 | 20 | =head1 NAME 21 | 22 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::PIRSF 23 | 24 | =head1 SYNOPSIS 25 | 26 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::PIRSF-> 27 | new ( -db => $db, 28 | -input_id => $input_id, 29 | -analysis => $analysis, 30 | ); 31 | $seg->fetch_input; # gets sequence from DB 32 | $seg->run; 33 | $seg->write_output; # writes features to to DB 34 | 35 | =head1 DESCRIPTION 36 | 37 | This object wraps Bio::EnsEMBL::Pipeline::Runnable::Hmmpfam 38 | to add functionality to read and write to databases in 39 | a Pfam-specific way. 40 | 41 | =head1 CONTACT 42 | 43 | =cut 44 | 45 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PIRSF; 46 | 47 | use warnings ; 48 | use strict; 49 | use vars qw(@ISA); 50 | 51 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 52 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PIRSF; 53 | 54 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 55 | 56 | 57 | sub fetch_input { 58 | my ($self) = @_; 59 | 60 | $self->SUPER::fetch_input; 61 | 62 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PIRSF-> 63 | new(-query => $self->query, 64 | -analysis => $self->analysis, 65 | -database => $self->analysis->db_file, 66 | %{$self->parameters_hash} 67 | ); 68 | $self->runnable($run); 69 | } 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Superfamily.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # 18 | =pod 19 | 20 | =head1 NAME 21 | 22 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Superfamily 23 | 24 | =head1 SYNOPSIS 25 | 26 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Superfamily 27 | ->new ( -db => $db, 28 | -input_id => $input_id, 29 | -analysis => $analysis, 30 | ); 31 | $seg->fetch_input; # gets sequence from DB 32 | $seg->run; 33 | $seg->output; 34 | $seg->write_output; # writes features to to DB 35 | 36 | =head1 DESCRIPTION 37 | 38 | This object wraps Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily 39 | to add functionality to read and write to databases in 40 | 41 | 42 | =head1 CONTACT 43 | 44 | =cut 45 | 46 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Superfamily; 47 | 48 | use warnings ; 49 | use strict; 50 | use vars qw(@ISA); 51 | 52 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 53 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily; 54 | 55 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 56 | 57 | 58 | # 59 | # overridden methods 60 | # 61 | sub fetch_input { 62 | my ($self) = @_; 63 | 64 | $self->SUPER::fetch_input; 65 | 66 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily-> 67 | new(-query => $self->query, 68 | -analysis => $self->analysis, 69 | -database => $self->analysis->db_file, 70 | %{$self->parameters_hash} 71 | ); 72 | $self->runnable($run); 73 | } 74 | 75 | 76 | 1; 77 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/BlastRNASeqPep.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::BlastRNASeqPep; 17 | 18 | use strict; 19 | use vars qw( %Config ); 20 | 21 | # Hash containing config info 22 | %Config = ( 23 | 24 | BLASTRNASEQPEP_CONFIG_BY_LOGIC => 25 | { 26 | DEFAULT => { 27 | # databases are defined as hash keys from Bio::EnsEMBL::Analysis::Config::Databases 28 | OUTPUT_DB => '', 29 | MODEL_DB => '', 30 | 31 | # If left blank all refined genes will be fetched 32 | LOGICNAME => '', 33 | 34 | # path to index to fetch the sequence of the blast hit to calculate % coverage 35 | INDEX => '/path/to/indexed/sequences/from/the/blastdb/index', 36 | }, 37 | } 38 | 39 | ); 40 | 41 | sub import { 42 | my ($callpack) = caller(0); # Name of the calling package 43 | my $pack = shift; # Need to move package off @_ 44 | 45 | # Get list of variables supplied, or else everything 46 | my @vars = @_ ? @_ : keys( %Config ); 47 | return unless @vars; 48 | 49 | # Predeclare global variables in calling package 50 | eval "package $callpack; use vars qw(" 51 | . join(' ', map { '$'.$_ } @vars) . ")"; 52 | die $@ if $@; 53 | 54 | 55 | foreach (@vars) { 56 | if ( defined $Config{$_} ) { 57 | no strict 'refs'; 58 | # Exporter does a similar job to the following 59 | # statement, but for function names, not 60 | # scalar variables: 61 | *{"${callpack}::$_"} = \$Config{ $_ }; 62 | } else { 63 | die "Error: Config: $_ not known\n"; 64 | } 65 | } 66 | } 67 | 68 | 1; 69 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/IPRScan.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # 18 | =pod 19 | 20 | =head1 NAME 21 | 22 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::IPRScan 23 | 24 | =head1 SYNOPSIS 25 | 26 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::IPRScan-> 27 | new ( -db => $db, 28 | -input_id => $input_id, 29 | -analysis => $analysis, 30 | ); 31 | $seg->fetch_input; # gets sequence from DB 32 | $seg->run; 33 | $seg->write_output; # writes features to to DB 34 | 35 | =head1 DESCRIPTION 36 | 37 | This object wraps Bio::EnsEMBL::Pipeline::Runnable::Hmmpfam 38 | to add functionality to read and write to databases in 39 | a IPRScan-specific way. 40 | 41 | =head1 CONTACT 42 | 43 | =cut 44 | 45 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::IPRScan; 46 | 47 | use warnings ; 48 | use strict; 49 | use vars qw(@ISA); 50 | 51 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 52 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::IPRScan; 53 | 54 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 55 | 56 | 57 | sub fetch_input { 58 | my ($self) = @_; 59 | $self->SUPER::fetch_input; 60 | print "FETCHING INPUT\n"; 61 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::IPRScan-> 62 | new( 63 | -query => $self->query, 64 | -analysis => $self->analysis, 65 | -program => $self->analysis->program_file, 66 | %{$self->parameters_hash} 67 | ); 68 | $self->runnable($run); 69 | } 70 | 71 | 1; 72 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Pfam_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # 18 | =pod 19 | 20 | =head1 NAME 21 | 22 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Pfam_wormbase 23 | 24 | =head1 SYNOPSIS 25 | 26 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Pfam_wormbase-> 27 | new ( -db => $db, 28 | -input_id => $input_id, 29 | -analysis => $analysis, 30 | ); 31 | $seg->fetch_input; # gets sequence from DB 32 | $seg->run; 33 | $seg->write_output; # writes features to to DB 34 | 35 | =head1 DESCRIPTION 36 | 37 | This object wraps Bio::EnsEMBL::Pipeline::Runnable::Hmmpfam 38 | to add functionality to read and write to databases in 39 | a Pfam-specific way. 40 | 41 | =head1 CONTACT 42 | 43 | =cut 44 | 45 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Pfam_wormbase; 46 | 47 | use warnings ; 48 | use strict; 49 | use vars qw(@ISA); 50 | 51 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 52 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Pfam_wormbase; 53 | 54 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 55 | 56 | 57 | sub fetch_input { 58 | my ($self) = @_; 59 | 60 | $self->SUPER::fetch_input; 61 | 62 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Pfam_wormbase-> 63 | new(-query => $self->query, 64 | -analysis => $self->analysis, 65 | -database => $self->analysis->db_file, 66 | %{$self->parameters_hash} 67 | ); 68 | $self->runnable($run); 69 | } 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Tmhmm.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =pod 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Tmhmm 21 | 22 | =head1 SYNOPSIS 23 | 24 | my $tmhmm = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Tmhmm->new ( -db => $db, 25 | -input_id => $input_id, 26 | -analysis => $analysis, 27 | ); 28 | $tmhmm->fetch_input; # gets sequence from DB 29 | $tmhmm->run; 30 | $tmhmm->write_output; # writes features to to DB 31 | 32 | =head1 DESCRIPTION 33 | 34 | =cut 35 | 36 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Tmhmm; 37 | 38 | use warnings ; 39 | use strict; 40 | use vars qw(@ISA); 41 | 42 | 43 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Tmhmm; 44 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 45 | 46 | 47 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 48 | 49 | 50 | sub fetch_input { 51 | my ($self, @args) = @_; 52 | 53 | $self->SUPER::fetch_input(@args); 54 | 55 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Tmhmm->new(-query => $self->query, 56 | -analysis => $self->analysis, 57 | %{$self->parameters_hash} 58 | ); 59 | $self->runnable($run); 60 | } 61 | 62 | 1; 63 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/PacBioTranscriptFilter.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Tools::PacBioTranscriptFilter 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::Tools::PacBioTranscriptFilter; 39 | 40 | use strict; 41 | use warnings; 42 | 43 | use parent ('Bio::EnsEMBL::Analysis::Tools::CdnaUpdateTranscriptFilter'); 44 | 45 | sub filter_results { 46 | my ($self, $transcripts) = @_; 47 | my @modified_transcripts; 48 | foreach my $transcript (@$transcripts ){ 49 | my $real_strand = $self->_get_transcript_evidence_strand($transcript); 50 | if ($transcript->strand != $real_strand) { 51 | my $exons = $transcript->get_all_Exons; 52 | $transcript->flush_Exons(); 53 | foreach my $exon (@$exons) { 54 | $exon->strand($real_strand); 55 | $transcript->add_Exon($exon); 56 | } 57 | $transcript->{_gb_flag} = 1; 58 | } 59 | push(@modified_transcripts, $transcript); 60 | } 61 | return $self->SUPER::filter_results(\@modified_transcripts); 62 | } 63 | 64 | sub _get_transcript_evidence_strand { 65 | my ($self,$tran) = @_; 66 | 67 | my ($sf) = @{$tran->get_all_supporting_features}; 68 | 69 | if (!$sf) { 70 | ($sf) = @{$tran->get_all_Exons->[0]->get_all_supporting_features}; 71 | } 72 | 73 | return $sf->strand*$sf->hstrand; 74 | } 75 | 76 | 1; 77 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Superfamily_wormbase.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # 17 | # 18 | =pod 19 | 20 | =head1 NAME 21 | 22 | Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Superfamily_wormbase 23 | 24 | =head1 SYNOPSIS 25 | 26 | my $seg = Bio::EnsEMBL::Pipeline::RunnableDB::ProteinAnnotation::Superfamily_wormase 27 | ->new ( -db => $db, 28 | -input_id => $input_id, 29 | -analysis => $analysis, 30 | ); 31 | $seg->fetch_input; # gets sequence from DB 32 | $seg->run; 33 | $seg->output; 34 | $seg->write_output; # writes features to to DB 35 | 36 | =head1 DESCRIPTION 37 | 38 | This object wraps Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily_wormbase 39 | to add functionality to read and write to databases in 40 | 41 | 42 | =head1 CONTACT 43 | 44 | =cut 45 | 46 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Superfamily_wormbase; 47 | 48 | use warnings ; 49 | use strict; 50 | use vars qw(@ISA); 51 | 52 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 53 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily_wormbase; 54 | 55 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 56 | 57 | 58 | # 59 | # overridden methods 60 | # 61 | sub fetch_input { 62 | my ($self) = @_; 63 | 64 | $self->SUPER::fetch_input; 65 | 66 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Superfamily_wormbase-> 67 | new(-query => $self->query, 68 | -analysis => $self->analysis, 69 | -database => $self->analysis->db_file, 70 | %{$self->parameters_hash} 71 | ); 72 | $self->runnable($run); 73 | } 74 | 75 | 76 | 1; 77 | -------------------------------------------------------------------------------- /scripts/chunk_fasta_file.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | #This is a script which will run through the fasta file you provide and 19 | #chunk into into fasta files each containing the number of entries you 20 | #specify. 21 | #the usage is 22 | # chunk_fasta_file.pl fasta_file output_dir chunk_size 23 | 24 | use warnings ; 25 | use strict; 26 | 27 | 28 | my $filename = shift; 29 | my $output_dir = shift; 30 | my $chunk_size = shift; 31 | 32 | if(!$filename || !$output_dir || !$chunk_size){ 33 | print "usage chunk_fasta_file.pl fasta_file output_dir chunk_size"; 34 | exit; 35 | } 36 | 37 | if($filename eq '-h' || $filename eq '-help'){ 38 | print "usage chunk_fasta_file.pl fasta_file output_dir chunk_size"; 39 | exit; 40 | } 41 | 42 | &chunk_pepfile($filename, $output_dir, $chunk_size); 43 | 44 | sub chunk_pepfile { 45 | my ($pepfile, $scratchdir, $size) = @_; 46 | 47 | #Chunk the peptide file 48 | open (PEPFILE, "$pepfile") or die "couldn't open $pepfile $!"; 49 | my $count = 0; 50 | my $chunk = 1; 51 | #print STDERR "chunking peptide file\n"; 52 | 53 | 54 | $/ = "\>"; 55 | #print "have opened ".$pep_file."\n"; 56 | while(){ 57 | #print $_."\n"; 58 | if ($_ ne "\>") { 59 | if ($count == 0) { 60 | open (CHUNK,">".$scratchdir."/".$pepfile."_chunk.$chunk") or die "couldn't open ".$scratchdir."/".$pepfile."_chunk.$chunk"; 61 | #print "have opened ".$scratchdir."/chunks/chunk.$chunk\n"; 62 | } 63 | 64 | $_ =~ s/\>$//; 65 | 66 | print CHUNK ">$_"; 67 | $count++; 68 | if ($count == $size) { 69 | $count = 0; 70 | $chunk++; 71 | } 72 | } 73 | } 74 | $/ = "\n"; 75 | } 76 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Finished/EPCR.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | EPCR.pm 19 | 20 | =head1 SYNOPSIS 21 | 22 | my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::Finished::EPCR-> 23 | new( 24 | -input_id => 'contig::AL805961.22.1.166258:1:166258:1', 25 | -db => $db, 26 | -analysis => $analysis, 27 | ); 28 | $runnabledb->fetch_input; 29 | $runnabledb->run; 30 | $runnabledb->write_output; 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | The Finished version of EPCR. 36 | 37 | =head1 CONTACT 38 | 39 | anacode@sanger.ac.uk 40 | 41 | =cut 42 | 43 | package Bio::EnsEMBL::Analysis::RunnableDB::Finished::EPCR; 44 | 45 | use strict; 46 | use warnings; 47 | 48 | use Bio::EnsEMBL::Analysis::RunnableDB::EPCR; 49 | use Bio::EnsEMBL::Analysis::Runnable::Finished::EPCR; 50 | use vars qw(@ISA); 51 | 52 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::EPCR); 53 | 54 | sub fetch_input{ 55 | my ($self) = @_; 56 | my %parameters = %{$self->parameters_hash}; 57 | if($self->analysis->db_file){ 58 | $parameters{'-STS_FILE'} = $self->analysis->db_file 59 | unless($parameters{'-STS_FILE'}); 60 | } 61 | if(!$parameters{'-STS_FILE'}){ 62 | my $sts = $self->db->get_MarkerAdaptor->fetch_all; 63 | throw("No markers in ".$self->db->dbname) unless(@$sts); 64 | $parameters{'-STS_FEATURES'} = $sts; 65 | } 66 | my $slice = $self->fetch_sequence; 67 | $self->query($slice); 68 | my $runnable = Bio::EnsEMBL::Analysis::Runnable::Finished::EPCR->new 69 | ( 70 | -query => $slice, 71 | -program => $self->analysis->program_file, 72 | -analysis => $self->analysis, 73 | %parameters 74 | ); 75 | $self->runnable($runnable); 76 | } 77 | 78 | 1; 79 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Runnable/DustMasker.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Runnable::DustMasker 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::Runnable::DustMasker; 39 | 40 | use strict; 41 | use warnings; 42 | 43 | use parent qw(Bio::EnsEMBL::Analysis::Runnable::Dust); 44 | 45 | =head2 run_analysis 46 | 47 | Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Dust 48 | Arg [2] : string, program name 49 | Function : constructs a commandline and runs the program passed 50 | in, the generic method in Runnable isnt used as Dust doesnt 51 | fit this module 52 | Returntype: none 53 | Exceptions: throws if run failed because system doesnt 54 | return 0 55 | Example : 56 | 57 | =cut 58 | 59 | sub run_analysis{ 60 | my ($self, $program) = @_; 61 | if(!$program){ 62 | $program = $self->program; 63 | } 64 | throw($program." is not executable Dust::run_analysis ") 65 | unless($program && -x $program); 66 | my $command = $self->program; 67 | $command .= " -level ".$self->level if($self->level); 68 | $command .= " -window ".$self->window_size if($self->window_size); 69 | $command .= ' '.$self->options if ($self->options); 70 | $command .= " -in ".$self->queryfile." > ".$self->resultsfile; 71 | print "Running analysis ".$command."\n"; 72 | system($command) == 0 or throw("FAILED to run ".$command); 73 | } 74 | 75 | 1; 76 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/Config/BamMergeStatic.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Hive::Config::BamMergeStatic 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::Hive::Config::BamMergeStatic; 39 | 40 | use strict; 41 | use warnings; 42 | 43 | 44 | use parent ('Bio::EnsEMBL::Analysis::Hive::Config::BaseStatic'); 45 | 46 | sub _master_config { 47 | my ($self, $key) = @_; 48 | 49 | my %config = ( 50 | default => { 51 | # If 0, do not use multithreading, faster but can use more memory. 52 | # If > 0, tells how many cpu to use for samtools or just to use multiple cpus for picard 53 | use_threading => '#use_threads#', 54 | }, 55 | picard => { 56 | java => 'java', 57 | java_options => '-Xmx2g', 58 | # Path to MergeSamFiles.jar 59 | picard_lib => '#picard_lib_jar#', 60 | # Use this default options for Picard: 'MAX_RECORDS_IN_RAM=20000000 CREATE_INDEX=true SORT_ORDER=coordinate ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT' 61 | # You will need to change the options if you want to use samtools for merging 62 | options => 'MAX_RECORDS_IN_RAM=20000000 CREATE_INDEX=true SORT_ORDER=coordinate ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT', 63 | }, 64 | samtools => { 65 | options => '', 66 | }, 67 | ); 68 | 69 | return $config{$key}; 70 | } 71 | 72 | 1; 73 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/OrthologueEvaluatorExonerate.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluatorExonerate; 17 | 18 | use strict; 19 | use vars qw(%Config); 20 | 21 | %Config= 22 | ( 23 | EXONERATE_PROTEIN_CONF => { 24 | 25 | QUERYTYPE => 'protein', 26 | QUERYSEQS => "QUERYSEQ" , 27 | IIDREGEXP => '(\d+):(\d+)', 28 | # 29 | # either you set this variable or you fill out 30 | # Bio::EnsEMBL::Analysis::Config::Databases 31 | # 32 | OUTDB => { -dbname => '', 33 | -host => '', 34 | -port => '3306', 35 | -user => 'ensadmin', 36 | -pass => '****', 37 | }, 38 | COVERAGE_BY_ALIGNED => 0, 39 | OPTIONS => "--model protein2genome --forwardcoordinates FALSE ". 40 | "--softmasktarget TRUE --exhaustive FALSE --bestn 1", 41 | }, 42 | 43 | 44 | ); 45 | 46 | 47 | ############################################################ 48 | 49 | 50 | sub import { 51 | my ($callpack) = caller(0); 52 | my $pack = shift; 53 | my @vars = @_ ? @_ : keys(%Config); 54 | 55 | return unless @vars; 56 | eval "package $callpack; use vars qw(" 57 | . join(' ', map { '$'.$_ } @vars) . ")"; 58 | die $@ if $@; 59 | foreach (@vars) { 60 | if (defined $Config{ $_ }) { 61 | no strict 'refs'; 62 | *{"${callpack}::$_"} = \$Config{ $_ }; 63 | } else { 64 | die "Error: Config: $_ not known\n"; 65 | } 66 | } 67 | } 68 | 1; 69 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveLoadmRNAs.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadmRNAs 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | Module to load mRNA into a customised table in the Hive database 36 | 37 | =cut 38 | 39 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadmRNAs; 40 | 41 | use strict; 42 | use warnings; 43 | use POSIX qw(strftime); 44 | 45 | use parent ('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadSequences'); 46 | 47 | =head2 create_row_data 48 | 49 | Arg [1] : Bio::EnsEMBL::IO::Parser object 50 | Description: It will truncate the header to have only the accession, then return 51 | the accession and the sequence to be stored in the table 'table_name' 52 | of the Hive pipeline database 53 | Returntype : Array ref 54 | Exceptions : None 55 | 56 | =cut 57 | 58 | sub create_row_data { 59 | my ($self, $parser) = @_; 60 | 61 | my ($accession) = $parser->getHeader =~ /^\s*(\S+)/; 62 | my $source = 'INSDC'; 63 | if ($accession =~ /^NM/) { 64 | $source = 'RefSeq'; 65 | } 66 | my $biotype = 'mRNA'; 67 | my $date = strftime "%Y/%m/%d", localtime; 68 | # return [{accession => $accession, seq => $parser->getSequence, source => $source, biotype => $biotype, date => $date}]; 69 | return [{accession => $accession, seq => $parser->getSequence, source_db => $source, biotype => $biotype, date => $date}]; 70 | } 71 | 72 | 1; 73 | 74 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/GeneBuildUtils/HomologyUtils.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::HomologyUtils - utilities for gene objects 19 | 20 | =head1 SYNOPSIS 21 | 22 | use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::HomologyUtils qw(clone_Gene); 23 | 24 | or 25 | 26 | use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::HomologyUtils 27 | 28 | to get all methods 29 | 30 | =head1 DESCRIPTION 31 | 32 | All methods in this class should take a Bio::EnsEMBL::Compara::Homology 33 | object as their first argument. 34 | 35 | The methods provided should carry out some standard 36 | functionality for said objects such as printing info, and 37 | cloning 38 | 39 | =head1 CONTACT 40 | 41 | please send any questions to http://lists.ensembl.org/mailman/listinfo/dev 42 | 43 | =head1 METHODS 44 | 45 | the rest of the documention details the exported static 46 | class methods 47 | 48 | =cut 49 | 50 | 51 | package Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::HomologyUtils; 52 | 53 | use strict; 54 | use warnings; 55 | use Exporter; 56 | 57 | use vars qw (@ISA @EXPORT); 58 | 59 | @ISA = qw(Exporter); 60 | @EXPORT = qw( 61 | get_gene_obj_out_of_compara_homology_object 62 | ); 63 | 64 | 65 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning stack_trace_dump); 66 | 67 | sub get_gene_obj_out_of_compara_homology_object { 68 | my ( $homology, $species ) = @_ ; 69 | 70 | my $gene ; 71 | return $gene unless $homology ; 72 | 73 | for my $homology_member_obj ( @{$homology->gene_list}) { 74 | if ($homology_member_obj->genome_db->name eq $species ) { 75 | $gene = $homology_member_obj->get_Gene ; 76 | } 77 | } 78 | return $gene ; 79 | } 80 | 81 | 1; 82 | -------------------------------------------------------------------------------- /scripts/Merge/merge-refseq.conf: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a "bash" script which is sourced by merge-wrapper.ksh. 3 | # This means that there can't be any spaces around '=', for example. 4 | # 5 | 6 | ### THIS FILE: An example configuration for merging Ensembl into 7 | ### RefSeq (rather than Havana as we would normally do). The RefSeq 8 | ### gene set is taken from the most recent human otherfeatures database 9 | ### (the genes there with analysis logic name "refseq_human_import" in 10 | ### this case). 11 | # 12 | # Look to "merge.conf" for a slightly more well-documented configuration 13 | # file. 14 | 15 | ensembl_analysis_base="${HOME}/ensembl-src/ensembl-analysis" 16 | 17 | # Comment out to get a randomly generated output directory name 18 | # ("output.XXXX") in the current directory. 19 | output_dir='merge-refseq_ensembl-output' 20 | 21 | njobs='75' 22 | concurrent='20' 23 | 24 | rouser='ensro' 25 | ropassword='' 26 | 27 | rwuser='' 28 | rwpassword='' 29 | 30 | # host_secondary='genebuild8' 31 | # database_secondary='cgg_homo_sapiens_ensembl_74' 32 | host_secondary='' 33 | database_secondary='' 34 | 35 | # host_primary='ens-livemirror' 36 | # database_primary='homo_sapiens_otherfeatures_74_37' 37 | host_primary='' 38 | database_primary='' 39 | 40 | # Target database needs to exist but should be empty. 41 | # host_output='genebuild8' 42 | # database_output='ak4_refseq_ensembl_74' 43 | host_output='' 44 | database_output='' 45 | 46 | # Just comment out or leave empty if not applicable. 47 | # host_ccds='ens-livemirror' 48 | # database_ccds='ccds_human_74' 49 | host_ccds='' 50 | database_ccds='' 51 | 52 | # Filter options. You may specify either the X_include and X_exclude 53 | # options, but not both. These filter on gene analysis logic names. 54 | # Any gene filtered out will not be found in the output database (i.e. 55 | # they will also be skipped by the post-prosessing script that copies 56 | # all unprocessed Secondary genes to the output database). 57 | 58 | secondary_include='' 59 | secondary_exclude='' 60 | 61 | primary_include='refseq_human_import' 62 | primary_exclude='' 63 | 64 | 65 | # Tagging: Will be used as suffix for logic names ("_tag") and for 66 | # source. 67 | 68 | secondary_tag='ensembl' 69 | primary_tag='refseq' 70 | 71 | # Xrefs: The format is a comma-separated list of 72 | # "db_name,db_display_name,type" 73 | 74 | primary_gene_xref='RefSeq_import,RefSeq_import,MISC' 75 | primary_transcript_xref='RefSeq_mRNA,RefSeq mRNA,MISC' 76 | primary_translation_xref='RefSeq_peptide,RefSeq peptide,MISC' 77 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDBSeqFiles.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =head1 NAME 27 | 28 | Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveDBSeqFiles 29 | 30 | =head1 SYNOPSIS 31 | 32 | 33 | =head1 DESCRIPTION 34 | 35 | 36 | =cut 37 | 38 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveDBSeqFiles; 39 | 40 | use strict; 41 | use warnings; 42 | 43 | use parent ('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveBaseRunnableDB'); 44 | 45 | 46 | sub get_query_seqs { 47 | my ($self, $accession_array) = @_; 48 | 49 | my $table_adaptor = $self->db->get_NakedTableAdaptor(); 50 | $table_adaptor->table_name($self->param('sequence_table_name')); 51 | 52 | my $biotypes_hash = {}; 53 | my @query_sequences; 54 | foreach my $accession (@{$accession_array}) { 55 | my $db_row = $table_adaptor->fetch_by_dbID($accession); 56 | unless($db_row) { 57 | $self->throw('Did not find an entry in the '.$self->param('sequence_table_name')." table matching the accession. Accession:\n".$accession); 58 | } 59 | 60 | my $seq = $db_row->{'seq'}; 61 | $biotypes_hash->{$accession} = $db_row->{'biotype'}; 62 | 63 | push(@query_sequences, Bio::Seq->new(-display_id => $accession, -seq => $seq)); 64 | } 65 | 66 | $self->get_biotype($biotypes_hash); 67 | 68 | return \@query_sequences; 69 | } 70 | 71 | 72 | sub get_biotype { 73 | my ($self,$biotype_hash) = @_; 74 | if($biotype_hash) { 75 | $self->param('_biotype_hash',$biotype_hash); 76 | } 77 | return($self->param('_biotype_hash')); 78 | } 79 | 80 | 81 | 1; 82 | -------------------------------------------------------------------------------- /scripts/genebuild/convert_genome_dump.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [2019-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use warnings; 18 | use strict; 19 | use feature 'say'; 20 | use Getopt::Long; 21 | use Bio::EnsEMBL::Utils::Exception qw( throw warning verbose); 22 | 23 | my $input_file; 24 | my $output_file; 25 | my $conversion_type; 26 | my $remove_masking = 0; 27 | GetOptions( 'input_file:s' => \$input_file, 28 | 'output_file:s' => \$output_file, 29 | 'conversion_type:s' => \$conversion_type, 30 | 'remove_masking!' => \$remove_masking); 31 | 32 | unless($input_file && $output_file && $conversion_type) { 33 | throw("You must specify both an input file, an output file and a conversion type"); 34 | } 35 | 36 | unless(-e $input_file) { 37 | throw("The input file specified does not exist. Input file: ".$input_file); 38 | } 39 | 40 | if($conversion_type eq "slice_name_to_seq_region_name") { 41 | slice_name_to_seq_region_name($input_file,$output_file,$remove_masking); 42 | } else { 43 | throw("The conversion type you selected is not supported. Conversion type selected: ".$conversion_type); 44 | } 45 | 46 | exit; 47 | 48 | sub slice_name_to_seq_region_name { 49 | my ($input_file,$output_file,$remove_masking) = @_; 50 | 51 | open(IN,$input_file); 52 | unless(open(OUT,">".$output_file)) { 53 | throw("Could not open output file for writing. Output file: ".$output_file); 54 | } 55 | 56 | while() { 57 | my $line = $_; 58 | if($line =~ /^>/) { 59 | unless($line =~ /[^\:]+\:[^\:]+\:([^\:]+)\:/) { 60 | throw("Failed to parse the header line. Expected to find a seq region name after the second colon in header. Header used: ".$line); 61 | } 62 | my $header = ">".$1; 63 | say OUT $header; 64 | } else { 65 | if($remove_masking) { 66 | $line = uc($line); 67 | } 68 | print OUT $line; 69 | } 70 | } 71 | close OUT; 72 | close IN; 73 | } 74 | -------------------------------------------------------------------------------- /scripts/markers/map_weight.pl: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # script to calculate map_weights in a database that has markers 17 | # and marker_features. Recreates the marker_feature table with weights set 18 | 19 | use warnings ; 20 | use strict; 21 | use DBI; 22 | 23 | use Getopt::Long qw(:config no_ignore_case); 24 | 25 | my ( $host, $user, $pass, $port, $dbname ); 26 | my $verbose = 0; 27 | $port = 3306; 28 | 29 | GetOptions( "host|dbhost|h=s", \$host, 30 | "user|dbuser|u=s", \$user, 31 | "pass|dbpass|p=s", \$pass, 32 | "port|dbport|P=i", \$port, 33 | "dbname|db|D=s", \$dbname, 34 | "verbose", \$verbose 35 | ); 36 | 37 | if( !$host ) { 38 | usage(); 39 | } 40 | 41 | 42 | 43 | my $dsn = "DBI:mysql:host=$host;dbname=$dbname"; 44 | if( $port ) { 45 | $dsn .= ";port=$port"; 46 | } 47 | 48 | my $db = DBI->connect( $dsn, $user, $pass ); 49 | 50 | 51 | $db->do( " 52 | CREATE TABLE tmp_m_weight 53 | SELECT marker_id, count(*) as count 54 | FROM marker_feature 55 | GROUP BY marker_id 56 | " ); 57 | 58 | $db->do( " 59 | CREATE TABLE new_marker_feature 60 | SELECT mf.marker_feature_id, mf.marker_id, mf.seq_region_id, mf.seq_region_start, 61 | mf.seq_region_end, mf.analysis_id, tmw.count 62 | FROM marker_feature mf, tmp_m_weight tmw 63 | WHERE mf.marker_id = tmw.marker_id 64 | " ); 65 | 66 | $db->do( "delete from marker_feature" ); 67 | $db->do( "insert into marker_feature select * from new_marker_feature" ); 68 | $db->do( "drop table tmp_m_weight" ); 69 | $db->do( "drop table new_marker_feature" ); 70 | 71 | sub usage { 72 | print < { 23 | DEFAULT => { 24 | # directory containg the sam file(s) 25 | SAM_DIR => '/path/to/directory', 26 | 27 | # path to the bam file to produce as output 28 | BAMFILE => '/path/to/my/SAM/file/introns.sam', 29 | 30 | # regex to identify which SAM files to merge 31 | REGEX => '.sam', 32 | 33 | # file containing all the readgroup headers used in the alignments (optional) 34 | HEADERFILE => '/path/to/my/header/file/headers.txt', 35 | 36 | # path to dumped genome file used for the alignment 37 | # it will make an index for it if one does not already exist 38 | GENOMEFILE => '/path/to/my/genome/file.fa', 39 | }, 40 | } 41 | ); 42 | 43 | sub import { 44 | my ($callpack) = caller(0); # Name of the calling package 45 | my $pack = shift; # Need to move package off @_ 46 | 47 | # Get list of variables supplied, or else everything 48 | my @vars = @_ ? @_ : keys( %Config ); 49 | return unless @vars; 50 | 51 | # Predeclare global variables in calling package 52 | eval "package $callpack; use vars qw(" 53 | . join(' ', map { '$'.$_ } @vars) . ")"; 54 | die $@ if $@; 55 | 56 | 57 | foreach (@vars) { 58 | if ( defined $Config{$_} ) { 59 | no strict 'refs'; 60 | # Exporter does a similar job to the following 61 | # statement, but for function names, not 62 | # scalar variables: 63 | *{"${callpack}::$_"} = \$Config{ $_ }; 64 | } else { 65 | die "Error: Config: $_ not known\n"; 66 | } 67 | } 68 | } 69 | 70 | 1; 71 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/AddStableIds.pm.example: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | =head1 NAME 20 | 21 | Bio::EnsEMBL::Analysis::Config::AddStableIds 22 | 23 | =head1 SYNOPSIS 24 | 25 | use Bio::EnsEMBL::Analysis::Config::AddStableIds; 26 | 27 | =head1 DESCRIPTION 28 | 29 | 30 | 31 | =head1 CONTACT 32 | 33 | Please email comments or questions to the public Ensembl 34 | developers list at . 35 | 36 | Questions may also be sent to the Ensembl help desk at 37 | . 38 | 39 | =cut 40 | 41 | 42 | package Bio::EnsEMBL::Analysis::Config::AddStableIds; 43 | 44 | use strict; 45 | use vars qw( %Config ); 46 | 47 | # Hash containing config info 48 | %Config = ( 49 | ADD_STABLEIDS_BY_LOGIC => { 50 | DEFAULT => { 51 | 52 | GENES_DB => 'ROUGHDB', 53 | 54 | PREFIX => undef, 55 | LOGIC_NAME => undef, 56 | }, 57 | 58 | } 59 | ); 60 | 61 | sub import { 62 | my ($callpack) = caller(0); # Name of the calling package 63 | my $pack = shift; # Need to move package off @_ 64 | 65 | # Get list of variables supplied, or else everything 66 | my @vars = @_ ? @_ : keys(%Config); 67 | return unless @vars; 68 | 69 | # Predeclare global variables in calling package 70 | eval "package $callpack; use vars qw(" 71 | . join( ' ', map { '$' . $_ } @vars ) . ")"; 72 | die $@ if $@; 73 | 74 | foreach (@vars) { 75 | if ( defined $Config{$_} ) { 76 | no strict 'refs'; 77 | # Exporter does a similar job to the following 78 | # statement, but for function names, not 79 | # scalar variables: 80 | *{"${callpack}::$_"} = \$Config{$_}; 81 | } else { 82 | die "Error: Config: $_ not known\n"; 83 | } 84 | } 85 | } ## end sub import 86 | 87 | 1; 88 | -------------------------------------------------------------------------------- /modules/t/test-genome-DBs/pararge_aegeria/core/meta.txt: -------------------------------------------------------------------------------- 1 | 1 \N schema_type core 2 | 2 \N schema_version 104 3 | 4 1 genebuild.start_date 2021-05-Ensembl 4 | 5 1 assembly.date 2021-01 5 | 6 1 species.common_name Speckled Wood Butterfly 6 | 8 1 species.scientific_name Pararge aegeria 7 | 9 1 species.taxonomy_id 116150 8 | 10 1 assembly.accession GCA_905163445.1 9 | 11 1 assembly.default ilParAegt1.1 10 | 12 1 assembly.name ilParAegt1.1 11 | 13 1 assembly.web_accession_source NCBI 12 | 14 1 assembly.web_accession_type INSDC Assembly ID 13 | 15 1 annotation.provider_name Ensembl 14 | 16 1 annotation.provider_url www.ensembl.org 15 | 17 1 assembly.coverage_depth high 16 | 18 1 assembly.provider_name 17 | 19 1 assembly.provider_url 18 | 21 1 species.stable_id_prefix ENSPAG 19 | 22 1 species.url Pararge_aegeria_GCA_905163445.1 20 | 23 1 species.display_name Pararge aegeria (Speckled Wood Butterfly) - GCA_905163445.1 21 | 24 1 species.division EnsemblMetazoa 22 | 25 1 species.strain reference 23 | 26 1 species.strain_group pararge_aegeria_gca905163445v1 24 | 27 1 species.production_name pararge_aegeria_gca905163445v1 25 | 28 1 strain.type strain 26 | 29 1 repeat.analysis repeatdetector 27 | 30 1 repeat.analysis dust 28 | 31 1 repeat.analysis trf 29 | 36 1 species.classification Parargina 30 | 37 1 species.classification Satyrini 31 | 38 1 species.classification Satyrinae 32 | 39 1 species.classification Nymphalidae 33 | 40 1 species.classification Papilionoidea 34 | 41 1 species.classification Obtectomera 35 | 42 1 species.classification Ditrysia 36 | 43 1 species.classification Heteroneura 37 | 44 1 species.classification Neolepidoptera 38 | 45 1 species.classification Glossata 39 | 46 1 species.classification Lepidoptera 40 | 47 1 species.classification Amphiesmenoptera 41 | 48 1 species.classification Holometabola 42 | 49 1 species.classification Neoptera 43 | 50 1 species.classification Pterygota 44 | 51 1 species.classification Dicondylia 45 | 52 1 species.classification Insecta 46 | 53 1 species.classification Hexapoda 47 | 54 1 species.classification Pancrustacea 48 | 55 1 species.classification Mandibulata 49 | 56 1 species.classification Arthropoda 50 | 57 1 species.classification Panarthropoda 51 | 58 1 species.classification Ecdysozoa 52 | 59 1 species.classification Protostomia 53 | 60 1 species.classification Bilateria 54 | 61 1 species.classification Eumetazoa 55 | 62 1 species.classification Metazoa 56 | 63 1 species.classification Opisthokonta 57 | 64 1 species.classification Eukaryota 58 | 65 1 genebuild.level toplevel 59 | 66 1 transcriptbuild.level toplevel 60 | 67 1 exonbuild.level toplevel 61 | 68 1 assembly.mapping primary_assembly:ilParAegt1.1|contig 62 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Runnable/ProteinAnnotation/PrositeProfile.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | package Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile; 18 | use warnings ; 19 | use vars qw(@ISA); 20 | use strict; 21 | 22 | # Object preamble - inheriets from Bio::Root::Object 23 | 24 | 25 | use Bio::EnsEMBL::Utils::Exception qw(throw warning); 26 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation; 27 | 28 | @ISA = qw(Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation); 29 | 30 | 31 | sub multiprotein{ 32 | my ($self) = @_; 33 | return 0; 34 | } 35 | 36 | 37 | sub run_analysis { 38 | my ($self) = @_; 39 | 40 | throw("Failed during Profile run $!\n") unless 41 | (system ($self->program . ' -f ' . $self->queryfile. ' ' . 42 | $self->database . ' > ' .$self->resultsfile) == 0) ; 43 | 44 | } 45 | 46 | 47 | sub parse_results { 48 | my ($self,$seqid) = @_; 49 | 50 | my ($fh); 51 | my $resfile = $self->resultsfile; 52 | 53 | if (-e $resfile) { 54 | if (-z $resfile) { 55 | return; 56 | } else { 57 | open ($fh, "<$resfile") or throw("Error opening ", $resfile,); 58 | } 59 | } 60 | 61 | my (@pfs); 62 | while (<$fh>) { 63 | if (/^\s*(\S+)\s+(\d+)\s*pos\.\s+(\d+)\s+\-\s+(\d+)\s+(\w+)\|/) { 64 | my ($sc, $rsc, $st, $en, $acc) = ($1, $2, $3, $4, $5); 65 | my $fp = $self->create_protein_feature($st, 66 | $en, 67 | $sc, 68 | $seqid, 69 | 0, 0, 70 | $acc, 71 | $self->analysis, 72 | 0, 0); 73 | push @pfs, $fp; 74 | } 75 | } 76 | 77 | $self->output(\@pfs); 78 | } 79 | 80 | 81 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Runnable/SamtoolsMerge.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =cut 27 | 28 | =head1 NAME 29 | 30 | Bio::EnsEMBL::Analysis::Runnable::SamtoolsMerge - 31 | 32 | =head1 SYNOPSIS 33 | 34 | 35 | =head1 DESCRIPTION 36 | 37 | Merge BAM files using samtools 38 | 39 | =head1 APPENDIX 40 | 41 | The rest of the documentation details each of the object methods. 42 | Internal methods are usually preceded with a _ 43 | 44 | =cut 45 | 46 | package Bio::EnsEMBL::Analysis::Runnable::SamtoolsMerge; 47 | 48 | use warnings; 49 | use strict; 50 | 51 | use parent ('Bio::EnsEMBL::Analysis::Runnable::BaseBamMerge'); 52 | 53 | 54 | sub new { 55 | my ($class,@args) = @_; 56 | my $self = $class->SUPER::new(@args); 57 | 58 | $self->samtools($self->program); 59 | if ($self->options =~ /-b\s+(\S+)/){ 60 | throw('Could not access file containing BAM files '.$1) unless (-e $1); 61 | } 62 | 63 | return $self; 64 | } 65 | 66 | 67 | 68 | ############################################################ 69 | # 70 | # Analysis methods 71 | # 72 | ############################################################ 73 | 74 | =head2 run 75 | 76 | Arg [1] : None 77 | Description: Merge the BAM files using samtools and create the index file 78 | Returntype : Integer, 1 79 | Exceptions : None 80 | 81 | =cut 82 | 83 | sub run { 84 | my ($self) = @_; 85 | 86 | my $input_files = $self->input_files; 87 | $input_files = $input_files->[0] if (scalar(@{$input_files}) == 1); 88 | 89 | $self->samtools->merge($self->options, $self->output_file, $input_files); 90 | $self->check_output_file; 91 | 92 | return 1; 93 | } 94 | 95 | 1; 96 | -------------------------------------------------------------------------------- /scripts/delete_big_dir.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [2018-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | =head1 NAME 18 | 19 | delete_big_dir.pl 20 | 21 | =head1 DESCRIPTION 22 | 23 | This script take a single argument, the full path to a dir to remove and then removes all 24 | files and subdirs before finally removing the dir itself. This is designed to remove very 25 | complex subdir structures or dirs with a very large number of files in them. Perl is much 26 | faster at this kind of task than rm or rsync to an empty dir 27 | 28 | =cut 29 | 30 | use Cwd; 31 | use File::Spec; 32 | use warnings; 33 | use strict; 34 | use feature 'say'; 35 | use Getopt::Long; 36 | 37 | my $full_master_dir_path; 38 | GetOptions('dir:s' => \$full_master_dir_path); 39 | 40 | unless($full_master_dir_path) { 41 | die "No agruments entered. You need to pass in the name of the dir in the current directory to delete"; 42 | } 43 | 44 | $full_master_dir_path = File::Spec->rel2abs($full_master_dir_path); 45 | 46 | unless(-d $full_master_dir_path) { 47 | die "The argument you entered is not a dir. Argument entered: ".$full_master_dir_path; 48 | } 49 | 50 | say "The full path for the dir to be deleted is:\n".$full_master_dir_path; 51 | 52 | say "Getting subdir list..."; 53 | my @subdirs = `find $full_master_dir_path -type d`; 54 | 55 | say "Found ".(scalar(@subdirs) - 1)." subdirs"; 56 | 57 | for (my $i=5; $i>0; $i--) { 58 | say "Beginning file deletion in ".$i."..."; 59 | sleep(1); 60 | } 61 | 62 | print "\n"; 63 | 64 | foreach my $dir (@subdirs) { 65 | chomp $dir; 66 | say "Removing files from:\n".$dir; 67 | foreach my $file (<$dir/*>) { 68 | unless($file =~ /^$full_master_dir_path/) { 69 | die "Potential issue with file path, path didn't match to the master dir path. Path found:\n".$file; 70 | } 71 | unlink($file); 72 | } 73 | } 74 | 75 | say "\nFinished removing files. Now removing empty dirs..."; 76 | my $result = system('rm -r '.$full_master_dir_path); 77 | if($result) { 78 | die "Could not remove the master dir, something potentially went wrong with the deletion!"; 79 | } 80 | 81 | exit; 82 | 83 | -------------------------------------------------------------------------------- /modules/t/filter_t.t: -------------------------------------------------------------------------------- 1 | #!/usr/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | use strict; 17 | use warnings; 18 | 19 | use Test::More; 20 | 21 | use Bio::EnsEMBL::Analysis::Tools::Filter; 22 | 23 | my %params = ( 24 | -coverage => 50, 25 | -percent_id => 87, 26 | -reject_processed_pseudos => 1, 27 | -best_in_genome => 1, 28 | -verbosity => 2, 29 | ); 30 | 31 | my $filter = new_ok('Bio::EnsEMBL::Analysis::Tools::Filter'); 32 | ok(!defined($filter->min_coverage), 'Checking default min_coverage'); 33 | ok(!defined($filter->min_percent), 'Checking default min_percent'); 34 | ok($filter->reject_processed_pseudos == 0, 'Checking default reject_processed_pseudos'); 35 | ok($filter->best_in_genome == 0, 'Checking default best_in_genome'); 36 | ok($filter->verbosity == 0, 'Checking default verbosity'); 37 | 38 | $filter->min_coverage(90); 39 | $filter->min_percent(97); 40 | $filter->reject_processed_pseudos(1); 41 | $filter->best_in_genome(1); 42 | $filter->verbosity(1); 43 | ok($filter->min_coverage == 90, 'Checking min_coverage'); 44 | ok($filter->min_percent == 97, 'Checking min_percent'); 45 | ok($filter->reject_processed_pseudos == 1, 'Checking reject_processed_pseudos'); 46 | ok($filter->best_in_genome == 1, 'Checking best_in_genome'); 47 | ok($filter->verbosity == 1, 'Checking verbosity'); 48 | 49 | $filter = Bio::EnsEMBL::Analysis::Tools::Filter->new(%params); 50 | ok($filter->min_coverage == 50, 'Checking min_coverage'); 51 | ok($filter->min_percent == 87, 'Checking min_percent'); 52 | ok($filter->reject_processed_pseudos == 1, 'Checking reject_processed_pseudos'); 53 | ok($filter->best_in_genome == 1, 'Checking best_in_genome'); 54 | ok($filter->verbosity == 2, 'Checking verbosity'); 55 | 56 | eval{ 57 | $filter->filter_results; 58 | }; 59 | ok($@ && $@ =~ /You should give an arrayref of objects/, 'Checking fails on empty'); 60 | eval{ 61 | $filter->filter_results([]); 62 | }; 63 | ok($@ && $@ =~ /You should implement the filter_results method/, 'Checking fails on not implemented'); 64 | done_testing(); 65 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/BuildChecks.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::BuildChecks; 17 | 18 | use strict; 19 | use vars qw( %GeneBuilder ); 20 | 21 | # Hash containing config info 22 | %GeneBuilder = ( 23 | # introns smaller than this could be real due to framshifts 24 | MINSHORTINTRONLEN => 7, 25 | 26 | # introns between smaller than this is considered too short 27 | MAXSHORTINTRONLEN => 10, 28 | 29 | # introns longer than this are too long 30 | MINLONGINTRONLEN => 100000, 31 | 32 | # exons smaller than this could be real due to framshifts 33 | MINSHORTEXONLEN => 3, 34 | 35 | # exons shorter than this are too short 36 | MAXSHORTEXONLEN => 10, 37 | 38 | # exons longer than this are probably too long 39 | MINLONGEXONLEN => 5000, 40 | 41 | MINTRANSLATIONLEN => 10, 42 | 43 | MAX_EXONSTRANSCRIPT => 150, 44 | 45 | MAXTRANSCRIPTS => 10, 46 | MAXGENELEN => 2_000_000, 47 | 48 | IGNOREWARNINGS => 1, 49 | 50 | ); 51 | 52 | sub import { 53 | my ($callpack) = caller(0); # Name of the calling package 54 | my $pack = shift; # Need to move package off @_ 55 | 56 | # Get list of variables supplied, or else 57 | # all of GeneBuilder: 58 | my @vars = @_ ? @_ : keys( %GeneBuilder ); 59 | return unless @vars; 60 | 61 | # Predeclare global variables in calling package 62 | eval "package $callpack; use vars qw(" 63 | . join(' ', map { '$'.$_ } @vars) . ")"; 64 | die $@ if $@; 65 | 66 | 67 | foreach (@vars) { 68 | if ( defined $GeneBuilder{ $_ } ) { 69 | no strict 'refs'; 70 | # Exporter does a similar job to the following 71 | # statement, but for function names, not 72 | # scalar variables: 73 | *{"${callpack}::$_"} = \$GeneBuilder{ $_ }; 74 | } else { 75 | die "Error: GeneBuilder: $_ not known\n"; 76 | } 77 | } 78 | } 79 | 80 | 1; 81 | -------------------------------------------------------------------------------- /scripts/Merge/merge.conf: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a "bash" script which is sourced by merge-wrapper.ksh. 3 | # This means that there can't be any spaces around '=', for example. 4 | # 5 | 6 | ensembl_analysis_base="${HOME}/ensembl-src/ensembl-analysis" 7 | 8 | # Comment out to get a randomly generated output directory name 9 | # ("output.XXXX") in the current directory. The merge will write log 10 | # files here. The directory must not already exist. 11 | output_dir='merge-havana_ensembl-output' 12 | 13 | # The number of jobs in the job array. The workload will be evenly 14 | # distributed over these jobs no matter what number of jobs you put 15 | # here. 16 | njobs='75' 17 | 18 | # The maximum number of consecutive jobs to run at any point in time. 19 | # A number between 10 and 20 seems to be optimal. 20 | concurrent='20' 21 | 22 | # ro = read only 23 | rouser='ensro' 24 | ropassword='' 25 | 26 | # rw = read and write 27 | rwuser='' 28 | rwpassword='' 29 | 30 | # host_secondary='genebuild8' 31 | # database_secondary='cgg_homo_sapiens_ensembl_74' 32 | host_secondary='' 33 | database_secondary='' 34 | 35 | # host_primary='genebuild8' 36 | # database_primary='cgg_homo_sapiens_vega_fixed_72' 37 | host_primary='' 38 | database_primary='' 39 | 40 | # Target database needs to exist but should be empty. 41 | # (The "essential tables" needs to be populated) 42 | # host_output='genebuild8' 43 | # database_output='ak4_havana_ensembl_74' 44 | host_output='' 45 | database_output='' 46 | 47 | # Just comment out or leave empty if not applicable. 48 | # host_ccds='ens-livemirror' 49 | # database_ccds='ccds_human_74' 50 | host_ccds='' 51 | database_ccds='' 52 | 53 | # Filter options. You may specify either the X_include and X_exclude 54 | # options, but not both. These filter on gene analysis logic names. 55 | # Any gene filtered out will not be found in the output database (i.e. 56 | # they will also be skipped by the post-prosessing script that copies 57 | # all unprocessed Secondary genes to the output database). 58 | 59 | ## These are the defaults: 60 | # secondary_include='' 61 | # secondary_exclude='' 62 | # 63 | # primary_include='' 64 | # primary_exclude='' 65 | 66 | # Tagging: Will be used as suffix for logic names ("_tag") and for 67 | # source. With the default settings, merged genes and transcripts will 68 | # get the source "secondary_primary". 69 | 70 | ## These are the defaults: 71 | # secondary_tag='ensembl' 72 | # primary_tag='havana' 73 | 74 | # Xrefs: The format is a comma-separated list of 75 | # "db_name,db_display_name,type" 76 | 77 | ## These are the defaults: 78 | # primary_gene_xref='OTTG,Havana gene,ALT_GENE' 79 | # primary_transcript_xref='OTTT,Havana transcript,ALT_TRANS' 80 | # primary_translation_xref='OTTP,Havana translation,MISC' 81 | -------------------------------------------------------------------------------- /scripts/genebuild/sncrna/dump_prefilter_features.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use File::Spec::Functions qw(catfile); 21 | use Bio::EnsEMBL::DBSQL::DBAdaptor; 22 | 23 | my ($dbname, $dbhost, $dbport, $dbuser, $working_dir, $logic_name) = @ARGV; 24 | 25 | my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new( 26 | -DBNAME => $dbname, 27 | -HOST => $dbhost, 28 | -PORT => $dbport, 29 | -USER => $dbuser, 30 | -DRIVER => 'mysql', 31 | ); 32 | 33 | my $daf_adaptor = $db->get_DnaAlignFeatureAdaptor(); 34 | 35 | my $fn = catfile($working_dir, $logic_name.'_dafs.bed'); 36 | 37 | open(FH, '>', $fn) or die "Could not write to $fn"; 38 | 39 | foreach my $daf (@{$daf_adaptor->fetch_all_by_logic_name($logic_name)}) { 40 | my $strand = $daf->strand() > 0 ? "+" : "-"; 41 | 42 | 43 | print FH $daf->seq_region_name(), "\t", 44 | $daf->seq_region_start(), "\t", 45 | $daf->seq_region_end(), "\t", 46 | $daf->seq_region_name(), ":", 47 | $daf->seq_region_start(), "-", 48 | $daf->seq_region_end(), "\t", 49 | $daf->score(), "\t", 50 | $strand, "\t", 51 | $daf->hseqname(), "\t", 52 | $daf->p_value(), "\t", 53 | $daf->percent_id(), "\t", 54 | $daf->cigar_string(), "\n"; 55 | 56 | } 57 | 58 | close(FH) or die("Could not close $fn"); 59 | 60 | # dump putative stem-loops 61 | my $gene_adaptor = $db->get_GeneAdaptor(); 62 | 63 | $fn = catfile($working_dir, 'identified_mirnas.bed'); 64 | 65 | open(FH, '>', $fn) or die "Could not write to $fn"; 66 | 67 | foreach my $gene (@{$gene_adaptor->fetch_all_by_biotype('miRNA')}){ 68 | my $strand = $gene->strand() > 0 ? "+" : "-"; 69 | 70 | 71 | print FH $gene->seq_region_name(), "\t", 72 | $gene->seq_region_start(), "\t", 73 | $gene->seq_region_end(), "\t", 74 | $gene->seq_region_name(), ":", 75 | $gene->seq_region_start(), "-", 76 | $gene->seq_region_end(), "\t0\t", 77 | $strand, "\t", 78 | $gene->dbID(), "\n"; 79 | 80 | } 81 | 82 | close(FH) or die("Could not close $fn"); 83 | 84 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Accumulator.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =cut 27 | 28 | =head1 NAME 29 | 30 | Bio::EnsEMBL::Analysis::RunnableDB::Accumulator - 31 | 32 | =head1 SYNOPSIS 33 | 34 | my $accumulator = Bio::EnsEMBL::Analysis::RunnableDB::Accumulator-> 35 | new( 36 | -input_id => 'ACCUMULATOR', 37 | -db => $db, 38 | -analysis => $analysis, 39 | ); 40 | $accumulator->fetch_input; 41 | $accumulator->run; 42 | $accumulator->write_output; 43 | 44 | =head1 DESCRIPTION 45 | 46 | This is a simple place holder module to allow the accumulator wait for all 47 | stages in the pipeline to work. It does nothing just 48 | 49 | =head1 METHODS 50 | 51 | =cut 52 | 53 | 54 | package Bio::EnsEMBL::Analysis::RunnableDB::Accumulator; 55 | 56 | use warnings ; 57 | use strict; 58 | 59 | use Bio::EnsEMBL::Analysis::RunnableDB; 60 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); 61 | 62 | use vars qw(@ISA); 63 | 64 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB); 65 | 66 | =head2 fetch_input 67 | 68 | Title : fetch_input 69 | Usage : $self->fetch_input 70 | Function: Dummy method to comply to the interface 71 | Returns : none 72 | Args : none 73 | 74 | =cut 75 | 76 | sub fetch_input { 77 | my( $self) = @_; 78 | 79 | throw("No input id") unless defined($self->input_id); 80 | 81 | return 1; 82 | 83 | } 84 | 85 | sub run { 86 | my ($self) = @_; 87 | print "Dummy RunnableDB - no runnable to run\n"; 88 | 89 | } 90 | 91 | sub write_output { 92 | my ($self) = @_; 93 | 94 | print "Dummy RunnableDB - no output to write\n"; 95 | 96 | return 1; 97 | } 98 | 99 | 1; 100 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/AllExonOverlapFilter.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | package Bio::EnsEMBL::Analysis::Tools::AllExonOverlapFilter; 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); 21 | use Bio::EnsEMBL::Utils::Argument qw( rearrange ); 22 | 23 | 24 | 25 | sub new{ 26 | my ($class, @args) = @_; 27 | my $self = bless {},$class; 28 | 29 | if (scalar(@args)) { 30 | throw("AllExonOverlapFilter should have no args in new"); 31 | } 32 | 33 | return $self; 34 | } 35 | 36 | ##################################### 37 | sub filter { 38 | my ($self, $these, $others) = @_; 39 | 40 | # interference is judged by overlap at exon level 41 | # assumption is that @others is sorted by gene start 42 | 43 | my @filtered; 44 | 45 | my $cur_idx = 0; 46 | 47 | foreach my $obj (@$these) { 48 | my (@genomic_overlap, $left_bound); 49 | 50 | 51 | for(my $i=$cur_idx; $i < @$others; $i++) { 52 | my $o_obj = $others->[$i]; 53 | 54 | if ($o_obj->end >= $obj->start and not defined $left_bound) { 55 | $left_bound = $i; 56 | } 57 | 58 | if ($o_obj->end < $obj->start) { 59 | next; 60 | } elsif ($o_obj->start > $obj->end) { 61 | last; 62 | } else { 63 | push @genomic_overlap, $o_obj; 64 | } 65 | } 66 | 67 | $cur_idx = $left_bound if defined $left_bound; 68 | 69 | my $exon_overlap = 0; 70 | if (@genomic_overlap) { 71 | my @exons = @{$obj->get_all_Exons}; 72 | OG: foreach my $o_obj (@genomic_overlap) { 73 | foreach my $oe (@{$o_obj->get_all_Exons}) { 74 | foreach my $e (@exons) { 75 | if ($oe->strand == $e->strand and 76 | $oe->end >= $e->start and 77 | $oe->start <= $e->end) { 78 | $exon_overlap = 1; 79 | last OG; 80 | } 81 | } 82 | } 83 | } 84 | } 85 | 86 | if (not $exon_overlap) { 87 | push @filtered, $obj; 88 | } 89 | } 90 | 91 | return \@filtered; 92 | } 93 | 1; 94 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/S3Config.pm.example: -------------------------------------------------------------------------------- 1 | 2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | =head1 NAME 18 | 19 | Bio::EnsEMBL::Analysis::Config::General 20 | 21 | =head1 SYNOPSIS 22 | 23 | use Bio::EnsEMBL::Analysis::Config::General; 24 | use Bio::EnsEMBL::Analysis::Config::General qw(); 25 | 26 | =head1 DESCRIPTION 27 | 28 | General analysis configuration. 29 | 30 | It imports and sets a number of standard global variables into the 31 | calling package. Without arguments all the standard variables are set, 32 | and with a list, only those variables whose names are provided are set. 33 | The module will die if a variable which doesn't appear in its 34 | C<%Config> hash is asked to be set. 35 | 36 | The variables can also be references to arrays or hashes. 37 | 38 | Edit C<%Config> to add or alter variables. 39 | 40 | All the variables are in capitals, so that they resemble environment 41 | variables. 42 | 43 | =head1 CONTACT 44 | 45 | B 46 | 47 | =cut 48 | 49 | package Bio::EnsEMBL::Analysis::Config::S3Config; 50 | 51 | use strict; 52 | use vars qw(%Config); 53 | 54 | %Config = ( 55 | 56 | S3_CONFIG_FILE => "$ENV{S3_CONFIG_FILE}", 57 | 58 | ); 59 | 60 | 61 | 62 | sub import { 63 | my ($callpack) = caller(0); # Name of the calling package 64 | my $pack = shift; # Need to move package off @_ 65 | 66 | # Get list of variables supplied, or else all 67 | my @vars = @_ ? @_ : keys(%Config); 68 | return unless @vars; 69 | 70 | # Predeclare global variables in calling package 71 | eval "package $callpack; use vars qw(" 72 | . join(' ', map { '$'.$_ } @vars) . ")"; 73 | die $@ if $@; 74 | 75 | 76 | foreach (@vars) { 77 | if (defined $Config{ $_ }) { 78 | no strict 'refs'; 79 | # Exporter does a similar job to the following 80 | # statement, but for function names, not 81 | # scalar variables: 82 | *{"${callpack}::$_"} = \$Config{ $_ }; 83 | } else { 84 | die "Error: Config: $_ not known\n"; 85 | } 86 | } 87 | } 88 | 89 | 1; 90 | -------------------------------------------------------------------------------- /scripts/markers/marker_match.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # get a complete but non-redundant set of marker definitions 19 | 20 | use warnings ; 21 | use strict; 22 | 23 | my $infile1 = shift; 24 | my $infile2 = shift; 25 | my %marker = (); 26 | 27 | #store file 1 28 | open(IN, "<$infile1") || die "cant open file $infile1\n" ; 29 | while(){ 30 | chomp; 31 | my @line = split("\t", $_); 32 | push @{$marker{$line[0]}}, $_ ; 33 | } 34 | close(IN); 35 | 36 | #add file 2 37 | open(IN, "<$infile2")|| die "cant open file $infile1\n" ; ; 38 | while(){ 39 | chomp; 40 | my @line = split("\t", $_); 41 | push @{$marker{$line[0]}}, $_ ; 42 | } 43 | close(IN); 44 | 45 | # combine them 46 | foreach my $id (keys %marker){ 47 | my ( %names, %accs ) ; 48 | my ($display_id, $lprim, $rprim, $dist, $name, $junk, $acc, $species) ; 49 | 50 | for my $l (@{$marker{$id}}){ 51 | ($display_id, $lprim, $rprim, $dist, $name, $junk, $acc, $species) = split /\t/, $l; 52 | 53 | # getting name unique 54 | unless ($name=~m/-/){ 55 | if ($name=~m/;/) { 56 | my @na = split/\;/,$name ; 57 | @names{@na}=(); 58 | } else { 59 | $names{$name}=() ; 60 | } 61 | } 62 | 63 | # getting acc unique 64 | unless ($acc=~m/-/) { 65 | if ($acc=~m/;/) { 66 | my @ac = split/\;/,$acc ; 67 | @accs{@ac}=(); 68 | } else { 69 | $accs{$acc}=() ; 70 | } 71 | } 72 | 73 | } 74 | print "$display_id\t$lprim\t$rprim\t$dist\t"; 75 | unless (scalar(keys %names)==0) { 76 | print join (";",keys %names) ; 77 | } else { 78 | print "\t-\t" ; 79 | } 80 | print "\t$junk\t" ; 81 | unless (scalar(keys %accs)==0) { 82 | print join (";",keys %accs) ; 83 | }else { 84 | print "\t-\t" ; 85 | } 86 | print "\t$species\n" ; 87 | } 88 | 89 | 90 | __END__ 91 | 92 | 87 AAAAACACAAGTTTCATACATCACA AATGTAACTGTACCCTTCTGCATG - D9S1986 - G07334;Z39132 Mus musculus 93 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Seg.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =pod 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Seg 21 | 22 | =head1 SYNOPSIS 23 | 24 | my $seg = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Seg->new ( -db => $db, 25 | -input_id => $input_id, 26 | -analysis => $analysis, 27 | ); 28 | $seg->fetch_input; # gets sequence from DB 29 | $seg->run; 30 | $seg->output; 31 | $seg->write_output; # writes features to to DB 32 | 33 | NB: The input_id can either be a peptide id or the location for a protein file. 34 | 35 | =head1 DESCRIPTION 36 | 37 | This object wraps Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg 38 | to add functionality to read and write to databases. 39 | The query sequence is provided through the input_id. 40 | The appropriate Bio::EnsEMBL::Analysis object 41 | must be passed for extraction of parameters. 42 | 43 | =head1 CONTACT 44 | 45 | Marc Sohrmann: ms2@sanger.ac.uk 46 | 47 | =head1 APPENDIX 48 | 49 | The rest of the documentation details each of the object methods. 50 | Internal methods are usually preceded with a _. 51 | 52 | =cut 53 | 54 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Seg; 55 | 56 | use warnings ; 57 | use strict; 58 | use vars qw(@ISA); 59 | 60 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 61 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg; 62 | 63 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 64 | 65 | sub fetch_input { 66 | my ($self, @args) = @_; 67 | 68 | $self->SUPER::fetch_input(@args); 69 | 70 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg->new(-query => $self->query, 71 | -analysis => $self->analysis); 72 | $self->runnable($run); 73 | } 74 | 75 | 76 | 1; 77 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Snap.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | =head1 CONTACT 20 | 21 | Please email comments or questions to the public Ensembl 22 | developers list at . 23 | 24 | Questions may also be sent to the Ensembl help desk at 25 | . 26 | 27 | =cut 28 | 29 | =head1 NAME 30 | 31 | Bio::EnsEMBL::Analysis::RunnableDB::Snap - 32 | 33 | =head1 SYNOPSIS 34 | 35 | my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::Snap-> 36 | new( 37 | -input_id => 'contig::AL805961.22.1.166258:1:166258:1', 38 | -db => $db, 39 | -analysis => $analysis, 40 | ); 41 | $runnabledb->fetch_input; 42 | $runnabledb->run; 43 | $runnabledb->write_output; 44 | 45 | 46 | =head1 DESCRIPTION 47 | 48 | fetches sequence data from database an instantiates and runs the 49 | fgenesh runnable, this inherits from the Genscan runnableDB an as such doesnt 50 | implement much itself 51 | 52 | =head1 METHODS 53 | 54 | =cut 55 | 56 | 57 | package Bio::EnsEMBL::Analysis::RunnableDB::Snap; 58 | 59 | use strict; 60 | use warnings; 61 | 62 | use Bio::EnsEMBL::Analysis::RunnableDB::Genscan; 63 | use Bio::EnsEMBL::Analysis::Runnable::Snap; 64 | use Bio::EnsEMBL::Analysis::Config::General; 65 | use vars qw(@ISA); 66 | 67 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Genscan); 68 | 69 | 70 | 71 | =head2 runnable_path 72 | 73 | Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Snap 74 | Function : return the runnable path 75 | Returntype: string 76 | Exceptions: 77 | Example : my $runnable = $self->runnable_path->new 78 | ( 79 | -query => $self->query, 80 | -program => $self->analysis->program_file, 81 | -analysis => $self->analysis, 82 | %parameters, 83 | ); 84 | 85 | =cut 86 | 87 | 88 | sub runnable_path{ 89 | my ($self); 90 | return "Bio::EnsEMBL::Analysis::Runnable::Snap"; 91 | } 92 | 93 | 1; 94 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Fgenesh.pm: -------------------------------------------------------------------------------- 1 | =head1 LICENSE 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | =head1 CONTACT 19 | 20 | Please email comments or questions to the public Ensembl 21 | developers list at . 22 | 23 | Questions may also be sent to the Ensembl help desk at 24 | . 25 | 26 | =cut 27 | 28 | =head1 NAME 29 | 30 | Bio::EnsEMBL::Analysis::RunnableDB::Fgenesh - 31 | 32 | =head1 SYNOPSIS 33 | 34 | my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::Fgenesh-> 35 | new( 36 | -input_id => 'contig::AL805961.22.1.166258:1:166258:1', 37 | -db => $db, 38 | -analysis => $analysis, 39 | ); 40 | $runnabledb->fetch_input; 41 | $runnabledb->run; 42 | $runnabledb->write_output; 43 | 44 | 45 | =head1 DESCRIPTION 46 | 47 | fetches sequence data from database an instantiates and runs the 48 | fgenesh runnable, this inherits from the Genscan runnableDB an as such doesnt 49 | implement much itself 50 | 51 | =head1 METHODS 52 | 53 | =cut 54 | 55 | 56 | package Bio::EnsEMBL::Analysis::RunnableDB::Fgenesh; 57 | 58 | use strict; 59 | use warnings; 60 | 61 | use Bio::EnsEMBL::Analysis::RunnableDB::Genscan; 62 | use Bio::EnsEMBL::Analysis::Runnable::Fgenesh; 63 | use Bio::EnsEMBL::Analysis::Config::General; 64 | use vars qw(@ISA); 65 | 66 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Genscan); 67 | 68 | 69 | 70 | =head2 runnable_path 71 | 72 | Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Fgenesh 73 | Function : return the runnable path 74 | Returntype: string 75 | Exceptions: 76 | Example : my $runnable = $self->runnable_path->new 77 | ( 78 | -query => $self->query, 79 | -program => $self->analysis->program_file, 80 | -analysis => $self->analysis, 81 | %parameters, 82 | ); 83 | 84 | =cut 85 | 86 | 87 | sub runnable_path{ 88 | my ($self); 89 | return "Bio::EnsEMBL::Analysis::Runnable::Fgenesh"; 90 | } 91 | 92 | 1; 93 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Finished/RepeatMasker.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | =head1 NAME 18 | 19 | Bio::EnsEMBL::Analysis::RunnableDB::Finished::RepeatMasker 20 | 21 | =head1 SYNOPSIS 22 | 23 | my $repeat_masker = Bio::EnsEMBL::Analysis::RunnableDB::Finished::RepeatMasker-> 24 | new( 25 | -input_id => 'contig::AL805961.22.1.166258:1:166258:1', 26 | -db => $db, 27 | -analysis => $analysis, 28 | ); 29 | $repeat_masker->fetch_input; 30 | $repeat_masker->run; 31 | $repeat_masker->write_output; 32 | 33 | =head1 DESCRIPTION 34 | 35 | This module provides an interface between the ensembl database and 36 | the Runnable RepeatMasker which wraps the program RepeatMasker 37 | 38 | This module can fetch appropriate input from the database 39 | pass it to the runnable then write the results back to the database 40 | in the repeat_feature and repeat_consensus tables 41 | 42 | =head1 CONTACT 43 | 44 | Post questions to : anacode@sanger.ac.uk 45 | 46 | =cut 47 | 48 | package Bio::EnsEMBL::Analysis::RunnableDB::Finished::RepeatMasker; 49 | 50 | use strict; 51 | use warnings; 52 | use Bio::EnsEMBL::Analysis::RunnableDB::RepeatMasker; 53 | use Bio::EnsEMBL::Analysis::Runnable::Finished::RepeatMasker; 54 | 55 | use vars qw(@ISA); 56 | 57 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::RepeatMasker); 58 | 59 | 60 | 61 | =head2 fetch_input 62 | 63 | Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Finished::RepeatMasker 64 | Function : fetch data out of database and create runnable 65 | Returntype: 1 66 | Exceptions: none 67 | Example : 68 | 69 | =cut 70 | 71 | 72 | 73 | sub fetch_input{ 74 | my ($self) = @_; 75 | my $slice = $self->fetch_sequence; 76 | $self->query($slice); 77 | my %parameters; 78 | if($self->parameters_hash){ 79 | %parameters = %{$self->parameters_hash}; 80 | } 81 | my $runnable = Bio::EnsEMBL::Analysis::Runnable::Finished::RepeatMasker->new 82 | ( 83 | -query => $self->query, 84 | -program => $self->analysis->program_file, 85 | -analysis => $self->analysis, 86 | %parameters, 87 | ); 88 | $self->runnable($runnable); 89 | return 1; 90 | } 91 | 92 | 1; 93 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Tools/CodingExonOverlapFilter.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | package Bio::EnsEMBL::Analysis::Tools::CodingExonOverlapFilter; 18 | 19 | use strict; 20 | use warnings; 21 | 22 | use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); 23 | use Bio::EnsEMBL::Utils::Argument qw( rearrange ); 24 | 25 | 26 | 27 | sub new{ 28 | my ($class, @args) = @_; 29 | my $self = bless {},$class; 30 | 31 | if (scalar(@args)) { 32 | throw("CodingExonOverlapFilter should have no args in new"); 33 | } 34 | 35 | return $self; 36 | } 37 | 38 | ##################################### 39 | sub filter { 40 | my ($self, $these, $others) = @_; 41 | 42 | # interference is judged by overlap at exon level 43 | # assumption is that @others is sorted by gene start 44 | 45 | my @filtered; 46 | 47 | my $cur_idx = 0; 48 | 49 | foreach my $obj (@$these) { 50 | my (@genomic_overlap, $left_bound); 51 | 52 | 53 | for(my $i=$cur_idx; $i < @$others; $i++) { 54 | my $o_obj = $others->[$i]; 55 | 56 | if ($o_obj->end >= $obj->start and not defined $left_bound) { 57 | $left_bound = $i; 58 | } 59 | 60 | if ($o_obj->end < $obj->start) { 61 | next; 62 | } elsif ($o_obj->start > $obj->end) { 63 | last; 64 | } else { 65 | push @genomic_overlap, $o_obj; 66 | } 67 | } 68 | 69 | $cur_idx = $left_bound if defined $left_bound; 70 | 71 | my $exon_overlap = 0; 72 | if (@genomic_overlap) { 73 | my @exons = @{$obj->get_all_Transcripts->[0]->get_all_translateable_Exons}; 74 | OG: foreach my $o_obj (@genomic_overlap) { 75 | foreach my $oe (@{$o_obj->get_all_Transcripts->[0]->get_all_translateable_Exons}) { 76 | foreach my $e (@exons) { 77 | if ($oe->strand == $e->strand and 78 | $oe->end >= $e->start and 79 | $oe->start <= $e->end) { 80 | $exon_overlap = 1; 81 | last OG; 82 | } 83 | } 84 | } 85 | } 86 | } 87 | 88 | if (not $exon_overlap) { 89 | push @filtered, $obj; 90 | } 91 | } 92 | 93 | return \@filtered; 94 | } 95 | 1; 96 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/IgSegBuilder.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::Config:IgSegBuilder 19 | 20 | =head1 SYNOPSIS 21 | 22 | use Bio::EnsEMBL::Analysis::Config::Genebuild::IgSegBuilder; 23 | 24 | =cut 25 | 26 | 27 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::IgSegBuilder; 28 | 29 | use strict; 30 | use vars qw( %Config ); 31 | 32 | # Hash containing config info 33 | %Config = ( 34 | IGSEG_CONFIG_BY_LOGIC => { 35 | DEFAULT => { 36 | 37 | TRANDB_DATABASES_NAME => 'IG_EXONERATE_DB', 38 | LV_LOGICS => [], 39 | D_LOGICS => [], 40 | J_LOGICS => [], 41 | C_LOGICS => [], 42 | 43 | LV_OUTPUT_BIOTYPE => 'LV_segment', 44 | D_OUTPUT_BIOTYPE => 'D_segment', 45 | J_OUTPUT_BIOTYPE => 'J_segment', 46 | C_OUTPUT_BIOTYPE => 'C_segment', 47 | 48 | SUPPORTING_FEATURE_OUTPUT_LOGIC => '', 49 | 50 | OUTPUTDB_DATABASES_NAME => 'IG_OUTPUT_DB', 51 | 52 | # D/J genes that are not closer than the distance 53 | # below to a V/C gene are rejected 54 | D_J_PROXIMITY_THRESHOLD => 200000, 55 | 56 | }, 57 | } 58 | ); 59 | 60 | sub import { 61 | my ($callpack) = caller(0); # Name of the calling package 62 | my $pack = shift; # Need to move package off @_ 63 | 64 | # Get list of variables supplied, or else everything 65 | my @vars = @_ ? @_ : keys( %Config ); 66 | return unless @vars; 67 | 68 | # Predeclare global variables in calling package 69 | eval "package $callpack; use vars qw(" 70 | . join(' ', map { '$'.$_ } @vars) . ")"; 71 | die $@ if $@; 72 | 73 | 74 | foreach (@vars) { 75 | if ( defined $Config{$_} ) { 76 | no strict 'refs'; 77 | # Exporter does a similar job to the following 78 | # statement, but for function names, not 79 | # scalar variables: 80 | *{"${callpack}::$_"} = \$Config{ $_ }; 81 | } else { 82 | die "Error: Config: $_ not known\n"; 83 | } 84 | } 85 | } 86 | 87 | 1; 88 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/CloneEndsLinking.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # POD documentation - main docs before the code 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::Config::CloneEndsLinking 21 | 22 | =head1 SYNOPSIS 23 | 24 | use Bio::EnsEMBL::Pipeline::Config::CloneEndsLinking; 25 | 26 | =head1 DESCRIPTION 27 | 28 | This contains the configuration for the linking of aligned 29 | clone ends in order to produce a misc set and its associated 30 | misc features. It needs to be run after a ExonerateAlignFeature 31 | run with specifique parameters. 32 | 33 | The layout of the configuration is a set of hashes, 34 | each one keyed by logic name. There is also a DEFAULT hash, 35 | which is used as the default for all logic names 36 | 37 | =head1 CONTACT 38 | 39 | =cut 40 | 41 | 42 | package Bio::EnsEMBL::Analysis::Config::CloneEndsLinking; 43 | 44 | use strict; 45 | use vars qw( %Config ); 46 | 47 | %Config = ( 48 | CLONE_END_LINKING_CONFIG_BY_LOGIC => { 49 | DEFAULT => { 50 | # must be a directory with files containing clone fasta sequences with extra information in the headerlike this: 51 | # >918936606:CH243-100A1:F:CH243:184000:36800:1098268172037:1001 52 | CLONE_ALIGNED_DB => '', 53 | CLONE_LOGIC_NAME => '', 54 | OUTDB => '', 55 | STORE_DNAALIGNFEATURES => 1, 56 | }, 57 | } 58 | ); 59 | 60 | sub import { 61 | my ($callpack) = caller(0); # Name of the calling package 62 | my $pack = shift; # Need to move package off @_ 63 | 64 | # Get list of variables supplied, or else everything 65 | my @vars = @_ ? @_ : keys( %Config ); 66 | return unless @vars; 67 | 68 | # Predeclare global variables in calling package 69 | eval "package $callpack; use vars qw(" 70 | . join(' ', map { '$'.$_ } @vars) . ")"; 71 | die $@ if $@; 72 | 73 | 74 | foreach (@vars) { 75 | if ( defined $Config{$_} ) { 76 | no strict 'refs'; 77 | # Exporter does a similar job to the following 78 | # statement, but for function names, not 79 | # scalar variables: 80 | *{"${callpack}::$_"} = \$Config{ $_ }; 81 | } else { 82 | die "Error: Config: $_ not known\n"; 83 | } 84 | } 85 | } 86 | 87 | 1; 88 | -------------------------------------------------------------------------------- /scripts/genebuild/slice_coding_gene_cnt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License 17 | 18 | #This script checks slices for a given db to flag up cases where slices > 5mb have no protein coding gene 19 | 20 | # 21 | 22 | use strict; 23 | use warnings; 24 | 25 | use Getopt::Long; 26 | use Bio::EnsEMBL::DBSQL::DBAdaptor; 27 | use feature 'say'; 28 | use Bio::EnsEMBL::Utils::Exception qw(throw warning); 29 | 30 | my $dbname = ''; 31 | my $host = ''; 32 | my $user = ''; 33 | my $port = ''; 34 | my $pass = ''; 35 | my $driver = ''; 36 | 37 | GetOptions('dbname:s' => \$dbname, 38 | 'host:s' => \$host, 39 | 'user:s' => \$user, 40 | 'port:s' => \$port, 41 | 'pass:s' => \$pass, 42 | 'driver:s' => \$driver, 43 | ); 44 | my $slice_cnt = 0; my $gene_cnt = 0; my $slice_with_gene = 0; my $slice_no_gene = 0; my $size = 0; 45 | 46 | my $db_adaptor = new Bio::EnsEMBL::DBSQL::DBAdaptor( 47 | -dbname => $dbname, 48 | -host => $host, 49 | -port => $port, 50 | -user => $user, 51 | -pass => $pass, 52 | -driver => $driver, 53 | ); 54 | my $slice_adaptor = $db_adaptor->get_SliceAdaptor(); 55 | foreach my $slice ( @{ $slice_adaptor->fetch_all('toplevel') } ){ 56 | 57 | my $gene_cnt = 0; 58 | $slice_cnt++; 59 | 60 | #retrieving gene from slice 61 | foreach my $gene ( @{ $slice->get_all_Genes } ){ 62 | if ($gene->biotype eq 'protein_coding'){ 63 | #counting all protein coding genes 64 | $gene_cnt++; 65 | } 66 | 67 | } 68 | if ($slice->length >= 5000000) {#check that slice is bigger than 5mb 69 | 70 | if ($gene_cnt < 1){#check if slice contains protein coding genes 71 | throw("slice has no protein coding gene"); 72 | #say "slice " . $slice->name . " has no protein coding gene"; 73 | $slice_no_gene++; 74 | } 75 | else{#slice contains protein coding genes 76 | #say "slice " . $slice->name . " has $gene_cnt protein coding genes"; 77 | $slice_with_gene++; 78 | } 79 | } 80 | else{#count number of slice in database 81 | $size++; 82 | } 83 | 84 | } 85 | #print stats found 86 | say "Total slice = $slice_cnt"; say "Slice with genes = $slice_with_gene"; say "slice with no gene = $slice_no_gene"; 87 | say "slice less 5mb = $size"; 88 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Funcgen/ACME.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::ACME 19 | 20 | =head1 SYNOPSIS 21 | 22 | my $runnable = Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::ACME->new 23 | ( 24 | -db => $db, 25 | -input_id => 'chromosome::20:1:100000:1', 26 | -analysis => $analysis, 27 | ); 28 | $runnable->fetch_input; 29 | $runnable->run; 30 | $runnable->write_output; 31 | 32 | =head1 DESCRIPTION 33 | 34 | This module provides an interface between the ensembl functional genomics 35 | database and the Runnable ACME which wraps the R package ACME (for Algorithm 36 | for Capturing Microarray Enrichment). 37 | 38 | =head1 AUTHOR 39 | 40 | Stefan Graf, Ensembl Functional Genomics - http://www.ensembl.org/ 41 | 42 | =head1 CONTACT 43 | 44 | Post questions to the Ensembl development list: http://lists.ensembl.org/mailman/listinfo/dev 45 | 46 | =cut 47 | 48 | package Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::ACME; 49 | 50 | use strict; 51 | use warnings; 52 | use Data::Dumper; 53 | 54 | use Bio::EnsEMBL::Analysis::RunnableDB; 55 | use Bio::EnsEMBL::Analysis::RunnableDB::Funcgen; 56 | use Bio::EnsEMBL::Analysis::Runnable::Funcgen::ACME; 57 | 58 | use Bio::EnsEMBL::Analysis::Config::General; 59 | use Bio::EnsEMBL::Analysis::Config::Funcgen::ACME; 60 | 61 | use Bio::EnsEMBL::Utils::Exception qw(throw warning); 62 | use vars qw(@ISA); 63 | 64 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Funcgen); 65 | 66 | =head2 new 67 | 68 | Arg [1] : 69 | Arg [2] : 70 | Description : Instantiates new ACME runnabledb 71 | Returntype : Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::ACME object 72 | Exceptions : 73 | Example : 74 | 75 | =cut 76 | 77 | sub new { 78 | 79 | print "Analysis::RunnableDB::Funcgen::ACME::new\n"; 80 | my ($class,@args) = @_; 81 | my $self = $class->SUPER::new(@args); 82 | 83 | $self->read_and_check_config($CONFIG); 84 | 85 | # make sure we have the correct analysis object 86 | $self->check_Analysis(); 87 | 88 | # make sure we can store the correct feature_set, data_sets, and result_sets 89 | $self->check_Sets(); 90 | 91 | return $self; 92 | 93 | } 94 | 95 | 1; 96 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveLoadcDNAs.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | #Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadcDNAs; 19 | 20 | use strict; 21 | use warnings; 22 | 23 | use Bio::SeqIO; 24 | use Bio::EnsEMBL::Analysis::Tools::PolyAClipping qw(clip_if_necessary); 25 | 26 | use parent ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory'); 27 | 28 | sub param_defaults { 29 | my ($self) = @_; 30 | 31 | return { 32 | %{$self->SUPER::param_defaults()}, 33 | sequence_biotype => 'cdna', 34 | column_names => ['iid'], 35 | sequence_table_name => 'cdna_sequences', 36 | iid_type => 'db_seq', 37 | format => 'fasta', 38 | } 39 | } 40 | 41 | 42 | sub fetch_input { 43 | my $self = shift; 44 | 45 | my $process_polyA = 0; 46 | my $parser = Bio::SeqIO->new(-format => $self->param('format'), -file => $self->param_required('cdna_file')); 47 | if ($self->param_is_defined('process_polyA') and $self->param('process_polyA')) { 48 | $process_polyA = 1; 49 | } 50 | my $biotype = $self->param('sequence_biotype'); 51 | 52 | my $adaptor; 53 | my $write_to_file = $self->param('iid_type') eq 'db_seq' ? 0 : 1; 54 | if ($write_to_file) { 55 | $adaptor = Bio::SeqIO->new(-format => 'fasta', -file => '>'.$self->param_required('output_file')); 56 | } 57 | else { 58 | $adaptor = $self->db->get_NakedTableAdaptor(); 59 | $adaptor->table_name($self->param('sequence_table_name')); 60 | } 61 | 62 | my @iids; 63 | while(my $bioseq = $parser->next_seq) { 64 | my $header = $bioseq->id; 65 | if ($process_polyA) { 66 | ($bioseq, undef, undef) = clip_if_necessary($bioseq); 67 | if (!$bioseq) { 68 | $self->warning('Sequence full of polyA for '.$header); 69 | next; 70 | } 71 | } 72 | 73 | $header =~ s/^\w*\|\w*\|//; 74 | if ($write_to_file) { 75 | $bioseq->id($header); 76 | $adaptor->write_seq($bioseq); 77 | } 78 | else { 79 | my $db_row = [{ 80 | 'accession' => $header, 81 | 'seq' => $bioseq->seq, 82 | 'biotype' => $biotype, 83 | }]; 84 | $adaptor->store($db_row); 85 | } 86 | push(@iids, $header); 87 | } 88 | $self->param('inputlist', \@iids); 89 | } 90 | 91 | 1; 92 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/ProjectedTranscriptEvidence.pm.example: -------------------------------------------------------------------------------- 1 | 1; 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | =head1 NAME 19 | 20 | Bio::EnsEMBL::Analysis::Config::GeneBuild::ProjectedTranscriptEvidence 21 | 22 | =head1 SYNOPSIS 23 | 24 | use Bio::EnsEMBL::Analysis::Config::GeneBuild::ProjectedTranscriptEvidence 25 | 26 | =head1 DESCRIPTION 27 | Supplies config for Bio::EnsEMBL::Analysis::RunnableDB::ProjectedTranscriptEvidence, 28 | which is used to align a projected transcript against the original and add the original 29 | transcript as a transcript_supporting_feature of the projected transcript. 30 | 31 | 32 | =head1 CONTACT 33 | 34 | =cut 35 | 36 | 37 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::ProjectedTranscriptEvidence; 38 | 39 | use strict; 40 | use vars qw( %Config ); 41 | 42 | %Config = ( 43 | PROJECTED_TRANSCRIPT_EVIDENCE_CONFIG_BY_LOGIC => { 44 | DEFAULT => { 45 | #Database to fetch the original/reference genes from 46 | GENEDB => 'REFERENCE_DB', 47 | #Database the projected (transformed) genes were written to 48 | #where original transcript will be added as a transcript_supporting_feature 49 | OUTGENEDB => 'PROJECT_DB', 50 | OPTIONS => '--model affine:local --bestn 1 --dnahspthreshold 50 -w 1 -s 0', 51 | PROGRAM => "exonerate-0.9.0", 52 | }, 53 | 54 | } 55 | ); 56 | 57 | sub import { 58 | my ($callpack) = caller(0); # Name of the calling package 59 | my $pack = shift; # Need to move package off @_ 60 | 61 | # Get list of variables supplied, or else everything 62 | my @vars = @_ ? @_ : keys( %Config ); 63 | return unless @vars; 64 | 65 | # Predeclare global variables in calling package 66 | eval "package $callpack; use vars qw(" 67 | . join(' ', map { '$'.$_ } @vars) . ")"; 68 | die $@ if $@; 69 | 70 | 71 | foreach (@vars) { 72 | if ( defined $Config{$_} ) { 73 | no strict 'refs'; 74 | # Exporter does a similar job to the following 75 | # statement, but for function names, not 76 | # scalar variables: 77 | *{"${callpack}::$_"} = \$Config{ $_ }; 78 | } else { 79 | die "Error: Config: $_ not known\n"; 80 | } 81 | } 82 | } 83 | 84 | 1; 85 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/ExonerateSolexaLocalAlignment.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::Config::GeneBuild::ExonerateSolexa 19 | 20 | =head1 SYNOPSIS 21 | 22 | use Bio::EnsEMBL::Analysis::Config::GeneBuild::ExonerateSolexaLocalAlignment 23 | 24 | =head1 DESCRIPTION 25 | 26 | This contains the specific configuraton for 27 | Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexaLocalAlignment 28 | 29 | =head1 CONTACT 30 | 31 | =cut 32 | 33 | 34 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::ExonerateSolexaLocalAlignment; 35 | 36 | use strict; 37 | use vars qw( %Config ); 38 | 39 | %Config = ( 40 | EXONERATE_SOLEXA_LOCAL_ALIGNMENT_CONFIG_BY_LOGIC => { 41 | DEFAULT => { 42 | # database to fetch genomic alignments from 43 | GENOMICDB => '', 44 | # only want to realign partial genomic alignments max score 45 | # for alignments to be included say 60% of read length? 46 | SCORE => , 47 | # logicnames of the reads you want to use blank = all 48 | LOGIC_NAMES => [], 49 | # logic name of the refined models to use, blank = all 50 | REFINED_LN => "", 51 | # database to fetch refined models from 52 | REFINED_DB => "", 53 | # maximum distance between split models before assuming they are 54 | # 2 separate genes? 55 | MAX_GAP => 20000, 56 | }, 57 | } 58 | ); 59 | 60 | sub import { 61 | my ($callpack) = caller(0); # Name of the calling package 62 | my $pack = shift; # Need to move package off @_ 63 | 64 | # Get list of variables supplied, or else everything 65 | my @vars = @_ ? @_ : keys( %Config ); 66 | return unless @vars; 67 | 68 | # Predeclare global variables in calling package 69 | eval "package $callpack; use vars qw(" 70 | . join(' ', map { '$'.$_ } @vars) . ")"; 71 | die $@ if $@; 72 | 73 | 74 | foreach (@vars) { 75 | if ( defined $Config{$_} ) { 76 | no strict 'refs'; 77 | # Exporter does a similar job to the following 78 | # statement, but for function names, not 79 | # scalar variables: 80 | *{"${callpack}::$_"} = \$Config{ $_ }; 81 | } else { 82 | die "Error: Config: $_ not known\n"; 83 | } 84 | } 85 | } 86 | 87 | 1; 88 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HivecDNAManyHits.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 4 | #Copyright [2016-2024] EMBL-European Bioinformatics Institute 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HivecDNAManyHits; 19 | 20 | use strict; 21 | use warnings; 22 | use feature 'say'; 23 | 24 | 25 | use Bio::EnsEMBL::Analysis::Tools::Utilities qw(hrdb_get_dba); 26 | use parent ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory'); 27 | 28 | 29 | sub param_defaults { 30 | my ($self) = @_; 31 | 32 | return { 33 | %{$self->SUPER::param_defaults}, 34 | threshold => 20, 35 | column_names => ['iid'], 36 | many_hits_process_threshod => .90, 37 | } 38 | } 39 | 40 | sub fetch_input { 41 | my $self = shift; 42 | 43 | my $db = hrdb_get_dba($self->param_required('target_db')); 44 | my $slice_adaptor = $db->get_SliceAdaptor; 45 | my %hit_names; 46 | foreach my $slice (@{$slice_adaptor->fetch_all('toplevel', undef, 1)}) { 47 | foreach my $transcript (@{$slice->get_all_Transcripts}) { 48 | ++$hit_names{$transcript->get_all_supporting_features->[0]->hseqname}; 49 | } 50 | } 51 | my @many_hits; 52 | my $threshold = $self->param('threshold'); 53 | foreach my $key (keys %hit_names) { 54 | push(@many_hits, $key) if ($hit_names{$key} > $threshold); 55 | } 56 | if (@many_hits) { 57 | if ($self->param_is_defined('old_db')) { 58 | my $old_db = hrdb_get_dba($self->param_required('old_db')); 59 | my $transcript_adaptor = $old_db->get_TranscriptAdaptor; 60 | my @to_process; 61 | $threshold *= $self->param('many_hits_process_threshod'); 62 | foreach my $hitname (@many_hits) { 63 | my $transcripts = $transcript_adaptor->fetch_all_by_transcript_supporting_evidence($hitname, 'dna_align_feature'); 64 | push(@to_process, $hitname) unless (scalar(@$transcripts) > $threshold); 65 | } 66 | if (@to_process) { 67 | $self->param('inputlist', \@to_process); 68 | } 69 | else { 70 | $self->complete_early(scalar(@many_hits).' cDNAs had more than '.$self->param('threshold').' hits but were already in the previous database'); 71 | } 72 | } 73 | else { 74 | $self->param('inputlist', \@many_hits); 75 | } 76 | } 77 | else { 78 | $self->complete_early("No cDNAs had more than $threshold hits"); 79 | } 80 | } 81 | 82 | 83 | 1; 84 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Gsnap.pm.example: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::Config::GeneBuild::Gsnap 19 | 20 | =head1 SYNOPSIS 21 | 22 | use Bio::EnsEMBL::Analysis::Config::GeneBuild::Gsnap 23 | 24 | =head1 DESCRIPTION 25 | 26 | This contains the specific configuraton for 27 | Bio::EnsEMBL::Analysis::RunnableDB::Gsnap 28 | 29 | =head1 CONTACT 30 | 31 | =cut 32 | 33 | 34 | package Bio::EnsEMBL::Analysis::Config::GeneBuild::Gsnap; 35 | 36 | use strict; 37 | use vars qw( %Config ); 38 | 39 | %Config = ( 40 | GSNAP_CONFIG_BY_LOGIC => { 41 | DEFAULT => { 42 | 43 | # base path to the fastq 44 | INDIR => "/path/to/my/input", 45 | 46 | # path to the output directory 47 | OUTDIR => "/path/to/my/output", 48 | 49 | # Nmme given to the indexed genome when using gmap build 50 | GENOMENAME => "", 51 | # Directory containing the genome files 52 | GENOMEDIR => "/path/to/genome/dir", 53 | # alignment options ( just for example ) 54 | OPTIONS => "", 55 | 56 | # are the reads paired end? (1/0) 57 | PAIRED => "0", 58 | 59 | # path to the samtools binaries 60 | SAMTOOLS_PATH => "/software/solexa/bin/samtools", 61 | 62 | # optional header with additional information describing the sample 63 | HEADER => "", 64 | }, 65 | } 66 | ); 67 | 68 | sub import { 69 | my ($callpack) = caller(0); # Name of the calling package 70 | my $pack = shift; # Need to move package off @_ 71 | 72 | # Get list of variables supplied, or else everything 73 | my @vars = @_ ? @_ : keys( %Config ); 74 | return unless @vars; 75 | 76 | # Predeclare global variables in calling package 77 | eval "package $callpack; use vars qw(" 78 | . join(' ', map { '$'.$_ } @vars) . ")"; 79 | die $@ if $@; 80 | 81 | 82 | foreach (@vars) { 83 | if ( defined $Config{$_} ) { 84 | no strict 'refs'; 85 | # Exporter does a similar job to the following 86 | # statement, but for function names, not 87 | # scalar variables: 88 | *{"${callpack}::$_"} = \$Config{ $_ }; 89 | } else { 90 | die "Error: Config: $_ not known\n"; 91 | } 92 | } 93 | } 94 | 95 | 1; 96 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/Funcgen/Chipotle.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | =head1 NAME 17 | 18 | Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::Chipotle 19 | 20 | =head1 SYNOPSIS 21 | 22 | my $runnable = Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::Chipotle->new 23 | ( 24 | -db => $db, 25 | -input_id => 'chromosome::20:1:100000:1', 26 | -analysis => $analysis, 27 | ); 28 | $runnable->fetch_input; 29 | $runnable->run; 30 | $runnable->write_output; 31 | 32 | =head1 DESCRIPTION 33 | 34 | This module provides an interface between the ensembl database and 35 | the Runnable Chipotle which wraps the program ChIPoTle 36 | 37 | =head1 AUTHOR 38 | 39 | Stefan Graf, Ensembl Functional Genomics - http://www.ensembl.org/ 40 | 41 | =head1 CONTACT 42 | 43 | Post questions to the Ensembl development list: http://lists.ensembl.org/mailman/listinfo/dev 44 | 45 | =cut 46 | 47 | package Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::Chipotle; 48 | 49 | use strict; 50 | use warnings; 51 | use Data::Dumper; 52 | 53 | use Bio::EnsEMBL::Analysis::Config::General; 54 | use Bio::EnsEMBL::Analysis::Config::Funcgen::Chipotle; 55 | 56 | use Bio::EnsEMBL::Analysis::RunnableDB; 57 | use Bio::EnsEMBL::Analysis::RunnableDB::Funcgen; 58 | use Bio::EnsEMBL::Analysis::Runnable::Funcgen::Chipotle; 59 | 60 | use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); 61 | use vars qw(@ISA); 62 | 63 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Funcgen); 64 | 65 | =head2 new 66 | 67 | Arg [1] : 68 | Arg [2] : 69 | Description : Instantiates new Chipotle runnabledb 70 | Returntype : Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::Chipotle object 71 | Exceptions : 72 | Example : 73 | 74 | =cut 75 | 76 | sub new { 77 | 78 | print "Analysis::RunnableDB::Funcgen::Chipotle::new\n"; 79 | my ($class,@args) = @_; 80 | 81 | my $self = $class->SUPER::new(@args); 82 | 83 | $self->read_and_check_config($CONFIG); 84 | 85 | # add some runnable/program special params to analysis here 86 | 87 | # make sure we have the correct analysis object 88 | $self->check_Analysis(); 89 | 90 | # make sure we can store the correct feature_set, data_sets, and result_sets 91 | $self->check_Sets(); 92 | 93 | return $self; 94 | } 95 | 96 | 1; 97 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Panther.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute 2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Author: Gary Williams (gw3@sanger.ac.uk) 18 | # Copyright (c) Marc Sohrmann, 2001 19 | # You may distribute this code under the same terms as perl itself 20 | # 21 | # You may distribute this module under the same terms as perl itself 22 | # 23 | # POD documentation - main docs before the code 24 | 25 | =pod 26 | 27 | =head1 NAME 28 | 29 | Bio::EnsEMBL::Analysis::RunnableDB::Protein::Panther 30 | 31 | =head1 SYNOPSIS 32 | 33 | my $seg = Bio::EnsEMBL::Analysis::RunnableDB::Protein::Panther->new ( -db => $db, 34 | -input_id => $input_id, 35 | -analysis => $analysis, 36 | ); 37 | $seg->fetch_input; # gets sequence from DB 38 | $seg->run; 39 | $seg->output; 40 | $seg->write_output; # writes features to to DB 41 | 42 | =head1 DESCRIPTION 43 | 44 | This object wraps Bio::EnsEMBL::Analysis::Runnable::Panther 45 | to add functionality to read and write to databases. 46 | A Bio::EnsEMBL::Analysis::DBSQL::DBAdaptor is required for database access (db). 47 | The query sequence is provided through the input_id. 48 | The appropriate Bio::EnsEMBL::Analysis object 49 | must be passed for extraction of parameters. 50 | 51 | =head1 CONTACT 52 | 53 | Gary Williams 54 | 55 | =head1 APPENDIX 56 | 57 | The rest of the documentation details each of the object methods. 58 | Internal methods are usually preceded with a _. 59 | 60 | =cut 61 | 62 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Panther; 63 | 64 | use warnings ; 65 | use strict; 66 | use vars qw(@ISA); 67 | 68 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation; 69 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Panther; 70 | 71 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation); 72 | 73 | 74 | 75 | 76 | # runnable method 77 | sub fetch_input { 78 | my ($self,@args)=@_; 79 | $self->SUPER::fetch_input(@args); 80 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Panther->new(-query => $self->query,-analysis => $self->analysis); 81 | $self->runnable($run); 82 | } 83 | 1; 84 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Config/CollapseAffyProbes.pm.example: -------------------------------------------------------------------------------- 1 | # 2 | # package Bio::EnsEMBL::Pipeline::Config::ExonerateTranscript 3 | # 4 | # Cared for by EnsEMBL (http://lists.ensembl.org/mailman/listinfo/dev) 5 | # 6 | # Copyright GRL & EBI 7 | # 8 | # You may distribute this module under the same terms as perl itself 9 | 10 | # POD documentation - main docs before the code 11 | 12 | =head1 NAME 13 | 14 | Bio::EnsEMBL::Pipeline::Config::Affy::Exonerate2Affy 15 | 16 | =head1 SYNOPSIS 17 | 18 | use Bio::EnsEMBL::Pipeline::Config::Exonerate2Genes; 19 | 20 | =head1 DESCRIPTION 21 | 22 | This contains the configuration for step 1 of the 23 | process which maps Affymetric probes to the Genome. 24 | 25 | The layout of the configuration is a set of hashes, 26 | each one keyed by logic name. There is also a DEFAULT hash, 27 | which is used as the default for all logic names (this 28 | was the configuration pattern stolen from Exonerate2Genes, 29 | although in this case it's very unlikely you will need to have 30 | different configs by logic name). 31 | 32 | =head1 CONTACT 33 | 34 | =cut 35 | 36 | 37 | package Bio::EnsEMBL::Analysis::Config::CollapseAffyProbes; 38 | 39 | use strict; 40 | use vars qw( %Config ); 41 | 42 | # Hash containing config info 43 | # -- one hashnode per logic name, with a 'DEFAULT' logic name provided 44 | # 45 | 46 | %Config = ( 47 | AFFY_CONFIG => { 48 | DEFAULT => { 49 | # All input probes must be kept in one huge (possibly redundant) fasta file 50 | QUERYSEQS => '/ecs2/work3/vvi/osgb/affy/data/all_probes.fa', 51 | # The output of this module writes a set of affy probes into the OUTDB.affy_probe table, 52 | # and also writes the nonredundant probes into this fasta file, 53 | # with the fasta headers keyed with the affy probes' internal id. 54 | NON_REDUNDANT_PROBE_SEQS => '/ecs2/work3/vvi/osgb/affy/data/all_nr_probes.fa', 55 | # DB containing all affy_arrays, affy_probes and (next step) affy_features 56 | OUTDB => { 57 | -dbname => 'vivek_homo_sapiens_test_26_35', 58 | -host => 'ecs2', 59 | -port => '3362', 60 | -user => 'ensadmin', 61 | -pass => 'xxxxx', 62 | }, 63 | }, 64 | } 65 | ); 66 | 67 | sub import { 68 | my ($callpack) = caller(0); # Name of the calling package 69 | my $pack = shift; # Need to move package off @_ 70 | 71 | # Get list of variables supplied, or else everything 72 | my @vars = @_ ? @_ : keys( %Config ); 73 | return unless @vars; 74 | 75 | # Predeclare global variables in calling package 76 | eval "package $callpack; use vars qw(" 77 | . join(' ', map { '$'.$_ } @vars) . ")"; 78 | die $@ if $@; 79 | 80 | 81 | foreach (@vars) { 82 | if ( defined $Config{$_} ) { 83 | no strict 'refs'; 84 | # Exporter does a similar job to the following 85 | # statement, but for function names, not 86 | # scalar variables: 87 | *{"${callpack}::$_"} = \$Config{ $_ }; 88 | } else { 89 | die "Error: Config: $_ not known\n"; 90 | } 91 | } 92 | } 93 | 94 | 1; 95 | -------------------------------------------------------------------------------- /modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveIndexGenome.pm: -------------------------------------------------------------------------------- 1 | # Copyright [1999-2016] the EMBL-European Bioinformatics Institute 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveIndexGenome; 16 | 17 | use strict; 18 | use warnings; 19 | 20 | use File::Spec; 21 | 22 | use parent ('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveBaseRunnableDB'); 23 | 24 | 25 | =head2 fetch_input 26 | 27 | Arg [1] : None 28 | Description: Create the command to execute genome indexing with STAR 29 | Returntype : None 30 | Exceptions : Throw if splitpath does not return an existing directory name 31 | 32 | =cut 33 | 34 | sub fetch_input { 35 | my ($self) = @_; 36 | 37 | my (undef, $dirname, $file) = File::Spec->splitpath($self->param('wide_genome_file')); 38 | $self->throw("File::Spec->splitpath failed, $dirname does not exist") unless (-e $dirname); 39 | if (-e "$dirname/SA") { 40 | $self->complete_early($self->param('wide_genome_file').'is already indexed!'); 41 | } 42 | else { 43 | my @command = ($self->param('wide_short_read_aligner'), '--runMode genomeGenerate'); 44 | push(@command, '--runThreadN', $self->param('use_threading')) 45 | if ($self->param_is_defined('use_threading') and $self->param('use_threading') > 0); 46 | push(@command, '--genomeDir', $dirname); 47 | push(@command, '--genomeFastaFiles', $file); 48 | push(@command, '--sjdbGTFfile', $self->param('annotation_gtf')) 49 | if ($self->param_is_defined('annotation_gtf')); 50 | push(@command, '--sjdbOverhang', $self->param('read_length')-1) 51 | if ($self->param_is_defined('read_length') and $self->param('read_length') > 1); 52 | push(@command, $self->param('extra_options')) 53 | if ($self->param_is_defined('extra_options')); 54 | $self->param('commandline', \@command); 55 | } 56 | } 57 | 58 | 59 | =head2 run 60 | 61 | Arg [1] : None 62 | Description: Run the STAR command, it will generate the indexes 63 | Returntype : None 64 | Exceptions : Throws if STAR fails 65 | 66 | =cut 67 | 68 | sub run { 69 | my ($self) = @_; 70 | 71 | $self->throw('Could not execute: '.join(' ', @{$self->param('commandline')})) 72 | if (system(@{$self->param('commandline')})); 73 | } 74 | 75 | 76 | =head2 write_output 77 | 78 | Arg [1] : None 79 | Description: Return 1 to override SUPER method 80 | Returntype : Integer 1 81 | Exceptions : None 82 | 83 | =cut 84 | 85 | sub write_output { 86 | my ($self) = @_; 87 | 88 | return 1; 89 | } 90 | 91 | 1; 92 | --------------------------------------------------------------------------------