├── scripts
├── Merge
│ ├── README-vega-check.txt
│ ├── merge-refseq.conf
│ └── merge.conf
├── protein
│ ├── THESE_SCRIPTS_ARE_USED_TO_RUN_THE_PROTEIN_ANNOTATION_PIPELINE
│ └── chunk_protein_file.pl
├── assembly_patches
│ ├── remove_patch_karyotype.sql
│ └── remove_patch_raw_compute.sql
├── imgt
│ └── kill_list.txt
├── cdna_update
│ └── find_N.pl
├── genebuild
│ ├── parse_embl_cds2uniprotkb.pl
│ ├── sncrna
│ │ ├── filter_cm.pl
│ │ ├── repeats_dump.pl
│ │ └── dump_prefilter_features.pl
│ ├── find_seq_in_fasta.pl
│ ├── convert_genome_dump.pl
│ └── slice_coding_gene_cnt.pl
├── databases
│ └── process_uniprot_isoforms.pl
├── chunk_fasta_file.pl
├── markers
│ ├── map_weight.pl
│ └── marker_match.pl
└── delete_big_dir.pl
├── requirements.txt
├── modules
├── t
│ ├── test-genome-DBs
│ │ └── pararge_aegeria
│ │ │ └── core
│ │ │ ├── seq_region_attrib.txt
│ │ │ ├── meta_coord.txt
│ │ │ ├── seq_region_synonym.txt
│ │ │ ├── coord_system.txt
│ │ │ ├── analysis.txt
│ │ │ ├── seq_region.txt
│ │ │ ├── assembly.txt
│ │ │ ├── external_db.txt
│ │ │ ├── attrib_type.txt
│ │ │ ├── analysis_description.txt
│ │ │ └── meta.txt
│ ├── MultiTestDB.conf.default
│ ├── repeatcoverage.t
│ ├── hiveassemblycomponents_rb.t
│ ├── hiveloadgenomesequences_rb.t
│ ├── hiveprocessassemblyreport_rb.t
│ ├── prepare_local_tests.sh
│ ├── hivecreatedirectories_rb.t
│ ├── hiverepeatcoverage_rb.t
│ └── filter_t.t
└── Bio
│ └── EnsEMBL
│ └── Analysis
│ ├── RunnableDB
│ ├── Bam2Genes.pm
│ ├── Bam2Introns.pm
│ ├── Solexa2Genes.pm
│ ├── BlastRNASeqPep.pm
│ ├── ExonerateSolexa.pm
│ ├── RefineSolexaGenes.pm
│ ├── Solexa2GenesLiteNew.pm
│ ├── ExonerateSolexaTranscript.pm
│ ├── ExonerateSolexaLocalAlignment.pm
│ ├── ProteinAnnotation
│ │ ├── PrositePattern.pm.retired
│ │ ├── Hamap_wormbase.pm
│ │ ├── PrositeProfile.pm
│ │ ├── PrositeProfile_wormbase.pm
│ │ ├── Prints.pm
│ │ ├── Prints_wormbase.pm
│ │ ├── PrositePattern.pm
│ │ ├── PrositePattern_wormbase.pm
│ │ ├── Coil.pm
│ │ ├── Signalp.pm
│ │ ├── Hmmpfam.pm
│ │ ├── PIRSF.pm
│ │ ├── Superfamily.pm
│ │ ├── IPRScan.pm
│ │ ├── Pfam_wormbase.pm
│ │ ├── Tmhmm.pm
│ │ ├── Superfamily_wormbase.pm
│ │ ├── Seg.pm
│ │ └── Panther.pm
│ ├── Finished
│ │ ├── EPCR.pm
│ │ └── RepeatMasker.pm
│ ├── Accumulator.pm
│ ├── Snap.pm
│ ├── Fgenesh.pm
│ └── Funcgen
│ │ ├── ACME.pm
│ │ └── Chipotle.pm
│ ├── Config
│ ├── GeneBuild
│ │ ├── Bam2Genes.pm.example
│ │ ├── Solexa2Genes.pm.example
│ │ ├── Solexa2GenesLiteNew.pm
│ │ ├── RefineSolexaGenes.pm.example
│ │ ├── BlastRNASeqPep.pm.example
│ │ ├── OrthologueEvaluatorExonerate.pm.example
│ │ ├── Sam2Bam.pm.example
│ │ ├── BuildChecks.pm.example
│ │ ├── IgSegBuilder.pm.example
│ │ ├── ProjectedTranscriptEvidence.pm.example
│ │ ├── ExonerateSolexaLocalAlignment.pm.example
│ │ └── Gsnap.pm.example
│ ├── AddStableIds.pm.example
│ ├── S3Config.pm.example
│ ├── CloneEndsLinking.pm.example
│ └── CollapseAffyProbes.pm.example
│ ├── Hive
│ ├── Config
│ │ ├── genome_annotation.ini
│ │ ├── sample_genes_registry_conf.pl
│ │ └── BamMergeStatic.pm
│ └── RunnableDB
│ │ ├── HiveRunExternalCmd.pm
│ │ ├── HiveCreateFastqDownloadJobs.pm
│ │ ├── HiveStoreUnmappedcDNAs.pm
│ │ ├── HiveLoadProteins.pm
│ │ ├── HiveSequencesToFiles.pm
│ │ ├── HiveLoadmRNAs.pm
│ │ ├── HiveDBSeqFiles.pm
│ │ ├── HiveLoadcDNAs.pm
│ │ ├── HivecDNAManyHits.pm
│ │ └── HiveIndexGenome.pm
│ ├── Tools
│ ├── BlastDBTracking
│ │ └── Entry.pm
│ ├── IMGT
│ │ └── Seq
│ │ │ └── RichSeqIMGT.pm
│ ├── SoftwareConfigLoad.pm
│ ├── Stashes.pm
│ ├── GenomeOverlapFilter.pm
│ ├── PacBioTranscriptFilter.pm
│ ├── GeneBuildUtils
│ │ └── HomologyUtils.pm
│ ├── AllExonOverlapFilter.pm
│ └── CodingExonOverlapFilter.pm
│ └── Runnable
│ ├── DustMasker.pm
│ ├── ProteinAnnotation
│ └── PrositeProfile.pm
│ └── SamtoolsMerge.pm
├── cpanfile
├── travisci
└── MultiTestDB.conf.mysql
├── requirements_p36_ncrna.txt
├── pull_request_template.md
├── .gitignore
└── sql
└── repeat_db_tables.sql
/scripts/Merge/README-vega-check.txt:
--------------------------------------------------------------------------------
1 | TBC
2 |
--------------------------------------------------------------------------------
/scripts/protein/THESE_SCRIPTS_ARE_USED_TO_RUN_THE_PROTEIN_ANNOTATION_PIPELINE:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | deeptools ~= 3.4.1
2 | gspread
3 | oauth2client
4 | google-auth
5 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/seq_region_attrib.txt:
--------------------------------------------------------------------------------
1 | 1 367 1
2 | 1 6 1
3 |
--------------------------------------------------------------------------------
/cpanfile:
--------------------------------------------------------------------------------
1 | requires 'Bio::DB::HTS';
2 | requires 'Proc::ProcessTable';
3 | requires 'Bio::DB::EUtilities';
4 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/meta_coord.txt:
--------------------------------------------------------------------------------
1 | gene 1 43270
2 | exon 1 3426
3 | transcript 1 6190
4 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/seq_region_synonym.txt:
--------------------------------------------------------------------------------
1 | 1 1 LR990895.1 50710
2 | 2 1 NC_053180.1 1830
3 |
--------------------------------------------------------------------------------
/travisci/MultiTestDB.conf.mysql:
--------------------------------------------------------------------------------
1 | {
2 | 'port' => '3306',
3 | 'driver' => 'mysql',
4 | 'user' => 'root',
5 | 'host' => '127.0.0.1',
6 | }
7 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Genes.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Genes.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Introns.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Bam2Introns.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2Genes.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2Genes.pm
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/coord_system.txt:
--------------------------------------------------------------------------------
1 | 1 1 primary_assembly ilParAegt1.1 1 default_version
2 | 2 1 contig \N 2 default_version,sequence_level
3 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/BlastRNASeqPep.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/BlastRNASeqPep.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexa.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexa.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/RefineSolexaGenes.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/RefineSolexaGenes.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2GenesLiteNew.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/Solexa2GenesLiteNew.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Bam2Genes.pm.example:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Bam2Genes.pm.example
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/analysis.txt:
--------------------------------------------------------------------------------
1 | 1 2021-05-08 10:07:46 ensembl \N \N \N \N \N \N \N \N \N \N \N
2 | 2 2021-05-08 10:08:27 ncrna \N \N \N \N \N \N \N \N \N \N \N
3 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2Genes.pm.example:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2Genes.pm.example
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2GenesLiteNew.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/Solexa2GenesLiteNew.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaTranscript.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaTranscript.pm
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/RefineSolexaGenes.pm.example:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/Config/GeneBuild/RefineSolexaGenes.pm.example
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaLocalAlignment.pm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ensembl/ensembl-analysis/HEAD/modules/Bio/EnsEMBL/Analysis/RunnableDB/ExonerateSolexaLocalAlignment.pm
--------------------------------------------------------------------------------
/requirements_p36_ncrna.txt:
--------------------------------------------------------------------------------
1 | # We can only use python <= 3.6 because it needs scipy 0.18.1
2 | # We need to reinstall numpy as pandas will update to the latest version possible
3 | pandas
4 | scipy==0.18.1
5 | scikit-learn==0.18.1
6 | numpy==1.17.5
7 |
--------------------------------------------------------------------------------
/scripts/assembly_patches/remove_patch_karyotype.sql:
--------------------------------------------------------------------------------
1 | delete karyotype from attrib_type, seq_region_attrib, karyotype where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = karyotype.seq_region_id;
2 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Hive/Config/genome_annotation.ini:
--------------------------------------------------------------------------------
1 | assembly_accessions=[]
2 | output_path=
3 | release_number=
4 | email_address=
5 | genebuilder_id=
6 | pipeline_name=
7 | user_r=
8 | user_w=
9 | password=
10 | pipe_db_host=
11 | databases_host=
12 | dna_db_host=
13 | pipe_db_port=
14 | databases_port=
15 | dna_db_port=
16 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/seq_region.txt:
--------------------------------------------------------------------------------
1 | 1 1 1 21295481
2 | 2 contig_1 2 42060
3 | 3 contig_2 2 43299
4 | 4 contig_3 2 13106
5 | 5 contig_4 2 20932
6 | 6 contig_5 2 10928
7 | 7 contig_6 2 17392
8 | 8 contig_7 2 7483
9 | 9 contig_8 2 26761
10 | 10 contig_9 2 149864
11 | 11 contig_10 2 16265
12 | 12 contig_11 2 22067
13 | 13 contig_12 2 18568
14 | 14 contig_13 2 72768
15 |
--------------------------------------------------------------------------------
/modules/t/MultiTestDB.conf.default:
--------------------------------------------------------------------------------
1 | # This provides a default set of configurations which is merged with your
2 | # provided MultiTestDB.conf.
3 | #
4 | # You should provide a MultiTestDB.conf which defines the databases
5 |
6 | {
7 | 'databases' => {
8 | 'homo_sapiens' => {
9 | 'core' => 'Bio::EnsEMBL::DBSQL::DBAdaptor',
10 | },
11 | 'pararge_aegeria' => {
12 | 'core' => 'Bio::EnsEMBL::DBSQL::DBAdaptor',
13 | },
14 | },
15 | }
16 |
--------------------------------------------------------------------------------
/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Requirements
2 | When creating your Pull request, please fill out the template below:
3 |
4 | # PR details
5 | _Is this a fix/ update/ new feature?_
6 |
7 | _Include a short description_
8 |
9 | _Include links to JIRA tickets_
10 |
11 | # Testing
12 | _Have you tested it?_
13 |
14 | # Assign to the weekly GitHub reviewer
15 | _If you are a member of Ensembl, please check the Genebuild weekly Rotas and assign this week's GitHub reviewer to the PR_
16 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/assembly.txt:
--------------------------------------------------------------------------------
1 | 1 8 1190846 1198328 1 7483 1
2 | 1 9 1254816 1281576 1 26761 1
3 | 1 10 1551975 1701838 1 149864 1
4 | 1 11 2139403 2155667 1 16265 1
5 | 1 12 2914577 2936643 1 22067 1
6 | 1 2 3594096 3636155 1 42060 1
7 | 1 3 7512206 7555504 1 43299 1
8 | 1 4 7747315 7760420 1 13106 1
9 | 1 5 9996681 10017612 1 20932 1
10 | 1 6 13645738 13656665 1 10928 1
11 | 1 7 15616608 15633999 1 17392 1
12 | 1 13 13399324 13417891 1 18568 1
13 | 1 14 14122744 14195511 1 72768 1
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | blib/
2 | .build/
3 | _build/
4 | cover_db/
5 | inc/
6 | Build
7 | !Build/
8 | Build.bat
9 | .last_cover_stats
10 | MANIFEST.bak
11 | META.yml
12 | MYMETA.yml
13 | nytprof.out
14 | pm_to_blib
15 | .DS_Store
16 | Thumbs.db
17 | *.swp
18 | *.swo
19 | *~
20 | \#*\#
21 | /.emacs.desktop
22 | /.emacs.desktop.lock
23 | .elc
24 | auto-save-list
25 | tramp
26 | .\#*
27 | # Org-mode
28 | .org-id-locations
29 | *_archive
30 | a.out
31 | *.o
32 | *.obj
33 | *.class
34 | modules/t/MultiTestDB.conf
35 | modules/Bio/EnsEMBL/Analysis/Config/General.pm
36 |
--------------------------------------------------------------------------------
/scripts/imgt/kill_list.txt:
--------------------------------------------------------------------------------
1 | AF062232 human; heavy chain orphon (chr15) not annotated as such
2 | AF062120 human; heavy chain orphon (chr15) not annotated as such
3 | HSIGHZF human; heavy chain orphon (chr15) not annotated as such
4 | HSIGVH441 human; heavy chain orphon (chr15) not annotated as such
5 | HSIGHXX27 human; heavy chain orphon (chr15) not annotated as such
6 | HSIGV79 human; heavy chain orphon (chr15) not annotated as such
7 | HSIGLC16 human; light chain orphon (chr22, distal). Probable pseudo
8 | MM07554 mouse; C-REGION contains J-REGION
9 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/external_db.txt:
--------------------------------------------------------------------------------
1 | 1830 RefSeq_genomic \N KNOWN 193 RefSeq Genomic MISC \N \N This external_db_id can be used in the seq_region_synonym table. For species such as human, cow, dog we store chromosome names (1-22, X,Y) in the name column of the seq_region table. The RefSeq_genomic is stored as a synonym. eg. NC_000011.10, NT_187365.1
2 | 50710 INSDC \N KNOWNXREF 5 International Nucleotide Sequence Database Collaboration MISC INSDC \N This external_db_id was initially made for use in the seq_region_synonym table. For species such as human, cow, dog we store chromosome names (1-22, X,Y) in the name column of the seq_region table. The INSDC accession is stored as a synonym.
3 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Tools/BlastDBTracking/Entry.pm:
--------------------------------------------------------------------------------
1 | package Bio::EnsEMBL::Analysis::Tools::BlastDBTracking::Entry;
2 |
3 | use warnings ;
4 | use strict ;
5 | use namespace::autoclean;
6 | use Moose;
7 |
8 | has filename => ( is => 'ro', isa => 'Str', required => 1 );
9 | has version => ( is => 'ro', isa => 'Str', required => 1 );
10 | has sanger_version => ( is => 'ro', isa => 'Int', );
11 | has installation => ( is => 'ro', isa => 'Int', );
12 | has count => ( is => 'ro', isa => 'Int', );
13 | has checksum => ( is => 'ro', isa => 'Str', );
14 | has from_file => ( is => 'ro', isa => 'Bool', );
15 |
16 | 1;
17 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/attrib_type.txt:
--------------------------------------------------------------------------------
1 | 6 toplevel Top Level Top Level Non-Redundant Sequence Region
2 | 367 karyotype_rank Rank in the karyotype For a given seq_region, if it is part of the species karyotype, will indicate its rank
3 | 554 is_canonical Ensembl Canonical This transcript is the chosen canonical for its gene. For protein-coding genes, this is the MANE_Select transcript if there is one. If not, the canonical transcript is chosen by a pipeline that takes into account several criteria including transcript support (TSL), functional importance (APPRIS), representation in RefSeq and UniProt databases, length and coverage of pathogenic variants, where available. For non protein-coding genes, it is usually the longest transcript with the same biotype as its parent gene.
4 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveRunExternalCmd.pm:
--------------------------------------------------------------------------------
1 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRunExternalCmd;
2 |
3 | use strict;
4 | use warnings;
5 |
6 | use base ('Bio::EnsEMBL::Hive::Process');
7 | use Bio::EnsEMBL::Compara::Utils::RunCommand;
8 |
9 | sub param_defaults {
10 | return {
11 | 'cmd' => undef, # command to run
12 | };
13 | }
14 |
15 | sub run {
16 | my $self = shift;
17 |
18 | my $cmd = $self->param_required('cmd');
19 |
20 | my $rc = Bio::EnsEMBL::Compara::Utils::RunCommand
21 | ->new_and_exec($cmd, { die_on_failure => 1 });
22 |
23 | # Save stdout into a hive param so it can be used in flow_into
24 | my $stdout = $rc->out;
25 | chomp $stdout;
26 | $self->param('stdout', $stdout);
27 | $self->param('stderr', $rc->err);
28 | }
29 |
30 | sub write_output {
31 | my $self = shift;
32 |
33 | # Flow stdout as #stdout#
34 | $self->dataflow_output_id({ stdout => $self->param('stdout') }, 1);
35 | }
36 |
37 | 1;
38 |
--------------------------------------------------------------------------------
/modules/t/repeatcoverage.t:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | use strict;
17 | use warnings;
18 |
19 | use Test::More;
20 |
21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveRepeatCoverage');
22 |
23 | TODO: {
24 | local $TODO = 'Proper tests needed';
25 | note($TODO);
26 | }
27 |
28 | done_testing();
29 |
--------------------------------------------------------------------------------
/modules/t/hiveassemblycomponents_rb.t:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | use strict;
17 | use warnings;
18 |
19 | use Test::More;
20 |
21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadAssemblyComponents');
22 |
23 | TODO: {
24 | local $TODO = 'Proper tests needed';
25 | note($TODO);
26 | }
27 |
28 | done_testing();
29 |
--------------------------------------------------------------------------------
/modules/t/hiveloadgenomesequences_rb.t:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | use strict;
17 | use warnings;
18 |
19 | use Test::More;
20 |
21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveLoadGenomeSequences');
22 |
23 | TODO: {
24 | local $TODO = 'Proper tests needed';
25 | note($TODO);
26 | }
27 |
28 | done_testing();
29 |
--------------------------------------------------------------------------------
/modules/t/hiveprocessassemblyreport_rb.t:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | use strict;
17 | use warnings;
18 |
19 | use Test::More;
20 |
21 | use_ok('Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveProcessAssemblyReport');
22 |
23 | TODO: {
24 | local $TODO = 'Proper tests needed';
25 | note($TODO);
26 | }
27 |
28 | done_testing();
29 |
--------------------------------------------------------------------------------
/modules/t/test-genome-DBs/pararge_aegeria/core/analysis_description.txt:
--------------------------------------------------------------------------------
1 | 1 Annotation produced by the Ensembl genebuild. Ensembl 1 {"caption": "Genes (Ensembl)", "colour_key": "[biotype]", "default": {"MultiBottom": "collapsed_label", "MultiTop": "gene_label", "alignsliceviewbottom": "as_collapsed_label", "contigviewbottom": "transcript_label", "contigviewtop": "gene_label", "cytoview": "gene_label"}, "key": "ensembl", "label_key": "[biotype]", "multi_name": "Ensembl genes"}
2 | 2 Non-coding RNAs (ncRNAs) predicted using sequences from RFAM and miRBase. See article. ncRNAs 1 {"caption": "Genes (Ensembl)", "colour_key": "[biotype]", "default": {"MultiBottom": "collapsed_label", "MultiTop": "gene_label", "alignsliceviewbottom": "as_collapsed_label", "contigviewbottom": "transcript_label", "contigviewtop": "gene_label", "cytoview": "gene_label"}, "key": "ensembl", "label_key": "[biotype]", "multi_name": "Ensembl genes"}
3 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositePattern.pm.retired:
--------------------------------------------------------------------------------
1 |
2 | =pod
3 |
4 | =head1 NAME
5 |
6 | Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern
7 |
8 | =head1 SYNOPSIS
9 |
10 | my $tmhmm = Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern->
11 | new (
12 | -db => $db,
13 | -input_id => $input_id,
14 | -analysis => $analysis)
15 | );
16 | $tmhmm->fetch_input; # gets sequence from DB
17 | $tmhmm->run;
18 | $tmhmm->write_output; # writes features to to DB
19 |
20 | =head1 DESCRIPTION
21 |
22 | =cut
23 |
24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositePattern;
25 |
26 | use strict;
27 | use vars qw(@ISA);
28 |
29 |
30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern;
31 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation;
32 |
33 |
34 | @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation);
35 |
36 |
37 | sub fetch_input {
38 | my ($self, @args) = @_;
39 |
40 | $self->SUPER::fetch_input(@args);
41 |
42 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositePattern->
43 | new(-query => $self->query,
44 | -analysis => $self->analysis,
45 | %{$self->parameters_hash}
46 | );
47 | $self->runnable($run);
48 | }
49 |
50 | 1;
51 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Tools/IMGT/Seq/RichSeqIMGT.pm:
--------------------------------------------------------------------------------
1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | package Bio::EnsEMBL::Analysis::Tools::IMGT::Seq::RichSeqIMGT;
17 | use warnings ;
18 | use strict;
19 |
20 | use base qw(Bio::Seq::RichSeq);
21 |
22 |
23 | sub new {
24 | # standard new call..
25 | my($caller,@args) = @_;
26 | my $self = $caller->SUPER::new(@args);
27 |
28 | my ($data_class) = $self->_rearrange([qw(DATA_CLASS
29 | )],
30 | @args);
31 |
32 | defined $data_class and $self->data_class($data_class);
33 |
34 | return $self;
35 | }
36 |
37 |
38 | sub data_class {
39 | my $obj = shift;
40 | if( @_ ) {
41 | my $value = shift;
42 | $obj->{'_data_class'} = $value;
43 | }
44 | return $obj->{'_data_class'};
45 |
46 | }
47 |
48 |
49 | 1;
50 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/Hamap_wormbase.pm:
--------------------------------------------------------------------------------
1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::Hamap_wormbase;
18 | use warnings ;
19 | use vars qw(@ISA);
20 |
21 | use strict;
22 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation;
23 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hamap_wormbase;
24 |
25 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation);
26 |
27 |
28 | sub fetch_input {
29 | my ($self, @args) = @_;
30 |
31 | $self->SUPER::fetch_input(@args);
32 |
33 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Hamap_wormbase->
34 | new(-query => $self->query,
35 | -analysis => $self->analysis);
36 | $self->runnable($run);
37 | }
38 |
39 |
40 | 1;
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/sql/repeat_db_tables.sql:
--------------------------------------------------------------------------------
1 |
2 | CREATE TABLE assembly (
3 |
4 | assembly_id INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
5 | gca VARCHAR(14) NOT NULL,
6 | species_id INT(10) NOT NULL,
7 |
8 | PRIMARY KEY (assembly_id),
9 |
10 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
11 |
12 |
13 | CREATE TABLE species (
14 |
15 | species_id INT(10) NOT NULL AUTO_INCREMENT,
16 | taxon_id INT(10) UNSIGNED NOT NULL,
17 | common_name VARCHAR(40) NOT NULL,
18 | group_name VARCHAR(40) NOT NULL,
19 |
20 | PRIMARY KEY (species_id),
21 |
22 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
23 |
24 |
25 | CREATE TABLE repeat_sequence (
26 |
27 | repeat_sequence_id INT(10) NOT NULL AUTO_INCREMENT,
28 | repeat_class_id INT(10) NOT NULL,
29 | species_id INT(10) UNSIGNED NOT NULL,
30 | assembly_id INT(10) UNSIGNED NOT NULL,
31 |
32 | PRIMARY KEY (repeat_sequence_id),
33 |
34 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
35 |
36 |
37 | CREATE TABLE repeat_class (
38 |
39 | repeat_class_id INT(10) NOT NULL AUTO_INCREMENT,
40 | repeat_name VARCHAR(255) NOT NULL,
41 | repeat_class VARCHAR(100) NOT NULL,
42 | repeat_type VARCHAR(40) NOT NULL,
43 | repeat_sequence LONGTEXT NOT NULL,
44 |
45 | PRIMARY KEY (repeat_class_id),
46 | KEY name (repeat_name),
47 | KEY class (repeat_class),
48 | KEY type (repeat_type),
49 |
50 | ) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
51 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositeProfile.pm:
--------------------------------------------------------------------------------
1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | =head1 NAME
17 |
18 | =head1 SYNOPSIS
19 |
20 | =head1 DESCRIPTION
21 |
22 | =cut
23 |
24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositeProfile;
25 | use warnings ;
26 | use vars qw(@ISA);
27 |
28 | use strict;
29 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation;
30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile;
31 |
32 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation);
33 |
34 |
35 | sub fetch_input {
36 | my ($self, @args) = @_;
37 |
38 | $self->SUPER::fetch_input(@args);
39 |
40 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile->
41 | new(-query => $self->query,
42 | -analysis => $self->analysis);
43 | $self->runnable($run);
44 | }
45 |
46 |
47 | 1;
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveCreateFastqDownloadJobs.pm:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | package Bio::EnsEMBL::Analysis::Hive::RunnableDB::HiveCreateFastqDownloadJobs;
19 |
20 | use strict;
21 | use warnings;
22 | use feature 'say';
23 |
24 | use base ('Bio::EnsEMBL::Hive::RunnableDB::JobFactory');
25 |
26 | =head2 fetch_input
27 |
28 | Arg [1] :
29 | Description:
30 |
31 | Returntype : None
32 | Exceptions : None
33 |
34 | =cut
35 |
36 |
37 | sub write_output {
38 | my $self = shift;
39 | my $inputfile = $self->param('inputfile');
40 | my @fastq_list = `cut -d\$'\t' -f4 $inputfile`;
41 | my @output_ids;
42 | foreach my $fastq (@fastq_list){
43 | chomp $fastq;
44 | if ($fastq ne ""){
45 | push(@output_ids, {iid => $fastq})
46 | }
47 | }
48 | $self->dataflow_output_id(\@output_ids, $self->param('fan_branch_code'));
49 | }
50 |
51 | 1;
52 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/RunnableDB/ProteinAnnotation/PrositeProfile_wormbase.pm:
--------------------------------------------------------------------------------
1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | =head1 NAME
17 |
18 | =head1 SYNOPSIS
19 |
20 | =head1 DESCRIPTION
21 |
22 | =cut
23 |
24 | package Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation::PrositeProfile_wormbase;
25 | use warnings ;
26 | use vars qw(@ISA);
27 |
28 | use strict;
29 | use Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation;
30 | use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile_wormbase;
31 |
32 | @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ProteinAnnotation);
33 |
34 |
35 | sub fetch_input {
36 | my ($self, @args) = @_;
37 |
38 | $self->SUPER::fetch_input(@args);
39 |
40 | my $run = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::PrositeProfile_wormbase->
41 | new(-query => $self->query,
42 | -analysis => $self->analysis);
43 | $self->runnable($run);
44 | }
45 |
46 |
47 | 1;
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/modules/t/prepare_local_tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | CURRENT_DIR=`dirname $0`
18 | if [ ! -e "$CURRENT_DIR/MultiTestDB.conf" ];then
19 | echo "You should create a file $CURRENT_DIR/MultiTestDB.conf containing connection details to a database:"
20 | cat < 3306,
23 | user => EHIVE_USER,
24 | pass => EHIVE_PASS,
25 | host => HOST,
26 | driver => 'mysql',
27 | }
28 | EOF
29 | echo
30 | fi
31 |
32 | BASEDIR="modules/t/test-genome-DBs/homo_sapiens/core"
33 | ENSEMBLDIR="../ensembl/$BASEDIR"
34 | if [ ! -e "$ENSEMBLDIR" ];then
35 | if [ -n "$PERL5LIB" ];then
36 | for D in `echo $PERL5LIB | sed 's/:/\n/g'`; do
37 | if [ "$D" != "${D/ensembl\/modules}" ]; then
38 | ENSEMBLDIR=$D
39 | fi
40 | done
41 | else
42 | printf "\033[31mPERL5LIB is not set\033[0m\n"
43 | exit 1
44 | fi
45 | fi
46 |
47 | if [ ! -e "$BASEDIR" ]; then
48 | mkdir -p "$BASEDIR"
49 | fi
50 |
51 | for F in ${ENSEMBLDIR}/*; do
52 | # We also want the SQLite table in case we start testing it too
53 | cp -r "$F" "$BASEDIR"
54 | done
55 |
--------------------------------------------------------------------------------
/modules/Bio/EnsEMBL/Analysis/Tools/SoftwareConfigLoad.pm:
--------------------------------------------------------------------------------
1 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | package Bio::EnsEMBL::Analysis::Tools::SoftwareConfigLoad;
17 |
18 | use strict;
19 | use warnings;
20 | use JSON;
21 | use File::Basename;
22 | use File::Spec;
23 | use Exporter 'import';
24 | our @EXPORT_OK = qw(get_software_path);
25 |
26 | my $config_file = File::Spec->catfile(dirname(__FILE__), 'SoftwareConfig.json'); # Find suitable location
27 |
28 | sub get_software_path {
29 | my ($software_type, $tool) = @_;
30 |
31 | open my $fh, '<', $config_file or die "Could not open config file: $!";
32 | my $json_text = do { local $/; <$fh> };
33 | close $fh;
34 |
35 | my $config = decode_json($json_text);
36 |
37 |
38 | # Validate inputs
39 | unless ($software_type && exists $config->{software_paths}{$software_type}) {
40 | die "Software type '$software_type' not found in config. Available types: "
41 | . join(", ", keys %{ $config->{software_paths} }) . "\n";
42 | }
43 |
44 | unless (exists $config->{software_paths}{$software_type}{$tool}) {
45 | die "Tool '$tool' not found for software type '$software_type'. Available tools: "
46 | . join(", ", keys %{ $config->{software_paths}{$software_type} }) . "\n";
47 | }
48 |
49 | return $config->{software_paths}{$software_type}{$tool};
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/scripts/cdna_update/find_N.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings ;
3 | use strict;
4 |
5 |
6 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
7 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 |
21 |
22 | #script to parse a fasta file and identify sequences with large strings of 'N's
23 |
24 | #perl find_N.pl missing_fasta.out >many_n.out
25 |
26 | my $percent = 2; #percentage of sequence which must be consecutive Ns
27 | my $total_percent = 5 * $percent; #total Ns
28 |
29 | my $data = $ARGV[0];
30 | my $a_count = 0;
31 |
32 | local $/ = "\n>";
33 |
34 | open(DATA, "<$data") or die ("Can't read $data $! \n");
35 |
36 | while(){
37 | #have a sequence:
38 |
39 | s/>//g;
40 |
41 | my $len = length $_;
42 | my $max_n = sprintf "%.0f", (($len / 100) * $percent); #threshold number of Ns which we want to flag
43 | my $percent_n = 0;
44 |
45 | my ($name, $seq);
46 | if ($_=~/^([\w\.]+)\s+([\w\s]+)/){
47 | $name = $1;
48 | my @tmp = $2;
49 |
50 | for my $s (@tmp){
51 | $s =~s/\s//g;
52 | }
53 | $seq = join "", @tmp;
54 | }
55 |
56 | while ($_=~/(N+)/g){ #will match greedily
57 | $percent_n += length $1;
58 | if (length $1 >= $max_n && $percent_n >= $total_percent){
59 |
60 | print "$name\n"; #print the seq id
61 | last;
62 | }
63 | }
64 | }
65 |
66 |
67 |
--------------------------------------------------------------------------------
/scripts/genebuild/parse_embl_cds2uniprotkb.pl:
--------------------------------------------------------------------------------
1 | # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
2 | # Copyright [2016-2024] EMBL-European Bioinformatics Institute
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | #!/usr/bin/env perl
17 |
18 | use strict;
19 | use warnings;
20 |
21 | use Getopt::Long;
22 |
23 | my $fasta_file;
24 | my $map_file;
25 | my $edited_fasta_file;
26 |
27 | &GetOptions(
28 | 'fasta_file:s' => \$fasta_file,
29 | 'map_file:s' => \$map_file,
30 | 'edited_fasta_file:s' => \$edited_fasta_file
31 | );
32 |
33 | my %map_ids;
34 |
35 | open (EFF,">".$edited_fasta_file) || die "Could not open edited_fasta_file for writing\n";
36 | open (MAP, "$map_file") or die "Can't open ".$map_file."\n";
37 |
38 | while(