├── LICENSE.md
├── README.md
├── python
    ├── base
    │   ├── blastout.py
    │   ├── blastout.pyc
    │   ├── fasta.py
    │   ├── fasta.pyc
    │   ├── gff3.py
    │   ├── gff3.pyc
    │   ├── goterm.py
    │   ├── goterm.pyc
    │   ├── low.py
    │   ├── low.pyc
    │   ├── misa.py
    │   ├── misa.pyc
    │   ├── muscle.py
    │   ├── muscle.pyc
    │   ├── needlemanwunsch.py
    │   ├── needlemanwunsch.pyc
    │   ├── newick.py
    │   ├── newick.pyc
    │   ├── orthocluster.py
    │   ├── orthocluster.pyc
    │   ├── orthomcl.py
    │   ├── orthomcl.pyc
    │   ├── pfam.py
    │   ├── pfam.pyc
    │   ├── sciroko.py
    │   ├── sciroko.pyc
    │   ├── stats.py
    │   └── stats.pyc
    ├── blast
    │   ├── benchmark_blast.py
    │   ├── blast-best-hit-per-query.py
    │   ├── cluster-paralogs.py
    │   ├── cluster_sequences.py
    │   ├── homology_blast.py
    │   ├── paralogs-from-selfblast.py
    │   ├── parse-best-blast-hit.py
    │   ├── parse-blastout-xml.py
    │   ├── parse_blast_annotate.py
    │   ├── parse_blast_out2.py
    │   ├── parse_blast_out3.py
    │   └── remove-from-blastout.py
    ├── fasta
    │   ├── assembly-stats.py
    │   ├── concatenate-alignments.py
    │   ├── create-clusters.py
    │   ├── create_fasta_clusters.py
    │   ├── fasta-extract-fragment.py
    │   ├── fasta-length-per-file.py
    │   ├── fasta-length-per-sequence.py
    │   ├── fasta-length-stats.py
    │   ├── fasta-length.py
    │   ├── fasta-sort.py
    │   ├── fasta-starts-with-meth.py
    │   ├── fasta-to-swapsc-input.py
    │   ├── fasta2flat.py
    │   ├── fasta2phylip.py
    │   ├── fastasplit.py
    │   ├── gc-content-from-fasta.py
    │   ├── generate-fasta-aa-nt.py
    │   ├── get-all-possible-translations.py
    │   ├── get-cluster-sequences.py
    │   ├── get-sequence-between-genes.py
    │   ├── import-fasta-sequence.py
    │   ├── index-fasta.py
    │   ├── reduce_fasta_file.py
    │   ├── remove-from-fasta.py
    │   ├── remove-stopcodons-from-fasta.py
    │   ├── rename-fasta-sequences.py
    │   ├── rename-geneids.py
    │   ├── seqlength.py
    │   ├── stockholm-to-fasta.py
    │   ├── translatedprot_from_gb_to_fasta.py
    │   └── uniprot-dat-to-fasta.py
    ├── geneontology
    │   ├── add-parental-go-terms.py
    │   ├── go-enrichment.py
    │   ├── go-enrichment2.py
    │   ├── go-from-blastout.py
    │   ├── go2slim.py
    │   ├── goflat2grouptable.py
    │   ├── goflat2topgo.py
    │   ├── goid2name-from-obo-xml.py
    │   ├── goid2name-from-obo.py
    │   ├── goodness-of-fit.py
    │   └── goterms-to-xdom.py
    ├── generic
    │   ├── add-basename-as-first-col.py
    │   ├── add-species-as-first-col.py
    │   ├── add_to_xdom.py
    │   ├── addid2xdom.py
    │   ├── difference.py
    │   ├── flat-split-by-lines.py
    │   ├── flat2line.py
    │   ├── flat2matrix.py
    │   ├── flat2sqlinject.py
    │   ├── flat2xdom.py
    │   ├── grab-columns.py
    │   ├── intersection.py
    │   ├── map.py
    │   ├── parallel-processes.py
    │   ├── search-replace.py
    │   ├── subtract.py
    │   ├── text2range.py
    │   ├── xdom2flat.py
    │   └── z-score-stats.py
    ├── gff
    │   ├── droso-chromosome-reconstruction.py
    │   ├── droso-introns-exons.py
    │   ├── get-missing-exons.py
    │   ├── gff2orthocluster.py
    │   ├── intra-and-intergenic-orthologous-regions.py
    │   ├── overlapping-cds-from-gff.py
    │   ├── plot-genomic-region.py
    │   └── splice-forms-from-gff.py
    ├── kegg
    │   ├── kegg-enzyme2ko.py
    │   ├── kegg-extractor.py
    │   ├── kegg-parser.py
    │   └── kegg2xdom.py
    ├── latex-bibtex
    │   ├── bibtex-number-of-coauthors.py
    │   └── latex-rename.py
    ├── misa
    │   ├── add-features-to-misa.py
    │   ├── add-localization-to-misa.py
    │   ├── exonic-ssrs-to-genes.py
    │   ├── gc-content-from-misa.py
    │   ├── get-transcript-and-protein-per-droso-gene.py
    │   ├── import-into-sqlite3.py
    │   ├── misa-global-stats.py
    │   ├── misa-single-genome-stats.py
    │   ├── orth-all-pairwise-ssr-comparison.py
    │   ├── orth-all-vs-all-ssr-comparison.py
    │   ├── orth-report-conserved-ssrs.py
    │   ├── orth-ssr-comparison.py
    │   ├── ortho-pairwise-exon-intron.py
    │   ├── ortho-pairwise-intra-intergenic.py
    │   ├── qc-orthologous-regions.py
    │   ├── split-compound-ssrs.py
    │   ├── ssr-to-amino-acid.py
    │   ├── ssr-to-pfam.py
    │   └── test
    ├── openreadingframe
    │   ├── ORFPREDICTORRR.py
    │   ├── orf_prediction_part1.py
    │   ├── orf_prediction_part2.py
    │   └── stats_predicted_orfs.py
    ├── orthomcl
    │   ├── add-blasthits-to-cluster.py
    │   ├── build-clusters-nt.py
    │   ├── build-clusters.py
    │   ├── build-counts-table.py
    │   ├── build-orthomcl-like-output.py
    │   ├── cluster2arath.py
    │   ├── geneid2cluster.py
    │   ├── map-orthomcl-clusters.py
    │   ├── orthomcl-blastparse.py
    │   ├── paralogs-per-cluster.py
    │   ├── remove-paralogs.py
    │   ├── speciesids4orthomcl.py
    │   ├── table-of-gene-id-per-cluster.py
    │   └── tree-for-codeml.py
    ├── paml
    │   ├── PAML_Ka_Ks.py
    │   ├── codeml-parallel.py
    │   ├── get-paml-results.py
    │   ├── paml-codeml.py
    │   ├── paml-lrt-bic.py
    │   ├── parse_codeml-modelA.py
    │   ├── parse_codeml.py
    │   └── plot-codeml-model-A-digest.py
    ├── pfam
    │   ├── pfam-domain-counts.py
    │   ├── pfam-filter-output.py
    │   ├── pfam-mapping.py
    │   ├── pfam-pid2arrangement.py
    │   ├── pfam-pid2clan.py
    │   └── pfamtable-from-pid-annotation.py
    ├── phylip
    │   └── create-distance-matrix.py
    ├── sciroko
    │   ├── import-into-sqlite3.py
    │   └── sciroko-single-genome-stats.py
    ├── signalp
    │   └── signalp-report-hits.py
    └── swapsc
    │   ├── evolve4swapsc.py
    │   ├── parse-swapsc.py
    │   ├── swapsc.py
    │   ├── swapsee-table-annotation.py
    │   └── swapsee.py
└── ruby
    ├── geneontology
        ├── go-enrichment-summary.rb
        ├── go-eval.rb
        ├── termcloud-from-go-enrichment.rb
        ├── termcloud-from-go-enrichment2-comp.rb
        ├── termcloud-from-go-enrichment2.rb
        └── termtable-from-go-enrichment2.rb
    ├── generic
        └── wordwrap.rb
    ├── pfam
        ├── hmmout_annotation.rb
        └── length2hmmout.rb
    └── swapsc
        ├── bio-graphics-plot.rb
        └── visualize-swapsc.rb


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | =====================
 3 | 
 4 | Copyright (c) 2012 Lothar Wissler
 5 | ---------------------------------
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository contains a broad, but probably incomplete, collection of scripts I developed for my bioinformatics analyses.
 2 | 
 3 | 
 4 | Organization of files
 5 | ---------------------
 6 | Thus far, *python* and *ruby* scripts are included in separate folders. I have further created subfolders for scripts that relate to specific programs/databases (e.g. Pfam, GeneOntology) or file formats (e.g. fasta, gff). The *base* folder includes sources that may be imported and therefore required by other scripts.
 7 | 
 8 | 
 9 | Documentation
10 | -------------
11 | Documentation is almost non-existant, and if present in scripts, it may be outdated. However, naming of the scripts themsolves as well as their subroutines should make it easy to reconstruct how/when/where each of the programs can be applied.
12 | 
13 | 


--------------------------------------------------------------------------------
/python/base/blastout.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | # =============================================================================
 4 | class BlastHit:
 5 |   def __init__(self, line):
 6 |     cols = line.split("\t")
 7 |     self.qid, self.hid = cols.pop(0), cols.pop(0)
 8 |     self.identity = float(cols.pop(0))
 9 |     self.alnlen = int(cols.pop(0))
10 |     self.mismatch = int(cols.pop(0))
11 |     self.gap = int(cols.pop(0))
12 |     self.qstart = int(cols.pop(0))
13 |     self.qstop = int(cols.pop(0))
14 |     self.hstart = int(cols.pop(0))
15 |     self.hstop = int(cols.pop(0))
16 |     self.evalue = float(cols.pop(0))
17 |     self.score = float(cols.pop(0))
18 |     
19 |   def to_s(self):
20 |     out = []
21 |     out += [self.qid, self.hid, str(self.identity), str(self.alnlen)]
22 |     out += [str(self.mismatch), str(self.gap), str(self.qstart), str(self.qstop)]
23 |     out += [str(self.hstart), str(self.hstop), str(self.evalue), str(self.score)]
24 |     return string.join(out, "\t")
25 | 
26 | # =============================================================================
27 | def get_query_hash(blastoutfile, evalue=10.0):
28 |   qh = {}
29 |   fo = open(blastoutfile)
30 |   for line in fo:
31 |     line = line.rstrip()
32 |     if len(line) == 0 or line.startswith('#') or not len(line.split("\t")) == 12: continue
33 |     blasthit = BlastHit(line)
34 |     if blasthit.evalue > evalue: continue
35 |     if not qh.has_key(blasthit.qid): qh[blasthit.qid] = []
36 |     qh[blasthit.qid].append(blasthit)
37 |   fo.close()
38 |   return qh
39 | 
40 | # =============================================================================
41 | def get_sequence_hash(fastafile):
42 |   seqhash = {}
43 |   key = ""
44 |   fo = open(fastafile)
45 |   for line in fo:
46 |     if line.startswith(">"):
47 |       gid = re.match(">(\S+)", line).group(1)
48 |       key = gid
49 |       seqhash[key] = ""
50 |     else:
51 |       if key != "": seqhash[key] += line.strip()
52 |   fo.close()
53 |   return seqhash
54 | 


--------------------------------------------------------------------------------
/python/base/blastout.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/blastout.pyc


--------------------------------------------------------------------------------
/python/base/fasta.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import gzip 
 3 | 
 4 | # =============================================================================
 5 | def get_sequence_hash(fastafile):
 6 |   seqhash = {}
 7 |   key = ""
 8 |   if fastafile.endswith('.gz'): fo = gzip.open(fastafile)
 9 |   else: fo = open(fastafile)
10 |   for line in fo:
11 |     if line.startswith(">"):
12 |       gid = re.match(">(\S+)", line).group(1)
13 |       key = gid
14 |       seqhash[key] = ""
15 |     else:
16 |       if key != "": seqhash[key] += line.strip()
17 |   fo.close()
18 |   return seqhash
19 |   
20 | # =============================================================================
21 | def get_length_hash(fastafile):
22 |   lenhash = {}
23 |   key = ""
24 |   if fastafile.endswith('.gz'): fo = gzip.open(fastafile)
25 |   else: fo = open(fastafile)
26 |   for line in fo:
27 |     if line.startswith(">"):
28 |       gid = re.match(">(\S+)", line).group(1)
29 |       key = gid
30 |       lenhash[key] = 0
31 |     else:
32 |       if key != "": lenhash[key] += len(line.strip())
33 |   fo.close()
34 |   return lenhash
35 | 


--------------------------------------------------------------------------------
/python/base/fasta.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/fasta.pyc


--------------------------------------------------------------------------------
/python/base/gff3.py:
--------------------------------------------------------------------------------
 1 | import sys, string
 2 | 
 3 | def get_gff_hash(gffile):
 4 |   hash = {}
 5 |   fo = open(gffile)
 6 |   for line in fo:
 7 |     gf = GeneFeature(line)
 8 |     if not hash.has_key(gf.seqid): hash[gf.seqid] = []
 9 |     hash[gf.seqid].append(gf)
10 |   fo.close()
11 |   return hash
12 |   
13 | 
14 | class GeneFeature():
15 |   def __init__(self, line):
16 |     columns = line.rstrip().split("\t")
17 |     if not len(columns) == 9:
18 |       print >> sys.stderr, "GFF3 with incorrect number of columns. Expected: 9 | Observed: %s" % len(columns)
19 |       print >> sys.stderr, "\"%s\"" % line
20 |       sys.exit(1)
21 |     self.seqid = columns.pop(0)
22 |     self.source = columns.pop(0)
23 |     self.ftype = columns.pop(0)
24 |     self.start = int(columns.pop(0))
25 |     self.stop = int(columns.pop(0))
26 |     self.score = columns.pop(0)
27 |     self.strand = columns.pop(0)
28 |     self.phase = columns.pop(0)
29 |     self.attributes = columns.pop(0)
30 | 
31 |   def get_attributes(self):
32 |     hash = {}
33 |     for e in self.attributes.split(";"):
34 |       if e == '': continue
35 |       k, v = e.split("=")
36 |       hash[k] = v
37 |     return hash
38 |     
39 |   def set_attribute(self, key, value):
40 |     hash = {}
41 |     for e in self.attributes.split(";"):
42 |       if e == '': continue
43 |       k, v = e.split("=")
44 |       hash[k] = v
45 |     if hash.has_key(key):
46 |       hash[key] = value
47 |       self.attributes = ""
48 |       for k, v in hash.iteritems(): self.attributes += "%s=%s;" %(k, v)
49 |     else:
50 |       self.attributes += "%s=%s;" %(key, value)
51 |     
52 | 
53 |   def to_string(self):
54 |     return string.join([self.seqid, self.source, self.ftype, str(self.start), str(self.stop), self.score, self.strand, self.phase, self.attributes], "\t")
55 | 


--------------------------------------------------------------------------------
/python/base/gff3.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/gff3.pyc


--------------------------------------------------------------------------------
/python/base/goterm.py:
--------------------------------------------------------------------------------
 1 | class GOTerm():
 2 |   def __init__(self, lines):
 3 |     self.id = ""
 4 |     self.name = ""
 5 |     self.namespace = ""
 6 |     self.definition = ""
 7 |     self.is_a = []
 8 |     self.alt_ids = []
 9 |     self.xrefs = []
10 |     self.synonyms = []
11 |     self.obsolete = 0
12 | 
13 |     for line in lines:
14 |       line = line.strip()
15 |       if line.startswith("id: "): self.id = line[line.index(":")+2:]
16 |       if line.startswith("name: "): self.name = line[line.index(":")+2:]
17 |       if line.startswith("namespace: "): self.namespace = line[line.index(":")+2:]
18 |       if line.startswith("def: "): self.definition = line[line.index(":")+2:]
19 |       if line.startswith("is_a: "): self.is_a.append( line[line.index(":")+2:] )
20 |       if line.startswith("is_obsolete: true"): self.obsolete = 1
21 |       if line.startswith("alt_id: "): self.alt_ids.append( line[line.index(":")+2:] )
22 |       if line.startswith("xref: "): self.xrefs.append( line[line.index(":")+2:] )
23 |       if line.startswith("synonym: "): self.synonyms.append( line[line.index(":")+2:] )
24 | 
25 |   def get_id(self): return self.id
26 |   def get_name(self): return self.name
27 |   def get_namespace(self): return self.namespace
28 |   def get_definition(self): return self.definition
29 |   def get_is_a(self): return self.is_a
30 |   def get_is_a_goids(self): return [e.split()[0] for e in self.is_a]
31 |   def get_alt_ids(self): return self.alt_ids
32 |   def get_xrefs(self): return self.xrefs
33 |   def get_synonyms(self): return self.synonyms
34 |   def get_is_obsolete(self): return self.obsolete
35 | 


--------------------------------------------------------------------------------
/python/base/goterm.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/goterm.pyc


--------------------------------------------------------------------------------
/python/base/low.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/low.pyc


--------------------------------------------------------------------------------
/python/base/misa.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | class MisaSSRspecies():
 4 |   def __init__(self, line):
 5 |     self.feature = 0
 6 |     columns = line.rstrip().split("\t")
 7 |     self.species = columns.pop(0)
 8 |     self.geneid = columns.pop(0)
 9 |     self.ssrnr = int(columns.pop(0))
10 |     self.type = columns.pop(0)
11 |     self.pattern = columns.pop(0)
12 |     self.length = int(columns.pop(0))
13 |     self.startpos = int(columns.pop(0))
14 |     self.endpos = int(columns.pop(0))
15 |     if len(columns) > 0: self.feature = columns.pop(0)
16 |     if self.type != "c" and self.type != "c*":
17 |       self.motif = self.pattern[1:self.pattern.index(")")]
18 |       if self.pattern.endswith("*"): self.repeats = int(self.pattern[self.pattern.index(")")+1:-1])
19 |       else: self.repeats = int(self.pattern[self.pattern.index(")")+1:])
20 | 
21 |   def to_s(self):
22 |     array = [self.species, self.geneid, str(self.ssrnr), self.type, self.pattern, str(self.length), str(self.startpos), str(self.endpos)]
23 |     return string.join(array, "\t")
24 | 
25 |   def is_perfect_match_to(self, other):
26 |     if self.pattern != other.pattern: return 0
27 |     return 1
28 | 
29 |   def is_polymorphic_to(self, other):
30 |     if self.motif != other.motif: return 0
31 |     if self.repeats == other.repeats: return 0
32 |     return 1
33 | 
34 |   def is_shifted_to(self, other):
35 |     if self.motif == other.motif: return 0
36 |     if self.type != other.type: return 0
37 |     m = self.motif
38 |     for i in range(len(self.motif)):
39 |       m = m[1:] + m[0]
40 |       if m == other.motif: return 1
41 |     return 0
42 | 
43 | 
44 | class MisaSSR():
45 |   def __init__(self, line):
46 |     self.feature = 0
47 |     columns = line.rstrip().split("\t")
48 |     self.geneid = columns.pop(0)
49 |     self.ssrnr = int(columns.pop(0))
50 |     self.type = columns.pop(0)
51 |     self.pattern = columns.pop(0)
52 |     self.length = int(columns.pop(0))
53 |     self.startpos = int(columns.pop(0))
54 |     self.endpos = int(columns.pop(0))
55 |     if len(columns) > 0: self.feature = columns.pop(0)
56 |     if self.type != "c" and self.type != "c*":
57 |       self.motif = self.pattern[1:self.pattern.index(")")]
58 |       if self.pattern.endswith("*"): self.repeats = int(self.pattern[self.pattern.index(")")+1:-1])
59 |       else: self.repeats = int(self.pattern[self.pattern.index(")")+1:])
60 | 
61 |   def to_s(self):
62 |     array = [self.geneid, str(self.ssrnr), self.type, self.pattern, str(self.length), str(self.startpos), str(self.endpos)]
63 |     return string.join(array, "\t")
64 | 
65 |   def is_perfect_match_to(self, other):
66 |     if self.pattern != other.pattern: return 0
67 |     return 1
68 | 
69 |   def is_polymorphic_to(self, other):
70 |     if self.motif != other.motif: return 0
71 |     if self.repeats == other.repeats: return 0
72 |     return 1
73 | 
74 |   def is_shifted_to(self, other):
75 |     if self.motif == other.motif: return 0
76 |     if self.type != other.type: return 0
77 |     m = self.motif
78 |     for i in range(len(self.motif)):
79 |       m = m[1:] + m[0]
80 |       if m == other.motif: return 1
81 |     return 0
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/python/base/misa.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/misa.pyc


--------------------------------------------------------------------------------
/python/base/muscle.py:
--------------------------------------------------------------------------------
 1 | import tempfile, os, fasta
 2 | 
 3 | # =============================================================================
 4 | def align(sequences, ids, outfile=False):
 5 |   h, infile = tempfile.mkstemp()
 6 |   os.close(h)
 7 |   fw = open(infile, 'w')
 8 |   for i in range(len(sequences)): fw.write(">" + ids[i] + "\n" + sequences[i] + "\n")
 9 |   fw.close()
10 |   h, outfile = tempfile.mkstemp()
11 |   os.close(h)
12 |   os.system("muscle -in %s -out %s -quiet 2> /dev/null" %(infile, outfile))
13 |   os.unlink(infile)
14 |   aligned_sequences = []
15 |   alnhash = fasta.get_sequence_hash(outfile)
16 |   for gid in ids: aligned_sequences.append(alnhash[gid])
17 |   os.unlink(outfile)
18 |   return aligned_sequences
19 | 


--------------------------------------------------------------------------------
/python/base/muscle.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/muscle.pyc


--------------------------------------------------------------------------------
/python/base/needlemanwunsch.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # =============================================================================
 4 | def align(array1, array2, gap = -2, match = 1, mismatch = -1):
 5 |     """Performs Needleman-Wunsch alignment of string1 and string2.
 6 |     Prints out the alignment and returns the array of scores and pointers(arrows).
 7 | 
 8 |     Example usage from an interactive shell:
 9 |         from NeedlemanWunsch import NW
10 |         Scores, Pointers = NW('PELICAN','COELACANTH')
11 | 
12 |     This is modified from a Perl implementation in the book BLAST by Korf, et al.
13 |     """
14 |     # initialize scoring and 'arrow' matrices to 0
15 |     Scores = [[0 for x in range(len(array2)+1)] for y in range(len(array1)+1)]
16 |     Pointers = [[0 for x in range(len(array2)+1)] for y in range(len(array1)+1)]
17 | 
18 |     # initialize borders
19 |     # for pointers (arrows), use 2 for diagonal, -1 for horizontal, and 1 for vertical moves (an arbitrary system).
20 |     # I have tried to consistently use i for rows (vertical positions) in the score and pointer tables, and j for columns (horizontal positions).
21 |     for i in range(len(array1)+1):
22 |         Scores[i][0] = gap*i
23 |         Pointers[i][0] = 1 
24 |     for j in range(len(array2)+1):
25 |         Scores[0][j] = gap*j
26 |         Pointers[0][j] = -1
27 | 
28 |     # fill with scores
29 |     for i in range(1,len(array1)+1):
30 |         for j in range(1,len(array2)+1):
31 |             letter1 = array1[i-1]
32 |             letter2 = array2[j-1]
33 |             if letter1 == letter2: 
34 |               DiagonalScore = Scores[i-1][j-1] + match
35 |             else: DiagonalScore = Scores[i-1][j-1] + mismatch
36 |             HorizontalScore = Scores[i][j-1] + gap 
37 |             UpScore = Scores[i-1][j] + gap
38 |             # TempScores is list of the three scores and their pointers
39 |             TempScores = [[DiagonalScore,2],[HorizontalScore,-1],[UpScore,1]]
40 |             # Now we keep the highest score, and the associated direction (pointer)
41 |             Scores[i][j], Pointers[i][j] = max(TempScores)
42 | 
43 |     # backtrace from the last entry.  
44 |     [i,j] = [len(array1),len(array2)]
45 |     align1 = []
46 |     align2 = []
47 |     while [i,j] != [0,0]:
48 |         if Pointers[i][j] == 2:
49 |             align1.append(array1[i-1])
50 |             align2.append(array2[j-1])
51 |             i = i - 1
52 |             j = j - 1
53 |         elif Pointers[i][j] == -1:
54 |             align1.append('-')
55 |             align2.append(array2[j-1])
56 |             j = j - 1
57 |         else:
58 |             align1.append(array1[i-1])
59 |             align2.append('-')
60 |             i = i - 1
61 | 
62 |     # the alignments have been created backwards, so we need to reverse them:
63 |     align1 = align1[::-1]
64 |     align2 = align2[::-1]
65 | 
66 |     # print out alignment
67 |     #print align1
68 |     #print align2
69 | 
70 |     # in case you want to look at the scores and pointers, the function returns them
71 |     return [Scores,Pointers, [align1, align2]]
72 | 
73 | 


--------------------------------------------------------------------------------
/python/base/needlemanwunsch.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/needlemanwunsch.pyc


--------------------------------------------------------------------------------
/python/base/newick.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/newick.pyc


--------------------------------------------------------------------------------
/python/base/orthocluster.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # =============================================================================
 4 | def parse(clusterfile, poshash={}):
 5 |   nspecies = 0
 6 |   clusterhash = {}
 7 |   fo = open(clusterfile)
 8 |   while 1:
 9 |     line = fo.readline()
10 |     if not line: break
11 |     if line.startswith("No. of sequence"): nspecies = int(line.split()[-1])
12 |     if line.startswith("CL-"):
13 |       cols = line.split()
14 |       sc = OrthoCluster(cols[0])
15 |       ngenes = int(max(cols[1:nspecies+1]))
16 |       fo.readline()
17 |       for i in range(ngenes):
18 |         cols = fo.readline().split()
19 |         for j in range(nspecies):
20 |           cols.pop(0)
21 |           cols.pop(0)
22 |           strand = cols.pop(0)
23 |           scaffold = cols.pop(0)
24 |           geneid = cols.pop(0)
25 |           if poshash.has_key(geneid):
26 |             startpos, endpos = poshash[geneid][1:3]
27 |           else:
28 |             print >> sys.stderr, "geneid", geneid, "not found in poshash"
29 |             startpos, endpos = None, None
30 |           sr = SyntenicRegion(geneid, scaffold, strand, startpos, endpos)
31 |           sc.add_syntenic_region(sr, j)
32 |       clusterhash[sc.id] = sc
33 |   fo.close()
34 |   return nspecies, clusterhash
35 | 
36 | 
37 | # =============================================================================
38 | class SyntenicRegion():
39 |   def __init__(self, geneid, scaffold, strand, startpos, endpos):
40 |     self.geneid = geneid
41 |     self.scaffold = scaffold
42 |     self.strand = strand
43 |     self.startpos = startpos
44 |     self.endpos = endpos
45 | 
46 | # =============================================================================
47 | class OrthoCluster():
48 |   def __init__(self, clusterid):
49 |     self.id = clusterid
50 |     self.syntenic_regions = {}
51 | 
52 |   def add_syntenic_region(self, sr, index):
53 |     if not self.syntenic_regions.has_key(index): self.syntenic_regions[index] = []
54 |     self.syntenic_regions[index].append(sr)
55 | 


--------------------------------------------------------------------------------
/python/base/orthocluster.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/orthocluster.pyc


--------------------------------------------------------------------------------
/python/base/orthomcl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class OrthoMCLCluster():
 4 |   def __init__(self, line):
 5 |     descr, genedefs = line.split("\t")
 6 |     genedefs = genedefs.split()
 7 |     self.name = descr[:descr.index('(')].lower()
 8 |     self.geneHash = {}
 9 |     self.speciesHash = {}
10 |     for genedef in genedefs:
11 |       geneid = genedef[:genedef.index('(')]
12 |       species = genedef[genedef.index('(')+1:-1]
13 |       self.geneHash[geneid] = species
14 |       if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid)
15 |       else: self.speciesHash[species] = [geneid]
16 | 
17 |   def add_gene(self, geneid, species):
18 |     if not self.geneHash.has_key(geneid):
19 |       self.speciesHash[species].append(geneid)
20 |       self.geneHash[geneid] = species
21 |   def get_name(self): return self.name
22 |   def get_count(self): return len(self.geneHash)
23 |   def get_gene_hash(self): return self.geneHash
24 |   def get_species_hash(self): return self.speciesHash
25 |   def to_s(self):
26 |     sys.stdout.write(self.name + "(" + str(len(self.geneHash)) + " genes, " + str(len(self.speciesHash)) + ")\t")
27 |     first = 1
28 |     for geneid, species in self.geneHash.iteritems():
29 |       if first == 0: sys.stdout.write(" ")
30 |       first = 0
31 |       sys.stdout.write(geneid + "(" + species + ")")
32 |     sys.stdout.write("\n")
33 | 
34 | 


--------------------------------------------------------------------------------
/python/base/orthomcl.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/orthomcl.pyc


--------------------------------------------------------------------------------
/python/base/pfam.py:
--------------------------------------------------------------------------------
 1 | def read_hmmout(ifile, evalue=10, matchreq=0.0):
 2 |   hash = {}
 3 |   fo = open(ifile)
 4 |   for line in fo:
 5 |     cols = line.strip().split()
 6 |     if len(cols) == 16: 
 7 |       i = []
 8 |       i.append(line.index("\t"))
 9 |       if line.count(" ") > 0: i.append(line.index(" "))
10 |       line = line[min(i):]
11 |     pd = PfamDomain(line)
12 |     if float(pd.get_attr('E-value')) > evalue: continue
13 |     if matchreq > 0 and ((float(pd.get_attr('alignment_end'))-float(pd.get_attr('alignment_start')))/float(pd.get_attr('hmm_length'))) < matchreq: continue
14 |     #print pd.get_attr('seq_id'), pd.get_attr('hmm_name')
15 |     #print pd.get_attr('seq_id')
16 |     if not hash.has_key(pd.get_attr('seq_id')): hash[pd.get_attr('seq_id')] = []
17 |     hash[pd.get_attr('seq_id')].append(pd)
18 |   fo.close()
19 |   return hash
20 |   
21 | 
22 | class PfamDomain():
23 |   def __init__(self, line):
24 |     self.attributes = ['seq_id', 'alignment_start', 'alignment_end', 'envelope_start', 'envelope_end', 'hmm_acc', 'hmm_name', 'type', 'hmm_start', 'hmm_end', 'hmm_length', 'bit_score', 'E-value', 'significance', 'clan']
25 |     line = line.strip()
26 |     self.values =  line.split()
27 | 
28 |   def get_attr(self, name):
29 |     if not name in self.attributes: return ""
30 |     return self.values[ self.attributes.index(name) ]
31 |   
32 |   def covers(self, position):
33 |     position = int(position)
34 |     if int(self.get_attr('alignment_start')) <= position and int(self.get_attr('alignment_end')) >= position:
35 |       return True
36 |     return False
37 | 


--------------------------------------------------------------------------------
/python/base/pfam.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/pfam.pyc


--------------------------------------------------------------------------------
/python/base/sciroko.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/sciroko.pyc


--------------------------------------------------------------------------------
/python/base/stats.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import rpy2.robjects 
 3 | R = rpy2.robjects.r
 4 | 
 5 | # =============================================================================
 6 | def correlate(x, y, method="pearson"):
 7 |   """
 8 |   performs a correlation between two vectors (assumed floats) and a given 
 9 |   correlation method. returns cor.coefficient and p-value.
10 |   """
11 |   xr = rpy2.robjects.FloatVector(x)
12 |   yr = rpy2.robjects.FloatVector(y)
13 |   res = R['cor.test'](xr, yr, method=method)
14 |   #for i in range(len(res)):
15 |   #  k = res.names[i]
16 |   #  v = res[i]
17 |   #  print i, "|", k, "=", v
18 |   p = res.subset('p.value')[0][0]
19 |   cor = res.subset('estimate')[0][0]
20 |   return cor, p
21 |   
22 | # =============================================================================
23 | def average(array):
24 |   return numpy.average(array)
25 |   
26 | # =============================================================================
27 | def median(array):
28 |   return numpy.median(array)
29 |   
30 | 
31 | # =============================================================================
32 | def stdev(array):
33 |   return numpy.std(array)
34 |   
35 | 


--------------------------------------------------------------------------------
/python/base/stats.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/stats.pyc


--------------------------------------------------------------------------------
/python/blast/blast-best-hit-per-query.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import getopt      # comand line argument handling
 6 | from low import *  # custom functions, written by myself
 7 | import blastout
 8 | 
 9 | # =============================================================================  
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
13 |   stdout( " " )
14 |   stdout( " option    description" )
15 |   stdout( " -h        help (this text here)" )
16 |   stdout( " -f        blastout file (-m 8)" )
17 |   stdout( " " )
18 |   sys.exit(1)
19 | 
20 | # =============================================================================
21 | def handle_arguments():
22 |   """ verifies the presence of all necessary arguments and returns the data dir """
23 | 
24 |   if len ( sys.argv ) == 1:
25 |     stderr( "no arguments provided." )
26 |     show_help()  
27 |   
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
30 |   except getopt.GetoptError:
31 |     stderr( "invalid arguments provided." )
32 |     show_help()
33 | 
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f': args['blastoutfile'] = value
37 |     
38 |   for key in ['blastoutfile']:
39 |     if key.endswith("file"):
40 |       if not args_file_exists(args, key): show_help()
41 |     elif key.endswith("dir"):
42 |       if not args_dir_exists(args, key): show_help()
43 |     elif not args.has_key(key):
44 |       print >> sys.stderr, "missing argument", key
45 |       show_help()
46 |   return args
47 | 
48 | # =============================================================================
49 | def statusbar(current, total, message="", width=40):
50 |   progress = 1.0*current/total
51 |   if message != "": message = "[" + message + "]"
52 |   progressbar = "=" * int(progress*width)
53 |   while len(progressbar) < width: progressbar += " " 
54 |   sys.stderr.write("\r   0% " + progressbar + " 100% " + message)
55 |   if progress == 1.0: sys.stderr.write("\n")
56 |   
57 | 
58 | # =============================================================================
59 | # === MAIN ====================================================================
60 | # =============================================================================
61 | def main( args ):
62 |   q2hits = blastout.get_query_hash(args['blastoutfile'])
63 |   for qid, blasthits in q2hits.iteritems():
64 |     print blasthits[0].to_s()
65 | 
66 | # =============================================================================
67 | args = handle_arguments()
68 | main( args )
69 | 
70 | 


--------------------------------------------------------------------------------
/python/blast/parse-best-blast-hit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 		# low level handling, such as command line stuff
 4 | import string			# string methods available
 5 | import re					# regular expressions
 6 | import getopt			# comand line argument handling
 7 | import math				# match functions
 8 | from low import *	# custom functions, written by myself
 9 | 
10 | 
11 | 
12 | # =============================================================================	
13 | def show_help( ):
14 | 	""" displays the program parameter list and usage information """
15 | 	stdout( "usage: " + sys.argv[0] + " -b <path> [-f <path>]" )
16 | 	stdout( " " )
17 | 	stdout( " option    description" )
18 | 	stdout( " -h        help (this text here)" )
19 | 	stdout( " -f        path to the fasta file containing the record ids." )
20 | 	stdout( " -b        path to the blast.best-hit file of swiss-prot" )
21 | 	stdout( " " )
22 | 	
23 | 	sys.exit(1)
24 | 
25 | 
26 | # =============================================================================
27 | def handle_arguments():
28 | 	""" verifies the presence of all necessary arguments and returns the data dir """
29 | 	if len ( sys.argv ) == 1:
30 | 		stderr( "no arguments provided." )
31 | 		show_help()	
32 | 	
33 | 	try: # check for the right arguments
34 | 		keys, values = getopt.getopt( sys.argv[1:], "hb:f:" )
35 | 	except getopt.GetoptError:
36 | 		stderr( "invalid arguments provided." )
37 | 		show_help()
38 | 		
39 | 	blastbesthitfile, recordfile = '', ''
40 | 	for key, value in keys:
41 | 		if key == '-b':
42 | 			if not file_exists( value ):
43 | 				stderr( "invalid path in " + key )
44 | 				show_help()
45 | 			else:
46 | 				blastbesthitfile = value
47 | 		
48 | 		if key == '-f':
49 | 			if not file_exists( value ):
50 | 				stderr( "invalid path in " + key )
51 | 				show_help()
52 | 			else:
53 | 				recordfile = value
54 | 		
55 | 	if blastbesthitfile == '':
56 | 		stderr( "blast.best-hit file missing." )
57 | 		show_help()
58 | 	elif not file_exists( blastbesthitfile ):
59 | 		stderr( "blast.best-hit file does not exist." )
60 | 		show_help()
61 | 		
62 | 	if recordfile == '':
63 | 		stderr( "recordfile missing." )
64 | 		show_help()
65 | 	elif not file_exists( recordfile ):
66 | 		stderr( "recordfile does not exist." )
67 | 		show_help()
68 | 		
69 | 	return blastbesthitfile, recordfile
70 | 
71 | 
72 | # =============================================================================
73 | def parse_best_blast_hits( blastbesthitfile, recordfile ):
74 | 	""" """
75 | 	
76 | 	records = []
77 | 	fo = open( recordfile, 'r' )
78 | 	for line in fo:
79 | 		records.append(line.strip().replace('\n',''))
80 | 	
81 | 	fo = open( blastbesthitfile, 'r' )
82 | 	for line in fo:
83 | 		columns = line.split()
84 | 		if columns[0] in records:
85 | 			print columns[0]
86 | 			print "   hit   :", string.join(columns[10:], ' ')[1:]
87 | 			print "   evalue:", columns[4], "\n"
88 | 		
89 | 	fo.close()
90 | 	
91 | 	
92 | 
93 | # =============================================================================
94 | # === MAIN ====================================================================
95 | # =============================================================================
96 | 
97 | blastbesthitfile, recordfile = handle_arguments()
98 | parse_best_blast_hits( blastbesthitfile, recordfile )
99 | 


--------------------------------------------------------------------------------
/python/blast/parse-blastout-xml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys, getopt, string
 4 | from Bio.Seq import Seq
 5 | from Bio.Blast import NCBIXML
 6 | from Bio.Alphabet import IUPAC
 7 | 
 8 | #==============================================================================
 9 | def show_help():
10 |   print """%s parses BLASTX XML output to STDOUT
11 |   
12 |   Options:
13 |   -f:\tBLASTX output in XML format
14 |   -n:\tnumber of best hits to be parsed (default: 1)
15 |   -e:\tmaximum e-value to accept hits (default: 1e-5)
16 | 
17 | 	What this program does:
18 | 	It takes the best hit's start and endposition from BLAST, applies it to the sequence in your query (e.g. the CAP3-output),
19 | 	and translates to the left resp. right from the start resp. end of your CAP3-output, until a Start-orStopcodon appears.
20 |   """ % sys.argv[0]
21 | 
22 |   sys.exit(1)
23 | 
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     sys.stderr.write( "no arguments provided.\n" )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:n:e:" )
34 |   except getopt.GetoptError:
35 |     sys.stderr.write( "invalid arguments provided.\n" )
36 |     show_help()
37 | 
38 |   args = {}
39 |   args['numhits'] = 1
40 |   args['evalue'] = float('1e-5')
41 |   for key, value in keys:
42 |     if key == '-f': args['blastfile'] = value
43 |     if key == '-n': args['numhits'] = int(value)
44 |     if key == '-e': args['evalue'] = float(value)
45 |     
46 |   if not args.has_key('blastfile'):
47 |     sys.stderr.write( "blastx XML file argument missing.\n" )
48 |     show_help()
49 |   elif not os.path.exists( args.get('blastfile') ) or not os.path.isfile( args.get('blastfile') ):
50 |     sys.stderr.write( "blastx XML file does not exist.\n" )
51 |     show_help()
52 | 
53 |   return args
54 | 
55 | 
56 | #==============================================================================
57 | def main(args):
58 |   #print "Working..."
59 |   header = ['query', 'hit', 'frame', 'query_startpos', 'query_endpos', 'subject_startpos', 'subject_endpos', 'evalue', 'score']
60 |   print '#', string.join(header, "\t")
61 |   XML = open( args.get('blastfile') )
62 |   blast_records = NCBIXML.parse(XML)
63 | 
64 |   for i in blast_records:
65 |   #  print i.query
66 |     count = 0
67 |     while count < args.get('numhits'):
68 |       count += 1
69 |       hit = i.alignments.pop(0)
70 |       hsp = hit.hsps[0]
71 |       if hsp.expect > args.get('evalue'): break
72 | #      print i.query, hit.title.split()[0], hsp.frame[0], hsp.query_start, hsp.query_start -1+ len(hsp.query)*3, hsp.sbjct_start, hsp.sbjct_start -1+ len(hsp.sbjct), hsp.expect, hsp.score
73 |       print string.join([i.query, hit.title.split()[0], 
74 |         str(hsp.frame[0]), 
75 |         str(hsp.query_start),
76 |         str(hsp.query_start -1+ len(hsp.query.replace('-', ''))*3), 
77 |         str(hsp.sbjct_start), 
78 |         str(hsp.sbjct_start -1+ len(hsp.sbjct)), 
79 |         str(hsp.expect),
80 |         str(hsp.score)], "\t")
81 | 
82 | 
83 | # =============================================================================
84 | args = handle_arguments()
85 | main( args )
86 | 


--------------------------------------------------------------------------------
/python/blast/remove-from-blastout.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | 
 9 | # =============================================================================  
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "usage: " + sys.argv[0] + " -f <path> -i <path>" )
13 |   stdout( " " )
14 |   stdout( " option    description" )
15 |   stdout( " -h        help (this text here)" )
16 |   stdout( " -b        blastout file (-m 8)" )
17 |   stdout( " -i        file with the IDs to keep" )
18 |   stdout( " " )
19 |   
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hi:b:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 |   
35 |   args = {}
36 |   args['verbose'] = 0
37 |   for key, value in keys:
38 |     if key == '-b': args['in-blastout'] = value
39 |     if key == '-i':  args['in-ids'] = value
40 | 
41 |   if not args.has_key('in-blastout'):
42 |     stderr( "in-blastout file missing." )
43 |     show_help()
44 |   if not args.has_key('in-ids'):
45 |     stderr( "in-ids file missing." )
46 |     show_help()
47 |     
48 |   if not file_exists( args.get('in-blastout') ):
49 |     stderr( "in-blastout file does not exist." )
50 |     show_help()
51 |   if not file_exists( args.get('in-ids') ):
52 |     stderr( "in-ids file does not exist." )
53 |     show_help()
54 |   
55 |   return args
56 | 
57 | # =============================================================================
58 | def get_ids_to_remove( args ):
59 |   """
60 |   reads in the in-ids file and gathers all IDs to which
61 |   the out fasta file will be reduced to.
62 |   """
63 |   fo = open( args.get('in-ids'), 'r' )
64 |   ids = {}
65 |   for line in fo:
66 |     line = line.rstrip()
67 |     ids[ line.replace('>','') ] = 1
68 |   fo.close()
69 |   return ids
70 |   
71 |   
72 | # =============================================================================
73 | def reduce_blastout( args, rmids ):
74 |   """
75 |   reads in in-fasta and creates out-fasta that only contains the records
76 |   whose id is contained in the hash keepids.
77 |   """
78 |   
79 |   retained = 0
80 |   fo = open( args.get('in-blastout') )
81 |   for line in fo:
82 |     line = line.rstrip()
83 |     if len(line) == 0: continue
84 |     hid, qid = line.split("\t")[0:2]
85 |     if rmids.has_key(hid) or rmids.has_key(qid): continue
86 |     print line
87 |     retained += 1
88 |   fo.close()
89 |     
90 | 
91 | # =============================================================================
92 | # === MAIN ====================================================================
93 | # =============================================================================
94 | 
95 | args = handle_arguments(  )
96 | rmids = get_ids_to_remove( args )
97 | reduce_blastout( args, rmids )
98 | 


--------------------------------------------------------------------------------
/python/fasta/concatenate-alignments.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from collections import defaultdict
 9 | 
10 | 
11 | # =============================================================================  
12 | def show_help( ):
13 |   """ displays the program parameter list and usage information """
14 |   stdout( "usage: " + sys.argv[0] + " -f <path> -i -n" )
15 |   stdout( " " )
16 |   stdout( " option    description" )
17 |   stdout( " -h        help (this text here)" )
18 |   stdout( " -e        file extension, e.g. \".muscle\"" )
19 |   stdout( " " )
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "he:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 | 
35 |   args = {}
36 |   for key, value in keys:
37 |     if key == '-e': args['ext'] = value
38 |     
39 |   if not args.has_key('ext'):
40 |     stderr( "ext argument missing." )
41 |     show_help()
42 |   
43 |   return args
44 | 
45 |   
46 | # =============================================================================
47 | def aln_is_conserved(file, min=0.85):
48 |   popenout = os.popen("~/bin/t-coffee -other_pg seq_reformat -in %s -output sim | tail -n 1" % file)
49 |   out = popenout.read()
50 |   popenout.close()
51 |   identity = float(out.split()[-1])
52 |   if identity > min: return 1
53 |   else: return 0
54 |   
55 | 
56 | # =============================================================================
57 | # === MAIN ====================================================================
58 | # =============================================================================
59 | def main( args ):
60 |   
61 |   added = 0
62 |   seqhash = defaultdict(str)
63 |   ext = args['ext']
64 |   for file in os.listdir('.'):
65 |     if added == 1500: break
66 |     if not file.endswith(ext): continue
67 |     if not aln_is_conserved(file): continue
68 |     fo = open(file)
69 |     for line in fo:
70 |       line = line.rstrip()
71 |       if line.startswith(">"):
72 |         id = line[1:]
73 |         if id.count(" ") > 0: id = id[:id.index(" ")]
74 |       else:
75 |         seqhash[id] += line
76 |     fo.close()
77 |     added += 1
78 |   for id, seq in seqhash.iteritems():
79 |     print ">" + id
80 |     print seq
81 | 
82 | # =============================================================================
83 | args = handle_arguments()
84 | main( args )
85 | 
86 | 


--------------------------------------------------------------------------------
/python/fasta/create-clusters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        ortholog cluster flat file to import" )
18 |   stdout( " -p        prefix to put in front of the number" )
19 |   stdout( " " )
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:p:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 | 
35 |   args = {}
36 |   args['prefix'] = 'orth.cluster.'
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     if key == '-p': args['prefix'] = value
40 |     
41 |   if not args.has_key('file'):
42 |     stderr( "import file argument missing." )
43 |     show_help()
44 |   elif not file_exists( args.get('file') ):
45 |     stderr( "import file does not exist." )
46 |     show_help()
47 |     
48 |   return args
49 | 
50 | 
51 | # =============================================================================
52 | # === MAIN ====================================================================
53 | # =============================================================================
54 | def main( args ):
55 |   
56 | 
57 |   counter = 0
58 |   fo = open( args.get('file') )
59 | 
60 |   for line in fo:
61 |     counter += 1
62 |     fw = open( args.get('prefix') + add_leading_zeroes( counter, 3 ) + '.ids', 'w' )
63 |     ids = line.split()
64 |     for id in ids: fw.write( id + "\n" )
65 |     fw.close()
66 | 
67 |   fo.close()
68 | 
69 | # =============================================================================
70 | args = handle_arguments()
71 | main( args )
72 | 
73 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-length-per-file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "reports for each fasta sequence the length in tab format"
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile"
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 2: usage()
12 |   inFile = sys.argv[1]
13 |   return inFile
14 | 
15 | 
16 | def parse_fasta_file(file):
17 |   lengthHash = {}
18 |   fo = open(file)
19 |   id = ""
20 |   length = 0
21 |   for line in fo:
22 |     line = line.strip()
23 |     if line.startswith(">"):
24 |       continue
25 |     else:
26 |       length += len(line)
27 |   fo.close()
28 |   base = file
29 |   if base.count(".") > 0: base = base[:base.index(".")]
30 |   if base.count("_") > 0: base = base[:base.index("_")]
31 |   print base + "\t" + str(length)
32 | 
33 | 
34 | def main():
35 |   inFile = plausi()
36 |   parse_fasta_file( inFile )
37 | 
38 | 
39 | main()
40 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-length-per-sequence.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "reports for each fasta sequence the length in tab format"
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile"
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 2: usage()
12 |   inFile = sys.argv[1]
13 |   return inFile
14 | 
15 | 
16 | def parse_fasta_file(file):
17 |   lengthHash = {}
18 |   fo = open(file)
19 |   id = ""
20 |   for line in fo:
21 |     line = line.strip()
22 |     if line.startswith(">"):
23 |       id = line[1:]
24 |       if id.count(" ") > 0: id = id[:id.index(" ")]
25 |       lengthHash[id] = 0
26 |     else:
27 |       lengthHash[id] += len(line)
28 |   for id, length in lengthHash.iteritems():
29 |     print id + "\t" + str(length)
30 | 
31 | 
32 | def main():
33 |   inFile = plausi()
34 |   parse_fasta_file( inFile )
35 | 
36 | 
37 | main()
38 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-length-stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "reports for each fasta sequence the length in tab format"
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile"
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 2: usage()
12 |   inFile = sys.argv[1]
13 |   return inFile
14 | 
15 | 
16 | def parse_fasta_file(file):
17 |   lengthHash = {}
18 |   fo = open(file)
19 |   id = ""
20 |   for line in fo:
21 |     line = line.strip()
22 |     if line.startswith(">"):
23 |       id = line[1:]
24 |       if id.count(" ") > 0: id = id[:id.index(" ")]
25 |       lengthHash[id] = 0
26 |     else:
27 |       lengthHash[id] += len(line)
28 |   for id, length in lengthHash.iteritems():
29 |     print id + "\t" + str(length)
30 | 
31 | 
32 | def main():
33 |   inFile = plausi()
34 |   parse_fasta_file( inFile )
35 | 
36 | 
37 | main()
38 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "reports all fasta files with one or more sequences < or > n characters"
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " folder \"<> n\""
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 3: usage()
12 |   inFolder, inCutoff = sys.argv[1:3]
13 |   inCut, inThreshold = inCutoff.split()
14 |   inThreshold = int(inThreshold)
15 |   return inFolder, inCut, inThreshold
16 | 
17 | 
18 | def parse_fasta_file(file):
19 |   lengthHash = {}
20 |   fo = open(file)
21 |   id = ""
22 |   for line in fo:
23 |     line = line.strip()
24 |     if line.startswith(">"):
25 |       id = line[1:]
26 |       lengthHash[id] = 0
27 |     else:
28 |       lengthHash[id] += len(line)
29 |   lengths = lengthHash.values()
30 |   lengths.sort()
31 |   return lengths[0]
32 | 
33 | 
34 | def test_threshold(length, inCut, inThreshold):
35 |   if inCut == ">" and length > inThreshold: return 1
36 |   if inCut == "<" and length < inThreshold: return 1
37 |   return 0
38 | 
39 | 
40 | def main():
41 |   inFolder, inCut, inThreshold = plausi()
42 |   for filename in os.listdir(inFolder):
43 |     if not filename.endswith(".fasta"): continue
44 |     minlength = parse_fasta_file( filename )
45 |     report = test_threshold(minlength, inCut, inThreshold)
46 |     if report: 
47 |       print os.path.split(filename)[1] + "\t" + str(minlength)
48 |     
49 | 
50 | 
51 | main()
52 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-sort.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm
 9 | 
10 | # =============================================================================	
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        fasta file" )
18 |   stdout( " " )
19 | 
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()	
28 | 
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:m:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 | 
35 |   args = {}
36 |   for key, value in keys:
37 |     if key == '-f':	args['aln'] = value
38 |         
39 |   if not args.has_key('aln'):
40 |     stderr( "fasta file missing." )
41 |     show_help()
42 |   if not file_exists( args.get('aln') ):
43 |     stderr( "fasta file does not exist." )
44 |     show_help()
45 |     
46 |   return args
47 | 
48 | # =============================================================================
49 | # =============================================================================
50 | def main( args ):
51 | 
52 |   #sys.stderr.write(args.get('aln') + "\t")
53 |   #sys.stderr.flush()
54 |   # create evolver control file based on the M0 out file
55 | 
56 |   hash = {}
57 |   id = ""
58 |   fo = open( args.get('aln') )
59 |   for line in fo:
60 |     line = line.rstrip()
61 |     if line.startswith(">"): 
62 |       id = line[1:]
63 |       hash[id] = ""
64 |     else:
65 |       hash[id] += line
66 |   fo.close()
67 | 
68 |   sorted_keys = hash.keys()
69 |   sorted_keys.sort()
70 |   for id in sorted_keys:
71 |     print ">" + id
72 |     seq = hash[id]
73 |     i = 0
74 |     while i < len(seq):
75 |       end = min([i+60, len(seq)])
76 |       print seq[i:end]
77 |       i += 60
78 | 
79 |   
80 | # =============================================================================
81 | # === MAIN ====================================================================
82 | # =============================================================================
83 | 
84 | args = handle_arguments(  )
85 | main( args )
86 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-starts-with-meth.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "reports for each fasta sequence the length in tab format"
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile"
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 2: usage()
12 |   inFile = sys.argv[1]
13 |   return inFile
14 | 
15 | 
16 | def parse_fasta_file(file):
17 |   fo = open(file)
18 |   id = ""
19 |   for line in fo:
20 |     line = line.strip()
21 |     if line.startswith(">"):
22 |       id = line[1:]
23 |       if id.count(" ") > 0: id = id[:id.index(" ")]
24 |       seq = ''
25 |     elif len(seq) == 0:
26 |       seq += line
27 |       if seq[0].upper() == 'M': print "%s\t1" % id
28 |       else: print "%s\t0" % id
29 | 
30 | 
31 | def main():
32 |   inFile = plausi()
33 |   parse_fasta_file( inFile )
34 | 
35 | 
36 | main()
37 | 


--------------------------------------------------------------------------------
/python/fasta/fasta-to-swapsc-input.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm
 9 | 
10 | # =============================================================================	
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        nt alignment file (fasta)" )
18 |   stdout( " -m        paml M0 out file" )
19 |   stdout( " " )
20 | 
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()	
29 | 
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:m:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f':	args['aln'] = value
39 |     if key == '-m':	args['m0'] = value
40 |         
41 |   if not args.has_key('aln'):
42 |     stderr( "aln file missing." )
43 |     show_help()
44 |   if not file_exists( args.get('aln') ):
45 |     stderr( "aln file does not exist." )
46 |     show_help()
47 |     
48 |   if not args.has_key('m0'):
49 |     stderr( "M0 file missing." )
50 |     show_help()
51 |   if not file_exists( args.get('m0') ):
52 |     stderr( "M0 file does not exist." )
53 |     show_help()
54 | 
55 |   return args
56 | 
57 | # =============================================================================
58 | # =============================================================================
59 | def main( args ):
60 | 
61 |   #sys.stderr.write(args.get('aln') + "\t")
62 |   #sys.stderr.flush()
63 |   # create evolver control file based on the M0 out file
64 |   fo = open( args.get('m0') )
65 |   line = ""
66 |   while not re.match("\s+\d+\s+\d+\s*$", line):
67 |     line = fo.readline()
68 |   numbers = line.split()
69 |   nspecies, length = numbers[0:2] 
70 |   fo.close()
71 | 
72 |   fo = open( args.get('aln') )
73 |   print "  " + nspecies + "  " + length + "\n"
74 |   for line in fo:
75 |     line = line.rstrip()
76 |     if line.startswith(">"): print line[1:]
77 |     else: print line
78 |   fo.close()
79 | 
80 |   
81 | # =============================================================================
82 | # === MAIN ====================================================================
83 | # =============================================================================
84 | 
85 | args = handle_arguments(  )
86 | main( args )
87 | 


--------------------------------------------------------------------------------
/python/fasta/fasta2flat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import getopt					# comand line argument handling
 6 | from low import *			# custom functions, written by myself
 7 | import fasta
 8 | 
 9 | # =============================================================================	
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
13 |   stdout( " " )
14 |   stdout( " option    description" )
15 |   stdout( " -h        help (this text here)" )
16 |   stdout( " -f        fasta file" )
17 |   stdout( " " )
18 | 
19 |   sys.exit(1)
20 | 
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 |   if len ( sys.argv ) == 1:
25 |     stderr( "no arguments provided." )
26 |     show_help()	
27 | 
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
30 |   except getopt.GetoptError:
31 |     stderr( "invalid arguments provided." )
32 |     show_help()
33 | 
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f':	args['fastafile'] = value
37 |         
38 |   if not args.has_key('fastafile'):
39 |     stderr( "fasta file missing." )
40 |     show_help()
41 |   if not file_exists( args.get('fastafile') ):
42 |     stderr( "fasta file does not exist." )
43 |     show_help()
44 |     
45 |   return args
46 | 
47 | # =============================================================================
48 | def get_sequences(file):
49 |   seqcount, alnlength = 0, 0
50 |   text = ''
51 |   fo = open(file)
52 |   for line in fo:
53 |     line = line.rstrip()
54 |     if line.startswith(">"):
55 |       id = line[1:]
56 |       if id.count(" ") > 0: id = id[:id.index(" ")]
57 |       text += "\n" + id + "\n"
58 |       seqcount += 1
59 |     else:
60 |       text += line
61 |       if seqcount == 1: alnlength += len(line)
62 |   fo.close()
63 |   return text, seqcount, alnlength
64 | 
65 | # =============================================================================
66 | # =============================================================================
67 | def main( args ):
68 |     for gid, seq in fasta.get_sequence_hash(args['fastafile']).iteritems():
69 |       print string.join([gid, seq], "\t")
70 | 
71 | # =============================================================================
72 | # === MAIN ====================================================================
73 | # =============================================================================
74 | 
75 | args = handle_arguments(  )
76 | main( args )
77 | 


--------------------------------------------------------------------------------
/python/fasta/fasta2phylip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | 
 9 | # =============================================================================	
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
13 |   stdout( " " )
14 |   stdout( " option    description" )
15 |   stdout( " -h        help (this text here)" )
16 |   stdout( " -f        fasta file" )
17 |   stdout( " " )
18 | 
19 |   sys.exit(1)
20 | 
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 |   if len ( sys.argv ) == 1:
25 |     stderr( "no arguments provided." )
26 |     show_help()	
27 | 
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:m:" )
30 |   except getopt.GetoptError:
31 |     stderr( "invalid arguments provided." )
32 |     show_help()
33 | 
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f':	args['aln'] = value
37 |         
38 |   if not args.has_key('aln'):
39 |     stderr( "fasta file missing." )
40 |     show_help()
41 |   if not file_exists( args.get('aln') ):
42 |     stderr( "fasta file does not exist." )
43 |     show_help()
44 |     
45 |   return args
46 | 
47 | # =============================================================================
48 | def get_sequences(file):
49 |   seqcount, alnlength = 0, 0
50 |   text = ''
51 |   fo = open(file)
52 |   for line in fo:
53 |     line = line.rstrip()
54 |     if line.startswith(">"):
55 |       id = line[1:]
56 |       if id.count(" ") > 0: id = id[:id.index(" ")]
57 |       text += "\n" + id + "\n"
58 |       seqcount += 1
59 |     else:
60 |       text += line
61 |       if seqcount == 1: alnlength += len(line)
62 |   fo.close()
63 |   return text, seqcount, alnlength
64 | 
65 | # =============================================================================
66 | # =============================================================================
67 | def main( args ):
68 |     
69 |     text, seqcount, alnlength = get_sequences(args['aln'])
70 |     sys.stdout.write(" %s %s" %( seqcount, alnlength ))
71 |     print text 
72 | 
73 | # =============================================================================
74 | # === MAIN ====================================================================
75 | # =============================================================================
76 | 
77 | args = handle_arguments(  )
78 | main( args )
79 | 


--------------------------------------------------------------------------------
/python/fasta/fastasplit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | import math
 8 | from low import *			# custom functions, written by myself
 9 | 
10 | # =============================================================================	
11 | def show_help( ):
12 | 	""" displays the program parameter list and usage information """
13 | 	stdout( "usage: " + sys.argv[0] + " -f <path> -n <x> -i <x>" )
14 | 	stdout( " " )
15 | 	stdout( " option    description" )
16 | 	stdout( " -h        help (this text here)" )
17 | 	stdout( " -f        fasta file" )
18 | 	stdout( " -n        size of each new fasta file (# seq)" )
19 | 	stdout( " -i        number of fasta files to split into" )
20 | 	stdout( " " )
21 | 	sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 | 	""" verifies the presence of all necessary arguments and returns the data dir """
26 | 	if len ( sys.argv ) == 1:
27 | 		stderr( "no arguments provided." )
28 | 		show_help()	
29 | 	
30 | 	try: # check for the right arguments
31 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:n:i:" )
32 | 	except getopt.GetoptError:
33 | 		stderr( "invalid arguments provided." )
34 | 		show_help()
35 | 	
36 | 	args = {}
37 | 	for key, value in keys:
38 | 		if key == '-f': args['fasta'] = value
39 | 		if key == '-n':	args['n'] = int(value)
40 | 		if key == '-i':	args['i'] = int(value)
41 | 				
42 | 	if not args.has_key('n') and not args.has_key('i'):
43 | 		stderr( "n or i missing." )
44 | 		show_help()
45 | 
46 | 	if not args.has_key('fasta'):
47 | 		stderr( "fasta file missing." )
48 | 		show_help()
49 | 	if not file_exists( args.get('fasta') ):
50 | 		stderr( "fasta file does not exist." )
51 | 		show_help()
52 | 		
53 | 	return args
54 | 
55 | 	
56 | # =============================================================================
57 | # =============================================================================
58 | def main( args ):
59 |   sout, serr = catch_bash_cmd_output( "grep '>' -c %s" % args.get('fasta') )
60 |   total = int( sout )
61 |   cut = total
62 |   seqcount = 0
63 |   filecount = 1
64 | 
65 |   if args.has_key('i'): cut = int(math.ceil( 1.0 * total / args.get('i') ))
66 |   else: cut = args.get('n')
67 | 
68 | 
69 |   fw = open( args.get('fasta') + '.' + add_leading_zeroes(filecount, 6), 'w' )
70 |   handle = open(args.get('fasta'))
71 |   for line in handle:
72 | 
73 |     if line[0] == ">": 
74 |       seqcount += 1
75 |       if ((seqcount % cut) == 1 and seqcount > 1) or (cut == 1 and seqcount > 1):
76 |         filecount += 1
77 |         fw.flush()
78 |         fw.close()
79 |         fw = open( args.get('fasta') + '.' + add_leading_zeroes(filecount, 6), 'w' )
80 | 
81 |     fw.write(line)
82 | 
83 |   fw.flush()
84 |   fw.close()
85 |   infomsg( "total.seq.count: %s | split.count: %s | file.count: %s" %(total, cut, filecount) )
86 | 	
87 | # =============================================================================
88 | # === MAIN ====================================================================
89 | # =============================================================================
90 | 
91 | args = handle_arguments(  )
92 | main( args )
93 | 


--------------------------------------------------------------------------------
/python/fasta/gc-content-from-fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from goterm import GOTerm
 9 | from collections import defaultdict
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        DNA fasta file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "fasta file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "fasta file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 |   
50 | # =============================================================================
51 | # === MAIN ====================================================================
52 | # =============================================================================
53 | def main( args ):
54 | 
55 |   counts = {'A':0, 'T':0, 'G':0, 'C':0}
56 |   fo = open(args['file'])
57 |   for line in fo:
58 |     if line.startswith(">"): continue
59 |     line = line.rstrip().upper()
60 |     for char in ['A', 'T', 'G', 'C']:
61 |       counts[char] += line.count(char)
62 | 
63 |   total = sum(counts.values())
64 |   gc = 1.0 * (counts['G'] + counts['C']) / total
65 |   base = args['file']
66 |   if base.count(".") > 0: base = base[:base.index(".")]
67 |   if base.count("_") > 0: base = base[:base.index("_")]
68 | 
69 |   print base + "\t" + str(gc)
70 | 
71 | # =============================================================================
72 | args = handle_arguments()
73 | main( args )
74 | 
75 | 


--------------------------------------------------------------------------------
/python/fasta/generate-fasta-aa-nt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm					# index databases (file hash)
 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance
10 | 
11 | # =============================================================================	
12 | def show_help( ):
13 | 	""" displays the program parameter list and usage information """
14 | 	stdout( "usage: " + sys.argv[0] + " -i <path> [-d <path>]" )
15 | 	stdout( " " )
16 | 	stdout( " option    description" )
17 | 	stdout( " -h        help (this text here)" )
18 | 	stdout( " -i        ID file" )
19 | 	stdout( " -d        directory to search for orthologs" )
20 | 	stdout( " " )
21 | 	
22 | 	sys.exit(1)
23 | 
24 | # =============================================================================
25 | def handle_arguments():
26 |   """ verifies the presence of all necessary arguments and returns the data dir """
27 |   if len ( sys.argv ) == 1:
28 |     stderr( "no arguments provided." )
29 |     show_help()	
30 | 
31 |   try: # check for the right arguments
32 |     keys, values = getopt.getopt( sys.argv[1:], "hi:d:" )
33 |   except getopt.GetoptError:
34 |     stderr( "invalid arguments provided." )
35 |     show_help()
36 | 
37 |   args = {}
38 |   for key, value in keys:
39 |     if key == '-d':	args['dir'] = value
40 |     if key == '-i':	args['idfile'] = value
41 |         
42 |   if args.has_key('dir') and not dir_exists( args.get('dir') ):
43 |     stderr( "dir folder does not exist." )
44 |     show_help()
45 |   if not args.has_key('dir'): args['dir'] = './'
46 |   if not args.get('dir').endswith('/'): args['dir'] = args.get('dir') + '/'
47 |     
48 |   if not args.has_key('idfile'):
49 |     stderr( "id file missing." )
50 |     show_help()
51 |   if not file_exists( args.get('idfile') ):
52 |     stderr( "id file does not exist" )
53 |     show_help
54 | 
55 |   return args
56 | 
57 |   
58 | # =============================================================================
59 | # =============================================================================
60 | def main( args ):
61 |   idlist = read_from_file( args.get('idfile') ).splitlines()
62 |   dir = args.get('dir')
63 | 
64 |   hash = {}
65 |   for id in idlist:
66 |     popenout = os.popen("grep -l \"%s\" %s*" %(id, dir))
67 |     out = popenout.read()
68 |     popenout.close()
69 |     outlines = out.splitlines()
70 | 
71 |     hash[ id ] = outlines
72 | 
73 |   aafile = args.get('idfile') + '.aa'
74 |   ntfile = args.get('idfile') + '.nt'
75 |   for id,files in hash.iteritems():
76 |     for file in files:
77 |       if not file.endswith('.aa') and not file.endswith('.nt'): continue
78 |       popenout = os.popen("grep -A 100 \"%s\" %s" %(id, file))
79 |       out = popenout.read()
80 |       popenout.close()
81 |       outlines = out.splitlines()
82 |       outlines.pop(0)
83 |       
84 |       if file.endswith('.aa'): outfile = aafile
85 |       else: outfile = ntfile
86 |       
87 |       os.system( "echo \">%s\" >> %s" %( id, outfile ) )
88 |       for line in outlines:
89 |         if not line.startswith(">"): os.system( "echo \"%s\" >> %s" %( line, outfile ) )
90 |         else: break
91 | 
92 | # =============================================================================
93 | # === MAIN ====================================================================
94 | # =============================================================================
95 | 
96 | args = handle_arguments(  )
97 | main( args )
98 | 


--------------------------------------------------------------------------------
/python/fasta/index-fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys 				# low level handling, such as command line stuff
 3 | import getopt					# comand line argument handling
 4 | import anydbm					# index databases (file hash)
 5 | from low import *			# collection of generic self-defined functions
 6 | 
 7 | 
 8 | # =============================================================================	
 9 | def show_help( ):
10 | 	""" displays the program parameter list and usage information """
11 | 	stdout( "usage: " + sys.argv[0] + " -f <path> -o <path>" )
12 | 	stdout( " " )
13 | 	stdout( " option    description" )
14 | 	stdout( " -h        help (this text here)" )
15 | 	stdout( " -f        input fasta file" )
16 | 	stdout( " -o        output dbm file" )
17 | 	stdout( " " )
18 | 	sys.exit(1)
19 | 	
20 | # =============================================================================
21 | def handle_arguments():
22 | 	""" verifies the presence of all necessary arguments and returns the data dir """
23 | 	if len ( sys.argv ) == 1:
24 | 		stderr( "no arguments provided." )
25 | 		show_help()	
26 | 	
27 | 	try: # check for the right arguments
28 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:o:" )
29 | 	except getopt.GetoptError:
30 | 		stderr( "invalid arguments provided." )
31 | 		show_help()
32 | 	
33 | 	args = {}
34 | 	for key, value in keys:
35 | 		if key == '-f': args['fasta'] = value
36 | 		if key == '-o':	args['out'] = value
37 | 	
38 | 	if not args.has_key('fasta'):
39 | 		stderr( "fasta file missing." )
40 | 		show_help()
41 | 	if not file_exists( args.get('fasta') ):
42 | 		stderr( "fasta file does not exist." )
43 | 		show_help()
44 | 	
45 | 	if not args.has_key('out'):
46 | 		stderr( "out file missing." )
47 | 		show_help()
48 | 		
49 | 	return args
50 | 
51 | 	
52 | # =============================================================================
53 | # =============================================================================
54 | def main( args ):
55 |   DBM = anydbm.open( args.get('out'), 'c' )
56 |   sout, serr = catch_bash_cmd_output( "grep '>' -c %s" %args.get('fasta') )
57 |   total = int( sout )
58 |   added = 0
59 |   fo = open( args.get('fasta') )
60 |   key, value = '', ''
61 |   for line in fo:
62 |     line = line.rstrip()
63 |     if line.startswith('>'):
64 |       if key != '' and value != '':
65 |         #print key + "\t" + value
66 |         added += 1
67 |         DBM[ key ] = value
68 |         sys.stderr.write('\r\tindexing:\t%s\t%01.2f%%' %(added,100.0*added/total) )
69 |         sys.stderr.flush()
70 |         key, value = '', ''
71 |       key = re.match(">(\S+)", line).group(1)
72 |     else:
73 |       value += line.rstrip()
74 |   fo.close()
75 |   if key != '' and value != '':
76 |     added += 1
77 |     DBM[ key ] = value
78 |     #print key + "\t" + value
79 |   DBM.close()
80 |   sys.stderr.write('\r\tindexing:\t%s\t%01.2f%%\ndone.\n' %(added,100.0*added/total) )
81 | 
82 | # =============================================================================
83 | # === MAIN ====================================================================
84 | # =============================================================================
85 | args = handle_arguments(  )
86 | main( args )
87 | 
88 | 


--------------------------------------------------------------------------------
/python/fasta/remove-stopcodons-from-fasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os, sys     # low level handling, such as command line stuff
  4 | import string      # string methods available
  5 | import re          # regular expressions
  6 | import getopt      # comand line argument handling
  7 | from low import *  # custom functions, written by myself
  8 | 
  9 | 
 10 | # =============================================================================  
 11 | def show_help( ):
 12 |   """ displays the program parameter list and usage information """
 13 |   stdout( "usage: " + sys.argv[0] + " -f <fasta> ")
 14 |   stdout( " " )
 15 |   stdout( " option    description" )
 16 |   stdout( " -h        help (this text here)" )
 17 |   stdout( " -f        fasta file" )
 18 |   stdout( " " )
 19 |   sys.exit(1)
 20 | 
 21 | # =============================================================================
 22 | def handle_arguments():
 23 |   """ verifies the presence of all necessary arguments and returns the data dir """
 24 |   if len ( sys.argv ) == 1:
 25 |     stderr( "no arguments provided." )
 26 |     show_help()  
 27 |   
 28 |   try: # check for the right arguments
 29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
 30 |   except getopt.GetoptError:
 31 |     stderr( "invalid arguments provided." )
 32 |     show_help()
 33 | 
 34 |   args = {}
 35 |   for key, value in keys:
 36 |     if key == '-f': args['fasta'] = value
 37 |     
 38 |   if not args.has_key('fasta'):
 39 |     stderr( "fasta file argument missing." )
 40 |     show_help()
 41 |   elif not file_exists( args.get('fasta') ):
 42 |     stderr( "fasta file does not exist." )
 43 |     show_help()
 44 |   
 45 |   return args
 46 | 
 47 | 
 48 | # =============================================================================
 49 | def parse_fasta(file):
 50 |   hash = {}
 51 |   fo = open(file)
 52 |   STOPCODONS = ["TAA", "TGA", "TAG"]
 53 |   id = ""
 54 |   for line in fo:
 55 |     line = line.strip()
 56 |     if line.startswith(">"):
 57 |       id = line[1:]
 58 |       if id.count(" ") > 0: id = id[:id.index(" ")]
 59 |       hash[id] = ""
 60 |     else:
 61 |       sequence = line.upper()
 62 |       i = 0
 63 |       while i < len(sequence):
 64 |         codon = sequence[i:i+3]
 65 |         if codon in STOPCODONS:
 66 |           hash[id] += "---"
 67 |         else:
 68 |           hash[id] += codon
 69 |         i += 3
 70 |   return hash
 71 | 
 72 | # =============================================================================
 73 | def replace_stop_codons(hash):
 74 |   for id, sequence in hash.iteritems():
 75 |     i = 0
 76 |     while i < len(sequence):
 77 |       codon = sequence[i:i+3]
 78 |       if codon in STOPCODONS:
 79 |         sequence[i:i+3] = "---"
 80 |       i += 3
 81 | 
 82 |   return hash
 83 | 
 84 | 
 85 | # =============================================================================
 86 | # === MAIN ====================================================================
 87 | # =============================================================================
 88 | def main( args ):
 89 |   
 90 |   hash = parse_fasta(args['fasta'])
 91 |   width = 60
 92 |   for id, sequence in hash.iteritems():
 93 |     print ">" + id
 94 |     i = 0
 95 |     while i < len(sequence):
 96 |       part = sequence[i:min([len(sequence),i+60])]
 97 |       print part
 98 |       i += 60
 99 | 
100 | # =============================================================================
101 | args = handle_arguments()
102 | main( args )
103 | 
104 | 


--------------------------------------------------------------------------------
/python/fasta/rename-fasta-sequences.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        fasta file to import" )
18 |   stdout( " -m        tab delimited file that maps a regex to the replacement name, one per line" )
19 |   stdout( " " )
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:m:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 | 
35 |   args = {}
36 |   for key, value in keys:
37 |     if key == '-f': args['file'] = value
38 |     if key == '-m': args['mapping'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "fasta file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "fasta file does not exist." )
45 |     show_help()
46 |     
47 |   if not args.has_key('mapping'):
48 |     stderr( "mapping file argument missing." )
49 |     show_help()
50 |   elif not file_exists( args.get('file') ):
51 |     stderr( "mapping file does not exist." )
52 |     show_help()
53 |  
54 |   return args
55 | 
56 | 
57 | # =============================================================================
58 | def get_mapping(mfile):
59 |   hash = {}
60 |   fo = open( mfile, "r" )
61 |   for line in fo:
62 |     line = line.rstrip()
63 |     if len(line) == 0: break
64 |     if len(line.split("\t")) != 2: continue
65 |     regex, replacement = line.split("\t")
66 |     hash[re.compile(regex)] = replacement
67 |   fo.close()
68 |   return hash
69 | 
70 | # =============================================================================
71 | def apply_replacement(idline, maphash):
72 |   id = idline[1:].split()[0]
73 |   for regex, replacement in maphash.iteritems():
74 |     if re.search(regex, idline[1:]):
75 |       idline = '>' + re.sub(regex, replacement, idline[1:], count=1)
76 |       break
77 |   return idline
78 | 
79 | # =============================================================================
80 | # === MAIN ====================================================================
81 | # =============================================================================
82 | def main( args ):
83 | 
84 |   maphash = get_mapping( args.get('mapping') )
85 |   
86 |   fo = open( args.get('file') )
87 |   for line in fo:
88 |     line = line.rstrip()
89 |     if line.startswith(">"): line = apply_replacement(line, maphash)
90 |     print line
91 |   fo.close()
92 | 
93 | # =============================================================================
94 | args = handle_arguments()
95 | main( args )
96 | 
97 | 


--------------------------------------------------------------------------------
/python/fasta/stockholm-to-fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | 
 9 | # =============================================================================	
10 | def show_help( ):
11 | 	""" displays the program parameter list and usage information """
12 | 	stdout( "usage: " + sys.argv[0] + " -s <path> -k \"regex\" -v \"regex\"" )
13 | 	stdout( " " )
14 | 	stdout( " option    description" )
15 | 	stdout( " -h        help (this text here)" )
16 | 	stdout( " -s        stockholm file" )
17 | 	stdout( " -k        regular expression for the key" )
18 | 	stdout( " -v        regular expression for the value" )
19 | 	stdout( " " )
20 | 	sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 | 	""" verifies the presence of all necessary arguments and returns the data dir """
25 | 	if len ( sys.argv ) == 1:
26 | 		stderr( "no arguments provided." )
27 | 		show_help()	
28 | 	
29 | 	try: # check for the right arguments
30 | 		keys, values = getopt.getopt( sys.argv[1:], "hs:k:v:" )
31 | 	except getopt.GetoptError:
32 | 		stderr( "invalid arguments provided." )
33 | 		show_help()
34 | 	
35 | 	args = {}
36 | 	for key, value in keys:
37 | 		if key == '-s': args['stockholm'] = value
38 | 		if key == '-k':	args['keyregex'] = re.compile(value + '(.*)$' )
39 | 		if key == '-v':	args['valueregex'] = re.compile(value + '(.*)$' )
40 | 				
41 | 	if not args.has_key('keyregex'):
42 | 		stderr( "key regex missing." )
43 | 		show_help()
44 | 		
45 | 	if not args.has_key('valueregex'):
46 | 		stderr( "value regex missing." )
47 | 		show_help()
48 | 
49 | 	if not args.has_key('stockholm'):
50 | 		stderr( "stockholm file missing." )
51 | 		show_help()
52 | 	if not file_exists( args.get('stockholm') ):
53 | 		stderr( "stockholm file does not exist." )
54 | 		show_help()
55 | 		
56 | 	return args
57 | 
58 | 	
59 | # =============================================================================
60 | # =============================================================================
61 | def main( args ):
62 | 
63 | 	fo = open( args.get('stockholm') )
64 | 	kre = args.get('keyregex')
65 | 	vre = args.get('valueregex')
66 | 	key, value = '', ''
67 | 	for line in fo:
68 | 		if re.search( kre, line ):
69 | 			if key != '' and value != '':
70 | 				print ">%s" % key
71 | 				print value
72 | 				key, value = '', ''
73 | 			key = re.search( kre, line ).group(1).strip()
74 | 		if re.search( vre, line ):
75 | 			value = re.search( vre, line ).group(1).strip()
76 | 	fo.close()
77 | 	if key != '' and value != '':
78 | 		print ">%s" % key
79 | 		print value
80 | 	
81 | 	
82 | 	
83 | # =============================================================================
84 | # === MAIN ====================================================================
85 | # =============================================================================
86 | 
87 | args = handle_arguments(  )
88 | main( args )


--------------------------------------------------------------------------------
/python/fasta/translatedprot_from_gb_to_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 		# low level handling, such as command line stuff
 4 | import string			# string methods available
 5 | import re					# regular expressions
 6 | from low import *	# custom functions, written by myself
 7 | 
 8 | 
 9 | # =============================================================================
10 | def get_translatedfasta_from_gb( file ):
11 | 	"""
12 | 	"""
13 | 	
14 | 	def write_output( source, hash ):
15 | 		L = [ ">",hash.get('protein_id'),"|",hash.get('db_xref')]
16 | 		if hash.has_key('product'): L.append("|"+hash.get('product'))
17 | 		L.append(" ("+source+")")
18 | 		print string.join(L,'')
19 | 		print hash.get('translation')
20 | 	
21 | 	fo = open(file)
22 | 	# read general infos
23 | 	source = ''
24 | 	for line in fo:
25 | 		if re.search('FEATURES',line): break
26 | 		if re.match('SOURCE',line):
27 | 			source = re.search('SOURCE\s+(.*)\n',line).group(1)
28 | 		
29 | 	# read gene infos
30 | 	hash = {}
31 | 	hit = 0
32 | 	for line in fo:
33 | 		if not re.match('                     ',line):
34 | 			if len(hash) > 0:	write_output( source, hash )
35 | 			hash = {} 
36 | 			hit = 0
37 | 		if re.match('     CDS',line): hit = 1
38 | 		
39 | 		if hit:
40 | 			# catch everything except translation sequence
41 | 			if re.search('/(\S+)=".*"',line):
42 | 				hash[re.search('/(\S+)=".*"',line).group(1)] = re.search('/\S+="(.*)"',line).group(1)
43 | 			# catch translation sequence
44 | 			if re.search('/translation=',line):
45 | 				hash['translation'] = re.search('/translation="(.*)\n',line).group(1)
46 | 			elif hash.has_key('translation'): hash['translation'] += re.search("([a-zA-Z]+)",line).group(1)
47 | 	if len(hash) > 0: write_output( source, hash )
48 | 	fo.close()
49 | 
50 | # =============================================================================
51 | # === MAIN ====================================================================
52 | # =============================================================================
53 | 
54 | if len( sys.argv ) == 1:
55 | 	print "no arguments provided. you need to specify the gb file(s) to parse."
56 | 	sys.exit(1)
57 | 
58 | for file in sys.argv[1:]:
59 | 	if not file_exists(file):
60 | 		print "gb file not found (or is a dir):", file
61 | 		continue
62 | 	get_translatedfasta_from_gb( file ) 


--------------------------------------------------------------------------------
/python/fasta/uniprot-dat-to-fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | 
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        uniprot dat file" )
18 |   stdout( " " )
19 |   sys.exit(1)
20 | 
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 | 
25 |   if len ( sys.argv ) == 1:
26 |     stderr( "no arguments provided." )
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
31 |   except getopt.GetoptError:
32 |     stderr( "invalid arguments provided." )
33 |     show_help()
34 | 
35 |   args = {}
36 |   for key, value in keys:
37 |     if key == '-f': args['datfile'] = value
38 |     
39 |   for key in ['datfile']:
40 |     if key.endswith("file"):
41 |       if not args_file_exists(args, key): show_help()
42 |     elif key.endswith("dir"):
43 |       if not args_dir_exists(args, key): show_help()
44 |   return args
45 | 
46 | # =============================================================================  
47 | def parse_until_doubleslash(fo):
48 |   hash, end = {}, False
49 |   line = fo.readline().strip()
50 |   while not line.startswith("//"):
51 |     if len(line) == 0:
52 |       end = True
53 |       break
54 |     if len(line.split(" ", 1)[0]) != 2:
55 |       key = "SEQ"
56 |       value = line.strip().replace(" ", "")
57 |     else:
58 |       cols =  [e.strip() for e in line.split(" ", 1)]
59 |       if len(cols) != 2: 
60 |         line = fo.readline().strip()
61 |         continue
62 |       key, value = [e.strip() for e in line.split(" ", 1)]
63 |     if not hash.has_key(key): hash[key] = ""
64 |     if key != "SEQ" and len(hash[key]) > 0 and hash[key][-1] != " " and not value.startswith(" "): hash[key] += " "
65 |     hash[key] += value
66 |     line = fo.readline().strip()
67 |   return hash, end
68 |   
69 | # =============================================================================
70 | # === MAIN ====================================================================
71 | # =============================================================================
72 | def main( args ):
73 |   fo = open(args['datfile'])
74 |   while 1:
75 |     hash, end = parse_until_doubleslash(fo)
76 |     if end: break
77 |     print ">" + hash["ID"].split()[0] + " " + hash["OC"]
78 |     print hash["SEQ"]
79 |   fo.close()
80 | 
81 | # =============================================================================
82 | args = handle_arguments()
83 | main( args )
84 | 
85 | 


--------------------------------------------------------------------------------
/python/geneontology/go-enrichment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | import rpy
 4 | 
 5 | 
 6 | def usage():
 7 |   print >> sys.stderr, "usage: " + sys.argv[0] + " universe-topGO.table  testset.ids"
 8 |   sys.exit(1)
 9 | 
10 | 
11 | def plausi():
12 |   if len(sys.argv) != 3: usage()
13 |   inUniverse, inTestset = sys.argv[1:3]
14 |   return inUniverse, inTestset
15 | 
16 | 
17 | def init_R():
18 |   R = rpy.r
19 |   try:
20 |     R.library('topGO')
21 |   except:
22 |     try: 
23 |       R.source("http://bioconductor.org/biocLite.R")
24 |       R.biocLite('topGO')
25 |       R.library('topGO')
26 |     except:
27 |       print "Problem importing R libraries."
28 |       sys.exit()
29 | 
30 |   R('if(!isGeneric("GOFisherTestUnder")) setGeneric("GOFisherTestUnder", function(object) standardGeneric("GOFisherTestUnder"))')
31 |   R('setMethod("GOFisherTestUnder", "classicCount", function(object) { contMat <- contTable(object); if(all(contMat == 0)) p.value <- 1 else p.value <- fisher.test(contMat, alternative = "less")$p.value; return(p.value) })')
32 |   return R
33 | 
34 | 
35 | def main():
36 |   inUniverse, inTestset = plausi()
37 |   R = init_R()
38 |   R('GOmap = readMappings(file = "' + inUniverse + '")')
39 |   R('refset  = names(GOmap)')
40 |   R('testset = scan(file="' + inTestset + '", what=character())')
41 |   R('genes_of_interest = factor(as.integer(refset %in% testset))')
42 |   R('names(genes_of_interest) <- refset')
43 |   for ontology in ["MF", "BP", "CC"]:
44 |     R('tgData = new("topGOdata", ontology = "' + ontology + '", allGenes = genes_of_interest, annot = annFUN.gene2GO, gene2GO = GOmap)')
45 |     R('fisherRes = runTest(tgData, algorithm="classic", statistic="fisher")')
46 |     R('fisherResCor = p.adjust(score(fisherRes), method="fdr")')
47 |     R('weightRes = runTest(tgData, algorithm="weight01", statistic="fisher")')
48 |     R('weightResCor = p.adjust(score(weightRes), method="fdr")')
49 |     R('allRes    = GenTable(tgData, classic=fisherRes, weight=weightRes, orderBy="weight", ranksOf="classic", topNodes=150)')
50 |     R('allRes$fisher.FDR = fisherResCor[allRes$GO.ID]')
51 |     R('allRes$weight.FDR = weightResCor[allRes$GO.ID]')
52 |     R('write.csv(allRes, "topGO.over.Sig.' + ontology + '.csv")')
53 | 
54 |     R('tgData = new("topGOdata", ontology = "' + ontology + '", allGenes = genes_of_interest, annot = annFUN.gene2GO, gene2GO = GOmap)')
55 |     R('test.stat <- new("classicCount", testStatistic = GOFisherTestUnder, name ="Fisher test underrepresentation")')
56 |     R('fisherRes <- getSigGroups(tgData, test.stat)')
57 |     R('fisherResCor = p.adjust(score(fisherRes), method="fdr")')
58 |     R('test.stat <- new("weightCount", testStatistic = GOFisherTestUnder, name ="Fisher test underrepresentation")')
59 |     R('weightRes <- getSigGroups(tgData, test.stat)')
60 |     R('weightResCor = p.adjust(score(weightRes), method="fdr")')
61 |     R('allRes    = GenTable(tgData, classic=fisherRes, weight=weightRes, orderBy="weight", ranksOf="classic", topNodes=150)')
62 |     R('allRes$fisher.FDR = fisherResCor[allRes$GO.ID]')
63 |     R('allRes$weight.FDR = weightResCor[allRes$GO.ID]')
64 |     R('write.csv(allRes, "topGO.under.Sig.' + ontology + '.csv")')
65 |  
66 | main()
67 | 


--------------------------------------------------------------------------------
/python/geneontology/go-from-blastout.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm					# index databases (file hash)
 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance
10 | 
11 | # =============================================================================	
12 | def show_help( ):
13 | 	""" displays the program parameter list and usage information """
14 | 	stdout( "usage: " + sys.argv[0] + " -c <path> -o <path>" )
15 | 	stdout( " " )
16 | 	stdout( " option    description" )
17 | 	stdout( " -h        help (this text here)" )
18 | 	stdout( " -f        blast.out file" )
19 | 	stdout( " " )
20 | 	
21 | 	sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 | 	""" verifies the presence of all necessary arguments and returns the data dir """
26 | 	if len ( sys.argv ) == 1:
27 | 		stderr( "no arguments provided." )
28 | 		show_help()	
29 | 	
30 | 	try: # check for the right arguments
31 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 | 	except getopt.GetoptError:
33 | 		stderr( "invalid arguments provided." )
34 | 		show_help()
35 | 	
36 | 	args = {}
37 | 	for key, value in keys:
38 | 		if key == '-f':	args['file'] = value
39 | 				
40 | 	if not args.has_key('file'):
41 | 		stderr( "blast.out file missing." )
42 | 		show_help()
43 | 	if not file_exists( args.get('file') ):
44 | 		stderr( "blast.out file does not exist." )
45 | 		show_help()
46 | 		
47 | 	return args
48 | 
49 | 
50 | 
51 | # =============================================================================
52 | def parse_descr( text ):
53 |   hash = {}
54 |   if not re.search("GO:\d+.*evidence", text): 
55 |     sys.stderr.write("return None.\n")
56 |     return hash
57 |   for match in re.finditer( '(GO:\d+)\s*\"([^"]+)\"\s*evidence', text ):
58 |     id = match.group(1)
59 |     description = match.group(2)
60 |     hash[ id ] = description
61 |   return hash
62 |   
63 | 
64 | # =============================================================================
65 | # =============================================================================
66 | def main( args ):
67 |   fo = open( args.get('file') )
68 |   descr_index = None
69 |   for line in fo:
70 |     line = line.rstrip()
71 |     cols = line.split("\t")
72 |     if descr_index == None:
73 |       for index, col in enumerate(cols):
74 |         if re.search("GO:\d+", col):
75 |           descr_index = index
76 |           break
77 |     descr = cols[ descr_index ]
78 |     go_hash = parse_descr( descr )
79 |     for goterm, godescr in go_hash.iteritems():
80 |       L = []
81 |       for index, col in enumerate(cols):
82 |         if index == descr_index:
83 |           L.append(goterm)
84 |           L.append(godescr)
85 |         else:
86 |           L.append(col)
87 |       print string.join(L,"\t")
88 |   fo.close()
89 | 	
90 | # =============================================================================
91 | # === MAIN ====================================================================
92 | # =============================================================================
93 | 
94 | args = handle_arguments(  )
95 | main( args )
96 | 


--------------------------------------------------------------------------------
/python/geneontology/goflat2topgo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        GO flat file to import [tab delimited]" )
18 |   stdout( " " )
19 |   sys.exit(1)
20 | 
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 |   if len ( sys.argv ) == 1:
25 |     stderr( "no arguments provided." )
26 |     show_help()  
27 |   
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
30 |   except getopt.GetoptError:
31 |     stderr( "invalid arguments provided." )
32 |     show_help()
33 | 
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f': args['file'] = value
37 |     
38 |   if not args.has_key('file'):
39 |     stderr( "import file argument missing." )
40 |     show_help()
41 |   elif not file_exists( args.get('file') ):
42 |     stderr( "import file does not exist." )
43 |     show_help()
44 |     
45 | 
46 |   return args
47 | 
48 | 
49 | # =============================================================================
50 | # === MAIN ====================================================================
51 | # =============================================================================
52 | def main( args ):
53 |   
54 |   from collections import defaultdict
55 |   goHash = defaultdict(list)
56 |   fo = open( args.get('file') )
57 |   for line in fo:
58 |     line = line.strip()
59 |     geneid, goterm = line.split("\t")
60 |     if geneid.count(" ") > 0:
61 |       geneid = geneid[:geneid.index(" ")]
62 |     goHash[geneid].append(goterm)
63 |   fo.close()
64 |   for geneid, goterms in goHash.iteritems():
65 |     print geneid + "\t" + string.join(goterms, ", ")
66 | 
67 | # =============================================================================
68 | args = handle_arguments()
69 | main( args )
70 | 
71 | 


--------------------------------------------------------------------------------
/python/geneontology/goid2name-from-obo-xml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from collections import defaultdict
 9 | from xml.dom import minidom
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        go term obo-xml file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['obo'] = value
39 |     
40 |   if not args.has_key('obo'):
41 |     stderr( "obo file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('obo') ):
44 |     stderr( "obo file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 | # =============================================================================
50 | class GOTerm():
51 | 
52 |   def __init__(self, xml):
53 |     self.id = xml.getElementsByTagName("id")[0].firstChild.data
54 |     self.name = xml.getElementsByTagName("name")[0].firstChild.data
55 |     self.namespace = xml.getElementsByTagName("namespace")[0].firstChild.data
56 |     self.alt_ids = [node.firstChild.data for node in xml.getElementsByTagName("alt_id")]
57 | 
58 | # =============================================================================
59 | def read_obo( file ):
60 |   hash = {}
61 |   xmldoc = minidom.parse(file)
62 |   for term in xmldoc.getElementsByTagName('term'):
63 |     goterm = GOTerm(term)
64 |     hash[goterm.id] = goterm
65 |     for alt_id in goterm.alt_ids: 
66 |       if not hash.has_key(alt_id): hash[alt_id] = goterm
67 |   print >> sys.stderr, "goterms read from obo: %s" % len(hash)
68 |   return hash
69 | 
70 | # =============================================================================
71 | # === MAIN ====================================================================
72 | # =============================================================================
73 | def main( args ):
74 |   
75 |   gohash = read_obo(args['obo'])
76 |   for goid, goterm in gohash.iteritems():
77 |     print goid + "\t" + goterm.name
78 | 
79 | # =============================================================================
80 | args = handle_arguments()
81 | main( args )
82 | 
83 | 


--------------------------------------------------------------------------------
/python/geneontology/goid2name-from-obo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from goterm import GOTerm
 9 | from collections import defaultdict
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        go obo file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['obo'] = value
39 |     
40 |   if not args.has_key('obo'):
41 |     stderr( "obo file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('obo') ):
44 |     stderr( "obo file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 |   
50 | # =============================================================================
51 | def read_obo( file ):
52 |   hash = {}
53 |   goterm = {}
54 |   fo = open(file)
55 |   for line in fo:
56 |     line = line.rstrip()
57 |     if line.startswith("[Term]") or line.startswith("[Typedef]"):
58 |       if goterm.has_key('id') and goterm.has_key('name'): hash[goterm['id']] = goterm['name']
59 |       goterm = {}
60 |     elif line.startswith("id:"):
61 |       goterm['id'] = line.split()[1]
62 |     elif line.startswith("name:"):
63 |       goterm['name'] = string.join(line.split()[1:], " ")
64 |   fo.close()
65 |   print >> sys.stderr, "goterms read from obo: %s" % len(hash)
66 |   return hash
67 | 
68 | 
69 | # =============================================================================
70 | # === MAIN ====================================================================
71 | # =============================================================================
72 | def main( args ):
73 |   
74 |   gohash = read_obo(args['obo'])
75 |   for goid, goname in gohash.iteritems():
76 |     print goid + "\t" + goname
77 | 
78 | # =============================================================================
79 | args = handle_arguments()
80 | main( args )
81 | 
82 | 


--------------------------------------------------------------------------------
/python/generic/add-basename-as-first-col.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from collections import defaultdict
 9 | import fileinput
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        input file (will be rewritten on the fly!) - basename is everything before the first dot" )
20 |   stdout( " -l        basename to lower case" )
21 |   stdout( " -u        basename to upper case" )
22 |   stdout( " " )
23 |   sys.exit(1)
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     stderr( "no arguments provided." )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:ul" )
34 |   except getopt.GetoptError:
35 |     stderr( "invalid arguments provided." )
36 |     show_help()
37 | 
38 |   args = {'lower':False, 'upper':False}
39 |   for key, value in keys:
40 |     if key == '-f': args['file'] = value
41 |     if key == '-l': args['lower'] = True
42 |     if key == '-u': args['upper'] = True
43 |     
44 |   if not args.has_key('file'):
45 |     stderr( "fasta file argument missing." )
46 |     show_help()
47 |   elif not file_exists( args.get('file') ):
48 |     stderr( "fasta file does not exist." )
49 |     show_help()
50 | 
51 |   if args['lower'] and args['upper']:
52 |     stderr( "cannot select both lower and upper." )
53 |     show_help()
54 |   
55 |   return args
56 | 
57 |   
58 | # =============================================================================
59 | # === MAIN ====================================================================
60 | # =============================================================================
61 | def main( args ):
62 |   filename = os.path.split(args['file'])[1]
63 |   basename = filename
64 |   while basename.count(".") > 0: basename = os.path.splitext(basename)[0]
65 |   if args['lower']: basename = basename.lower()
66 |   if args['upper']: basename = basename.upper()
67 |   for line in fileinput.input(args['file'],inplace=1):
68 |     print basename + "\t" + line.rstrip()
69 | 
70 | # =============================================================================
71 | args = handle_arguments()
72 | main( args )
73 | 
74 | 


--------------------------------------------------------------------------------
/python/generic/add-species-as-first-col.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from collections import defaultdict
 9 | import fileinput
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        input file (will be rewritten on the fly!)" )
20 |   stdout( " -l        species name to lower case" )
21 |   stdout( " -u        species name to upper case" )
22 |   stdout( " " )
23 |   sys.exit(1)
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     stderr( "no arguments provided." )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:ul" )
34 |   except getopt.GetoptError:
35 |     stderr( "invalid arguments provided." )
36 |     show_help()
37 | 
38 |   args = {'lower':False, 'upper':False}
39 |   for key, value in keys:
40 |     if key == '-f': args['file'] = value
41 |     if key == '-l': args['lower'] = True
42 |     if key == '-u': args['upper'] = True
43 |     
44 |   if not args.has_key('file'):
45 |     stderr( "fasta file argument missing." )
46 |     show_help()
47 |   elif not file_exists( args.get('file') ):
48 |     stderr( "fasta file does not exist." )
49 |     show_help()
50 | 
51 |   if args['lower'] and args['upper']:
52 |     stderr( "cannot select both lower and upper." )
53 |     show_help()
54 |   
55 |   return args
56 | 
57 |   
58 | # =============================================================================
59 | # === MAIN ====================================================================
60 | # =============================================================================
61 | def main( args ):
62 |   species = args['file'][:4]
63 |   if args['lower']: species = species.lower()
64 |   if args['upper']: species = species.upper()
65 |   for line in fileinput.input(args['file'],inplace=1):
66 |     print species + "\t" + line.rstrip()
67 | 
68 | # =============================================================================
69 | args = handle_arguments()
70 | main( args )
71 | 
72 | 


--------------------------------------------------------------------------------
/python/generic/add_to_xdom.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys 				# low level handling, such as command line stuff
 3 | import getopt					# comand line argument handling
 4 | import anydbm					# index databases (file hash)
 5 | from low import *			# collection of generic self-defined functions
 6 | 
 7 | 
 8 | # =============================================================================	
 9 | def show_help( ):
10 | 	""" displays the program parameter list and usage information """
11 | 	stdout( "usage: " + sys.argv[0] + " -f <path> -o <path>" )
12 | 	stdout( " " )
13 | 	stdout( " option    description" )
14 | 	stdout( " -h        help (this text here)" )
15 | 	stdout( " -f        xdom file" )
16 | 	stdout( " -i        indexed ndb file" )
17 | 	stdout( " -n        column to look up [0..n]" )
18 | 	stdout( " " )
19 | 	sys.exit(1)
20 | 	
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 |   if len ( sys.argv ) == 1:
25 |   	stderr( "no arguments provided." )
26 |   	show_help()	
27 |   
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:i:n:" )
30 |   except getopt.GetoptError:
31 |   	stderr( "invalid arguments provided." )
32 |   	show_help()
33 |   
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f': args['xdom'] = value
37 |     if key == '-i':	args['dbm'] = value
38 |     if key == '-n':	args['column'] = int(value)
39 |   
40 |   if not args.has_key('xdom'):
41 |   	stderr( "xdom file missing." )
42 |   	show_help()
43 |   if not file_exists( args.get('xdom') ):
44 |   	stderr( "xdom file does not exist." )
45 |   	show_help()
46 |   	
47 |   if not args.has_key('dbm'):
48 |   	stderr( "dbm file missing." )
49 |   	show_help()
50 |   if not file_exists( args.get('dbm') ):
51 |   	stderr( "dbm file does not exist." )
52 |   	show_help()
53 |   
54 |   if not args.has_key('column'):
55 |     stderr( "column index missing." )
56 |     show_help()
57 |   
58 |   return args
59 | 
60 | 	
61 | # =============================================================================
62 | # =============================================================================
63 | def main( args ):
64 |   DBM = anydbm.open( args.get('dbm'), 'r' )
65 |   fo = open( args.get('xdom') )
66 |   n = args.get('column')
67 |   key, value = '', ''
68 |   for line in fo:
69 |     line = line.rstrip()
70 |     if line.endswith('\n'): line = line.replace('\n','')
71 |     if line.startswith('>'):
72 |       print line
73 |   		#if key != '' and value != '':
74 |   		#	sys.stdout.write( ">%s\n%s" %(key,value) )
75 |   		#	key, value = '', ''
76 |   		#key = line[1:].rstrip()
77 |     else:
78 |       value = line.rstrip()
79 |       pid = value.split()[ n ]
80 |       if not DBM.has_key( pid ):
81 |       	print "DBM does not contain the following key:", pid
82 |       else: value += "\t" + DBM.get(pid)
83 |       print value 
84 |   fo.close()
85 |   #if key != '' and value != '':
86 |   #	sys.stdout.write( ">%s\n%s" %(key,value) )
87 |   DBM.close()
88 | 
89 | # =============================================================================
90 | # === MAIN ====================================================================
91 | # =============================================================================
92 | args = handle_arguments(  )
93 | main( args )
94 | 
95 | 


--------------------------------------------------------------------------------
/python/generic/difference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sets
 4 | import sys, os
 5 | 
 6 | def get_lines( file ):
 7 |   lines = []
 8 |   fo = open(file)
 9 |   for line in fo:
10 |     line = line.rstrip()
11 |     lines.append(line)
12 | 
13 |   return sets.Set(lines)
14 | 
15 | l1 = get_lines(sys.argv[1])
16 | l2 = get_lines(sys.argv[2])
17 | for e in l1.difference(l2):
18 |   print e
19 | 


--------------------------------------------------------------------------------
/python/generic/flat-split-by-lines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import getopt      # comand line argument handling
 6 | from low import *
 7 | 
 8 | # =============================================================================  
 9 | def show_help( ):
10 |   """ displays the program parameter list and usage information """
11 |   print "splits a flat file into chunks. Options: (1) N number of lines per chunk. (2) N number of chunks of equal size"
12 |   print "usage: " + sys.argv[0] + " -f <file> [-i <chunks> -l <lines>]"
13 |   print " "
14 |   print " option    description"
15 |   print " -h        help (this text here)"
16 |   print " -f        flat file to split"
17 |   print " -l        number of lines per chunk"
18 |   print " -i        number of equally sized chunks"
19 |   print " "
20 |   sys.exit(1)
21 | 
22 | # =============================================================================
23 | def handle_arguments():
24 |   """ verifies the presence of all necessary arguments and returns the data dir """
25 |   if len ( sys.argv ) == 1:
26 |     print >> sys.stderr, "no arguments provided."
27 |     show_help()  
28 |   
29 |   try: # check for the right arguments
30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:i:l:" )
31 |   except getopt.GetoptError:
32 |     print >> sys.stderr, "invalid arguments provided."
33 |     show_help()
34 | 
35 |   args = {}
36 |   for key, value in keys:
37 |     if key == '-f': args['file'] = value
38 |     if key == '-l': args['l'] = int(value)
39 |     if key == '-i': args['i'] = int(value)
40 |     
41 |   if not args.has_key('file'):
42 |     print >> sys.stderr, "import file argument missing."
43 |     show_help()
44 |   elif not file_exists( args.get('file') ):
45 |     print >> sys.stderr, "import file does not exist."
46 |     show_help()
47 | 
48 |   if not args.has_key('l') and not args.has_key('i'):
49 |     print >> sys.stderr, "l or i missing."
50 |     show_help()
51 |     
52 |   return args
53 | 
54 | 
55 | def get_number_of_lines(file):
56 |   lines = 0
57 |   fo = open(file)
58 |   for line in fo: lines += 1
59 |   return lines
60 | 
61 | # =============================================================================
62 | def get_lines_in(ifile):
63 |   lc = 0
64 |   fo = open(ifile)
65 |   for line in fo: lc += 1
66 |   fo.close()
67 |   return lc
68 |   
69 | # =============================================================================
70 | # === MAIN ====================================================================
71 | # =============================================================================
72 | def main( args ):
73 | 
74 |   totallines = get_lines_in(args.get('file'))
75 |   linecount, filecount = 0, 1
76 |   if args.has_key('i'): rotate = int(math.ceil( 1.0 * totallines / args.get('i') ))
77 |   else: rotate = args.get('l')
78 | 
79 |   digits = len(str(math.ceil(1.0*totallines/rotate)))
80 |   fw = open( args.get('file') + '.' + add_leading_zeroes(filecount, digits), 'w' )
81 |   fo = open( args.get('file') )
82 |   for line in fo:
83 |     linecount += 1
84 |     if ((linecount % rotate) == 1 and linecount > 1) or (rotate == 1 and linecount > 1):
85 |       filecount += 1
86 |       fw.close()
87 |       fw = open( args.get('file') + '.' + add_leading_zeroes(filecount, digits), 'w' )
88 |     fw.write(line)    
89 |   fo.close()
90 |   fw.close()
91 |   
92 | 
93 | # =============================================================================
94 | args = handle_arguments()
95 | main( args )
96 | 
97 | 


--------------------------------------------------------------------------------
/python/generic/flat2line.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import getopt      # comand line argument handling
 6 | from collections import defaultdict
 7 | from low import *  # custom functions, written by myself
 8 | 
 9 | # =============================================================================  
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
13 |   stdout( " " )
14 |   stdout( " option    description" )
15 |   stdout( " -h        help (this text here)" )
16 |   stdout( " -f        flat file to import" )
17 |   stdout( " -d        delimiter (default: ', ' | allowed: any string, tab, space" )
18 |   stdout( " " )
19 |   sys.exit(1)
20 | 
21 | # =============================================================================
22 | def handle_arguments():
23 |   """ verifies the presence of all necessary arguments and returns the data dir """
24 |   if len ( sys.argv ) == 1:
25 |     stderr( "no arguments provided." )
26 |     show_help()  
27 |   
28 |   try: # check for the right arguments
29 |     keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" )
30 |   except getopt.GetoptError:
31 |     stderr( "invalid arguments provided." )
32 |     show_help()
33 | 
34 |   args = {}
35 |   for key, value in keys:
36 |     if key == '-f': args['file'] = value
37 |     if key == '-d': args['delimiter'] = value
38 |     
39 |   if not args.has_key('file'):
40 |     stderr( "import file argument missing." )
41 |     show_help()
42 |   elif not file_exists( args.get('file') ):
43 |     stderr( "import file does not exist." )
44 |     show_help()
45 |     
46 |   if not args.has_key('delimiter'): # or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 
47 |     args['delimiter'] = ', '
48 |   elif args['delimiter'] == "tab": args['delimiter'] = "\t"
49 |   elif args['delimiter'] == "space": args['delimiter'] = " "
50 | 
51 |   return args
52 | 
53 | 
54 | # =============================================================================
55 | # === MAIN ====================================================================
56 | # =============================================================================
57 | def main( args ):
58 | 
59 |   hash = defaultdict(list)
60 |   fo = open( args.get('file') )
61 |   for line in fo:
62 |     line = line.rstrip()
63 |     key, value = line.split("\t")
64 |     hash[key].append(value)
65 |   fo.close()
66 |   
67 |   for key, values in hash.iteritems():
68 |     print key + "\t" + string.join(values, args.get('delimiter'))
69 | 
70 | # =============================================================================
71 | args = handle_arguments()
72 | main( args )
73 | 
74 | 


--------------------------------------------------------------------------------
/python/generic/flat2matrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        flat file to import [tab delimited]" )
18 |   stdout( " -a        index of the first dimension key [default: 0]" )
19 |   stdout( " -b        index of the second dimension key [default: 1]" )
20 |   stdout( " -v        index of the value [default: 2]" )
21 |   stdout( " -o        order: comma-separated list of keys in which to output the matrix [default: alphabetically sorted]" )
22 |   stdout( " " )
23 |   sys.exit(1)
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     stderr( "no arguments provided." )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:a:b:v:o:" )
34 |   except getopt.GetoptError:
35 |     stderr( "invalid arguments provided." )
36 |     show_help()
37 | 
38 |   args = {'key1':0, 'key2':1, 'value':2}
39 |   for key, value in keys:
40 |     if key == '-f': args['file'] = value
41 |     if key == '-a': args['key1'] = int(value)
42 |     if key == '-b': args['key2'] = int(value)
43 |     if key == '-v': args['value'] = int(value)
44 |     if key == '-o': args['order'] = value.split(",")
45 |     
46 |   if not args.has_key('file'):
47 |     stderr( "import file argument missing." )
48 |     show_help()
49 |   elif not file_exists( args.get('file') ):
50 |     stderr( "import file does not exist." )
51 |     show_help()
52 |     
53 | 
54 |   return args
55 | 
56 | 
57 | # =============================================================================
58 | # === MAIN ====================================================================
59 | # =============================================================================
60 | def main( args ):
61 |   
62 |   hash = {}
63 |   keys = []
64 |   fo = open( args.get('file') )
65 |   for line in fo:
66 |     col = line.strip().split("\t")
67 |     key1, key2, value = col[args['key1']], col[args['key2']], col[args['value']]
68 |     hash[key1 + '|||' + key2] = value
69 |     if not key1 in keys: keys.append(key1)
70 |     if not key2 in keys: keys.append(key2)
71 |   fo.close()
72 |   if args.has_key('order'): keys = args['order']
73 |   else: keys.sort()
74 | 
75 |   print string.join(keys, ",")
76 |   for i in keys:
77 |     sys.stdout.write(i)
78 |     for j in keys:
79 |       value = 'NA'
80 |       if hash.has_key(i+'|||'+j): value = hash[i+'|||'+j]
81 |       elif hash.has_key(j+'|||'+i): value = hash[j+'|||'+i]
82 |       sys.stdout.write(","+value)
83 |     sys.stdout.write("\n")
84 | 
85 | 
86 | # =============================================================================
87 | args = handle_arguments()
88 | main( args )
89 | 
90 | 


--------------------------------------------------------------------------------
/python/generic/flat2xdom.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        fasta file to import" )
18 |   stdout( " -p        prefix to put in fron of the key" )
19 |   stdout( " -d        delimiter (default: space | allowed: ; , tab space" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     if key == '-p': args['prefix'] = value
40 |     if key == '-d': args['delimiter'] = value
41 |     
42 |   if not args.has_key('file'):
43 |     stderr( "import file argument missing." )
44 |     show_help()
45 |   elif not file_exists( args.get('file') ):
46 |     stderr( "import file does not exist." )
47 |     show_help()
48 |     
49 |   if not args.has_key('delimiter') or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 
50 |     args['delimiter'] = 'space'
51 | 
52 |   return args
53 | 
54 | 
55 | # =============================================================================
56 | # === MAIN ====================================================================
57 | # =============================================================================
58 | def main( args ):
59 | 
60 |   fo = open( args.get('file') )
61 |   oldid = ""
62 |   for line in fo:
63 |     line = line.rstrip()
64 |     if args.get('delimiter') == "tab":
65 |       columns = line.split("\t")
66 |     elif args.get('delimiter') == "space":
67 |       columns = line.split()
68 |     else:
69 |       columns = line.split( args.get('delimiter') )
70 |     id = columns[0]
71 |     if id != oldid:
72 |       oldid = id
73 |       if args.has_key('prefix'):
74 |         print ">" + args.get('prefix') + id
75 |       else:
76 |         print ">" + id
77 |     print string.join( columns[1:], "\t" )
78 |   fo.close()
79 | 
80 | # =============================================================================
81 | args = handle_arguments()
82 | main( args )
83 | 
84 | 


--------------------------------------------------------------------------------
/python/generic/grab-columns.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path> -i -n" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        tab delimited input file" )
18 |   stdout( " -1        keep first column" )
19 |   stdout( " -r        regex for the column header to mark as to keep" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:r:1" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   args['keepfirstcol'] = 0
38 |   for key, value in keys:
39 |     if key == '-f': args['file'] = value
40 |     if key == '-r': args['regex'] = re.compile(value)
41 |     if key == '-1': args['keepfirstcol'] = 1
42 |     
43 |   if not args.has_key('file'):
44 |     stderr( "import file argument missing." )
45 |     show_help()
46 |   elif not file_exists( args.get('file') ):
47 |     stderr( "import file does not exist." )
48 |     show_help()
49 | 
50 |   if not args.has_key('regex'):
51 |     stderr( "regex argument missing." )
52 |     show_help()
53 |   
54 |   return args
55 | 
56 | 
57 | # =============================================================================
58 | def get_header( file ):
59 |   fo = open(file)
60 |   header = fo.readline().rstrip()
61 |   fo.close()
62 |   return header
63 | 
64 | 
65 | # =============================================================================
66 | # === MAIN ====================================================================
67 | # =============================================================================
68 | def main( args ):
69 |   headline = get_header( args.get('file') )
70 |   columns = headline.split("\t")
71 |   regex = args.get('regex')
72 |   keepindices = []
73 |   for i in range(len(columns)):
74 |     if regex.search(columns[i]): 
75 |       keepindices.append(i)
76 |       #sys.stderr.write("marked:\t%02d\t%s\n" % (i, columns[i]))
77 |     elif i == 0 and args.get('keepfirstcol'): 
78 |       keepindices.append(i)
79 |       #sys.stderr.write("marked:\t%02d\t%s\n" % (i, columns[i]))
80 | 
81 |   fo = open(args.get('file'))
82 |   for line in fo:
83 |     line = line.rstrip()
84 |     columns = line.split("\t")
85 |     out = []
86 |     for i in keepindices: out.append( columns[i] )
87 |     print string.join(out, "\t")
88 |   fo.close()
89 | 
90 | # =============================================================================
91 | args = handle_arguments()
92 | main( args )
93 | 
94 | 


--------------------------------------------------------------------------------
/python/generic/intersection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sets
 4 | import sys, os
 5 | 
 6 | def get_lines_in_hash(file):
 7 |   hash = {}
 8 |   fo = open(file)
 9 |   for line in fo: hash[line.strip()] = 1
10 |   fo.close()
11 |   return hash
12 | 
13 | def get_lines( file ):
14 |   lines = []
15 |   fo = open(file)
16 |   for line in fo:
17 |     line = line.rstrip()
18 |     lines.append(line)
19 | 
20 |   return sets.Set(lines)
21 | 
22 | def terminate():
23 |   print >> sys.stderr, "provide at least two valid input files as input arguments"
24 |   sys.exit(1)
25 | 
26 | 
27 | if len(sys.argv[1:]) < 2: terminate()
28 | for inputfile in sys.argv[1:]:
29 |   if not os.path.isfile(inputfile): terminate()
30 | 
31 | allhashes = []
32 | for file in sys.argv[1:]:
33 |   allhashes.append( get_lines_in_hash(file) )
34 | 
35 | refkeys = allhashes[0].keys()
36 | for refkey in refkeys:
37 |   found = 0
38 |   for hash in allhashes:
39 |     if hash.has_key(refkey): found += 1
40 |     else: break
41 |   if found == len(allhashes):
42 |     print refkey
43 | 
44 | #l1 = get_lines(sys.argv[1])
45 | #l2 = get_lines(sys.argv[2])
46 | #for e in l1.intersection(l2):
47 | #  print e
48 | 


--------------------------------------------------------------------------------
/python/generic/subtract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sets
 4 | import sys, os
 5 | 
 6 | def get_lines( file ):
 7 |   lines = []
 8 |   fo = open(file)
 9 |   for line in fo:
10 |     line = line.rstrip()
11 |     lines.append(line)
12 | 
13 |   return sets.Set(lines)
14 | 
15 | ref = get_lines(sys.argv[1])
16 | for filename in sys.argv[2:]:
17 |   l = get_lines(filename)
18 |   for e in l:
19 |     if e in ref: ref.remove(e)
20 | 
21 | for e in ref: print e
22 | 


--------------------------------------------------------------------------------
/python/generic/text2range.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | REGEX = re.compile("(\d+)$")
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        text flat file to analyze" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "import file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "import file does not exist." )
45 |     show_help()
46 |     
47 |   return args
48 | 
49 | 
50 | # =============================================================================
51 | def is1higherthan( text1, text2, regex=REGEX ):
52 | 
53 |   def splittext( text, regex ):
54 |     return regex.split( text )[0], int(regex.split( text )[1])
55 | 
56 |   id1, number1 = splittext( text1, regex )
57 |   id2, number2 = splittext( text2, regex )
58 |   if id1 != id2: return 0 
59 |   if (number1 +1) == number2: return 1
60 |   return 0
61 | 
62 | # =============================================================================
63 | # === MAIN ====================================================================
64 | # =============================================================================
65 | def main( args ):
66 | 
67 |   fo = open( args.get('file') )
68 |   lines = fo.readlines()
69 |   fo.close()
70 | 
71 |   started_at = ""
72 | 
73 |   for i in range(1,len(lines)):
74 |     line0, line1 = lines[i-1], lines[i]
75 |     if started_at == "": started_at = line0
76 |     if i < (len(lines)-1) and is1higherthan( line0, line1 ): continue
77 |     print string.join([started_at.rstrip(), line0.rstrip()], "\t")
78 |     started_at = ""
79 |     
80 | # =============================================================================
81 | args = handle_arguments()
82 | main( args )
83 | 
84 | 


--------------------------------------------------------------------------------
/python/generic/xdom2flat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import math        # match functions
 8 | from low import *  # custom functions, written by myself
 9 | 
10 | # =============================================================================  
11 | def show_help( ):
12 |   """ displays the program parameter list and usage information """
13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
14 |   stdout( " " )
15 |   stdout( " option    description" )
16 |   stdout( " -h        help (this text here)" )
17 |   stdout( " -f        fasta file to import" )
18 |   stdout( " -p        prefix to put in fron of the key" )
19 |   stdout( " -d        delimiter (default: space | allowed: ; , tab space" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     if key == '-d': args['delimiter'] = value
40 |     
41 |   if not args.has_key('file'):
42 |     stderr( "import file argument missing." )
43 |     show_help()
44 |   elif not file_exists( args.get('file') ):
45 |     stderr( "import file does not exist." )
46 |     show_help()
47 |     
48 |   if not args.has_key('delimiter') or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 
49 |     args['delimiter'] = 'space'
50 | 
51 |   return args
52 | 
53 | 
54 | # =============================================================================
55 | # === MAIN ====================================================================
56 | # =============================================================================
57 | def main( args ):
58 |   
59 |   if args.get('delimiter') == "space": delim = " "
60 |   if args.get('delimiter') == "tab": delim = "\t"
61 |   if args.get('delimiter') == ";": delim = ";"
62 |   if args.get('delimiter') == ",": delim = ","
63 | 
64 |   fo = open( args.get('file') )
65 |   for line in fo:
66 |     line = line.rstrip()
67 |     if line.startswith(">"): 
68 |       id = line[1:]
69 |       continue
70 |     print id + delim + line
71 |   fo.close()
72 | 
73 | # =============================================================================
74 | args = handle_arguments()
75 | main( args )
76 | 
77 | 


--------------------------------------------------------------------------------
/python/generic/z-score-stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | from low import *
 4 | from collections import defaultdict
 5 | import rpy2.robjects as robjects
 6 | R = robjects.r
 7 | 
 8 | 
 9 | # =============================================================================
10 | def usage():
11 |   print >> sys.stderr, "usage: " + sys.argv[0] + " paralog-count.tab" 
12 |   sys.exit(1)
13 | 
14 | 
15 | def plausi():
16 |   if len(sys.argv) != 2: usage()
17 |   inCounts = sys.argv[1]
18 |   return inCounts
19 | 
20 | 
21 | def R_mean_and_sd(pylist):
22 |   rcountsvec = robjects.IntVector(pylist)
23 |   mean = R['mean'](rcountsvec)[0]
24 |   sd = R['sd'](rcountsvec)[0]
25 |   return mean, sd
26 | 
27 | 
28 | def Zscore(x, mean, sd):
29 |   if sd == 0: return 0
30 |   return (1.0*x - mean)/sd
31 | 
32 | def main():
33 |   inCounts = plausi()
34 |   fo = open(inCounts)
35 |   lines = fo.readlines()
36 |   fo.close()
37 |   header = lines.pop(0).rstrip().split("\t")
38 |   speciesArray = header[1:]
39 |   results = defaultdict(lambda: defaultdict(int))
40 |   for line in lines:
41 |     line = line.rstrip()
42 |     columns = line.split("\t")
43 |     cluster = columns[0]
44 |     genecounts = columns[1:]
45 |     mean, sd = R_mean_and_sd(genecounts)
46 |     for i in range(len(genecounts)):
47 |       gc, species = int(genecounts[i]), speciesArray[i]
48 |       z = Zscore(gc, mean, sd)
49 |       if abs(z) < 2: continue
50 |       if z > 3: results[species]['Z > 3'] += 1
51 |       elif z > 2: results[species]['Z > 2'] += 1
52 |       elif z < -3: results[species]['Z < -3'] += 1
53 |       elif z < -2: results[species]['Z < -2'] += 1
54 |   
55 |   speciesArray.sort()
56 |   print "\t" + string.join(speciesArray, "\t")
57 |   for zcat in ['Z > 3', 'Z > 2', 'Z < -3', 'Z < -2']:
58 |     sys.stdout.write(zcat)
59 |     for spec in speciesArray:
60 |       count = str(results[spec][zcat])
61 |       sys.stdout.write("\t" + count)
62 |     sys.stdout.write("\n")
63 | 
64 | 
65 | main()
66 | 


--------------------------------------------------------------------------------
/python/gff/droso-chromosome-reconstruction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import getopt      # comand line argument handling
 6 | from collections import defaultdict
 7 | from low import *  # custom functions, written by myself
 8 | 
 9 | # =============================================================================  
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   print >> sys.stderr, "usage: " + sys.argv[0] + " -d <gff-folder>"
13 |   stdout( " option    description" )
14 |   stdout( " -h        help (this text here)" )
15 |   stdout( " -d        folder with gff files to parse" )
16 |   stdout( " " )
17 |   sys.exit(1)
18 | 
19 | # =============================================================================
20 | def handle_arguments():
21 |   """ verifies the presence of all necessary arguments and returns the data dir """
22 |   if len ( sys.argv ) == 1:
23 |     stderr( "no arguments provided." )
24 |     show_help()  
25 |   
26 |   try: # check for the right arguments
27 |     keys, values = getopt.getopt( sys.argv[1:], "hd:" )
28 |   except getopt.GetoptError:
29 |     stderr( "invalid arguments provided." )
30 |     show_help()
31 | 
32 |   args = {}
33 |   for key, value in keys:
34 |     if key == '-d': args['dir'] = value
35 |     
36 |   if not args.has_key('dir'):
37 |     print >> sys.stderr, "gff dir argument missing."
38 |     show_help()
39 |   elif not dir_exists( args.get('dir') ):
40 |     print >> sys.stderr, "gff dir does not exist."
41 |     show_help()
42 | 
43 |   if not args['dir'].endswith("/"): args['dir'] += '/'
44 |   return args
45 | 
46 | 
47 | # =============================================================================
48 | # === MAIN ====================================================================
49 | # =============================================================================
50 | def main( args ):
51 | 
52 |   def process_gff_line(line, species):
53 |     if line.startswith("#") or len(line.rstrip()) == 0: return
54 |     columns = line.rstrip().split("\t")
55 |     if len(columns) != 9: return
56 |     type = columns[2]
57 |     if type != "gene": return
58 |     chr, start, stop, strand, descr = columns[0], columns[3], columns[4], columns[6], columns[8]
59 |     id = re.search("ID=([^;]+);", descr).group(1)
60 |     sys.stdout.write(species + "\t" + id + "\t")
61 |     print string.join([chr, start, stop, strand], "\t")
62 | 
63 | # =============================================================================
64 | 
65 |   for filename in os.listdir(args['dir']):
66 |     gzip = 0
67 |     if not filename.endswith(".gff") and not filename.endswith(".gff.gz"): continue
68 |     species = filename[:filename.index("-")]
69 |     filename = args['dir'] +  filename
70 |     if filename.endswith(".gff.gz"): gzip = 1
71 |     if gzip: 
72 |       os.system("gunzip " + filename)
73 |       filename = filename[:-3]
74 |     fo = open(filename)
75 |     for line in fo: process_gff_line(line, species)
76 |     fo.close()
77 |     if gzip: os.system("gzip " + filename)
78 | 
79 | 
80 | 
81 | # =============================================================================
82 | args = handle_arguments()
83 | main( args )
84 | 
85 | 


--------------------------------------------------------------------------------
/python/gff/droso-introns-exons.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import getopt      # comand line argument handling
 6 | from collections import defaultdict
 7 | from low import *  # custom functions, written by myself
 8 | 
 9 | # =============================================================================  
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   print >> sys.stderr, "usage: " + sys.argv[0] + " -d <gff-folder>"
13 |   stdout( " option    description" )
14 |   stdout( " -h        help (this text here)" )
15 |   stdout( " -d        folder with gff files to parse" )
16 |   stdout( " " )
17 |   sys.exit(1)
18 | 
19 | # =============================================================================
20 | def handle_arguments():
21 |   """ verifies the presence of all necessary arguments and returns the data dir """
22 |   if len ( sys.argv ) == 1:
23 |     stderr( "no arguments provided." )
24 |     show_help()  
25 |   
26 |   try: # check for the right arguments
27 |     keys, values = getopt.getopt( sys.argv[1:], "hd:" )
28 |   except getopt.GetoptError:
29 |     stderr( "invalid arguments provided." )
30 |     show_help()
31 | 
32 |   args = {}
33 |   for key, value in keys:
34 |     if key == '-d': args['dir'] = value
35 |     
36 |   if not args.has_key('dir'):
37 |     print >> sys.stderr, "gff dir argument missing."
38 |     show_help()
39 |   elif not dir_exists( args.get('dir') ):
40 |     print >> sys.stderr, "gff dir does not exist."
41 |     show_help()
42 | 
43 |   if not args['dir'].endswith("/"): args['dir'] += '/'
44 |   return args
45 | 
46 | 
47 | # =============================================================================
48 | # === MAIN ====================================================================
49 | # =============================================================================
50 | def main( args ):
51 | 
52 |   def process_gff_line(line, species):
53 |     if line.startswith("#") or len(line.rstrip()) == 0: return
54 |     columns = line.rstrip().split("\t")
55 |     if len(columns) != 9: return
56 |     type = columns[2]
57 |     if type != "exon" and type != "intron": return
58 |     chr, start, stop, strand, descr = columns[0], columns[3], columns[4], columns[6], columns[8]
59 |     # id = re.search("ID=([^;]+);", descr).group(1)
60 |     sys.stdout.write(species + "\t" + type + "\t")
61 |     print string.join([chr, start, stop], "\t")
62 | 
63 | # =============================================================================
64 | 
65 |   for filename in os.listdir(args['dir']):
66 |     gzip = 0
67 |     if not filename.endswith(".gff") and not filename.endswith(".gff.gz"): continue
68 |     species = filename[:filename.index("-")]
69 |     filename = args['dir'] +  filename
70 |     if filename.endswith(".gff.gz"): gzip = 1
71 |     if gzip: 
72 |       os.system("gunzip " + filename)
73 |       filename = filename[:-3]
74 |     fo = open(filename)
75 |     for line in fo: process_gff_line(line, species)
76 |     fo.close()
77 |     if gzip: os.system("gzip " + filename)
78 | 
79 | 
80 | 
81 | # =============================================================================
82 | args = handle_arguments()
83 | main( args )
84 | 
85 | 


--------------------------------------------------------------------------------
/python/gff/gff2orthocluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | import gff3
 9 | from collections import defaultdict
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        gff3 file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['gff'] = value
39 |     
40 |   if not args.has_key('gff'):
41 |     stderr( "gff argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('gff') ):
44 |     stderr( "gff does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 |   
50 | # =============================================================================
51 | # === MAIN ====================================================================
52 | # =============================================================================
53 | def main( args ):
54 |   
55 |   fo = open(args['gff'])
56 |   for line in fo:
57 |     if line.startswith("#"): continue
58 |     if len(line.strip()) == 0: continue
59 |     if len(line.split("\t")) != 9: continue
60 |     gf = gff3.GeneFeature(line.rstrip())
61 |     if gf.type != "gene": continue
62 |     id = gf.get_attributes()['ID']
63 |     if gf.strand == '+': strand = '1'
64 |     else: strand = "-1"
65 |     print string.join([id, gf.seqid, str(gf.start), str(gf.stop), strand], "\t")
66 |   fo.close()
67 | 
68 | # =============================================================================
69 | args = handle_arguments()
70 | main( args )
71 | 
72 | 


--------------------------------------------------------------------------------
/python/kegg/kegg-enzyme2ko.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm					# index databases (file hash)
 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance
10 | 
11 | # =============================================================================	
12 | def show_help( ):
13 | 	""" displays the program parameter list and usage information """
14 | 	stdout( "usage: " + sys.argv[0] + " -f <path>" )
15 | 	stdout( " " )
16 | 	stdout( " option    description" )
17 | 	stdout( " -h        help (this text here)" )
18 | 	stdout( " -f        kegg ko file file" )
19 | 	stdout( " " )
20 | 	
21 | 	sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 | 	""" verifies the presence of all necessary arguments and returns the data dir """
26 | 	if len ( sys.argv ) == 1:
27 | 		stderr( "no arguments provided." )
28 | 		show_help()	
29 | 	
30 | 	try: # check for the right arguments
31 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 | 	except getopt.GetoptError:
33 | 		stderr( "invalid arguments provided." )
34 | 		show_help()
35 | 	
36 | 	args = {}
37 | 	for key, value in keys:
38 | 		if key == '-f':	args['file'] = value
39 | 				
40 | 	if not args.has_key('file'):
41 | 		stderr( "kegg file missing." )
42 | 		show_help()
43 | 	if not file_exists( args.get('file') ):
44 | 		stderr( "kegg file does not exist." )
45 | 		show_help()
46 | 		
47 | 	return args
48 | 
49 | 
50 | # =============================================================================
51 | def strip_tags(value):
52 |   "Return the given HTML with all tags (+ KEGG tags) stripped."
53 |   value = re.sub(r'<[^>]*?>', '', value)
54 |   value = re.sub(r'\[.*\]', '', value)
55 |   return value
56 | 
57 | # =============================================================================
58 | # =============================================================================
59 | def main( args ):
60 |   fo = open( args.get('file'), 'r' )
61 |   ko_regex = re.compile( "^ENTRY\s+(K\S+)" )
62 |   enzyme_regex = re.compile( "\s+EC:\s+([0-9.]+)" )
63 | 
64 |   ko, enzyme = "", ""
65 |   for line in fo:
66 |     line = line.rstrip()
67 |     if line.startswith("///"): 
68 |       ko, enzyme = "", ""
69 |       continue
70 |     if ko == "":
71 |       if re.search( ko_regex, line): ko = re.search( ko_regex, line ).group(1)
72 |     else:
73 |       if re.search( enzyme_regex, line):
74 |         enzyme = re.search( enzyme_regex, line ).group(1)
75 |         print "%s\t%s" % ( ko, enzyme )
76 | 
77 |   fo.close()
78 | 
79 | 
80 | # =============================================================================
81 | # === MAIN ====================================================================
82 | # =============================================================================
83 | 
84 | args = handle_arguments(  )
85 | main( args )
86 | 


--------------------------------------------------------------------------------
/python/kegg/kegg-parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | import anydbm					# index databases (file hash)
 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance
10 | 
11 | # =============================================================================	
12 | def show_help( ):
13 | 	""" displays the program parameter list and usage information """
14 | 	stdout( "usage: " + sys.argv[0] + " -f <path>" )
15 | 	stdout( " " )
16 | 	stdout( " option    description" )
17 | 	stdout( " -h        help (this text here)" )
18 | 	stdout( " -f        kegg html file" )
19 | 	stdout( " " )
20 | 	
21 | 	sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 | 	""" verifies the presence of all necessary arguments and returns the data dir """
26 | 	if len ( sys.argv ) == 1:
27 | 		stderr( "no arguments provided." )
28 | 		show_help()	
29 | 	
30 | 	try: # check for the right arguments
31 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 | 	except getopt.GetoptError:
33 | 		stderr( "invalid arguments provided." )
34 | 		show_help()
35 | 	
36 | 	args = {}
37 | 	for key, value in keys:
38 | 		if key == '-f':	args['file'] = value
39 | 				
40 | 	if not args.has_key('file'):
41 | 		stderr( "kegg file missing." )
42 | 		show_help()
43 | 	if not file_exists( args.get('file') ):
44 | 		stderr( "kegg file does not exist." )
45 | 		show_help()
46 | 		
47 | 	return args
48 | 
49 | 
50 | # =============================================================================
51 | def strip_tags(value):
52 |   "Return the given HTML with all tags (+ KEGG tags) stripped."
53 |   value = re.sub(r'<[^>]*?>', '', value)
54 |   value = re.sub(r'\[.*\]', '', value)
55 |   return value
56 | 
57 | # =============================================================================
58 | # =============================================================================
59 | def main( args ):
60 |   fo = open( args.get('file'), 'r' )
61 |   statics = {}
62 |   statics['entry'] = '^#ENTRY\s+(\S+)'
63 |   statics['name'] = '^#NAME\s+(\S+)'
64 |   statics['definition'] = '^#DEFINITION\s+(.*)$'
65 |   oldlevel = ""
66 |   hier = []
67 |   for line in fo:
68 |     for name, regex in statics.iteritems():
69 |       if re.search( regex, line ):
70 |         print "#%s\t%s" %(name, re.search( regex, line).group(1))
71 | 
72 |     if re.match( '[A-Z]\s+', line ):
73 |       currentlevel = line[0]
74 |       #print currentlevel
75 |       rest = re.match( '[A-Z]\s+(.*)$', line ).group(1).strip()
76 |       if not re.search( '\S+', rest ): continue
77 |       rest = re.match( '(\S+)', rest ).group(1)
78 |       if currentlevel > oldlevel:
79 |         hier.append( strip_tags(rest) )
80 |       elif currentlevel == oldlevel: 
81 |         print string.join( hier, '/' )
82 |         hier.pop()
83 |         hier.append( strip_tags(rest) )
84 |       else:
85 |         hier.pop()
86 |       
87 |       oldlevel = currentlevel
88 | 
89 |   fo.close()
90 | 
91 | 
92 | # =============================================================================
93 | # === MAIN ====================================================================
94 | # =============================================================================
95 | 
96 | args = handle_arguments(  )
97 | main( args )
98 | 


--------------------------------------------------------------------------------
/python/kegg/kegg2xdom.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os, sys 				# low level handling, such as command line stuff
  4 | import string					# string methods available
  5 | import re							# regular expressions
  6 | import getopt					# comand line argument handling
  7 | from low import *			# custom functions, written by myself
  8 | import anydbm					# index databases (file hash)
  9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance
 10 | 
 11 | # =============================================================================	
 12 | def show_help( ):
 13 |   """ displays the program parameter list and usage information """
 14 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
 15 |   stdout( " " )
 16 |   stdout( " option    description" )
 17 |   stdout( " -h        help (this text here)" )
 18 |   stdout( " -f        kegg KO annotation file" )
 19 |   stdout( " " )
 20 | 
 21 |   sys.exit(1)
 22 | 
 23 | # =============================================================================
 24 | def handle_arguments():
 25 |   """ verifies the presence of all necessary arguments and returns the data dir """
 26 |   if len ( sys.argv ) == 1:
 27 |     stderr( "no arguments provided." )
 28 |     show_help()	
 29 | 
 30 |   try: # check for the right arguments
 31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
 32 |   except getopt.GetoptError:
 33 |     stderr( "invalid arguments provided." )
 34 |     show_help()
 35 | 
 36 |   args = {}
 37 |   for key, value in keys:
 38 |     if key == '-f':	args['file'] =  value
 39 |         
 40 |   if not args.has_key('file'):
 41 |     stderr( "kegg file missing." )
 42 |     show_help()
 43 |   if not file_exists( args.get('file') ):
 44 |     stderr( "kegg file does not exist." )
 45 |     show_help()
 46 | 
 47 |   return args
 48 | 
 49 | 
 50 | # =============================================================================
 51 | def strip_tags(value):
 52 |   "Return the given HTML with all tags (+ KEGG tags) stripped."
 53 |   value = re.sub(r'<[^>]*?>', '', value)
 54 |   value = re.sub(r'\[.*\]', '', value)
 55 |   return value
 56 | 
 57 | 
 58 | def read_KOs( file ):
 59 | 
 60 |   def next_entry(fo):
 61 |     pathlist = []
 62 |     definition = ""
 63 |     line = fo.readline().rstrip()
 64 |     if line == '': 
 65 |       return fo, None, None
 66 |     entry = re.match('^ENTRY\s+(\S+)', line).group(1)
 67 |     line = fo.readline().rstrip()
 68 |     line = fo.readline().rstrip()
 69 |     if re.match( '^DEFINITION\s+(.*)$',line):
 70 |       definition = re.search( '^DEFINITION\s+(.*)$', line ).group(1)
 71 |       line = fo.readline().rstrip()
 72 |     while line.startswith('CLASS') or line.startswith(' '):
 73 |       if re.search('\[\S+:\S+\]', line):
 74 |         pathlist.append( re.search('\[(\S+:\S+)\]',line).group(1) )
 75 |       line = fo.readline().rstrip()
 76 |       
 77 |     while line != '///':
 78 |       line = fo.readline().rstrip()
 79 | 
 80 |     if definition != "": entry += "\t" + definition
 81 |     return fo, entry, pathlist
 82 |   
 83 |   fo = open( file )
 84 |   kohash = {}
 85 |   while 1:
 86 |     fo, id, pathlist = next_entry( fo )
 87 |     if id == None: break
 88 |     print ">%s\n%s" %(id, string.join(pathlist,"\t"))
 89 |     
 90 |   fo.close()
 91 | 
 92 | # =============================================================================
 93 | # =============================================================================
 94 | def main( args ):
 95 |   
 96 |   kohash = read_KOs( args.get('file') )
 97 | 
 98 | 
 99 | # =============================================================================
100 | # === MAIN ====================================================================
101 | # =============================================================================
102 | 
103 | args = handle_arguments(  )
104 | main( args )
105 | 


--------------------------------------------------------------------------------
/python/latex-bibtex/bibtex-number-of-coauthors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import os, sys, re, string
  3 | 
  4 | class BibtexEntry:
  5 | 
  6 | 
  7 |   def __init__(self, lines):
  8 |     self.ATTRIBUTE_REGEX = re.compile("\s{2}(\S+)\s{1}=\s\{(.*)\}*$")
  9 |     self.BIBTEXSTART_REGEX = re.compile("@([A-Z]+)\{(\S+),$")
 10 |     self.key = ""
 11 |     self.bibtype = ""
 12 |     self.attributehash = {}
 13 |     while 1:
 14 |       if len(lines) == 0: break
 15 |       line = lines.pop(0)
 16 | 
 17 |       # end of entry
 18 |       if line.startswith("}"): break
 19 | 
 20 |       # bibtex entry start line and key definition
 21 |       if self.BIBTEXSTART_REGEX.match(line):
 22 |         self.bibtype = self.BIBTEXSTART_REGEX.match(line).group(1)
 23 |         self.key     = self.BIBTEXSTART_REGEX.match(line).group(2)
 24 |         continue
 25 | 
 26 |       # bibtex attribute start
 27 |       if self.ATTRIBUTE_REGEX.match(line):
 28 |         attr  = self.ATTRIBUTE_REGEX.match(line).group(1)
 29 |         value = self.ATTRIBUTE_REGEX.match(line).group(2)
 30 |         self.attributehash[attr] = value
 31 |       else: self.attributehash[attr] += " " + line.strip()
 32 | 
 33 |     for attr, value in self.attributehash.iteritems():
 34 |       if value.endswith("}"): self.attributehash[attr] = value[:-1]
 35 |       elif value.endswith("},"): self.attributehash[attr] = value[:-2]
 36 | 
 37 |   def get_key(self): return self.key
 38 |   def get_first_author(self): return self.attributehash['author'].split(" and ")[0]
 39 |   def get_attr(self, name):
 40 |     if self.attributehash.has_key(name): return self.attributehash[name]
 41 |     return ""
 42 | 
 43 |   def get_author_count(self, return_str=0): 
 44 |     count = self.attributehash['author'].count(" and ") +1
 45 |     if return_str: return "%s" % count
 46 |     else: return count
 47 | 
 48 |   def annotate(self):
 49 |     self.attributehash['annotate'] = "(%s co-authors)" % self.get_author_count()
 50 |     self.attributehash['note'] = "(%s co-authors)" % self.get_author_count()
 51 | 
 52 |   def to_s(self, escape_title=1, annotate=0):
 53 |     print "@" + self.bibtype + "{" + self.key + ","
 54 |     all_attrs = self.attributehash.keys()
 55 |     for i in range(len(all_attrs)):
 56 |       attr = all_attrs[i]
 57 |       if i == len(all_attrs)-1: comma = ""
 58 |       else: comma = ","
 59 |       if attr == "title" and escape_title: print "  " + attr + " = \"{" + self.attributehash[attr] + "}\"" + comma
 60 |       else: print "  " + attr + " = {" + self.attributehash[attr] + "}" + comma
 61 |     print "}"
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | def usage():
 68 |   print >> sys.stderr, "usage: " + sys.argv[0] + " db.bib  [n=max-coauthors]"
 69 |   sys.exit(1)
 70 | 
 71 | 
 72 | def plausi():
 73 |   if len(sys.argv) != 3: usage()
 74 |   inFile, inAuthors = sys.argv[1:3]
 75 |   return inFile, int(inAuthors)
 76 | 
 77 | 
 78 | def main():
 79 |   inFile, inAuthors = plausi()
 80 |   fo = open(inFile)
 81 |   while 1:
 82 |     line = fo.readline().rstrip()
 83 |     if line.startswith("%"): continue
 84 |     if line.startswith("@comment"): break
 85 |     if line.startswith("@"):
 86 |       lines = []
 87 |       lines.append(line)
 88 |       while 1:
 89 |         line = fo.readline().rstrip()
 90 |         lines.append(line)
 91 |         if line.startswith("}"): break
 92 |       b = BibtexEntry(lines)
 93 |       count = b.get_author_count()
 94 |       if count > inAuthors: b.annotate()
 95 |       b.to_s()
 96 | 
 97 |   fo.close()
 98 | 
 99 | 
100 | main()
101 | 


--------------------------------------------------------------------------------
/python/latex-bibtex/latex-rename.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys 				# low level handling, such as command line stuff
 4 | import string					# string methods available
 5 | import re							# regular expressions
 6 | import getopt					# comand line argument handling
 7 | from low import *			# custom functions, written by myself
 8 | 
 9 | # =============================================================================	
10 | def show_help( ):
11 |   """ displays the program parameter list and usage information """
12 |   stdout( "renames files so that they can be included in latex documents." )
13 |   stdout( "this means that all dots are removed except for the last one of the actual file extension." )
14 |   stdout( "dots are replaced by \"_\" by default." )
15 |   stdout( "usage: " + sys.argv[0] + " -f <path> [-r <x>]" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        fasta file" )
20 |   stdout( " -r        replace dot with this sign (default: \"_\")" )
21 |   stdout( " " )
22 |   sys.exit(1)
23 | 
24 | # =============================================================================
25 | def handle_arguments():
26 | 	""" verifies the presence of all necessary arguments and returns the data dir """
27 | 	if len ( sys.argv ) == 1:
28 | 		stderr( "no arguments provided." )
29 | 		show_help()	
30 | 	
31 | 	try: # check for the right arguments
32 | 		keys, values = getopt.getopt( sys.argv[1:], "hf:r:" )
33 | 	except getopt.GetoptError:
34 | 		stderr( "invalid arguments provided." )
35 | 		show_help()
36 | 	
37 | 	args = {}
38 | 	for key, value in keys:
39 | 		if key == '-f': args['file'] = value
40 | 		if key == '-r':	args['r'] = str(value)
41 | 				
42 | 	if not args.has_key('file'):
43 | 		stderr( "file missing." )
44 | 		show_help()
45 | 	if not file_exists( args.get('file') ):
46 | 		stderr( "file does not exist." )
47 | 		show_help()
48 | 		
49 | 	return args
50 | 
51 | 	
52 | # =============================================================================
53 | # =============================================================================
54 | def main( args ):
55 |   oldfilename = args.get('file')
56 |   path, filename = os.path.split(oldfilename)
57 |   base, ext = os.path.splitext(filename)
58 |   if args.has_key('r'): r = args.get('r')
59 |   else: r = '_'
60 |   base = base.replace('.',r)
61 |   if path != "":
62 |     newfilename = path + '/' + base + ext
63 |   else:
64 |     newfilename = base + ext
65 |   os.system( "mv %s %s" %(oldfilename, newfilename) )
66 | 	
67 | # =============================================================================
68 | # === MAIN ====================================================================
69 | # =============================================================================
70 | 
71 | args = handle_arguments(  )
72 | main( args )
73 | 


--------------------------------------------------------------------------------
/python/misa/gc-content-from-misa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from collections import defaultdict
 9 | from misa import MisaSSRspecies
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        all.misa out file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "fasta file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "fasta file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 |   
50 | # =============================================================================
51 | # === MAIN ====================================================================
52 | # =============================================================================
53 | def main( args ):
54 |   specieshash = {}
55 |   fo = open(args['file'])
56 |   for line in fo:
57 |     m = MisaSSRspecies(line)
58 |     if not specieshash.has_key(m.species): specieshash[m.species] = defaultdict(int)
59 |     for char in ['A', 'T', 'G', 'C']:
60 |       specieshash[m.species][char] += m.motif.count(char) * m.repeats
61 | 
62 |   speciesarray = specieshash.keys()
63 |   speciesarray.sort()
64 |   for species in speciesarray:
65 |     total = sum(specieshash[species].values())
66 |     gc = 1.0 * (specieshash[species]['G'] + specieshash[species]['C']) / total
67 |     print species + "\t" + str(gc)
68 | 
69 | # =============================================================================
70 | args = handle_arguments()
71 | main( args )
72 | 
73 | 


--------------------------------------------------------------------------------
/python/misa/import-into-sqlite3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import sqlite3
 8 | from low import *  # custom functions, written by myself
 9 | from misa import MisaSSRspecies
10 | from collections import defaultdict
11 | 
12 | 
13 | # =============================================================================  
14 | def show_help( ):
15 |   """ displays the program parameter list and usage information """
16 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
17 |   stdout( " " )
18 |   stdout( " option    description" )
19 |   stdout( " -h        help (this text here)" )
20 |   stdout( " -f        misa output file with an additional first column = speciesname" )
21 |   stdout( " -d        db file" )
22 |   stdout( " " )
23 |   sys.exit(1)
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     stderr( "no arguments provided." )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:d:" )
34 |   except getopt.GetoptError:
35 |     stderr( "invalid arguments provided." )
36 |     show_help()
37 | 
38 |   args = {}
39 |   for key, value in keys:
40 |     if key == '-f': args['file'] = value
41 |     if key == '-d': args['db'] = value
42 |     
43 |   if not args.has_key('file'):
44 |     stderr( "misa file argument missing." )
45 |     show_help()
46 |   elif not file_exists( args.get('file') ):
47 |     stderr( "misa file does not exist." )
48 |     show_help()
49 |   
50 |   return args
51 | 
52 | # =============================================================================
53 | def init_db(conn):
54 |   conn.execute("CREATE TABLE IF NOT EXISTS ssrs(id INTEGER PRIMARY KEY ASC, species VARCHAR(4), chr VARCHAR(50), startpos INTEGER, endpos INTEGER, ssr_type VARCHAR(2), motif VARCHAR(20), repeats INTEGER)")
55 | 
56 | 
57 | # =============================================================================
58 | # === MAIN ====================================================================
59 | # =============================================================================
60 | def main( args ):
61 | 
62 |   conn = sqlite3.connect(args['db'])
63 |   init_db(conn)
64 |   
65 |   fo = open(args['file'])
66 |   for line in fo:
67 |     if line.startswith("ID\t"): continue
68 |     m = MisaSSRspecies(line)
69 |     sql = "INSERT INTO ssrs(species, chr, startpos, endpos, ssr_type, motif, repeats) VALUES (\'%s\', \'%s\', %s, %s, \'%s\', \'%s\', %s)" %(m.species, m.geneid, m.startpos, m.endpos, m.type, m.motif, m.repeats)
70 |     conn.execute(sql)
71 |   res = conn.execute("SELECT COUNT(*) FROM ssrs")
72 |   entries = res.fetchall()[0][0]
73 |   print "done. entries added:", entries
74 |   conn.commit()
75 |   conn.close()
76 | 
77 | 
78 | # =============================================================================
79 | args = handle_arguments()
80 | main( args )
81 | 
82 | 


--------------------------------------------------------------------------------
/python/misa/split-compound-ssrs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from misa import MisaSSR
 9 | from collections import defaultdict
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        misa outptu file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "misa file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "misa file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 | 
50 | # =============================================================================
51 | # === MAIN ====================================================================
52 | # =============================================================================
53 | def main( args ):
54 |   
55 |   fo = open(args['file'])
56 |   for line in fo:
57 |     if line.startswith("ID\t"): continue
58 |     m = MisaSSR(line)
59 |     if m.type != "c" and m.type != "c*": print m.to_s()
60 |     else:
61 |       startpos = m.startpos
62 |       separatepatterns = re.findall("\([ATGC]+\)\d+[*]{0,1}",m.pattern)
63 |       for separatepattern in separatepatterns:
64 |         motif = separatepattern[1:separatepattern.index(")")]
65 |         if separatepattern.endswith("*"): repeats = int(separatepattern[separatepattern.index(")")+1:-1])
66 |         else: repeats = int(separatepattern[separatepattern.index(")")+1:])
67 |         length = len(motif)*repeats
68 |         endpos = startpos + length -1
69 |         print string.join([m.geneid, str(m.ssrnr), "p" + str(len(motif)), separatepattern, str(length), str(startpos), str(endpos)], "\t")
70 |         startpos = endpos+1
71 | 
72 | 
73 | 
74 | # =============================================================================
75 | args = handle_arguments()
76 | main( args )
77 | 
78 | 


--------------------------------------------------------------------------------
/python/openreadingframe/ORFPREDICTORRR.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys,os,getopt
 4 | 
 5 | OUTFILEPART2 = 'tmp.orf.part2.fasta'
 6 | 
 7 | #==============================================================================
 8 | def usage():
 9 |   print """Hello!
10 |   
11 |   Following options are possible:
12 |   -i:\tparsed BLASTX best hit definitions
13 |   -j:\tinput sequences in FASTA format
14 |   -t:\tminimum length for in silico predicted ORFs
15 |   """
16 | 
17 | #==============================================================================
18 | def main( XMLfile, CAP3, threshold ):
19 |   # First, elongate BLAST-hits
20 | 
21 |   os.system("orf_prediction_part1.py -b "+str(XMLfile)+" -f "+str(CAP3) )
22 |   #print "BLASTelongator has finished. Starting 2nd part..."
23 |   # It has written to temp and now comes Ina's script
24 | 
25 |   os.system("orf_prediction_part2.py"+" -t "+str(threshold)+" -f " + OUTFILEPART2 )
26 |   #print "ORF-Prediction has finished. Removing temp-files.."
27 |   #os.system("cat BLASTelongatorHits.out SimulatedORFS.out > "+str(outfile))  
28 |   #os.system("rm BLASTelongatorHits.out")
29 |   os.system("rm " + OUTFILEPART2)
30 |   #os.system("rm SimulatedORFS.out")
31 |   #print "Done. See you soon!"
32 | 
33 | 
34 | 
35 | #==============================================================================
36 | # MAIN ========================================================================
37 | #==============================================================================
38 | try:  
39 |   if len(sys.argv) > 1:
40 |     opts, args = getopt.getopt(sys.argv[1:],"i:j:t:h")
41 |   else:
42 |     usage()
43 |     #print "Hello! I take at least 3 arguments. I have the following options: -i defines the input XML-file which you want to use, -j defines the CAP3-outputfile, -t defines the threshold for in silico predicted proteins, -o defines the outfile, -h gives you more help! See you soon! You provided:" 
44 |     sys.exit()
45 | except getopt.GetoptError, err:
46 |   print "Something went wrong - maybe this helps: " + str(err)
47 |   sys.exit()
48 | 
49 | for o, a in opts:
50 |   if o == "-h":
51 |     usage()
52 |     sys.exit()
53 |   elif o == "-i":
54 |     if os.path.exists(a):
55 |       if os.path.isfile(a):
56 |         XMLfile = a
57 |       elif os.path.isdir(a):
58 |         print "Specified XML-file is a directory!"
59 |     else:
60 |       print "Something is wrong with the XML-file, maybe it doesn't exist?"
61 |   elif o == "-j":
62 |     if os.path.exists(a):
63 |       if os.path.isfile(a):
64 |         CAP3 = a
65 |       elif os.path.isdir(a):
66 |         print "Specified CAP3-file is a directory!"
67 |     else:
68 |       print "Something is wrong with the CAP3-file, maybe it doesn't exist?"
69 |   elif o == "-t":
70 |     threshold = a
71 |   else:
72 |     print "Something went wrong ;_;. Maybe the file you specified doesn't exist?"
73 | 
74 | if len(opts) == 3:
75 |   main( XMLfile, CAP3, threshold )
76 | else:
77 |   print len(opts)
78 |   print "Again, hello to you! You do not have the required amount of arguments given. Please specify them. For more, see -h! I AM THE PREDICTOR"
79 | 


--------------------------------------------------------------------------------
/python/orthomcl/add-blasthits-to-cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string, anydbm
 3 | from low import *
 4 | from orthomcl import OrthoMCLCluster
 5 | 
 6 | 
 7 | # =============================================================================
 8 | def usage():
 9 |   print >> sys.stderr, "add significant BLAST hits (e.g. in-paralogs) to an existing orthomcl cluster.\n"
10 |   print >> sys.stderr, "usage:      (1) " + sys.argv[0] + " noparalogs.orthomcl.out  blastout.add.dbm" 
11 |   print >> sys.stderr, "         or (2) " + sys.argv[0] + " noparalogs.orthomcl.out  all.fasta  all.gg  all.blastout" 
12 |   sys.exit(1)
13 | 
14 | 
15 | def plausi():
16 |   if len(sys.argv) != 3 and len(sys.argv) != 5: usage()
17 |   return sys.argv[1:]
18 | 
19 | 
20 | def read_gg(inGG):
21 |   outHash, speciesArray = {}, []
22 |   fo = open(inGG)
23 |   for line in fo: 
24 |     line = line.rstrip()
25 |     cols = line.split()
26 |     species = str(cols[0])[:-1]
27 |     if not species in speciesArray: speciesArray.append(species)
28 |     for col in cols[1:]:
29 |       outHash[col] = species
30 |   fo.close()
31 |   return outHash, speciesArray
32 | 
33 | 
34 | def get_seq_lengths(file):
35 |   lengthHash, id = {}, ""
36 |   fo = open(file)
37 |   for line in fo: 
38 |     line = line.strip()
39 |     if line.startswith(">"):
40 |       id = line[1:]
41 |       if id.count(" ") > 0: id = id[:id.index(" ")]
42 |       lengthHash[id] = 0 
43 |     else: lengthHash[id] += len(line)
44 |   return lengthHash
45 | 
46 | 
47 | def main():
48 |   args = plausi()
49 |   in_orthomcl = args[0]
50 |   EVALUE = float('1e-20')
51 |   IDENTITY = 30.0
52 |   if len(args) == 4:
53 |     in_fasta, in_gg, in_blast = args[1:4]
54 |     gene2species, speciesArray = read_gg(in_gg)
55 |     gene2length = get_seq_lengths(in_fasta)
56 |     dbmfile = in_blast + ".add.dbm"
57 |     dbm = anydbm.open(dbmfile, "c")
58 |     fo = open(in_blast)
59 |     for line in fo: 
60 |       line = line.rstrip()
61 |       cols = line.split("\t")
62 |       qid, hid, evalue, identity = cols[0], cols[1], float(cols[10]), float(cols[2])
63 |       # ignore self-hits and between-species hits, check e-value threshold
64 |       if qid == hid: continue
65 |       if gene2species[qid] != gene2species[hid]: continue
66 |       if evalue > EVALUE: continue
67 |       if identity < IDENTITY: continue
68 |       # check that blast alignment spans at least 75% of the longer sequence
69 |       alnlength, qlength, hlength = int(cols[3]), gene2length[qid], gene2length[hid]
70 |       lengthcutoff = 0.80 * max([qlength, hlength])
71 |       if alnlength < lengthcutoff: continue
72 |       if not dbm.has_key(qid): dbm[qid] = ""
73 |       else: dbm[qid] += " "
74 |       dbm[qid] += hid
75 |     fo.close()
76 |     dbm.close()
77 |   else: dbmfile = args[1]
78 |   dbm = anydbm.open(dbmfile)
79 | 
80 |   fo = open(in_orthomcl)
81 |   for line in fo:
82 |     o = OrthoMCLCluster(line.rstrip())
83 |     oldsize = o.get_count()
84 |     additions = []
85 |     for geneid, species in o.get_gene_hash().iteritems():
86 |       if not dbm.has_key(geneid): continue
87 |       [additions.append([x, species]) for x in dbm[geneid].split()]
88 | 
89 |     for x, species in additions: o.add_gene(x,species)
90 |     o.to_s()
91 |     newsize = o.get_count()
92 |     print >> sys.stderr, "%s\t%s\t%s" %(o.get_name(), oldsize, newsize)
93 | 
94 | main()
95 | 


--------------------------------------------------------------------------------
/python/orthomcl/build-counts-table.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | from low import *
 4 | 
 5 | # takes an input protein fasta file and an orthomcl.gg file
 6 | # orthomcl.gg file format:
 7 | # speciesname1: id1 id2 id3 id4 .... full genome
 8 | # speciesname2: id1 id2 id3 id4 .... full genome
 9 | #
10 | # with these infos, the goal is to get only one protein sequence per species
11 | # we use t-coffee to find the most similar protein sequence per species
12 | # to the whole cluster. so in case one species contributes several sequences 
13 | # to a cluster, we choose the one species to keep which has the highest average 
14 | # similarity to the rest of the cluster. if more than 1 sequence yield the highest
15 | # avgsim, we determine whether these protein sequences are (1) all identical, 
16 | # or whether they are (2) slightly different. In case (1), we choose any sequence
17 | # randomly because it does not matter. In case (2), we sum up all pairwise
18 | # similarities for each candidate sequence, and keep only the one sequence
19 | # with the highest sum. If these are identical as well, we again choose randomly
20 | # (should happen very rarely).
21 | 
22 | 
23 | 
24 | def usage():
25 |   print >> sys.stderr, "usage: " + sys.argv[0] + " clustering.out  orthomcl.gg"
26 |   sys.exit(1)
27 | 
28 | 
29 | def plausi():
30 |   if len(sys.argv) != 3: usage()
31 |   inClustering, inGG = sys.argv[1:3]
32 |   return inClustering, inGG
33 | 
34 | 
35 | def get_number_of_species(inGG):
36 |   count = 0
37 |   fo = open(inGG)
38 |   for line in fo: count += 1
39 |   fo.close()
40 |   return count
41 | 
42 | 
43 | def read_gg(inGG):
44 |   outHash = {}
45 |   speciesArray = []
46 |   fo = open(inGG)
47 |   for line in fo:
48 |     line = line.rstrip()
49 |     cols = line.split()
50 |     species = str(cols[0])[:-1]
51 |     if not species in speciesArray: speciesArray.append(species)
52 |     for col in cols[1:]:
53 |       outHash[col] = species
54 |   fo.close()
55 |   return outHash, speciesArray
56 | 
57 | 
58 | def main():
59 |   inClustering, inGG = plausi()
60 |   speciesHash, speciesArray = read_gg(inGG)
61 |   speciesArray.sort()
62 |   
63 |   sys.stdout.write("\t")
64 |   sys.stdout.write(string.join(speciesArray, "\t"))
65 |   sys.stdout.write("\n")
66 | 
67 |   fo = open(inClustering)
68 |   for line in fo:
69 |     if line.startswith("#"): continue
70 |     line = line.rstrip()
71 |     cluster, count, geneids = line.split("\t")[0:3]
72 |     geneids = geneids.split(", ")
73 |     tab = [0]*len(speciesArray)
74 |     for id in geneids:
75 |       species = speciesHash[id]
76 |       tab[ speciesArray.index(species) ] += 1
77 |     sys.stdout.write(cluster + "\t")
78 |     sys.stdout.write( string.join([str(e) for e in tab], "\t") )
79 |     sys.stdout.write("\n")
80 |   fo.close()
81 | 
82 | 
83 | main()
84 | 


--------------------------------------------------------------------------------
/python/orthomcl/build-orthomcl-like-output.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | from low import *
 4 | 
 5 | # takes an input protein fasta file and an orthomcl.gg file
 6 | # orthomcl.gg file format:
 7 | # speciesname1: id1 id2 id3 id4 .... full genome
 8 | # speciesname2: id1 id2 id3 id4 .... full genome
 9 | #
10 | # with these infos, the goal is to get only one protein sequence per species
11 | # we use t-coffee to find the most similar protein sequence per species
12 | # to the whole cluster. so in case one species contributes several sequences 
13 | # to a cluster, we choose the one species to keep which has the highest average 
14 | # similarity to the rest of the cluster. if more than 1 sequence yield the highest
15 | # avgsim, we determine whether these protein sequences are (1) all identical, 
16 | # or whether they are (2) slightly different. In case (1), we choose any sequence
17 | # randomly because it does not matter. In case (2), we sum up all pairwise
18 | # similarities for each candidate sequence, and keep only the one sequence
19 | # with the highest sum. If these are identical as well, we again choose randomly
20 | # (should happen very rarely).
21 | 
22 | 
23 | 
24 | def usage():
25 |   print >> sys.stderr, "usage: " + sys.argv[0] + " clustering.out  orthomcl.gg"
26 |   sys.exit(1)
27 | 
28 | 
29 | def plausi():
30 |   if len(sys.argv) != 3: usage()
31 |   inClustering, inGG = sys.argv[1:3]
32 |   return inClustering, inGG
33 | 
34 | 
35 | def get_number_of_species(inGG):
36 |   count = 0
37 |   fo = open(inGG)
38 |   for line in fo: count += 1
39 |   fo.close()
40 |   return count
41 | 
42 | 
43 | def read_gg(inGG):
44 |   outHash = {}
45 |   speciesArray = []
46 |   fo = open(inGG)
47 |   for line in fo:
48 |     line = line.rstrip()
49 |     cols = line.split()
50 |     species = str(cols[0])[:-1]
51 |     if not species in speciesArray: speciesArray.append(species)
52 |     for col in cols[1:]:
53 |       outHash[col] = species
54 |   fo.close()
55 |   return outHash, speciesArray
56 | 
57 | 
58 | def main():
59 |   inClustering, inGG = plausi()
60 |   speciesHash, speciesArray = read_gg(inGG)
61 |   
62 |   fo = open(inClustering)
63 |   for line in fo:
64 |     if line.startswith("#"): continue
65 |     line = line.rstrip()
66 |     cluster, count, geneids = line.split("\t")[0:3]
67 |     geneids = geneids.split(", ")
68 |     currentSpecies = []
69 |     for id in geneids: currentSpecies.append(speciesHash[id])
70 |     speciesCount = len(set(currentSpecies))
71 |     sys.stdout.write("%s(%s genes, %s taxa):\t" %(cluster, count, speciesCount)) 
72 |     for id in geneids: 
73 |       species = speciesHash[id]
74 |       sys.stdout.write(id + "(" + species + ") ")
75 |     sys.stdout.write("\n")
76 |   fo.close()
77 | 
78 | 
79 | main()
80 | 


--------------------------------------------------------------------------------
/python/orthomcl/cluster2arath.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | from low import *
 4 | from orthomcl import OrthoMCLCluster
 5 | 
 6 | 
 7 | # =============================================================================
 8 | def usage():
 9 |   print >> sys.stderr, "usage: " + sys.argv[0] + " noparalogs.orthomcl.out" 
10 |   sys.exit(1)
11 | 
12 | 
13 | def plausi():
14 |   if len(sys.argv) != 2: usage()
15 |   inFile = sys.argv[1]
16 |   return inFile
17 | 
18 | 
19 | def main():
20 |   inFile = plausi()
21 |   fo = open(inFile)
22 |   for line in fo:
23 |     o = OrthoMCLCluster(line.rstrip())
24 |     print o.get_name() + "\t" + o.get_species_hash()['Arath'][0]
25 | 
26 | 
27 | main()
28 | 


--------------------------------------------------------------------------------
/python/orthomcl/geneid2cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | from low import *
 4 | from orthomcl import OrthoMCLCluster
 5 | 
 6 | 
 7 | # =============================================================================
 8 | def usage():
 9 |   print >> sys.stderr, "prints a mapping between each gene id and its cluster from orthomcl output\n"
10 |   print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out" 
11 |   sys.exit(1)
12 | 
13 | 
14 | def plausi():
15 |   if len(sys.argv) != 2: usage()
16 |   inFile = sys.argv[1]
17 |   return inFile
18 | 
19 | 
20 | def main():
21 |   inFile = plausi()
22 |   fo = open(inFile)
23 |   for line in fo:
24 |     o = OrthoMCLCluster(line.rstrip())
25 |     name = o.get_name()
26 |     geneHash = o.get_gene_hash()
27 |     for geneid, species in geneHash.iteritems(): print geneid + "\t" + name
28 | 
29 | 
30 | main()
31 | 


--------------------------------------------------------------------------------
/python/orthomcl/map-orthomcl-clusters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | from low import *
 4 | 
 5 | 
 6 | def usage():
 7 |   print >> sys.stderr, "usage: ", sys.argv[0], " <from>  <to>  [<orthomcl.out>]"
 8 |   print >> sys.stderr, "from/to: speciesname or \"cluster\""
 9 |   sys.exit(1)
10 | 
11 | 
12 | def plausi():
13 |   if len(sys.argv) < 3: usage()
14 |   inTo = sys.argv[2].lower()
15 |   inFrom = sys.argv[1].lower()
16 |   if len(sys.argv) > 3:
17 |     inTable = sys.argv[3]
18 |   else:
19 |     inTable = "/home/low/workspace/back-to-the-sea-orf-cluster-verification/32-new-orthologs/3-orthomcl-v1.4/noparalogs_orthomcl.out"
20 |   if not os.path.exists(inTable) or not os.path.isfile(inTable) or not os.path.getsize(inTable) > 0: 
21 |     print >> sys.stderr, "specified orthomcl table file does not exist, is not a file, or is empty\n"
22 |     usage()
23 |   return inFrom, inTo, inTable
24 | 
25 | 
26 | class OrthoCluster():
27 |   def __init__(self, line):
28 |     descr, genedefs = line.split("\t")
29 |     genedefs = genedefs.split()
30 |     self.name = descr[:descr.index('(')].lower()
31 |     self.geneHash = {}
32 |     self.speciesHash = {}
33 |     for genedef in genedefs:
34 |       geneid = genedef[:genedef.index('(')]
35 |       species = genedef[genedef.index('(')+1:-1].lower()
36 |       self.geneHash[geneid] = species
37 |       if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid)
38 |       else: self.speciesHash[species] = [geneid]
39 | 
40 |   def get_name(self): return self.name
41 |   def get_count(self): return len(self.geneHash)
42 |   def get_gene_hash(self): return self.geneHash
43 |   def get_species_hash(self): return self.speciesHash
44 |     
45 | 
46 | 
47 | def main():
48 |   inFrom, inTo, inTable = plausi()
49 |   fo = open(inTable)
50 |   for line in fo:
51 |     o = OrthoCluster(line.rstrip())
52 |     speciesHash = o.get_species_hash()
53 |     name = o.get_name()
54 |     mapfrom, mapto = "", ""
55 |     if inFrom == "cluster": mapfrom = name
56 |     else: mapfrom = speciesHash[inFrom][0]
57 |     if inTo == "cluster": mapto = name
58 |     else: mapto = speciesHash[inTo][0]
59 |     print mapfrom + "\t" + mapto
60 |   fo.close()
61 | 
62 | 
63 | main()
64 | 


--------------------------------------------------------------------------------
/python/orthomcl/paralogs-per-cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | import string
 4 | 
 5 | 
 6 | def usage():
 7 |   print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out"
 8 |   sys.exit(1)
 9 | 
10 | 
11 | def plausi():
12 |   if len(sys.argv) != 2: usage()
13 |   inFile = sys.argv[1]
14 |   return inFile
15 | 
16 | 
17 | class OrthoCluster():
18 |   def __init__(self, line):
19 |     descr, genedefs = line.split("\t")
20 |     genedefs = genedefs.split()
21 |     self.name = descr[:descr.index('(')].lower()
22 |     self.geneHash = {}
23 |     self.speciesHash = {}
24 |     for genedef in genedefs:
25 |       geneid = genedef[:genedef.index('(')]
26 |       species = genedef[genedef.index('(')+1:-1]
27 |       self.geneHash[geneid] = species
28 |       if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid)
29 |       else: self.speciesHash[species] = [geneid]
30 | 
31 |   def get_name(self): return self.name
32 |   def get_count(self): return len(self.geneHash)
33 |   def get_gene_hash(self): return self.geneHash
34 |   def get_species_hash(self): return self.speciesHash
35 | 
36 | 
37 | def get_species_from_first_line(inFile):
38 |   fo = open(inFile)
39 |   line = fo.readline()
40 |   o = OrthoCluster(line.rstrip())
41 |   fo.close()
42 |   species = o.get_species_hash().keys()
43 |   species.sort()
44 |   return species
45 | 
46 | 
47 | def parse_orthocml_out(inFile):
48 |   speciesList = get_species_from_first_line(inFile)
49 |   print >> sys.stdout, "\t" + string.join(speciesList, "\t")
50 |   fo = open(inFile)
51 |   for line in fo:
52 |     o = OrthoCluster(line.rstrip())
53 |     speciesHash = o.get_species_hash()
54 |     sys.stdout.write(o.get_name())
55 |     for s in speciesList:
56 |       count = 0
57 |       if speciesHash.has_key(s): count = len(speciesHash[s])
58 |       sys.stdout.write("\t%s" % count)
59 |     sys.stdout.write("\n")
60 |         
61 |   fo.close()
62 | 
63 | 
64 | def main():
65 |   inFile = plausi()
66 |   parse_orthocml_out(inFile)
67 | 
68 | 
69 | 
70 | main()
71 | 


--------------------------------------------------------------------------------
/python/orthomcl/speciesids4orthomcl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | 
 4 | 
 5 | def usage():
 6 |   print >> sys.stderr, "usage: " + sys.argv[0] + " folder with genomes (*.fasta or *.fasta.gz)"
 7 |   sys.exit(1)
 8 | 
 9 | 
10 | def plausi():
11 |   if len(sys.argv) != 2: usage()
12 |   inFolder = sys.argv[1]
13 |   if not os.path.exists(inFolder) or not os.path.isdir(inFolder): 
14 |     print >> sys.stderr, "specified input folder does not exist or is not a directory\n"
15 |     usage()
16 |   if not inFolder.endswith('/'): inFolder += '/'
17 |   return inFolder
18 | 
19 | 
20 | def iterate_folder(inFolder):
21 |   inFiles = []
22 |   for fname in os.listdir(inFolder):
23 |     if not fname.endswith('.fasta') and not fname.endswith('.fasta.gz'): continue
24 |     inFiles.append(inFolder + fname)
25 |   return inFiles
26 | 
27 | 
28 | def process_file(inFile):
29 |   gzip = 0
30 |   if inFile.endswith('.gz'): gzip = 1
31 | 
32 |   if gzip:
33 |     ec = os.system('gunzip ' + inFile)
34 |     inFile = os.path.splitext(inFile)[0]
35 | 
36 |   filename = os.path.split(inFile)[1]
37 |   outName = os.path.splitext(filename)[0]
38 | 
39 |   sys.stdout.write(outName + ": ")
40 | 
41 |   ids = {}
42 |   fo = open(inFile)
43 |   for line in fo:
44 |     if not line.startswith(">"): continue
45 |     line = line.rstrip()
46 |     id = line[1:]
47 |     if id.count(" ") > 0: id = id[:id.index(" ")]
48 |     ids[id] = 1
49 | 
50 |   sys.stdout.write( string.join(ids.keys(), " ") )
51 |   sys.stdout.write("\n")
52 | 
53 |   if gzip: ec = os.system('gzip ' + inFile)
54 |   
55 | 
56 | def main():
57 |   inFolder = plausi()
58 |   inFiles = iterate_folder(inFolder)
59 |   for inFile in inFiles: process_file(inFile)
60 | 
61 | main()
62 | 


--------------------------------------------------------------------------------
/python/orthomcl/table-of-gene-id-per-cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, string
 3 | from low import *
 4 | 
 5 | 
 6 | def usage():
 7 |   print >> sys.stderr, "usage: ", sys.argv[0], " [<noparalogs.orthomcl.out>]"
 8 |   sys.exit(1)
 9 | 
10 | 
11 | def plausi():
12 |   if len(sys.argv) < 2: usage()
13 |   inOrtho = sys.argv[1]
14 |   if not os.path.exists(inOrtho) or not os.path.isfile(inOrtho) or not os.path.getsize(inOrtho) > 0: 
15 |     print >> sys.stderr, "specified orthomcl file does not exist, is not a file, or is empty\n"
16 |     usage()
17 |   return inOrtho
18 | 
19 | 
20 | class OrthoCluster():
21 |   def __init__(self, line):
22 |     descr, genedefs = line.split("\t")
23 |     genedefs = genedefs.split()
24 |     self.name = descr[:descr.index('(')].lower()
25 |     self.geneHash = {}
26 |     self.speciesHash = {}
27 |     for genedef in genedefs:
28 |       geneid = genedef[:genedef.index('(')]
29 |       species = genedef[genedef.index('(')+1:-1].lower()
30 |       self.geneHash[geneid] = species
31 |       if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid)
32 |       else: self.speciesHash[species] = [geneid]
33 | 
34 |   def get_name(self): return self.name
35 |   def get_count(self): return len(self.geneHash)
36 |   def get_gene_hash(self): return self.geneHash
37 |   def get_species_hash(self): return self.speciesHash
38 |     
39 | 
40 | 
41 | def main():
42 |   inOrtho = plausi()
43 |   fo = open(inOrtho)
44 |   speciesCols = 0
45 |   for line in fo:
46 |     o = OrthoCluster(line.rstrip())
47 |     SH = o.get_species_hash()
48 |     if not speciesCols:
49 |       speciesCols = SH.keys()
50 |       speciesCols.sort()
51 |       print "OrthoMCL.ID" + "\t" + string.join(speciesCols, "\t")
52 | 
53 |     name = o.get_name()
54 |     print name + "\t" + string.join( [SH[x][0] for x in speciesCols], "\t")
55 | 
56 |   fo.close()
57 | 
58 | 
59 | main()
60 | 


--------------------------------------------------------------------------------
/python/orthomcl/tree-for-codeml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys
 3 | import string
 4 | 
 5 | 
 6 | def usage():
 7 |   print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out  base.tree"
 8 |   sys.exit(1)
 9 | 
10 | 
11 | def plausi():
12 |   if len(sys.argv) != 3: usage()
13 |   inOrtho, inTree = sys.argv[1:3]
14 |   return inOrtho, inTree
15 | 
16 | 
17 | class OrthoCluster():
18 |   def __init__(self, line):
19 |     descr, genedefs = line.split("\t")
20 |     genedefs = genedefs.split()
21 |     self.name = descr[:descr.index('(')].lower()
22 |     self.geneHash = {}
23 |     self.speciesHash = {}
24 |     for genedef in genedefs:
25 |       geneid = genedef[:genedef.index('(')]
26 |       species = genedef[genedef.index('(')+1:-1] + "1"
27 |       self.geneHash[geneid] = species
28 |       if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid)
29 |       else: self.speciesHash[species] = [geneid]
30 | 
31 |   def get_name(self): return self.name
32 |   def get_count(self): return len(self.geneHash)
33 |   def get_gene_hash(self): return self.geneHash
34 |   def get_species_hash(self): return self.speciesHash
35 | 
36 | 
37 | def get_species_from_first_line(inFile):
38 |   fo = open(inFile)
39 |   line = fo.readline()
40 |   o = OrthoCluster(line.rstrip())
41 |   fo.close()
42 |   species = o.get_species_hash().keys()
43 |   species.sort()
44 |   return species
45 | 
46 | 
47 | def parse_orthocml_out(inFile, tree):
48 |   fo = open(inFile)
49 |   for line in fo:
50 |     o = OrthoCluster(line.rstrip())
51 |     speciesHash = o.get_species_hash()
52 |     name = o.get_name()
53 |     for species, genelist in speciesHash.iteritems():
54 |       if len(genelist) > 1: break
55 | 
56 |     replacement = '(' + species[:-1] + '1 #1,' + species[:-1] + '2)'
57 |     tree_repl_1 = tree.replace(species, replacement)
58 |     replacement = '(' + species[:-1] + '1,' + species[:-1] + '2 #1)'
59 |     tree_repl_2 = tree.replace(species, replacement)
60 |     fw = open(name + ".tree.1", "w")
61 |     fw.write(tree_repl_1)
62 |     fw.close()
63 |     fw = open(name + ".tree.2", "w")
64 |     fw.write(tree_repl_2)
65 |     fw.close()
66 |   fo.close()
67 | 
68 | 
69 | def read_tree_from_file(file):
70 |   fo = open(file)
71 |   tree = ""
72 |   for line in fo:
73 |     tree += line.strip()
74 |   fo.close()
75 |   return tree
76 | 
77 | 
78 | def main():
79 |   inOrtho, inTree = plausi()
80 |   tree = read_tree_from_file(inTree)
81 |   parse_orthocml_out(inOrtho, tree)
82 | 
83 | 
84 | 
85 | main()
86 | 


--------------------------------------------------------------------------------
/python/paml/get-paml-results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os, sys 				# low level handling, such as command line stuff
  4 | import string					# string methods available
  5 | import re							# regular expressions
  6 | import getopt					# comand line argument handling
  7 | from low import *			# custom functions, written by myself
  8 | import anydbm
  9 | 
 10 | # =============================================================================	
 11 | def show_help( ):
 12 |   """ displays the program parameter list and usage information """
 13 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
 14 |   stdout( " " )
 15 |   stdout( " option    description" )
 16 |   stdout( " -h        help (this text here)" )
 17 |   stdout( " -f        nt alignment file" )
 18 |   stdout( " " )
 19 | 
 20 |   sys.exit(1)
 21 | 
 22 | # =============================================================================
 23 | def handle_arguments():
 24 |   """ verifies the presence of all necessary arguments and returns the data dir """
 25 |   if len ( sys.argv ) == 1:
 26 |     stderr( "no arguments provided." )
 27 |     show_help()	
 28 | 
 29 |   try: # check for the right arguments
 30 |     keys, values = getopt.getopt( sys.argv[1:], "hf:t:p:" )
 31 |   except getopt.GetoptError:
 32 |     stderr( "invalid arguments provided." )
 33 |     show_help()
 34 | 
 35 |   args = {}
 36 |   for key, value in keys:
 37 |     if key == '-f':	args['aln'] = value
 38 |         
 39 |   if not args.has_key('aln'):
 40 |     stderr( "aln file missing." )
 41 |     show_help()
 42 |   if not file_exists( args.get('aln') ):
 43 |     stderr( "aln file does not exist." )
 44 |     show_help()
 45 |     
 46 |   return args
 47 | 
 48 | # =============================================================================
 49 | def get_aln_length_from_file( filename ):
 50 |   fo = open( filename )
 51 |   firstline = fo.readline()
 52 |   n, length = firstline.split()
 53 |   fo.close()
 54 |   return length
 55 | 
 56 | # =============================================================================
 57 | def get_lnL_from_file( filename, model ):
 58 |   file = filename + '.paml.out.' + model
 59 |   np, lnL = None, None
 60 |   if not file_exists( file ): 
 61 |     stderr( "File does not exist: %s" %file )
 62 |     return np, lnL
 63 | 
 64 |   fo = open( file )
 65 |   for line in fo:
 66 |     if line.startswith("lnL"):
 67 |       #print filename, model, line
 68 |       np = re.match("lnL\(.*\s+np:\s*(\d+)", line ).group(1)
 69 |       lnL = re.match("lnL\(.*\):\s+([0-9.-]+)", line ).group(1)
 70 |       break
 71 |   fo.close()
 72 |   return np, lnL
 73 | 
 74 | # =============================================================================
 75 | # =============================================================================
 76 | def main( args ):
 77 |   
 78 |   models = ["M0", "M3K2", "M3K3", "M7", "M8", "Free"]
 79 |   filename = args.get('aln')
 80 |   
 81 |   line = []
 82 |   line.append( filename )
 83 |   length = get_aln_length_from_file( filename )
 84 |   line.append( length )
 85 |   for M in models:
 86 |     np, lnL = get_lnL_from_file( filename, M )
 87 |     if np == None or lnL == None:
 88 |       stderr( "%s: None returned for model %s (%s/%s)" %( filename, M, np, lnL ) )
 89 |       sys.exit(1)
 90 |     line.append( M )
 91 |     line.append( np )
 92 |     line.append( lnL )
 93 |   print string.join(line,"\t")
 94 | 
 95 | # =============================================================================
 96 | # === MAIN ====================================================================
 97 | # =============================================================================
 98 | 
 99 | args = handle_arguments(  )
100 | main( args )
101 | 


--------------------------------------------------------------------------------
/python/paml/parse_codeml-modelA.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, re
 3 | 
 4 | def usage():
 5 |   print >> sys.stderr, "usage: " + sys.argv[0] + " folder  (files end with *.MAalt)"
 6 |   sys.exit(1)
 7 | 
 8 | 
 9 | def plausi():
10 |   if len(sys.argv) != 2: usage()
11 |   inFolder = sys.argv[1]
12 |   return inFolder
13 | 
14 | 
15 | def parse_from_file(inFile):
16 |   basename = os.path.split(inFile)[1]
17 |   fo = open(inFile)
18 |   line = fo.readline().rstrip()
19 |   while 1:
20 |     if line.startswith("ns ="): 
21 |       print >> sys.stderr, inFile
22 |       length = re.search("ls =\s+(\d+)", line).group(1)
23 | 
24 |     if not line.startswith("Bayes Empirical Bayes (BEB) analysis (Yang, Wong & Nielsen 2005. Mol. Biol. Evol. 22:1107-1118)"):
25 |       line = fo.readline().rstrip()
26 |     else:
27 |       line = fo.readline().rstrip() # Positive sites for foreground lineages Prob(w>1):
28 |       line = fo.readline().rstrip()
29 |       if re.match("^$", line): 
30 |         sites = 0
31 |       else:
32 |         sites = 0
33 |         while not re.match("^$", line):
34 |           if line.endswith("*"):
35 |             sites += 1
36 |           print basename + "\t" + length + "\t" + line
37 |           line = fo.readline().rstrip()
38 |       break
39 |   print >> sys.stderr, basename + "\t" + str(sites)
40 |   fo.close()
41 | 
42 | 
43 | def parse_all_files(inFolder):
44 |   for filename in os.listdir(inFolder):
45 |     if not filename.endswith(".MAalt"): continue
46 |     parse_from_file(inFolder + "/" + filename)
47 | 
48 | 
49 | def main():
50 |   inFolder = plausi()
51 |   parse_all_files(inFolder)
52 | 
53 | 
54 | main()
55 | 


--------------------------------------------------------------------------------
/python/paml/parse_codeml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, re
 3 | 
 4 | MODELS = ["M0", "M7", "M8", "Free", "M1a", "M2a", "MT1", "MT2", "MT3", "MT4", "MT5", "MT6"]
 5 | 
 6 | 
 7 | def usage():
 8 |   print >> sys.stderr, "usage: " + sys.argv[0] + " folder"
 9 |   sys.exit(1)
10 | 
11 | 
12 | def plausi():
13 |   if len(sys.argv) != 2: usage()
14 |   inFolder = sys.argv[1]
15 |   return inFolder
16 | 
17 | 
18 | def get_all_base_files(inFolder):
19 |   fileHash = {}
20 |   for file in os.listdir(inFolder):
21 |     filename = os.path.split(file)[1]
22 |     basename = filename
23 |     while basename.count('.') > 0: basename = os.path.splitext(basename)[0]
24 |     fileHash[basename] = 1
25 |   return fileHash.keys()
26 | 
27 | 
28 | def parse_all_from_basefile(file):
29 |   filesToParse = []
30 |   for m in MODELS: filesToParse.append(file + ".codeml." + m)
31 |   for f in filesToParse:
32 |     if not os.path.exists(f) or not os.path.isfile(f): 
33 |       print >> sys.stderr, "bad stuff happening with file", file, "/", f
34 |       return
35 | 
36 |   modelHash = {}
37 |   for f in filesToParse:
38 |     fo = open(f)
39 |     for line in fo:
40 |       if line.startswith("lnL("):
41 |         np = re.match("lnL.*\s+np:\s*(\d+)", line ).group(1)
42 |         lnL = re.match("lnL\(.*\):\s+([0-9.-]+)", line ).group(1)
43 |         break
44 |     modelHash[ os.path.splitext(f)[1][1:] ] = [lnL, np]
45 |     fo.close()
46 | 
47 |   sys.stdout.write(file)
48 |   for m in MODELS:
49 |     sys.stdout.write("\t" + m + ":" + modelHash[m][0] + "," + modelHash[m][1])
50 |   sys.stdout.write("\n")
51 | 
52 | 
53 | def main():
54 |   inFolder = plausi()
55 |   basefiles = get_all_base_files(inFolder)
56 |   for basefile in basefiles:
57 |     parse_all_from_basefile(basefile)
58 | 
59 | 
60 | main()
61 | 


--------------------------------------------------------------------------------
/python/phylip/create-distance-matrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | from low import *  # custom functions, written by myself
 8 | from goterm import GOTerm
 9 | from collections import defaultdict
10 | 
11 | 
12 | # =============================================================================  
13 | def show_help( ):
14 |   """ displays the program parameter list and usage information """
15 |   stdout( "usage: " + sys.argv[0] + " -f <path> -i -n" )
16 |   stdout( " " )
17 |   stdout( " option    description" )
18 |   stdout( " -h        help (this text here)" )
19 |   stdout( " -f        input file" )
20 |   stdout( " " )
21 |   sys.exit(1)
22 | 
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |     stderr( "no arguments provided." )
28 |     show_help()  
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:" )
32 |   except getopt.GetoptError:
33 |     stderr( "invalid arguments provided." )
34 |     show_help()
35 | 
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     
40 |   if not args.has_key('file'):
41 |     stderr( "input file argument missing." )
42 |     show_help()
43 |   elif not file_exists( args.get('file') ):
44 |     stderr( "input file does not exist." )
45 |     show_help()
46 |   
47 |   return args
48 | 
49 |   
50 | # =============================================================================
51 | def read_input(file):
52 |   hash = {}
53 |   speciesarray = []
54 |   fo = open(file)
55 |   for line in fo:
56 |     line = line.rstrip()
57 |     pair, rate = line.split("\t")
58 |     rate = str(round(1-float(rate),4))
59 |     while len(rate) < 6: rate += "0"
60 |     hash[pair] = rate
61 |     speciesarray.extend(pair.split(","))
62 |   fo.close()
63 |   speciesarray = list(set(speciesarray))
64 |   speciesarray.sort()
65 |   return speciesarray, hash
66 | 
67 | 
68 | # =============================================================================
69 | # === MAIN ====================================================================
70 | # =============================================================================
71 | def main( args ):
72 |   speciesarray, hash = read_input(args['file'])
73 |   print "\t" + str(len(speciesarray)+1)
74 |   print "outgroup  " + "0.0000" + "\t" + string.join(["1.0000"]*len(speciesarray), "\t")
75 |   for sp1 in speciesarray:
76 |     line = sp1
77 |     while len(line) < 10: line += " "
78 |     line += "1.0000"
79 |     for sp2 in speciesarray:
80 |       key = [sp1,sp2]
81 |       key.sort()
82 |       key = string.join(key, ",")
83 |       if sp1 == sp2: line += "\t" + "0.0000"
84 |       else: line += "\t" + hash[key]
85 |     print line
86 | 
87 | # =============================================================================
88 | args = handle_arguments()
89 | main( args )
90 | 
91 | 


--------------------------------------------------------------------------------
/python/sciroko/import-into-sqlite3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os, sys     # low level handling, such as command line stuff
 4 | import string      # string methods available
 5 | import re          # regular expressions
 6 | import getopt      # comand line argument handling
 7 | import sqlite3
 8 | from low import *  # custom functions, written by myself
 9 | from sciroko import SSR
10 | from collections import defaultdict
11 | 
12 | 
13 | # =============================================================================  
14 | def show_help( ):
15 |   """ displays the program parameter list and usage information """
16 |   stdout( "usage: " + sys.argv[0] + " -f <path>" )
17 |   stdout( " " )
18 |   stdout( " option    description" )
19 |   stdout( " -h        help (this text here)" )
20 |   stdout( " -f        sciroko output file" )
21 |   stdout( " -d        db file" )
22 |   stdout( " " )
23 |   sys.exit(1)
24 | 
25 | # =============================================================================
26 | def handle_arguments():
27 |   """ verifies the presence of all necessary arguments and returns the data dir """
28 |   if len ( sys.argv ) == 1:
29 |     stderr( "no arguments provided." )
30 |     show_help()  
31 |   
32 |   try: # check for the right arguments
33 |     keys, values = getopt.getopt( sys.argv[1:], "hf:d:" )
34 |   except getopt.GetoptError:
35 |     stderr( "invalid arguments provided." )
36 |     show_help()
37 | 
38 |   args = {}
39 |   for key, value in keys:
40 |     if key == '-f': args['file'] = value
41 |     if key == '-d': args['db'] = value
42 |     
43 |   if not args.has_key('file'):
44 |     stderr( "sciroko file argument missing." )
45 |     show_help()
46 |   elif not file_exists( args.get('file') ):
47 |     stderr( "sciroko file does not exist." )
48 |     show_help()
49 |   
50 |   return args
51 | 
52 | # =============================================================================
53 | def init_db(conn):
54 |   conn.execute("CREATE TABLE IF NOT EXISTS ssrs(id INTEGER PRIMARY KEY ASC, organism VARCHAR(50), chr VARCHAR(50), startpos INTEGER, endpos INTEGER, motif VARCHAR(10), motif_std VARCHAR(10), length INTEGER, score INTEGER, mismatches INTEGER, binpos INTEGER, seq VARCHAR(255))")
55 | 
56 | 
57 | # =============================================================================
58 | # === MAIN ====================================================================
59 | # =============================================================================
60 | def main( args ):
61 | 
62 |   conn = sqlite3.connect(args['db'])
63 |   init_db(conn)
64 |   
65 |   fo = open(args['file'])
66 |   for line in fo:
67 |     if not line.startswith("RAL"): continue
68 |     m = SSR(line)
69 |     sql = "INSERT INTO ssrs(organism, chr, startpos, endpos, motif, motif_std, length, score, mismatches, binpos, seq) VALUES (\'%s\', \'%s\', %s, %s, \'%s\', \'%s\', %s, %s, %s, %s, \'%s\')" %(m.organism, m.chromosome, m.startpos, m.endpos, m.motif, m.motif_std, m.length, m.score, m.mismatches, m.megabase, m.seq)
70 |     conn.execute(sql)
71 |   res = conn.execute("SELECT COUNT(*) FROM ssrs")
72 |   entries = res.fetchall()[0][0]
73 |   print "done. entries added:", entries
74 |   conn.commit()
75 |   conn.close()
76 | 
77 | 
78 | # =============================================================================
79 | args = handle_arguments()
80 | main( args )
81 | 
82 | 


--------------------------------------------------------------------------------
/python/swapsc/swapsee-table-annotation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys 				# low level handling, such as command line stuff
 3 | import getopt					# comand line argument handling
 4 | import anydbm					# index databases (file hash)
 5 | from low import *			# collection of generic self-defined functions
 6 | 
 7 | 
 8 | # =============================================================================	
 9 | def show_help( ):
10 | 	""" displays the program parameter list and usage information """
11 | 	stdout( "usage: " + sys.argv[0] + " -f <path> -e <string> [-i <n>]" )
12 | 	stdout( " " )
13 | 	stdout( " option    description" )
14 | 	stdout( " -h        help (this text here)" )
15 | 	stdout( " -f        table file" )
16 | 	stdout( " -i        table column index containing the lookup name [default: 0]" )
17 | 	stdout( " -c        annotation file column to use [default: all]" )
18 | 	stdout( " -l        annotation file line(s) to use [default: first]" )
19 | 	stdout( " -e        file extension to look for (= lookupname.extension)" )
20 | 	stdout( " " )
21 | 	sys.exit(1)
22 | 	
23 | # =============================================================================
24 | def handle_arguments():
25 |   """ verifies the presence of all necessary arguments and returns the data dir """
26 |   if len ( sys.argv ) == 1:
27 |   	stderr( "no arguments provided." )
28 |   	show_help()	
29 |   
30 |   try: # check for the right arguments
31 |     keys, values = getopt.getopt( sys.argv[1:], "hf:i:e:" )
32 |   except getopt.GetoptError:
33 |   	stderr( "invalid arguments provided." )
34 |   	show_help()
35 |   
36 |   args = {}
37 |   for key, value in keys:
38 |     if key == '-f': args['file'] = value
39 |     if key == '-i':	args['col'] = int( value )
40 |     if key == '-e':	args['ext'] = value
41 |   
42 |   if not args.has_key('file'):
43 |   	stderr( "table file missing." )
44 |   	show_help()
45 |   if not file_exists( args.get('file') ):
46 |   	stderr( "table file does not exist." )
47 |   	show_help()
48 |   	
49 |   if not args.has_key('col'):
50 |      args['col'] = 0
51 |   
52 |   if not args.has_key('ext'):
53 |     stderr( "file extension missing." )
54 |     show_help()
55 |   
56 |   return args
57 | 
58 | 	
59 | # =============================================================================
60 | # =============================================================================
61 | def main( args ):
62 |   fo = open( args.get('file') )
63 |   for line in fo:
64 |     line = line.rstrip()
65 |     columns = line.split("\t")
66 |     lookup = columns[ args.get('col') ]
67 |     lookupfile = lookup + args['ext']
68 |     if file_exists( lookupfile):
69 |       ft = open( lookupfile )
70 |       lines = ft.readlines()
71 |       ft.close()
72 |       # TODO:
73 |       # get lines, get column
74 |       # then add to table
75 |       # print the new line
76 | 
77 |   fo.close()
78 | 
79 | # =============================================================================
80 | # === MAIN ====================================================================
81 | # =============================================================================
82 | args = handle_arguments(  )
83 | main( args )
84 | 
85 | 


--------------------------------------------------------------------------------
/ruby/geneontology/go-eval.rb:
--------------------------------------------------------------------------------
 1 | #/usr/bin/ruby
 2 | =begin
 3 | =end
 4 | 
 5 | class GOterm
 6 |   attr_accessor :id, :name, :namespace, :parents
 7 |   def initialize
 8 |     @parents = Array.new
 9 |   end
10 | end
11 | 
12 | def load_obo_definition(file)
13 |   goterm = Hash.new
14 |   obofile = File.open(file)
15 |   while line = obofile.gets.chomp
16 |     if line =~ /^\[Term\]/
17 |       g = GOterm.new
18 |     elsif line =~ /^id:/
19 |       g.id = line.scan(/^id:\s+(GO:\d+)/).first.first
20 |     elsif line =~ /^name:/
21 |       g.name = line.scan(/^name:\s+(.*)$/).first.first
22 |     elsif line =~ /^namespace:/
23 |       g.namespace = line.scan(/^namespace:\s+(\S+)$/).first.first
24 |     elsif line =~ /^is_a:/
25 |       g.parents << line.scan(/^is_a:\s+(GO:\d+)/).first.first
26 |     elsif line =~ /^$/
27 |       goterm[g.id] = g
28 |     end
29 |   end
30 |   return goterm
31 | end
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/ruby/generic/wordwrap.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | require 'ftools'
 3 | 
 4 | def exit_with_usage
 5 |   STDOUT.print """ 
 6 |   #{$0} <file> [<line length>]
 7 |   
 8 |   this script inserts newlines in front of all words that would exceed
 9 |   a given threshold for max line length.
10 |   default max length: 80
11 | 
12 |   """
13 |   exit(1)
14 | end
15 | 
16 | exit_with_usage unless ARGV.length > 0
17 | exit_with_usage unless File.exists? ARGV[0]
18 | MAXLENGTH = (ARGV[1] || 80).to_i
19 | 
20 | STDERR.puts "INPUT FILE:\t%s" % ARGV[0]
21 | STDERR.puts "MAXLENGTH:\t%s" % MAXLENGTH
22 | 
23 | f = File.open(ARGV[0])
24 | while line = f.gets
25 |   if line.length < 80
26 |     STDOUT.print line
27 |   else
28 |     words = line.chomp.split
29 |     first = line[0..0]
30 |     pos = 0
31 |     newline = Array.new
32 |     words.each do |word|
33 |       newline << word
34 |       if newline.join(" ").length > MAXLENGTH
35 |         newline[-1] = "\n"
36 |         STDOUT.print newline.join(" ")
37 |         if first == "#" or first == "%"
38 |           newline = [first, word]
39 |         else
40 |           newline = [word]
41 |         end
42 |       end
43 |     end
44 |     STDOUT.puts newline.join(" ").chomp
45 |   end
46 | end
47 | f.close
48 | 
49 | STDERR.puts "STATUS:   \tdone.\n"
50 | 


--------------------------------------------------------------------------------
/ruby/pfam/hmmout_annotation.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require 'optparse'
 4 | 
 5 | PFAMFILE = "/global/databases/pfam/current/pfam_scan_db/Pfam-A.hmm"
 6 | 
 7 | class String
 8 |   def valid_float?
 9 |     # The double negation turns this into an actual boolean true - if you're 
10 |     # okay with "truthy" values (like 0.0), you can remove it.
11 |     !!Float(self) rescue false
12 |   end
13 | end
14 | 
15 | 
16 | # =============================================================================
17 | def get_opt
18 |   options = {}
19 |   optparse = OptionParser.new do |opts|
20 |     opts.banner = "Usage: #{$0} -f <file> -c <value>"
21 |     opts.on( '-f FILE or DIR', 'single hmmout file (pfam_scan output with first column = protein length), or a directory where all *.hmmout files will be processed' 
22 |       ){|file| options[:hmmfile] = file}
23 |     opts.on( '-c CUTOFF', '[evalueFloat|GA|TC|NC]'
24 |       ){|v| options[:cutoff] = v}
25 |   end 
26 |   begin
27 |     optparse.parse!
28 |     mandatory = [:hmmfile, :cutoff]
29 |     missing = mandatory.select{|param| options[param].nil?}
30 |     if not missing.empty?
31 |       puts "Missing options: #{missing.join(', ')}"
32 |       puts optparse 
33 |       exit
34 |     end 
35 |   rescue OptionParser::InvalidOption, OptionParser::MissingArgument
36 |     puts $!.to_s
37 |     puts optparse
38 |     exit
39 |   end
40 |   return options
41 | end
42 | 
43 | def get_cutoffs(file=PFAMFILE)
44 |   cutoffHash = Hash.new
45 |   capture = %w( NAME GA NC TC )
46 |   @name = nil
47 |   reader = File.open(file, 'r')
48 |   while (line = reader.gets)
49 |     entry = {} if line[0,6] == 'HMMER3'
50 |     capture.each{|e| entry[e] = line.split[1] if line[0,e.length] == e }
51 |     if line[0,2] == "//" 
52 |       if entry.length != capture.count
53 |         STDERR.puts "FATAL ERROR: not all required fields found for an entry: #{entry.inspect}"
54 |         next
55 |       end
56 |       cutoffHash[entry['NAME']] = entry
57 |     end
58 |   end
59 |   return cutoffHash
60 | end
61 | 
62 | 
63 | # ==============================================================================
64 | def filter_hmmout(file, cutoff)
65 |   fw = File.open(file + "." + cutoff, 'w')
66 |   f = File.open(file, 'r')
67 |   if cutoff.valid_float? # e-value cutoff given
68 |     e = cutoff.to_f
69 |     f.each{|line| cols = line.chomp.split; fw.puts cols.join("\t") if cols[13].to_f < e}
70 |   else 
71 |     e = cutoff if ['GA', 'TC', 'NC'].include?(cutoff)
72 |     abort("invalid value given for cutoff method (#{cutoff}). allowed values are GA, NC, and TC.") if e.nil?
73 |     cutoffHash = get_cutoffs()
74 |     puts "--- cutoffHash: #{cutoffHash.count} ---"
75 |     f.each{|line| 
76 |       cols = line.chomp.split;
77 |       name, bitscore = cols[7], cols[12].to_f
78 |       puts name, cutoffHash[name]
79 |       next unless name[0,6] == 'Pfam-B' or bitscore > cutoffHash[name][e].to_f
80 |       fw.puts cols.join("\t") 
81 |     }
82 |   end
83 |   f.close
84 |   fw.close
85 | end
86 | 
87 | 
88 | # ==============================================================================
89 | # =MAIN=========================================================================
90 | # ==============================================================================
91 | 
92 | options = get_opt()
93 | unless File.directory?(options[:hmmfile])
94 |   filter_hmmout(options[:hmmfile], options[:cutoff])
95 | else
96 |   Dir.glob(options[:hmmfile] + '/*.hmmout').each{|hmmfile| filter_hmmout(hmmfile, options[:cutoff])}
97 | end
98 | 
99 | 


--------------------------------------------------------------------------------
/ruby/pfam/length2hmmout.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby -w
 2 | HEADER = /^>(\S+)\s?.?/
 3 | unless (ARGV.size == 2)
 4 | 	puts "Usage: #{$0} fasta hmmout [NOTE: will change input hmmout!]"
 5 | 	exit
 6 | end
 7 | lengths	= Hash.new
 8 | seq 		= String.new
 9 | pid			= nil 
10 | f 			= File.open(ARGV[0], "r")
11 | c				= 0
12 | while(line = f.gets)
13 | 	line.chomp!
14 | 	if (m = HEADER.match(line))
15 | 		lengths[pid] = seq.length.to_s unless (pid.nil?)
16 | 		pid = m[1]
17 | 		seq = String.new
18 | 		c += 1
19 | 		STDERR.print "\r*** Reading fasta entries: #{c}... "
20 | 		next
21 | 	end
22 | 	seq += line	
23 | end
24 | lengths[pid] = seq.length unless (pid.nil?)
25 | f.close
26 | STDERR.puts "done."
27 | oldhmmout = Array.new
28 | IO.foreach(ARGV[1]) {|x| oldhmmout << x}
29 | f = File.open(ARGV[1], "w")
30 | oldhmmout.each do |line|
31 | 	next if (/^#.+/.match(line))
32 | 	fields = line.split
33 | 	unless (lengths.has_key?(fields[0]))
34 | 			puts "*** NO LENGTH FOUND FOR >#{fields[0]}<"
35 | 			present = false
36 | 			next
37 | 	end
38 | 	line.chomp!
39 | 	f.puts lengths[fields[0]].to_s + "\t" + line + "\n"
40 | end
41 | f.close
42 | 


--------------------------------------------------------------------------------
/ruby/swapsc/bio-graphics-plot.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | # generates a diagram of where in the sequence accelerated evolution / positive selection / negative selection took place
  3 | require 'rubygems'
  4 | require 'bio-graphics'
  5 | 
  6 | DEBUG = false
  7 | 
  8 | 
  9 | 
 10 | ###############################################################################
 11 | class SwapscFeature
 12 | 
 13 |   attr_accessor :start, :stop, :category
 14 |   
 15 |   def initialize(start,stop,category)
 16 |     @start = start
 17 |     @stop = stop
 18 |     @category = category
 19 |     @added = false
 20 |   end
 21 | 
 22 |   def <=> other
 23 |     @start <=> other.start
 24 |   end  
 25 | 
 26 |   def added?
 27 |     return @added
 28 |   end
 29 | 
 30 |   def add_to_track(track)
 31 |     track.add_feature( Bio::Feature.new(@category, '%s..%s' % [ @start, @stop ]), :colour => $categories[@category][:color] )
 32 |     $categories[@category][:stats] += (@stop - @start +1)
 33 |     $categories[@category][:branchstats] += (@stop - @start +1)
 34 |     @added = true
 35 |   end
 36 | end
 37 | ###############################################################################
 38 | 
 39 | ###############################################################################
 40 | 
 41 | if ARGV[0] and not File.exists?(ARGV[0])
 42 |   puts "error: invalid path to file specified."
 43 |   ARGV[0] = nil
 44 | end
 45 | 
 46 | unless ARGV[0] or ARGV[1]
 47 |   puts "generates a diagram of where in the sequence accelerated evolution / positive selection / negative selection took place\n"
 48 |   puts "expected format [tab-delimited]:"
 49 |   puts "PANEL length"
 50 |   puts "TRACK name label"
 51 |   puts "FEATURE range color [label]"
 52 |   puts "usage: visualize-swapsc.rb <flatfile> <outfile>\n"
 53 |   exit 1
 54 | end
 55 | 
 56 | # === MAIN ====================================================================
 57 | # =============================================================================
 58 | 
 59 | # 1. read flatfile and save the input
 60 | # 2. process input, create the plot
 61 | panel = nil
 62 | track = nil
 63 | tracks = Array.new
 64 | features = Array.new
 65 | 
 66 | f = File.open( ARGV[0], "r" )
 67 | #STDERR.print( ARGV[0] + "\t" )
 68 | while line = f.gets
 69 |   next if line == nil
 70 |   line.chomp!
 71 |   cols = line.split("\t")
 72 |   if cols[0] == "PANEL"
 73 |     panel = Bio::Graphics::Panel.new( cols[1].to_i, :width => 800, :format => :png )
 74 |   elsif cols[0] == "TRACK"
 75 |     i, name, label = line.split("\t")
 76 |     if label == "true"
 77 |       label = true
 78 |     else
 79 |       label = false
 80 |     end
 81 |     track = panel.add_track(name, :label => label)
 82 |   elsif cols[0] == "FEATURE"
 83 |     if line.split("\t").length == 4
 84 |       i, range, color, label = line.split("\t")
 85 |       color = color.split(',').collect{|c| c.to_f}
 86 |       track.add_feature( Bio::Feature.new("feature", range), :colour => color, :label => label )
 87 |     else
 88 |       i, range, color = line.split("\t")
 89 |       color = color.split(',').collect{|c| c.to_f}
 90 |       track.add_feature( Bio::Feature.new("feature", range), :colour => color )
 91 |     end
 92 |   else
 93 |     STDERR.puts "unknown line descriptor \"#{cols[0]}\"" unless cols[0].nil?
 94 |   end
 95 | end
 96 | f.close
 97 | panel.draw(ARGV[1])
 98 | 
 99 | #STDERR.puts "done."
100 | 


--------------------------------------------------------------------------------