├── LICENSE.md ├── README.md ├── python ├── base │ ├── blastout.py │ ├── blastout.pyc │ ├── fasta.py │ ├── fasta.pyc │ ├── gff3.py │ ├── gff3.pyc │ ├── goterm.py │ ├── goterm.pyc │ ├── low.py │ ├── low.pyc │ ├── misa.py │ ├── misa.pyc │ ├── muscle.py │ ├── muscle.pyc │ ├── needlemanwunsch.py │ ├── needlemanwunsch.pyc │ ├── newick.py │ ├── newick.pyc │ ├── orthocluster.py │ ├── orthocluster.pyc │ ├── orthomcl.py │ ├── orthomcl.pyc │ ├── pfam.py │ ├── pfam.pyc │ ├── sciroko.py │ ├── sciroko.pyc │ ├── stats.py │ └── stats.pyc ├── blast │ ├── benchmark_blast.py │ ├── blast-best-hit-per-query.py │ ├── cluster-paralogs.py │ ├── cluster_sequences.py │ ├── homology_blast.py │ ├── paralogs-from-selfblast.py │ ├── parse-best-blast-hit.py │ ├── parse-blastout-xml.py │ ├── parse_blast_annotate.py │ ├── parse_blast_out2.py │ ├── parse_blast_out3.py │ └── remove-from-blastout.py ├── fasta │ ├── assembly-stats.py │ ├── concatenate-alignments.py │ ├── create-clusters.py │ ├── create_fasta_clusters.py │ ├── fasta-extract-fragment.py │ ├── fasta-length-per-file.py │ ├── fasta-length-per-sequence.py │ ├── fasta-length-stats.py │ ├── fasta-length.py │ ├── fasta-sort.py │ ├── fasta-starts-with-meth.py │ ├── fasta-to-swapsc-input.py │ ├── fasta2flat.py │ ├── fasta2phylip.py │ ├── fastasplit.py │ ├── gc-content-from-fasta.py │ ├── generate-fasta-aa-nt.py │ ├── get-all-possible-translations.py │ ├── get-cluster-sequences.py │ ├── get-sequence-between-genes.py │ ├── import-fasta-sequence.py │ ├── index-fasta.py │ ├── reduce_fasta_file.py │ ├── remove-from-fasta.py │ ├── remove-stopcodons-from-fasta.py │ ├── rename-fasta-sequences.py │ ├── rename-geneids.py │ ├── seqlength.py │ ├── stockholm-to-fasta.py │ ├── translatedprot_from_gb_to_fasta.py │ └── uniprot-dat-to-fasta.py ├── geneontology │ ├── add-parental-go-terms.py │ ├── go-enrichment.py │ ├── go-enrichment2.py │ ├── go-from-blastout.py │ ├── go2slim.py │ ├── goflat2grouptable.py │ ├── goflat2topgo.py │ ├── goid2name-from-obo-xml.py │ ├── goid2name-from-obo.py │ ├── goodness-of-fit.py │ └── goterms-to-xdom.py ├── generic │ ├── add-basename-as-first-col.py │ ├── add-species-as-first-col.py │ ├── add_to_xdom.py │ ├── addid2xdom.py │ ├── difference.py │ ├── flat-split-by-lines.py │ ├── flat2line.py │ ├── flat2matrix.py │ ├── flat2sqlinject.py │ ├── flat2xdom.py │ ├── grab-columns.py │ ├── intersection.py │ ├── map.py │ ├── parallel-processes.py │ ├── search-replace.py │ ├── subtract.py │ ├── text2range.py │ ├── xdom2flat.py │ └── z-score-stats.py ├── gff │ ├── droso-chromosome-reconstruction.py │ ├── droso-introns-exons.py │ ├── get-missing-exons.py │ ├── gff2orthocluster.py │ ├── intra-and-intergenic-orthologous-regions.py │ ├── overlapping-cds-from-gff.py │ ├── plot-genomic-region.py │ └── splice-forms-from-gff.py ├── kegg │ ├── kegg-enzyme2ko.py │ ├── kegg-extractor.py │ ├── kegg-parser.py │ └── kegg2xdom.py ├── latex-bibtex │ ├── bibtex-number-of-coauthors.py │ └── latex-rename.py ├── misa │ ├── add-features-to-misa.py │ ├── add-localization-to-misa.py │ ├── exonic-ssrs-to-genes.py │ ├── gc-content-from-misa.py │ ├── get-transcript-and-protein-per-droso-gene.py │ ├── import-into-sqlite3.py │ ├── misa-global-stats.py │ ├── misa-single-genome-stats.py │ ├── orth-all-pairwise-ssr-comparison.py │ ├── orth-all-vs-all-ssr-comparison.py │ ├── orth-report-conserved-ssrs.py │ ├── orth-ssr-comparison.py │ ├── ortho-pairwise-exon-intron.py │ ├── ortho-pairwise-intra-intergenic.py │ ├── qc-orthologous-regions.py │ ├── split-compound-ssrs.py │ ├── ssr-to-amino-acid.py │ ├── ssr-to-pfam.py │ └── test ├── openreadingframe │ ├── ORFPREDICTORRR.py │ ├── orf_prediction_part1.py │ ├── orf_prediction_part2.py │ └── stats_predicted_orfs.py ├── orthomcl │ ├── add-blasthits-to-cluster.py │ ├── build-clusters-nt.py │ ├── build-clusters.py │ ├── build-counts-table.py │ ├── build-orthomcl-like-output.py │ ├── cluster2arath.py │ ├── geneid2cluster.py │ ├── map-orthomcl-clusters.py │ ├── orthomcl-blastparse.py │ ├── paralogs-per-cluster.py │ ├── remove-paralogs.py │ ├── speciesids4orthomcl.py │ ├── table-of-gene-id-per-cluster.py │ └── tree-for-codeml.py ├── paml │ ├── PAML_Ka_Ks.py │ ├── codeml-parallel.py │ ├── get-paml-results.py │ ├── paml-codeml.py │ ├── paml-lrt-bic.py │ ├── parse_codeml-modelA.py │ ├── parse_codeml.py │ └── plot-codeml-model-A-digest.py ├── pfam │ ├── pfam-domain-counts.py │ ├── pfam-filter-output.py │ ├── pfam-mapping.py │ ├── pfam-pid2arrangement.py │ ├── pfam-pid2clan.py │ └── pfamtable-from-pid-annotation.py ├── phylip │ └── create-distance-matrix.py ├── sciroko │ ├── import-into-sqlite3.py │ └── sciroko-single-genome-stats.py ├── signalp │ └── signalp-report-hits.py └── swapsc │ ├── evolve4swapsc.py │ ├── parse-swapsc.py │ ├── swapsc.py │ ├── swapsee-table-annotation.py │ └── swapsee.py └── ruby ├── geneontology ├── go-enrichment-summary.rb ├── go-eval.rb ├── termcloud-from-go-enrichment.rb ├── termcloud-from-go-enrichment2-comp.rb ├── termcloud-from-go-enrichment2.rb └── termtable-from-go-enrichment2.rb ├── generic └── wordwrap.rb ├── pfam ├── hmmout_annotation.rb └── length2hmmout.rb └── swapsc ├── bio-graphics-plot.rb └── visualize-swapsc.rb /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | ===================== 3 | 4 | Copyright (c) 2012 Lothar Wissler 5 | --------------------------------- 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains a broad, but probably incomplete, collection of scripts I developed for my bioinformatics analyses. 2 | 3 | 4 | Organization of files 5 | --------------------- 6 | Thus far, *python* and *ruby* scripts are included in separate folders. I have further created subfolders for scripts that relate to specific programs/databases (e.g. Pfam, GeneOntology) or file formats (e.g. fasta, gff). The *base* folder includes sources that may be imported and therefore required by other scripts. 7 | 8 | 9 | Documentation 10 | ------------- 11 | Documentation is almost non-existant, and if present in scripts, it may be outdated. However, naming of the scripts themsolves as well as their subroutines should make it easy to reconstruct how/when/where each of the programs can be applied. 12 | 13 | -------------------------------------------------------------------------------- /python/base/blastout.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | # ============================================================================= 4 | class BlastHit: 5 | def __init__(self, line): 6 | cols = line.split("\t") 7 | self.qid, self.hid = cols.pop(0), cols.pop(0) 8 | self.identity = float(cols.pop(0)) 9 | self.alnlen = int(cols.pop(0)) 10 | self.mismatch = int(cols.pop(0)) 11 | self.gap = int(cols.pop(0)) 12 | self.qstart = int(cols.pop(0)) 13 | self.qstop = int(cols.pop(0)) 14 | self.hstart = int(cols.pop(0)) 15 | self.hstop = int(cols.pop(0)) 16 | self.evalue = float(cols.pop(0)) 17 | self.score = float(cols.pop(0)) 18 | 19 | def to_s(self): 20 | out = [] 21 | out += [self.qid, self.hid, str(self.identity), str(self.alnlen)] 22 | out += [str(self.mismatch), str(self.gap), str(self.qstart), str(self.qstop)] 23 | out += [str(self.hstart), str(self.hstop), str(self.evalue), str(self.score)] 24 | return string.join(out, "\t") 25 | 26 | # ============================================================================= 27 | def get_query_hash(blastoutfile, evalue=10.0): 28 | qh = {} 29 | fo = open(blastoutfile) 30 | for line in fo: 31 | line = line.rstrip() 32 | if len(line) == 0 or line.startswith('#') or not len(line.split("\t")) == 12: continue 33 | blasthit = BlastHit(line) 34 | if blasthit.evalue > evalue: continue 35 | if not qh.has_key(blasthit.qid): qh[blasthit.qid] = [] 36 | qh[blasthit.qid].append(blasthit) 37 | fo.close() 38 | return qh 39 | 40 | # ============================================================================= 41 | def get_sequence_hash(fastafile): 42 | seqhash = {} 43 | key = "" 44 | fo = open(fastafile) 45 | for line in fo: 46 | if line.startswith(">"): 47 | gid = re.match(">(\S+)", line).group(1) 48 | key = gid 49 | seqhash[key] = "" 50 | else: 51 | if key != "": seqhash[key] += line.strip() 52 | fo.close() 53 | return seqhash 54 | -------------------------------------------------------------------------------- /python/base/blastout.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/blastout.pyc -------------------------------------------------------------------------------- /python/base/fasta.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | 4 | # ============================================================================= 5 | def get_sequence_hash(fastafile): 6 | seqhash = {} 7 | key = "" 8 | if fastafile.endswith('.gz'): fo = gzip.open(fastafile) 9 | else: fo = open(fastafile) 10 | for line in fo: 11 | if line.startswith(">"): 12 | gid = re.match(">(\S+)", line).group(1) 13 | key = gid 14 | seqhash[key] = "" 15 | else: 16 | if key != "": seqhash[key] += line.strip() 17 | fo.close() 18 | return seqhash 19 | 20 | # ============================================================================= 21 | def get_length_hash(fastafile): 22 | lenhash = {} 23 | key = "" 24 | if fastafile.endswith('.gz'): fo = gzip.open(fastafile) 25 | else: fo = open(fastafile) 26 | for line in fo: 27 | if line.startswith(">"): 28 | gid = re.match(">(\S+)", line).group(1) 29 | key = gid 30 | lenhash[key] = 0 31 | else: 32 | if key != "": lenhash[key] += len(line.strip()) 33 | fo.close() 34 | return lenhash 35 | -------------------------------------------------------------------------------- /python/base/fasta.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/fasta.pyc -------------------------------------------------------------------------------- /python/base/gff3.py: -------------------------------------------------------------------------------- 1 | import sys, string 2 | 3 | def get_gff_hash(gffile): 4 | hash = {} 5 | fo = open(gffile) 6 | for line in fo: 7 | gf = GeneFeature(line) 8 | if not hash.has_key(gf.seqid): hash[gf.seqid] = [] 9 | hash[gf.seqid].append(gf) 10 | fo.close() 11 | return hash 12 | 13 | 14 | class GeneFeature(): 15 | def __init__(self, line): 16 | columns = line.rstrip().split("\t") 17 | if not len(columns) == 9: 18 | print >> sys.stderr, "GFF3 with incorrect number of columns. Expected: 9 | Observed: %s" % len(columns) 19 | print >> sys.stderr, "\"%s\"" % line 20 | sys.exit(1) 21 | self.seqid = columns.pop(0) 22 | self.source = columns.pop(0) 23 | self.ftype = columns.pop(0) 24 | self.start = int(columns.pop(0)) 25 | self.stop = int(columns.pop(0)) 26 | self.score = columns.pop(0) 27 | self.strand = columns.pop(0) 28 | self.phase = columns.pop(0) 29 | self.attributes = columns.pop(0) 30 | 31 | def get_attributes(self): 32 | hash = {} 33 | for e in self.attributes.split(";"): 34 | if e == '': continue 35 | k, v = e.split("=") 36 | hash[k] = v 37 | return hash 38 | 39 | def set_attribute(self, key, value): 40 | hash = {} 41 | for e in self.attributes.split(";"): 42 | if e == '': continue 43 | k, v = e.split("=") 44 | hash[k] = v 45 | if hash.has_key(key): 46 | hash[key] = value 47 | self.attributes = "" 48 | for k, v in hash.iteritems(): self.attributes += "%s=%s;" %(k, v) 49 | else: 50 | self.attributes += "%s=%s;" %(key, value) 51 | 52 | 53 | def to_string(self): 54 | return string.join([self.seqid, self.source, self.ftype, str(self.start), str(self.stop), self.score, self.strand, self.phase, self.attributes], "\t") 55 | -------------------------------------------------------------------------------- /python/base/gff3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/gff3.pyc -------------------------------------------------------------------------------- /python/base/goterm.py: -------------------------------------------------------------------------------- 1 | class GOTerm(): 2 | def __init__(self, lines): 3 | self.id = "" 4 | self.name = "" 5 | self.namespace = "" 6 | self.definition = "" 7 | self.is_a = [] 8 | self.alt_ids = [] 9 | self.xrefs = [] 10 | self.synonyms = [] 11 | self.obsolete = 0 12 | 13 | for line in lines: 14 | line = line.strip() 15 | if line.startswith("id: "): self.id = line[line.index(":")+2:] 16 | if line.startswith("name: "): self.name = line[line.index(":")+2:] 17 | if line.startswith("namespace: "): self.namespace = line[line.index(":")+2:] 18 | if line.startswith("def: "): self.definition = line[line.index(":")+2:] 19 | if line.startswith("is_a: "): self.is_a.append( line[line.index(":")+2:] ) 20 | if line.startswith("is_obsolete: true"): self.obsolete = 1 21 | if line.startswith("alt_id: "): self.alt_ids.append( line[line.index(":")+2:] ) 22 | if line.startswith("xref: "): self.xrefs.append( line[line.index(":")+2:] ) 23 | if line.startswith("synonym: "): self.synonyms.append( line[line.index(":")+2:] ) 24 | 25 | def get_id(self): return self.id 26 | def get_name(self): return self.name 27 | def get_namespace(self): return self.namespace 28 | def get_definition(self): return self.definition 29 | def get_is_a(self): return self.is_a 30 | def get_is_a_goids(self): return [e.split()[0] for e in self.is_a] 31 | def get_alt_ids(self): return self.alt_ids 32 | def get_xrefs(self): return self.xrefs 33 | def get_synonyms(self): return self.synonyms 34 | def get_is_obsolete(self): return self.obsolete 35 | -------------------------------------------------------------------------------- /python/base/goterm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/goterm.pyc -------------------------------------------------------------------------------- /python/base/low.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/low.pyc -------------------------------------------------------------------------------- /python/base/misa.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | class MisaSSRspecies(): 4 | def __init__(self, line): 5 | self.feature = 0 6 | columns = line.rstrip().split("\t") 7 | self.species = columns.pop(0) 8 | self.geneid = columns.pop(0) 9 | self.ssrnr = int(columns.pop(0)) 10 | self.type = columns.pop(0) 11 | self.pattern = columns.pop(0) 12 | self.length = int(columns.pop(0)) 13 | self.startpos = int(columns.pop(0)) 14 | self.endpos = int(columns.pop(0)) 15 | if len(columns) > 0: self.feature = columns.pop(0) 16 | if self.type != "c" and self.type != "c*": 17 | self.motif = self.pattern[1:self.pattern.index(")")] 18 | if self.pattern.endswith("*"): self.repeats = int(self.pattern[self.pattern.index(")")+1:-1]) 19 | else: self.repeats = int(self.pattern[self.pattern.index(")")+1:]) 20 | 21 | def to_s(self): 22 | array = [self.species, self.geneid, str(self.ssrnr), self.type, self.pattern, str(self.length), str(self.startpos), str(self.endpos)] 23 | return string.join(array, "\t") 24 | 25 | def is_perfect_match_to(self, other): 26 | if self.pattern != other.pattern: return 0 27 | return 1 28 | 29 | def is_polymorphic_to(self, other): 30 | if self.motif != other.motif: return 0 31 | if self.repeats == other.repeats: return 0 32 | return 1 33 | 34 | def is_shifted_to(self, other): 35 | if self.motif == other.motif: return 0 36 | if self.type != other.type: return 0 37 | m = self.motif 38 | for i in range(len(self.motif)): 39 | m = m[1:] + m[0] 40 | if m == other.motif: return 1 41 | return 0 42 | 43 | 44 | class MisaSSR(): 45 | def __init__(self, line): 46 | self.feature = 0 47 | columns = line.rstrip().split("\t") 48 | self.geneid = columns.pop(0) 49 | self.ssrnr = int(columns.pop(0)) 50 | self.type = columns.pop(0) 51 | self.pattern = columns.pop(0) 52 | self.length = int(columns.pop(0)) 53 | self.startpos = int(columns.pop(0)) 54 | self.endpos = int(columns.pop(0)) 55 | if len(columns) > 0: self.feature = columns.pop(0) 56 | if self.type != "c" and self.type != "c*": 57 | self.motif = self.pattern[1:self.pattern.index(")")] 58 | if self.pattern.endswith("*"): self.repeats = int(self.pattern[self.pattern.index(")")+1:-1]) 59 | else: self.repeats = int(self.pattern[self.pattern.index(")")+1:]) 60 | 61 | def to_s(self): 62 | array = [self.geneid, str(self.ssrnr), self.type, self.pattern, str(self.length), str(self.startpos), str(self.endpos)] 63 | return string.join(array, "\t") 64 | 65 | def is_perfect_match_to(self, other): 66 | if self.pattern != other.pattern: return 0 67 | return 1 68 | 69 | def is_polymorphic_to(self, other): 70 | if self.motif != other.motif: return 0 71 | if self.repeats == other.repeats: return 0 72 | return 1 73 | 74 | def is_shifted_to(self, other): 75 | if self.motif == other.motif: return 0 76 | if self.type != other.type: return 0 77 | m = self.motif 78 | for i in range(len(self.motif)): 79 | m = m[1:] + m[0] 80 | if m == other.motif: return 1 81 | return 0 82 | 83 | 84 | -------------------------------------------------------------------------------- /python/base/misa.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/misa.pyc -------------------------------------------------------------------------------- /python/base/muscle.py: -------------------------------------------------------------------------------- 1 | import tempfile, os, fasta 2 | 3 | # ============================================================================= 4 | def align(sequences, ids, outfile=False): 5 | h, infile = tempfile.mkstemp() 6 | os.close(h) 7 | fw = open(infile, 'w') 8 | for i in range(len(sequences)): fw.write(">" + ids[i] + "\n" + sequences[i] + "\n") 9 | fw.close() 10 | h, outfile = tempfile.mkstemp() 11 | os.close(h) 12 | os.system("muscle -in %s -out %s -quiet 2> /dev/null" %(infile, outfile)) 13 | os.unlink(infile) 14 | aligned_sequences = [] 15 | alnhash = fasta.get_sequence_hash(outfile) 16 | for gid in ids: aligned_sequences.append(alnhash[gid]) 17 | os.unlink(outfile) 18 | return aligned_sequences 19 | -------------------------------------------------------------------------------- /python/base/muscle.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/muscle.pyc -------------------------------------------------------------------------------- /python/base/needlemanwunsch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # ============================================================================= 4 | def align(array1, array2, gap = -2, match = 1, mismatch = -1): 5 | """Performs Needleman-Wunsch alignment of string1 and string2. 6 | Prints out the alignment and returns the array of scores and pointers(arrows). 7 | 8 | Example usage from an interactive shell: 9 | from NeedlemanWunsch import NW 10 | Scores, Pointers = NW('PELICAN','COELACANTH') 11 | 12 | This is modified from a Perl implementation in the book BLAST by Korf, et al. 13 | """ 14 | # initialize scoring and 'arrow' matrices to 0 15 | Scores = [[0 for x in range(len(array2)+1)] for y in range(len(array1)+1)] 16 | Pointers = [[0 for x in range(len(array2)+1)] for y in range(len(array1)+1)] 17 | 18 | # initialize borders 19 | # for pointers (arrows), use 2 for diagonal, -1 for horizontal, and 1 for vertical moves (an arbitrary system). 20 | # I have tried to consistently use i for rows (vertical positions) in the score and pointer tables, and j for columns (horizontal positions). 21 | for i in range(len(array1)+1): 22 | Scores[i][0] = gap*i 23 | Pointers[i][0] = 1 24 | for j in range(len(array2)+1): 25 | Scores[0][j] = gap*j 26 | Pointers[0][j] = -1 27 | 28 | # fill with scores 29 | for i in range(1,len(array1)+1): 30 | for j in range(1,len(array2)+1): 31 | letter1 = array1[i-1] 32 | letter2 = array2[j-1] 33 | if letter1 == letter2: 34 | DiagonalScore = Scores[i-1][j-1] + match 35 | else: DiagonalScore = Scores[i-1][j-1] + mismatch 36 | HorizontalScore = Scores[i][j-1] + gap 37 | UpScore = Scores[i-1][j] + gap 38 | # TempScores is list of the three scores and their pointers 39 | TempScores = [[DiagonalScore,2],[HorizontalScore,-1],[UpScore,1]] 40 | # Now we keep the highest score, and the associated direction (pointer) 41 | Scores[i][j], Pointers[i][j] = max(TempScores) 42 | 43 | # backtrace from the last entry. 44 | [i,j] = [len(array1),len(array2)] 45 | align1 = [] 46 | align2 = [] 47 | while [i,j] != [0,0]: 48 | if Pointers[i][j] == 2: 49 | align1.append(array1[i-1]) 50 | align2.append(array2[j-1]) 51 | i = i - 1 52 | j = j - 1 53 | elif Pointers[i][j] == -1: 54 | align1.append('-') 55 | align2.append(array2[j-1]) 56 | j = j - 1 57 | else: 58 | align1.append(array1[i-1]) 59 | align2.append('-') 60 | i = i - 1 61 | 62 | # the alignments have been created backwards, so we need to reverse them: 63 | align1 = align1[::-1] 64 | align2 = align2[::-1] 65 | 66 | # print out alignment 67 | #print align1 68 | #print align2 69 | 70 | # in case you want to look at the scores and pointers, the function returns them 71 | return [Scores,Pointers, [align1, align2]] 72 | 73 | -------------------------------------------------------------------------------- /python/base/needlemanwunsch.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/needlemanwunsch.pyc -------------------------------------------------------------------------------- /python/base/newick.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/newick.pyc -------------------------------------------------------------------------------- /python/base/orthocluster.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # ============================================================================= 4 | def parse(clusterfile, poshash={}): 5 | nspecies = 0 6 | clusterhash = {} 7 | fo = open(clusterfile) 8 | while 1: 9 | line = fo.readline() 10 | if not line: break 11 | if line.startswith("No. of sequence"): nspecies = int(line.split()[-1]) 12 | if line.startswith("CL-"): 13 | cols = line.split() 14 | sc = OrthoCluster(cols[0]) 15 | ngenes = int(max(cols[1:nspecies+1])) 16 | fo.readline() 17 | for i in range(ngenes): 18 | cols = fo.readline().split() 19 | for j in range(nspecies): 20 | cols.pop(0) 21 | cols.pop(0) 22 | strand = cols.pop(0) 23 | scaffold = cols.pop(0) 24 | geneid = cols.pop(0) 25 | if poshash.has_key(geneid): 26 | startpos, endpos = poshash[geneid][1:3] 27 | else: 28 | print >> sys.stderr, "geneid", geneid, "not found in poshash" 29 | startpos, endpos = None, None 30 | sr = SyntenicRegion(geneid, scaffold, strand, startpos, endpos) 31 | sc.add_syntenic_region(sr, j) 32 | clusterhash[sc.id] = sc 33 | fo.close() 34 | return nspecies, clusterhash 35 | 36 | 37 | # ============================================================================= 38 | class SyntenicRegion(): 39 | def __init__(self, geneid, scaffold, strand, startpos, endpos): 40 | self.geneid = geneid 41 | self.scaffold = scaffold 42 | self.strand = strand 43 | self.startpos = startpos 44 | self.endpos = endpos 45 | 46 | # ============================================================================= 47 | class OrthoCluster(): 48 | def __init__(self, clusterid): 49 | self.id = clusterid 50 | self.syntenic_regions = {} 51 | 52 | def add_syntenic_region(self, sr, index): 53 | if not self.syntenic_regions.has_key(index): self.syntenic_regions[index] = [] 54 | self.syntenic_regions[index].append(sr) 55 | -------------------------------------------------------------------------------- /python/base/orthocluster.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/orthocluster.pyc -------------------------------------------------------------------------------- /python/base/orthomcl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class OrthoMCLCluster(): 4 | def __init__(self, line): 5 | descr, genedefs = line.split("\t") 6 | genedefs = genedefs.split() 7 | self.name = descr[:descr.index('(')].lower() 8 | self.geneHash = {} 9 | self.speciesHash = {} 10 | for genedef in genedefs: 11 | geneid = genedef[:genedef.index('(')] 12 | species = genedef[genedef.index('(')+1:-1] 13 | self.geneHash[geneid] = species 14 | if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid) 15 | else: self.speciesHash[species] = [geneid] 16 | 17 | def add_gene(self, geneid, species): 18 | if not self.geneHash.has_key(geneid): 19 | self.speciesHash[species].append(geneid) 20 | self.geneHash[geneid] = species 21 | def get_name(self): return self.name 22 | def get_count(self): return len(self.geneHash) 23 | def get_gene_hash(self): return self.geneHash 24 | def get_species_hash(self): return self.speciesHash 25 | def to_s(self): 26 | sys.stdout.write(self.name + "(" + str(len(self.geneHash)) + " genes, " + str(len(self.speciesHash)) + ")\t") 27 | first = 1 28 | for geneid, species in self.geneHash.iteritems(): 29 | if first == 0: sys.stdout.write(" ") 30 | first = 0 31 | sys.stdout.write(geneid + "(" + species + ")") 32 | sys.stdout.write("\n") 33 | 34 | -------------------------------------------------------------------------------- /python/base/orthomcl.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/orthomcl.pyc -------------------------------------------------------------------------------- /python/base/pfam.py: -------------------------------------------------------------------------------- 1 | def read_hmmout(ifile, evalue=10, matchreq=0.0): 2 | hash = {} 3 | fo = open(ifile) 4 | for line in fo: 5 | cols = line.strip().split() 6 | if len(cols) == 16: 7 | i = [] 8 | i.append(line.index("\t")) 9 | if line.count(" ") > 0: i.append(line.index(" ")) 10 | line = line[min(i):] 11 | pd = PfamDomain(line) 12 | if float(pd.get_attr('E-value')) > evalue: continue 13 | if matchreq > 0 and ((float(pd.get_attr('alignment_end'))-float(pd.get_attr('alignment_start')))/float(pd.get_attr('hmm_length'))) < matchreq: continue 14 | #print pd.get_attr('seq_id'), pd.get_attr('hmm_name') 15 | #print pd.get_attr('seq_id') 16 | if not hash.has_key(pd.get_attr('seq_id')): hash[pd.get_attr('seq_id')] = [] 17 | hash[pd.get_attr('seq_id')].append(pd) 18 | fo.close() 19 | return hash 20 | 21 | 22 | class PfamDomain(): 23 | def __init__(self, line): 24 | self.attributes = ['seq_id', 'alignment_start', 'alignment_end', 'envelope_start', 'envelope_end', 'hmm_acc', 'hmm_name', 'type', 'hmm_start', 'hmm_end', 'hmm_length', 'bit_score', 'E-value', 'significance', 'clan'] 25 | line = line.strip() 26 | self.values = line.split() 27 | 28 | def get_attr(self, name): 29 | if not name in self.attributes: return "" 30 | return self.values[ self.attributes.index(name) ] 31 | 32 | def covers(self, position): 33 | position = int(position) 34 | if int(self.get_attr('alignment_start')) <= position and int(self.get_attr('alignment_end')) >= position: 35 | return True 36 | return False 37 | -------------------------------------------------------------------------------- /python/base/pfam.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/pfam.pyc -------------------------------------------------------------------------------- /python/base/sciroko.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/sciroko.pyc -------------------------------------------------------------------------------- /python/base/stats.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import rpy2.robjects 3 | R = rpy2.robjects.r 4 | 5 | # ============================================================================= 6 | def correlate(x, y, method="pearson"): 7 | """ 8 | performs a correlation between two vectors (assumed floats) and a given 9 | correlation method. returns cor.coefficient and p-value. 10 | """ 11 | xr = rpy2.robjects.FloatVector(x) 12 | yr = rpy2.robjects.FloatVector(y) 13 | res = R['cor.test'](xr, yr, method=method) 14 | #for i in range(len(res)): 15 | # k = res.names[i] 16 | # v = res[i] 17 | # print i, "|", k, "=", v 18 | p = res.subset('p.value')[0][0] 19 | cor = res.subset('estimate')[0][0] 20 | return cor, p 21 | 22 | # ============================================================================= 23 | def average(array): 24 | return numpy.average(array) 25 | 26 | # ============================================================================= 27 | def median(array): 28 | return numpy.median(array) 29 | 30 | 31 | # ============================================================================= 32 | def stdev(array): 33 | return numpy.std(array) 34 | 35 | -------------------------------------------------------------------------------- /python/base/stats.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lotharwissler/bioinformatics/83a53771222ecb0759e3b4bfa2018d2cd7647643/python/base/stats.pyc -------------------------------------------------------------------------------- /python/blast/blast-best-hit-per-query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from low import * # custom functions, written by myself 7 | import blastout 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -f " ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -f blastout file (-m 8)" ) 17 | stdout( " " ) 18 | sys.exit(1) 19 | 20 | # ============================================================================= 21 | def handle_arguments(): 22 | """ verifies the presence of all necessary arguments and returns the data dir """ 23 | 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['blastoutfile'] = value 37 | 38 | for key in ['blastoutfile']: 39 | if key.endswith("file"): 40 | if not args_file_exists(args, key): show_help() 41 | elif key.endswith("dir"): 42 | if not args_dir_exists(args, key): show_help() 43 | elif not args.has_key(key): 44 | print >> sys.stderr, "missing argument", key 45 | show_help() 46 | return args 47 | 48 | # ============================================================================= 49 | def statusbar(current, total, message="", width=40): 50 | progress = 1.0*current/total 51 | if message != "": message = "[" + message + "]" 52 | progressbar = "=" * int(progress*width) 53 | while len(progressbar) < width: progressbar += " " 54 | sys.stderr.write("\r 0% " + progressbar + " 100% " + message) 55 | if progress == 1.0: sys.stderr.write("\n") 56 | 57 | 58 | # ============================================================================= 59 | # === MAIN ==================================================================== 60 | # ============================================================================= 61 | def main( args ): 62 | q2hits = blastout.get_query_hash(args['blastoutfile']) 63 | for qid, blasthits in q2hits.iteritems(): 64 | print blasthits[0].to_s() 65 | 66 | # ============================================================================= 67 | args = handle_arguments() 68 | main( args ) 69 | 70 | -------------------------------------------------------------------------------- /python/blast/parse-best-blast-hit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -b [-f ]" ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f path to the fasta file containing the record ids." ) 20 | stdout( " -b path to the blast.best-hit file of swiss-prot" ) 21 | stdout( " " ) 22 | 23 | sys.exit(1) 24 | 25 | 26 | # ============================================================================= 27 | def handle_arguments(): 28 | """ verifies the presence of all necessary arguments and returns the data dir """ 29 | if len ( sys.argv ) == 1: 30 | stderr( "no arguments provided." ) 31 | show_help() 32 | 33 | try: # check for the right arguments 34 | keys, values = getopt.getopt( sys.argv[1:], "hb:f:" ) 35 | except getopt.GetoptError: 36 | stderr( "invalid arguments provided." ) 37 | show_help() 38 | 39 | blastbesthitfile, recordfile = '', '' 40 | for key, value in keys: 41 | if key == '-b': 42 | if not file_exists( value ): 43 | stderr( "invalid path in " + key ) 44 | show_help() 45 | else: 46 | blastbesthitfile = value 47 | 48 | if key == '-f': 49 | if not file_exists( value ): 50 | stderr( "invalid path in " + key ) 51 | show_help() 52 | else: 53 | recordfile = value 54 | 55 | if blastbesthitfile == '': 56 | stderr( "blast.best-hit file missing." ) 57 | show_help() 58 | elif not file_exists( blastbesthitfile ): 59 | stderr( "blast.best-hit file does not exist." ) 60 | show_help() 61 | 62 | if recordfile == '': 63 | stderr( "recordfile missing." ) 64 | show_help() 65 | elif not file_exists( recordfile ): 66 | stderr( "recordfile does not exist." ) 67 | show_help() 68 | 69 | return blastbesthitfile, recordfile 70 | 71 | 72 | # ============================================================================= 73 | def parse_best_blast_hits( blastbesthitfile, recordfile ): 74 | """ """ 75 | 76 | records = [] 77 | fo = open( recordfile, 'r' ) 78 | for line in fo: 79 | records.append(line.strip().replace('\n','')) 80 | 81 | fo = open( blastbesthitfile, 'r' ) 82 | for line in fo: 83 | columns = line.split() 84 | if columns[0] in records: 85 | print columns[0] 86 | print " hit :", string.join(columns[10:], ' ')[1:] 87 | print " evalue:", columns[4], "\n" 88 | 89 | fo.close() 90 | 91 | 92 | 93 | # ============================================================================= 94 | # === MAIN ==================================================================== 95 | # ============================================================================= 96 | 97 | blastbesthitfile, recordfile = handle_arguments() 98 | parse_best_blast_hits( blastbesthitfile, recordfile ) 99 | -------------------------------------------------------------------------------- /python/blast/parse-blastout-xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys, getopt, string 4 | from Bio.Seq import Seq 5 | from Bio.Blast import NCBIXML 6 | from Bio.Alphabet import IUPAC 7 | 8 | #============================================================================== 9 | def show_help(): 10 | print """%s parses BLASTX XML output to STDOUT 11 | 12 | Options: 13 | -f:\tBLASTX output in XML format 14 | -n:\tnumber of best hits to be parsed (default: 1) 15 | -e:\tmaximum e-value to accept hits (default: 1e-5) 16 | 17 | What this program does: 18 | It takes the best hit's start and endposition from BLAST, applies it to the sequence in your query (e.g. the CAP3-output), 19 | and translates to the left resp. right from the start resp. end of your CAP3-output, until a Start-orStopcodon appears. 20 | """ % sys.argv[0] 21 | 22 | sys.exit(1) 23 | 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | sys.stderr.write( "no arguments provided.\n" ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:n:e:" ) 34 | except getopt.GetoptError: 35 | sys.stderr.write( "invalid arguments provided.\n" ) 36 | show_help() 37 | 38 | args = {} 39 | args['numhits'] = 1 40 | args['evalue'] = float('1e-5') 41 | for key, value in keys: 42 | if key == '-f': args['blastfile'] = value 43 | if key == '-n': args['numhits'] = int(value) 44 | if key == '-e': args['evalue'] = float(value) 45 | 46 | if not args.has_key('blastfile'): 47 | sys.stderr.write( "blastx XML file argument missing.\n" ) 48 | show_help() 49 | elif not os.path.exists( args.get('blastfile') ) or not os.path.isfile( args.get('blastfile') ): 50 | sys.stderr.write( "blastx XML file does not exist.\n" ) 51 | show_help() 52 | 53 | return args 54 | 55 | 56 | #============================================================================== 57 | def main(args): 58 | #print "Working..." 59 | header = ['query', 'hit', 'frame', 'query_startpos', 'query_endpos', 'subject_startpos', 'subject_endpos', 'evalue', 'score'] 60 | print '#', string.join(header, "\t") 61 | XML = open( args.get('blastfile') ) 62 | blast_records = NCBIXML.parse(XML) 63 | 64 | for i in blast_records: 65 | # print i.query 66 | count = 0 67 | while count < args.get('numhits'): 68 | count += 1 69 | hit = i.alignments.pop(0) 70 | hsp = hit.hsps[0] 71 | if hsp.expect > args.get('evalue'): break 72 | # print i.query, hit.title.split()[0], hsp.frame[0], hsp.query_start, hsp.query_start -1+ len(hsp.query)*3, hsp.sbjct_start, hsp.sbjct_start -1+ len(hsp.sbjct), hsp.expect, hsp.score 73 | print string.join([i.query, hit.title.split()[0], 74 | str(hsp.frame[0]), 75 | str(hsp.query_start), 76 | str(hsp.query_start -1+ len(hsp.query.replace('-', ''))*3), 77 | str(hsp.sbjct_start), 78 | str(hsp.sbjct_start -1+ len(hsp.sbjct)), 79 | str(hsp.expect), 80 | str(hsp.score)], "\t") 81 | 82 | 83 | # ============================================================================= 84 | args = handle_arguments() 85 | main( args ) 86 | -------------------------------------------------------------------------------- /python/blast/remove-from-blastout.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -f -i " ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -b blastout file (-m 8)" ) 17 | stdout( " -i file with the IDs to keep" ) 18 | stdout( " " ) 19 | 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hi:b:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | args['verbose'] = 0 37 | for key, value in keys: 38 | if key == '-b': args['in-blastout'] = value 39 | if key == '-i': args['in-ids'] = value 40 | 41 | if not args.has_key('in-blastout'): 42 | stderr( "in-blastout file missing." ) 43 | show_help() 44 | if not args.has_key('in-ids'): 45 | stderr( "in-ids file missing." ) 46 | show_help() 47 | 48 | if not file_exists( args.get('in-blastout') ): 49 | stderr( "in-blastout file does not exist." ) 50 | show_help() 51 | if not file_exists( args.get('in-ids') ): 52 | stderr( "in-ids file does not exist." ) 53 | show_help() 54 | 55 | return args 56 | 57 | # ============================================================================= 58 | def get_ids_to_remove( args ): 59 | """ 60 | reads in the in-ids file and gathers all IDs to which 61 | the out fasta file will be reduced to. 62 | """ 63 | fo = open( args.get('in-ids'), 'r' ) 64 | ids = {} 65 | for line in fo: 66 | line = line.rstrip() 67 | ids[ line.replace('>','') ] = 1 68 | fo.close() 69 | return ids 70 | 71 | 72 | # ============================================================================= 73 | def reduce_blastout( args, rmids ): 74 | """ 75 | reads in in-fasta and creates out-fasta that only contains the records 76 | whose id is contained in the hash keepids. 77 | """ 78 | 79 | retained = 0 80 | fo = open( args.get('in-blastout') ) 81 | for line in fo: 82 | line = line.rstrip() 83 | if len(line) == 0: continue 84 | hid, qid = line.split("\t")[0:2] 85 | if rmids.has_key(hid) or rmids.has_key(qid): continue 86 | print line 87 | retained += 1 88 | fo.close() 89 | 90 | 91 | # ============================================================================= 92 | # === MAIN ==================================================================== 93 | # ============================================================================= 94 | 95 | args = handle_arguments( ) 96 | rmids = get_ids_to_remove( args ) 97 | reduce_blastout( args, rmids ) 98 | -------------------------------------------------------------------------------- /python/fasta/concatenate-alignments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from collections import defaultdict 9 | 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -f -i -n" ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -e file extension, e.g. \".muscle\"" ) 19 | stdout( " " ) 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "he:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-e': args['ext'] = value 38 | 39 | if not args.has_key('ext'): 40 | stderr( "ext argument missing." ) 41 | show_help() 42 | 43 | return args 44 | 45 | 46 | # ============================================================================= 47 | def aln_is_conserved(file, min=0.85): 48 | popenout = os.popen("~/bin/t-coffee -other_pg seq_reformat -in %s -output sim | tail -n 1" % file) 49 | out = popenout.read() 50 | popenout.close() 51 | identity = float(out.split()[-1]) 52 | if identity > min: return 1 53 | else: return 0 54 | 55 | 56 | # ============================================================================= 57 | # === MAIN ==================================================================== 58 | # ============================================================================= 59 | def main( args ): 60 | 61 | added = 0 62 | seqhash = defaultdict(str) 63 | ext = args['ext'] 64 | for file in os.listdir('.'): 65 | if added == 1500: break 66 | if not file.endswith(ext): continue 67 | if not aln_is_conserved(file): continue 68 | fo = open(file) 69 | for line in fo: 70 | line = line.rstrip() 71 | if line.startswith(">"): 72 | id = line[1:] 73 | if id.count(" ") > 0: id = id[:id.index(" ")] 74 | else: 75 | seqhash[id] += line 76 | fo.close() 77 | added += 1 78 | for id, seq in seqhash.iteritems(): 79 | print ">" + id 80 | print seq 81 | 82 | # ============================================================================= 83 | args = handle_arguments() 84 | main( args ) 85 | 86 | -------------------------------------------------------------------------------- /python/fasta/create-clusters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f ortholog cluster flat file to import" ) 18 | stdout( " -p prefix to put in front of the number" ) 19 | stdout( " " ) 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:p:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | args['prefix'] = 'orth.cluster.' 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | if key == '-p': args['prefix'] = value 40 | 41 | if not args.has_key('file'): 42 | stderr( "import file argument missing." ) 43 | show_help() 44 | elif not file_exists( args.get('file') ): 45 | stderr( "import file does not exist." ) 46 | show_help() 47 | 48 | return args 49 | 50 | 51 | # ============================================================================= 52 | # === MAIN ==================================================================== 53 | # ============================================================================= 54 | def main( args ): 55 | 56 | 57 | counter = 0 58 | fo = open( args.get('file') ) 59 | 60 | for line in fo: 61 | counter += 1 62 | fw = open( args.get('prefix') + add_leading_zeroes( counter, 3 ) + '.ids', 'w' ) 63 | ids = line.split() 64 | for id in ids: fw.write( id + "\n" ) 65 | fw.close() 66 | 67 | fo.close() 68 | 69 | # ============================================================================= 70 | args = handle_arguments() 71 | main( args ) 72 | 73 | -------------------------------------------------------------------------------- /python/fasta/fasta-length-per-file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | 4 | def usage(): 5 | print >> sys.stderr, "reports for each fasta sequence the length in tab format" 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 2: usage() 12 | inFile = sys.argv[1] 13 | return inFile 14 | 15 | 16 | def parse_fasta_file(file): 17 | lengthHash = {} 18 | fo = open(file) 19 | id = "" 20 | length = 0 21 | for line in fo: 22 | line = line.strip() 23 | if line.startswith(">"): 24 | continue 25 | else: 26 | length += len(line) 27 | fo.close() 28 | base = file 29 | if base.count(".") > 0: base = base[:base.index(".")] 30 | if base.count("_") > 0: base = base[:base.index("_")] 31 | print base + "\t" + str(length) 32 | 33 | 34 | def main(): 35 | inFile = plausi() 36 | parse_fasta_file( inFile ) 37 | 38 | 39 | main() 40 | -------------------------------------------------------------------------------- /python/fasta/fasta-length-per-sequence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | 4 | def usage(): 5 | print >> sys.stderr, "reports for each fasta sequence the length in tab format" 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 2: usage() 12 | inFile = sys.argv[1] 13 | return inFile 14 | 15 | 16 | def parse_fasta_file(file): 17 | lengthHash = {} 18 | fo = open(file) 19 | id = "" 20 | for line in fo: 21 | line = line.strip() 22 | if line.startswith(">"): 23 | id = line[1:] 24 | if id.count(" ") > 0: id = id[:id.index(" ")] 25 | lengthHash[id] = 0 26 | else: 27 | lengthHash[id] += len(line) 28 | for id, length in lengthHash.iteritems(): 29 | print id + "\t" + str(length) 30 | 31 | 32 | def main(): 33 | inFile = plausi() 34 | parse_fasta_file( inFile ) 35 | 36 | 37 | main() 38 | -------------------------------------------------------------------------------- /python/fasta/fasta-length-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | 4 | def usage(): 5 | print >> sys.stderr, "reports for each fasta sequence the length in tab format" 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 2: usage() 12 | inFile = sys.argv[1] 13 | return inFile 14 | 15 | 16 | def parse_fasta_file(file): 17 | lengthHash = {} 18 | fo = open(file) 19 | id = "" 20 | for line in fo: 21 | line = line.strip() 22 | if line.startswith(">"): 23 | id = line[1:] 24 | if id.count(" ") > 0: id = id[:id.index(" ")] 25 | lengthHash[id] = 0 26 | else: 27 | lengthHash[id] += len(line) 28 | for id, length in lengthHash.iteritems(): 29 | print id + "\t" + str(length) 30 | 31 | 32 | def main(): 33 | inFile = plausi() 34 | parse_fasta_file( inFile ) 35 | 36 | 37 | main() 38 | -------------------------------------------------------------------------------- /python/fasta/fasta-length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | 4 | def usage(): 5 | print >> sys.stderr, "reports all fasta files with one or more sequences < or > n characters" 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " folder \"<> n\"" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 3: usage() 12 | inFolder, inCutoff = sys.argv[1:3] 13 | inCut, inThreshold = inCutoff.split() 14 | inThreshold = int(inThreshold) 15 | return inFolder, inCut, inThreshold 16 | 17 | 18 | def parse_fasta_file(file): 19 | lengthHash = {} 20 | fo = open(file) 21 | id = "" 22 | for line in fo: 23 | line = line.strip() 24 | if line.startswith(">"): 25 | id = line[1:] 26 | lengthHash[id] = 0 27 | else: 28 | lengthHash[id] += len(line) 29 | lengths = lengthHash.values() 30 | lengths.sort() 31 | return lengths[0] 32 | 33 | 34 | def test_threshold(length, inCut, inThreshold): 35 | if inCut == ">" and length > inThreshold: return 1 36 | if inCut == "<" and length < inThreshold: return 1 37 | return 0 38 | 39 | 40 | def main(): 41 | inFolder, inCut, inThreshold = plausi() 42 | for filename in os.listdir(inFolder): 43 | if not filename.endswith(".fasta"): continue 44 | minlength = parse_fasta_file( filename ) 45 | report = test_threshold(minlength, inCut, inThreshold) 46 | if report: 47 | print os.path.split(filename)[1] + "\t" + str(minlength) 48 | 49 | 50 | 51 | main() 52 | -------------------------------------------------------------------------------- /python/fasta/fasta-sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file" ) 18 | stdout( " " ) 19 | 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:m:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-f': args['aln'] = value 38 | 39 | if not args.has_key('aln'): 40 | stderr( "fasta file missing." ) 41 | show_help() 42 | if not file_exists( args.get('aln') ): 43 | stderr( "fasta file does not exist." ) 44 | show_help() 45 | 46 | return args 47 | 48 | # ============================================================================= 49 | # ============================================================================= 50 | def main( args ): 51 | 52 | #sys.stderr.write(args.get('aln') + "\t") 53 | #sys.stderr.flush() 54 | # create evolver control file based on the M0 out file 55 | 56 | hash = {} 57 | id = "" 58 | fo = open( args.get('aln') ) 59 | for line in fo: 60 | line = line.rstrip() 61 | if line.startswith(">"): 62 | id = line[1:] 63 | hash[id] = "" 64 | else: 65 | hash[id] += line 66 | fo.close() 67 | 68 | sorted_keys = hash.keys() 69 | sorted_keys.sort() 70 | for id in sorted_keys: 71 | print ">" + id 72 | seq = hash[id] 73 | i = 0 74 | while i < len(seq): 75 | end = min([i+60, len(seq)]) 76 | print seq[i:end] 77 | i += 60 78 | 79 | 80 | # ============================================================================= 81 | # === MAIN ==================================================================== 82 | # ============================================================================= 83 | 84 | args = handle_arguments( ) 85 | main( args ) 86 | -------------------------------------------------------------------------------- /python/fasta/fasta-starts-with-meth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | 4 | def usage(): 5 | print >> sys.stderr, "reports for each fasta sequence the length in tab format" 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " fastafile" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 2: usage() 12 | inFile = sys.argv[1] 13 | return inFile 14 | 15 | 16 | def parse_fasta_file(file): 17 | fo = open(file) 18 | id = "" 19 | for line in fo: 20 | line = line.strip() 21 | if line.startswith(">"): 22 | id = line[1:] 23 | if id.count(" ") > 0: id = id[:id.index(" ")] 24 | seq = '' 25 | elif len(seq) == 0: 26 | seq += line 27 | if seq[0].upper() == 'M': print "%s\t1" % id 28 | else: print "%s\t0" % id 29 | 30 | 31 | def main(): 32 | inFile = plausi() 33 | parse_fasta_file( inFile ) 34 | 35 | 36 | main() 37 | -------------------------------------------------------------------------------- /python/fasta/fasta-to-swapsc-input.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f nt alignment file (fasta)" ) 18 | stdout( " -m paml M0 out file" ) 19 | stdout( " " ) 20 | 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:m:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['aln'] = value 39 | if key == '-m': args['m0'] = value 40 | 41 | if not args.has_key('aln'): 42 | stderr( "aln file missing." ) 43 | show_help() 44 | if not file_exists( args.get('aln') ): 45 | stderr( "aln file does not exist." ) 46 | show_help() 47 | 48 | if not args.has_key('m0'): 49 | stderr( "M0 file missing." ) 50 | show_help() 51 | if not file_exists( args.get('m0') ): 52 | stderr( "M0 file does not exist." ) 53 | show_help() 54 | 55 | return args 56 | 57 | # ============================================================================= 58 | # ============================================================================= 59 | def main( args ): 60 | 61 | #sys.stderr.write(args.get('aln') + "\t") 62 | #sys.stderr.flush() 63 | # create evolver control file based on the M0 out file 64 | fo = open( args.get('m0') ) 65 | line = "" 66 | while not re.match("\s+\d+\s+\d+\s*$", line): 67 | line = fo.readline() 68 | numbers = line.split() 69 | nspecies, length = numbers[0:2] 70 | fo.close() 71 | 72 | fo = open( args.get('aln') ) 73 | print " " + nspecies + " " + length + "\n" 74 | for line in fo: 75 | line = line.rstrip() 76 | if line.startswith(">"): print line[1:] 77 | else: print line 78 | fo.close() 79 | 80 | 81 | # ============================================================================= 82 | # === MAIN ==================================================================== 83 | # ============================================================================= 84 | 85 | args = handle_arguments( ) 86 | main( args ) 87 | -------------------------------------------------------------------------------- /python/fasta/fasta2flat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from low import * # custom functions, written by myself 7 | import fasta 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -f " ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -f fasta file" ) 17 | stdout( " " ) 18 | 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['fastafile'] = value 37 | 38 | if not args.has_key('fastafile'): 39 | stderr( "fasta file missing." ) 40 | show_help() 41 | if not file_exists( args.get('fastafile') ): 42 | stderr( "fasta file does not exist." ) 43 | show_help() 44 | 45 | return args 46 | 47 | # ============================================================================= 48 | def get_sequences(file): 49 | seqcount, alnlength = 0, 0 50 | text = '' 51 | fo = open(file) 52 | for line in fo: 53 | line = line.rstrip() 54 | if line.startswith(">"): 55 | id = line[1:] 56 | if id.count(" ") > 0: id = id[:id.index(" ")] 57 | text += "\n" + id + "\n" 58 | seqcount += 1 59 | else: 60 | text += line 61 | if seqcount == 1: alnlength += len(line) 62 | fo.close() 63 | return text, seqcount, alnlength 64 | 65 | # ============================================================================= 66 | # ============================================================================= 67 | def main( args ): 68 | for gid, seq in fasta.get_sequence_hash(args['fastafile']).iteritems(): 69 | print string.join([gid, seq], "\t") 70 | 71 | # ============================================================================= 72 | # === MAIN ==================================================================== 73 | # ============================================================================= 74 | 75 | args = handle_arguments( ) 76 | main( args ) 77 | -------------------------------------------------------------------------------- /python/fasta/fasta2phylip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -f " ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -f fasta file" ) 17 | stdout( " " ) 18 | 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:m:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['aln'] = value 37 | 38 | if not args.has_key('aln'): 39 | stderr( "fasta file missing." ) 40 | show_help() 41 | if not file_exists( args.get('aln') ): 42 | stderr( "fasta file does not exist." ) 43 | show_help() 44 | 45 | return args 46 | 47 | # ============================================================================= 48 | def get_sequences(file): 49 | seqcount, alnlength = 0, 0 50 | text = '' 51 | fo = open(file) 52 | for line in fo: 53 | line = line.rstrip() 54 | if line.startswith(">"): 55 | id = line[1:] 56 | if id.count(" ") > 0: id = id[:id.index(" ")] 57 | text += "\n" + id + "\n" 58 | seqcount += 1 59 | else: 60 | text += line 61 | if seqcount == 1: alnlength += len(line) 62 | fo.close() 63 | return text, seqcount, alnlength 64 | 65 | # ============================================================================= 66 | # ============================================================================= 67 | def main( args ): 68 | 69 | text, seqcount, alnlength = get_sequences(args['aln']) 70 | sys.stdout.write(" %s %s" %( seqcount, alnlength )) 71 | print text 72 | 73 | # ============================================================================= 74 | # === MAIN ==================================================================== 75 | # ============================================================================= 76 | 77 | args = handle_arguments( ) 78 | main( args ) 79 | -------------------------------------------------------------------------------- /python/fasta/fastasplit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f -n -i " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file" ) 18 | stdout( " -n size of each new fasta file (# seq)" ) 19 | stdout( " -i number of fasta files to split into" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:n:i:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['fasta'] = value 39 | if key == '-n': args['n'] = int(value) 40 | if key == '-i': args['i'] = int(value) 41 | 42 | if not args.has_key('n') and not args.has_key('i'): 43 | stderr( "n or i missing." ) 44 | show_help() 45 | 46 | if not args.has_key('fasta'): 47 | stderr( "fasta file missing." ) 48 | show_help() 49 | if not file_exists( args.get('fasta') ): 50 | stderr( "fasta file does not exist." ) 51 | show_help() 52 | 53 | return args 54 | 55 | 56 | # ============================================================================= 57 | # ============================================================================= 58 | def main( args ): 59 | sout, serr = catch_bash_cmd_output( "grep '>' -c %s" % args.get('fasta') ) 60 | total = int( sout ) 61 | cut = total 62 | seqcount = 0 63 | filecount = 1 64 | 65 | if args.has_key('i'): cut = int(math.ceil( 1.0 * total / args.get('i') )) 66 | else: cut = args.get('n') 67 | 68 | 69 | fw = open( args.get('fasta') + '.' + add_leading_zeroes(filecount, 6), 'w' ) 70 | handle = open(args.get('fasta')) 71 | for line in handle: 72 | 73 | if line[0] == ">": 74 | seqcount += 1 75 | if ((seqcount % cut) == 1 and seqcount > 1) or (cut == 1 and seqcount > 1): 76 | filecount += 1 77 | fw.flush() 78 | fw.close() 79 | fw = open( args.get('fasta') + '.' + add_leading_zeroes(filecount, 6), 'w' ) 80 | 81 | fw.write(line) 82 | 83 | fw.flush() 84 | fw.close() 85 | infomsg( "total.seq.count: %s | split.count: %s | file.count: %s" %(total, cut, filecount) ) 86 | 87 | # ============================================================================= 88 | # === MAIN ==================================================================== 89 | # ============================================================================= 90 | 91 | args = handle_arguments( ) 92 | main( args ) 93 | -------------------------------------------------------------------------------- /python/fasta/gc-content-from-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from goterm import GOTerm 9 | from collections import defaultdict 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f DNA fasta file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "fasta file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "fasta file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | # === MAIN ==================================================================== 52 | # ============================================================================= 53 | def main( args ): 54 | 55 | counts = {'A':0, 'T':0, 'G':0, 'C':0} 56 | fo = open(args['file']) 57 | for line in fo: 58 | if line.startswith(">"): continue 59 | line = line.rstrip().upper() 60 | for char in ['A', 'T', 'G', 'C']: 61 | counts[char] += line.count(char) 62 | 63 | total = sum(counts.values()) 64 | gc = 1.0 * (counts['G'] + counts['C']) / total 65 | base = args['file'] 66 | if base.count(".") > 0: base = base[:base.index(".")] 67 | if base.count("_") > 0: base = base[:base.index("_")] 68 | 69 | print base + "\t" + str(gc) 70 | 71 | # ============================================================================= 72 | args = handle_arguments() 73 | main( args ) 74 | 75 | -------------------------------------------------------------------------------- /python/fasta/generate-fasta-aa-nt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm # index databases (file hash) 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -i [-d ]" ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -i ID file" ) 19 | stdout( " -d directory to search for orthologs" ) 20 | stdout( " " ) 21 | 22 | sys.exit(1) 23 | 24 | # ============================================================================= 25 | def handle_arguments(): 26 | """ verifies the presence of all necessary arguments and returns the data dir """ 27 | if len ( sys.argv ) == 1: 28 | stderr( "no arguments provided." ) 29 | show_help() 30 | 31 | try: # check for the right arguments 32 | keys, values = getopt.getopt( sys.argv[1:], "hi:d:" ) 33 | except getopt.GetoptError: 34 | stderr( "invalid arguments provided." ) 35 | show_help() 36 | 37 | args = {} 38 | for key, value in keys: 39 | if key == '-d': args['dir'] = value 40 | if key == '-i': args['idfile'] = value 41 | 42 | if args.has_key('dir') and not dir_exists( args.get('dir') ): 43 | stderr( "dir folder does not exist." ) 44 | show_help() 45 | if not args.has_key('dir'): args['dir'] = './' 46 | if not args.get('dir').endswith('/'): args['dir'] = args.get('dir') + '/' 47 | 48 | if not args.has_key('idfile'): 49 | stderr( "id file missing." ) 50 | show_help() 51 | if not file_exists( args.get('idfile') ): 52 | stderr( "id file does not exist" ) 53 | show_help 54 | 55 | return args 56 | 57 | 58 | # ============================================================================= 59 | # ============================================================================= 60 | def main( args ): 61 | idlist = read_from_file( args.get('idfile') ).splitlines() 62 | dir = args.get('dir') 63 | 64 | hash = {} 65 | for id in idlist: 66 | popenout = os.popen("grep -l \"%s\" %s*" %(id, dir)) 67 | out = popenout.read() 68 | popenout.close() 69 | outlines = out.splitlines() 70 | 71 | hash[ id ] = outlines 72 | 73 | aafile = args.get('idfile') + '.aa' 74 | ntfile = args.get('idfile') + '.nt' 75 | for id,files in hash.iteritems(): 76 | for file in files: 77 | if not file.endswith('.aa') and not file.endswith('.nt'): continue 78 | popenout = os.popen("grep -A 100 \"%s\" %s" %(id, file)) 79 | out = popenout.read() 80 | popenout.close() 81 | outlines = out.splitlines() 82 | outlines.pop(0) 83 | 84 | if file.endswith('.aa'): outfile = aafile 85 | else: outfile = ntfile 86 | 87 | os.system( "echo \">%s\" >> %s" %( id, outfile ) ) 88 | for line in outlines: 89 | if not line.startswith(">"): os.system( "echo \"%s\" >> %s" %( line, outfile ) ) 90 | else: break 91 | 92 | # ============================================================================= 93 | # === MAIN ==================================================================== 94 | # ============================================================================= 95 | 96 | args = handle_arguments( ) 97 | main( args ) 98 | -------------------------------------------------------------------------------- /python/fasta/index-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys # low level handling, such as command line stuff 3 | import getopt # comand line argument handling 4 | import anydbm # index databases (file hash) 5 | from low import * # collection of generic self-defined functions 6 | 7 | 8 | # ============================================================================= 9 | def show_help( ): 10 | """ displays the program parameter list and usage information """ 11 | stdout( "usage: " + sys.argv[0] + " -f -o " ) 12 | stdout( " " ) 13 | stdout( " option description" ) 14 | stdout( " -h help (this text here)" ) 15 | stdout( " -f input fasta file" ) 16 | stdout( " -o output dbm file" ) 17 | stdout( " " ) 18 | sys.exit(1) 19 | 20 | # ============================================================================= 21 | def handle_arguments(): 22 | """ verifies the presence of all necessary arguments and returns the data dir """ 23 | if len ( sys.argv ) == 1: 24 | stderr( "no arguments provided." ) 25 | show_help() 26 | 27 | try: # check for the right arguments 28 | keys, values = getopt.getopt( sys.argv[1:], "hf:o:" ) 29 | except getopt.GetoptError: 30 | stderr( "invalid arguments provided." ) 31 | show_help() 32 | 33 | args = {} 34 | for key, value in keys: 35 | if key == '-f': args['fasta'] = value 36 | if key == '-o': args['out'] = value 37 | 38 | if not args.has_key('fasta'): 39 | stderr( "fasta file missing." ) 40 | show_help() 41 | if not file_exists( args.get('fasta') ): 42 | stderr( "fasta file does not exist." ) 43 | show_help() 44 | 45 | if not args.has_key('out'): 46 | stderr( "out file missing." ) 47 | show_help() 48 | 49 | return args 50 | 51 | 52 | # ============================================================================= 53 | # ============================================================================= 54 | def main( args ): 55 | DBM = anydbm.open( args.get('out'), 'c' ) 56 | sout, serr = catch_bash_cmd_output( "grep '>' -c %s" %args.get('fasta') ) 57 | total = int( sout ) 58 | added = 0 59 | fo = open( args.get('fasta') ) 60 | key, value = '', '' 61 | for line in fo: 62 | line = line.rstrip() 63 | if line.startswith('>'): 64 | if key != '' and value != '': 65 | #print key + "\t" + value 66 | added += 1 67 | DBM[ key ] = value 68 | sys.stderr.write('\r\tindexing:\t%s\t%01.2f%%' %(added,100.0*added/total) ) 69 | sys.stderr.flush() 70 | key, value = '', '' 71 | key = re.match(">(\S+)", line).group(1) 72 | else: 73 | value += line.rstrip() 74 | fo.close() 75 | if key != '' and value != '': 76 | added += 1 77 | DBM[ key ] = value 78 | #print key + "\t" + value 79 | DBM.close() 80 | sys.stderr.write('\r\tindexing:\t%s\t%01.2f%%\ndone.\n' %(added,100.0*added/total) ) 81 | 82 | # ============================================================================= 83 | # === MAIN ==================================================================== 84 | # ============================================================================= 85 | args = handle_arguments( ) 86 | main( args ) 87 | 88 | -------------------------------------------------------------------------------- /python/fasta/remove-stopcodons-from-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f ") 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file" ) 18 | stdout( " " ) 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['fasta'] = value 37 | 38 | if not args.has_key('fasta'): 39 | stderr( "fasta file argument missing." ) 40 | show_help() 41 | elif not file_exists( args.get('fasta') ): 42 | stderr( "fasta file does not exist." ) 43 | show_help() 44 | 45 | return args 46 | 47 | 48 | # ============================================================================= 49 | def parse_fasta(file): 50 | hash = {} 51 | fo = open(file) 52 | STOPCODONS = ["TAA", "TGA", "TAG"] 53 | id = "" 54 | for line in fo: 55 | line = line.strip() 56 | if line.startswith(">"): 57 | id = line[1:] 58 | if id.count(" ") > 0: id = id[:id.index(" ")] 59 | hash[id] = "" 60 | else: 61 | sequence = line.upper() 62 | i = 0 63 | while i < len(sequence): 64 | codon = sequence[i:i+3] 65 | if codon in STOPCODONS: 66 | hash[id] += "---" 67 | else: 68 | hash[id] += codon 69 | i += 3 70 | return hash 71 | 72 | # ============================================================================= 73 | def replace_stop_codons(hash): 74 | for id, sequence in hash.iteritems(): 75 | i = 0 76 | while i < len(sequence): 77 | codon = sequence[i:i+3] 78 | if codon in STOPCODONS: 79 | sequence[i:i+3] = "---" 80 | i += 3 81 | 82 | return hash 83 | 84 | 85 | # ============================================================================= 86 | # === MAIN ==================================================================== 87 | # ============================================================================= 88 | def main( args ): 89 | 90 | hash = parse_fasta(args['fasta']) 91 | width = 60 92 | for id, sequence in hash.iteritems(): 93 | print ">" + id 94 | i = 0 95 | while i < len(sequence): 96 | part = sequence[i:min([len(sequence),i+60])] 97 | print part 98 | i += 60 99 | 100 | # ============================================================================= 101 | args = handle_arguments() 102 | main( args ) 103 | 104 | -------------------------------------------------------------------------------- /python/fasta/rename-fasta-sequences.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file to import" ) 18 | stdout( " -m tab delimited file that maps a regex to the replacement name, one per line" ) 19 | stdout( " " ) 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:m:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-f': args['file'] = value 38 | if key == '-m': args['mapping'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "fasta file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "fasta file does not exist." ) 45 | show_help() 46 | 47 | if not args.has_key('mapping'): 48 | stderr( "mapping file argument missing." ) 49 | show_help() 50 | elif not file_exists( args.get('file') ): 51 | stderr( "mapping file does not exist." ) 52 | show_help() 53 | 54 | return args 55 | 56 | 57 | # ============================================================================= 58 | def get_mapping(mfile): 59 | hash = {} 60 | fo = open( mfile, "r" ) 61 | for line in fo: 62 | line = line.rstrip() 63 | if len(line) == 0: break 64 | if len(line.split("\t")) != 2: continue 65 | regex, replacement = line.split("\t") 66 | hash[re.compile(regex)] = replacement 67 | fo.close() 68 | return hash 69 | 70 | # ============================================================================= 71 | def apply_replacement(idline, maphash): 72 | id = idline[1:].split()[0] 73 | for regex, replacement in maphash.iteritems(): 74 | if re.search(regex, idline[1:]): 75 | idline = '>' + re.sub(regex, replacement, idline[1:], count=1) 76 | break 77 | return idline 78 | 79 | # ============================================================================= 80 | # === MAIN ==================================================================== 81 | # ============================================================================= 82 | def main( args ): 83 | 84 | maphash = get_mapping( args.get('mapping') ) 85 | 86 | fo = open( args.get('file') ) 87 | for line in fo: 88 | line = line.rstrip() 89 | if line.startswith(">"): line = apply_replacement(line, maphash) 90 | print line 91 | fo.close() 92 | 93 | # ============================================================================= 94 | args = handle_arguments() 95 | main( args ) 96 | 97 | -------------------------------------------------------------------------------- /python/fasta/stockholm-to-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -s -k \"regex\" -v \"regex\"" ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -s stockholm file" ) 17 | stdout( " -k regular expression for the key" ) 18 | stdout( " -v regular expression for the value" ) 19 | stdout( " " ) 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hs:k:v:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-s': args['stockholm'] = value 38 | if key == '-k': args['keyregex'] = re.compile(value + '(.*)$' ) 39 | if key == '-v': args['valueregex'] = re.compile(value + '(.*)$' ) 40 | 41 | if not args.has_key('keyregex'): 42 | stderr( "key regex missing." ) 43 | show_help() 44 | 45 | if not args.has_key('valueregex'): 46 | stderr( "value regex missing." ) 47 | show_help() 48 | 49 | if not args.has_key('stockholm'): 50 | stderr( "stockholm file missing." ) 51 | show_help() 52 | if not file_exists( args.get('stockholm') ): 53 | stderr( "stockholm file does not exist." ) 54 | show_help() 55 | 56 | return args 57 | 58 | 59 | # ============================================================================= 60 | # ============================================================================= 61 | def main( args ): 62 | 63 | fo = open( args.get('stockholm') ) 64 | kre = args.get('keyregex') 65 | vre = args.get('valueregex') 66 | key, value = '', '' 67 | for line in fo: 68 | if re.search( kre, line ): 69 | if key != '' and value != '': 70 | print ">%s" % key 71 | print value 72 | key, value = '', '' 73 | key = re.search( kre, line ).group(1).strip() 74 | if re.search( vre, line ): 75 | value = re.search( vre, line ).group(1).strip() 76 | fo.close() 77 | if key != '' and value != '': 78 | print ">%s" % key 79 | print value 80 | 81 | 82 | 83 | # ============================================================================= 84 | # === MAIN ==================================================================== 85 | # ============================================================================= 86 | 87 | args = handle_arguments( ) 88 | main( args ) -------------------------------------------------------------------------------- /python/fasta/translatedprot_from_gb_to_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | from low import * # custom functions, written by myself 7 | 8 | 9 | # ============================================================================= 10 | def get_translatedfasta_from_gb( file ): 11 | """ 12 | """ 13 | 14 | def write_output( source, hash ): 15 | L = [ ">",hash.get('protein_id'),"|",hash.get('db_xref')] 16 | if hash.has_key('product'): L.append("|"+hash.get('product')) 17 | L.append(" ("+source+")") 18 | print string.join(L,'') 19 | print hash.get('translation') 20 | 21 | fo = open(file) 22 | # read general infos 23 | source = '' 24 | for line in fo: 25 | if re.search('FEATURES',line): break 26 | if re.match('SOURCE',line): 27 | source = re.search('SOURCE\s+(.*)\n',line).group(1) 28 | 29 | # read gene infos 30 | hash = {} 31 | hit = 0 32 | for line in fo: 33 | if not re.match(' ',line): 34 | if len(hash) > 0: write_output( source, hash ) 35 | hash = {} 36 | hit = 0 37 | if re.match(' CDS',line): hit = 1 38 | 39 | if hit: 40 | # catch everything except translation sequence 41 | if re.search('/(\S+)=".*"',line): 42 | hash[re.search('/(\S+)=".*"',line).group(1)] = re.search('/\S+="(.*)"',line).group(1) 43 | # catch translation sequence 44 | if re.search('/translation=',line): 45 | hash['translation'] = re.search('/translation="(.*)\n',line).group(1) 46 | elif hash.has_key('translation'): hash['translation'] += re.search("([a-zA-Z]+)",line).group(1) 47 | if len(hash) > 0: write_output( source, hash ) 48 | fo.close() 49 | 50 | # ============================================================================= 51 | # === MAIN ==================================================================== 52 | # ============================================================================= 53 | 54 | if len( sys.argv ) == 1: 55 | print "no arguments provided. you need to specify the gb file(s) to parse." 56 | sys.exit(1) 57 | 58 | for file in sys.argv[1:]: 59 | if not file_exists(file): 60 | print "gb file not found (or is a dir):", file 61 | continue 62 | get_translatedfasta_from_gb( file ) -------------------------------------------------------------------------------- /python/fasta/uniprot-dat-to-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f uniprot dat file" ) 18 | stdout( " " ) 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-f': args['datfile'] = value 38 | 39 | for key in ['datfile']: 40 | if key.endswith("file"): 41 | if not args_file_exists(args, key): show_help() 42 | elif key.endswith("dir"): 43 | if not args_dir_exists(args, key): show_help() 44 | return args 45 | 46 | # ============================================================================= 47 | def parse_until_doubleslash(fo): 48 | hash, end = {}, False 49 | line = fo.readline().strip() 50 | while not line.startswith("//"): 51 | if len(line) == 0: 52 | end = True 53 | break 54 | if len(line.split(" ", 1)[0]) != 2: 55 | key = "SEQ" 56 | value = line.strip().replace(" ", "") 57 | else: 58 | cols = [e.strip() for e in line.split(" ", 1)] 59 | if len(cols) != 2: 60 | line = fo.readline().strip() 61 | continue 62 | key, value = [e.strip() for e in line.split(" ", 1)] 63 | if not hash.has_key(key): hash[key] = "" 64 | if key != "SEQ" and len(hash[key]) > 0 and hash[key][-1] != " " and not value.startswith(" "): hash[key] += " " 65 | hash[key] += value 66 | line = fo.readline().strip() 67 | return hash, end 68 | 69 | # ============================================================================= 70 | # === MAIN ==================================================================== 71 | # ============================================================================= 72 | def main( args ): 73 | fo = open(args['datfile']) 74 | while 1: 75 | hash, end = parse_until_doubleslash(fo) 76 | if end: break 77 | print ">" + hash["ID"].split()[0] + " " + hash["OC"] 78 | print hash["SEQ"] 79 | fo.close() 80 | 81 | # ============================================================================= 82 | args = handle_arguments() 83 | main( args ) 84 | 85 | -------------------------------------------------------------------------------- /python/geneontology/go-enrichment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | import rpy 4 | 5 | 6 | def usage(): 7 | print >> sys.stderr, "usage: " + sys.argv[0] + " universe-topGO.table testset.ids" 8 | sys.exit(1) 9 | 10 | 11 | def plausi(): 12 | if len(sys.argv) != 3: usage() 13 | inUniverse, inTestset = sys.argv[1:3] 14 | return inUniverse, inTestset 15 | 16 | 17 | def init_R(): 18 | R = rpy.r 19 | try: 20 | R.library('topGO') 21 | except: 22 | try: 23 | R.source("http://bioconductor.org/biocLite.R") 24 | R.biocLite('topGO') 25 | R.library('topGO') 26 | except: 27 | print "Problem importing R libraries." 28 | sys.exit() 29 | 30 | R('if(!isGeneric("GOFisherTestUnder")) setGeneric("GOFisherTestUnder", function(object) standardGeneric("GOFisherTestUnder"))') 31 | R('setMethod("GOFisherTestUnder", "classicCount", function(object) { contMat <- contTable(object); if(all(contMat == 0)) p.value <- 1 else p.value <- fisher.test(contMat, alternative = "less")$p.value; return(p.value) })') 32 | return R 33 | 34 | 35 | def main(): 36 | inUniverse, inTestset = plausi() 37 | R = init_R() 38 | R('GOmap = readMappings(file = "' + inUniverse + '")') 39 | R('refset = names(GOmap)') 40 | R('testset = scan(file="' + inTestset + '", what=character())') 41 | R('genes_of_interest = factor(as.integer(refset %in% testset))') 42 | R('names(genes_of_interest) <- refset') 43 | for ontology in ["MF", "BP", "CC"]: 44 | R('tgData = new("topGOdata", ontology = "' + ontology + '", allGenes = genes_of_interest, annot = annFUN.gene2GO, gene2GO = GOmap)') 45 | R('fisherRes = runTest(tgData, algorithm="classic", statistic="fisher")') 46 | R('fisherResCor = p.adjust(score(fisherRes), method="fdr")') 47 | R('weightRes = runTest(tgData, algorithm="weight01", statistic="fisher")') 48 | R('weightResCor = p.adjust(score(weightRes), method="fdr")') 49 | R('allRes = GenTable(tgData, classic=fisherRes, weight=weightRes, orderBy="weight", ranksOf="classic", topNodes=150)') 50 | R('allRes$fisher.FDR = fisherResCor[allRes$GO.ID]') 51 | R('allRes$weight.FDR = weightResCor[allRes$GO.ID]') 52 | R('write.csv(allRes, "topGO.over.Sig.' + ontology + '.csv")') 53 | 54 | R('tgData = new("topGOdata", ontology = "' + ontology + '", allGenes = genes_of_interest, annot = annFUN.gene2GO, gene2GO = GOmap)') 55 | R('test.stat <- new("classicCount", testStatistic = GOFisherTestUnder, name ="Fisher test underrepresentation")') 56 | R('fisherRes <- getSigGroups(tgData, test.stat)') 57 | R('fisherResCor = p.adjust(score(fisherRes), method="fdr")') 58 | R('test.stat <- new("weightCount", testStatistic = GOFisherTestUnder, name ="Fisher test underrepresentation")') 59 | R('weightRes <- getSigGroups(tgData, test.stat)') 60 | R('weightResCor = p.adjust(score(weightRes), method="fdr")') 61 | R('allRes = GenTable(tgData, classic=fisherRes, weight=weightRes, orderBy="weight", ranksOf="classic", topNodes=150)') 62 | R('allRes$fisher.FDR = fisherResCor[allRes$GO.ID]') 63 | R('allRes$weight.FDR = weightResCor[allRes$GO.ID]') 64 | R('write.csv(allRes, "topGO.under.Sig.' + ontology + '.csv")') 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /python/geneontology/go-from-blastout.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm # index databases (file hash) 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -c -o " ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -f blast.out file" ) 19 | stdout( " " ) 20 | 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "blast.out file missing." ) 42 | show_help() 43 | if not file_exists( args.get('file') ): 44 | stderr( "blast.out file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | 51 | # ============================================================================= 52 | def parse_descr( text ): 53 | hash = {} 54 | if not re.search("GO:\d+.*evidence", text): 55 | sys.stderr.write("return None.\n") 56 | return hash 57 | for match in re.finditer( '(GO:\d+)\s*\"([^"]+)\"\s*evidence', text ): 58 | id = match.group(1) 59 | description = match.group(2) 60 | hash[ id ] = description 61 | return hash 62 | 63 | 64 | # ============================================================================= 65 | # ============================================================================= 66 | def main( args ): 67 | fo = open( args.get('file') ) 68 | descr_index = None 69 | for line in fo: 70 | line = line.rstrip() 71 | cols = line.split("\t") 72 | if descr_index == None: 73 | for index, col in enumerate(cols): 74 | if re.search("GO:\d+", col): 75 | descr_index = index 76 | break 77 | descr = cols[ descr_index ] 78 | go_hash = parse_descr( descr ) 79 | for goterm, godescr in go_hash.iteritems(): 80 | L = [] 81 | for index, col in enumerate(cols): 82 | if index == descr_index: 83 | L.append(goterm) 84 | L.append(godescr) 85 | else: 86 | L.append(col) 87 | print string.join(L,"\t") 88 | fo.close() 89 | 90 | # ============================================================================= 91 | # === MAIN ==================================================================== 92 | # ============================================================================= 93 | 94 | args = handle_arguments( ) 95 | main( args ) 96 | -------------------------------------------------------------------------------- /python/geneontology/goflat2topgo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f GO flat file to import [tab delimited]" ) 18 | stdout( " " ) 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['file'] = value 37 | 38 | if not args.has_key('file'): 39 | stderr( "import file argument missing." ) 40 | show_help() 41 | elif not file_exists( args.get('file') ): 42 | stderr( "import file does not exist." ) 43 | show_help() 44 | 45 | 46 | return args 47 | 48 | 49 | # ============================================================================= 50 | # === MAIN ==================================================================== 51 | # ============================================================================= 52 | def main( args ): 53 | 54 | from collections import defaultdict 55 | goHash = defaultdict(list) 56 | fo = open( args.get('file') ) 57 | for line in fo: 58 | line = line.strip() 59 | geneid, goterm = line.split("\t") 60 | if geneid.count(" ") > 0: 61 | geneid = geneid[:geneid.index(" ")] 62 | goHash[geneid].append(goterm) 63 | fo.close() 64 | for geneid, goterms in goHash.iteritems(): 65 | print geneid + "\t" + string.join(goterms, ", ") 66 | 67 | # ============================================================================= 68 | args = handle_arguments() 69 | main( args ) 70 | 71 | -------------------------------------------------------------------------------- /python/geneontology/goid2name-from-obo-xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from collections import defaultdict 9 | from xml.dom import minidom 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f go term obo-xml file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['obo'] = value 39 | 40 | if not args.has_key('obo'): 41 | stderr( "obo file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('obo') ): 44 | stderr( "obo file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | # ============================================================================= 50 | class GOTerm(): 51 | 52 | def __init__(self, xml): 53 | self.id = xml.getElementsByTagName("id")[0].firstChild.data 54 | self.name = xml.getElementsByTagName("name")[0].firstChild.data 55 | self.namespace = xml.getElementsByTagName("namespace")[0].firstChild.data 56 | self.alt_ids = [node.firstChild.data for node in xml.getElementsByTagName("alt_id")] 57 | 58 | # ============================================================================= 59 | def read_obo( file ): 60 | hash = {} 61 | xmldoc = minidom.parse(file) 62 | for term in xmldoc.getElementsByTagName('term'): 63 | goterm = GOTerm(term) 64 | hash[goterm.id] = goterm 65 | for alt_id in goterm.alt_ids: 66 | if not hash.has_key(alt_id): hash[alt_id] = goterm 67 | print >> sys.stderr, "goterms read from obo: %s" % len(hash) 68 | return hash 69 | 70 | # ============================================================================= 71 | # === MAIN ==================================================================== 72 | # ============================================================================= 73 | def main( args ): 74 | 75 | gohash = read_obo(args['obo']) 76 | for goid, goterm in gohash.iteritems(): 77 | print goid + "\t" + goterm.name 78 | 79 | # ============================================================================= 80 | args = handle_arguments() 81 | main( args ) 82 | 83 | -------------------------------------------------------------------------------- /python/geneontology/goid2name-from-obo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from goterm import GOTerm 9 | from collections import defaultdict 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f go obo file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['obo'] = value 39 | 40 | if not args.has_key('obo'): 41 | stderr( "obo file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('obo') ): 44 | stderr( "obo file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def read_obo( file ): 52 | hash = {} 53 | goterm = {} 54 | fo = open(file) 55 | for line in fo: 56 | line = line.rstrip() 57 | if line.startswith("[Term]") or line.startswith("[Typedef]"): 58 | if goterm.has_key('id') and goterm.has_key('name'): hash[goterm['id']] = goterm['name'] 59 | goterm = {} 60 | elif line.startswith("id:"): 61 | goterm['id'] = line.split()[1] 62 | elif line.startswith("name:"): 63 | goterm['name'] = string.join(line.split()[1:], " ") 64 | fo.close() 65 | print >> sys.stderr, "goterms read from obo: %s" % len(hash) 66 | return hash 67 | 68 | 69 | # ============================================================================= 70 | # === MAIN ==================================================================== 71 | # ============================================================================= 72 | def main( args ): 73 | 74 | gohash = read_obo(args['obo']) 75 | for goid, goname in gohash.iteritems(): 76 | print goid + "\t" + goname 77 | 78 | # ============================================================================= 79 | args = handle_arguments() 80 | main( args ) 81 | 82 | -------------------------------------------------------------------------------- /python/generic/add-basename-as-first-col.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from collections import defaultdict 9 | import fileinput 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f input file (will be rewritten on the fly!) - basename is everything before the first dot" ) 20 | stdout( " -l basename to lower case" ) 21 | stdout( " -u basename to upper case" ) 22 | stdout( " " ) 23 | sys.exit(1) 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | stderr( "no arguments provided." ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:ul" ) 34 | except getopt.GetoptError: 35 | stderr( "invalid arguments provided." ) 36 | show_help() 37 | 38 | args = {'lower':False, 'upper':False} 39 | for key, value in keys: 40 | if key == '-f': args['file'] = value 41 | if key == '-l': args['lower'] = True 42 | if key == '-u': args['upper'] = True 43 | 44 | if not args.has_key('file'): 45 | stderr( "fasta file argument missing." ) 46 | show_help() 47 | elif not file_exists( args.get('file') ): 48 | stderr( "fasta file does not exist." ) 49 | show_help() 50 | 51 | if args['lower'] and args['upper']: 52 | stderr( "cannot select both lower and upper." ) 53 | show_help() 54 | 55 | return args 56 | 57 | 58 | # ============================================================================= 59 | # === MAIN ==================================================================== 60 | # ============================================================================= 61 | def main( args ): 62 | filename = os.path.split(args['file'])[1] 63 | basename = filename 64 | while basename.count(".") > 0: basename = os.path.splitext(basename)[0] 65 | if args['lower']: basename = basename.lower() 66 | if args['upper']: basename = basename.upper() 67 | for line in fileinput.input(args['file'],inplace=1): 68 | print basename + "\t" + line.rstrip() 69 | 70 | # ============================================================================= 71 | args = handle_arguments() 72 | main( args ) 73 | 74 | -------------------------------------------------------------------------------- /python/generic/add-species-as-first-col.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from collections import defaultdict 9 | import fileinput 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f input file (will be rewritten on the fly!)" ) 20 | stdout( " -l species name to lower case" ) 21 | stdout( " -u species name to upper case" ) 22 | stdout( " " ) 23 | sys.exit(1) 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | stderr( "no arguments provided." ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:ul" ) 34 | except getopt.GetoptError: 35 | stderr( "invalid arguments provided." ) 36 | show_help() 37 | 38 | args = {'lower':False, 'upper':False} 39 | for key, value in keys: 40 | if key == '-f': args['file'] = value 41 | if key == '-l': args['lower'] = True 42 | if key == '-u': args['upper'] = True 43 | 44 | if not args.has_key('file'): 45 | stderr( "fasta file argument missing." ) 46 | show_help() 47 | elif not file_exists( args.get('file') ): 48 | stderr( "fasta file does not exist." ) 49 | show_help() 50 | 51 | if args['lower'] and args['upper']: 52 | stderr( "cannot select both lower and upper." ) 53 | show_help() 54 | 55 | return args 56 | 57 | 58 | # ============================================================================= 59 | # === MAIN ==================================================================== 60 | # ============================================================================= 61 | def main( args ): 62 | species = args['file'][:4] 63 | if args['lower']: species = species.lower() 64 | if args['upper']: species = species.upper() 65 | for line in fileinput.input(args['file'],inplace=1): 66 | print species + "\t" + line.rstrip() 67 | 68 | # ============================================================================= 69 | args = handle_arguments() 70 | main( args ) 71 | 72 | -------------------------------------------------------------------------------- /python/generic/add_to_xdom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys # low level handling, such as command line stuff 3 | import getopt # comand line argument handling 4 | import anydbm # index databases (file hash) 5 | from low import * # collection of generic self-defined functions 6 | 7 | 8 | # ============================================================================= 9 | def show_help( ): 10 | """ displays the program parameter list and usage information """ 11 | stdout( "usage: " + sys.argv[0] + " -f -o " ) 12 | stdout( " " ) 13 | stdout( " option description" ) 14 | stdout( " -h help (this text here)" ) 15 | stdout( " -f xdom file" ) 16 | stdout( " -i indexed ndb file" ) 17 | stdout( " -n column to look up [0..n]" ) 18 | stdout( " " ) 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:i:n:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['xdom'] = value 37 | if key == '-i': args['dbm'] = value 38 | if key == '-n': args['column'] = int(value) 39 | 40 | if not args.has_key('xdom'): 41 | stderr( "xdom file missing." ) 42 | show_help() 43 | if not file_exists( args.get('xdom') ): 44 | stderr( "xdom file does not exist." ) 45 | show_help() 46 | 47 | if not args.has_key('dbm'): 48 | stderr( "dbm file missing." ) 49 | show_help() 50 | if not file_exists( args.get('dbm') ): 51 | stderr( "dbm file does not exist." ) 52 | show_help() 53 | 54 | if not args.has_key('column'): 55 | stderr( "column index missing." ) 56 | show_help() 57 | 58 | return args 59 | 60 | 61 | # ============================================================================= 62 | # ============================================================================= 63 | def main( args ): 64 | DBM = anydbm.open( args.get('dbm'), 'r' ) 65 | fo = open( args.get('xdom') ) 66 | n = args.get('column') 67 | key, value = '', '' 68 | for line in fo: 69 | line = line.rstrip() 70 | if line.endswith('\n'): line = line.replace('\n','') 71 | if line.startswith('>'): 72 | print line 73 | #if key != '' and value != '': 74 | # sys.stdout.write( ">%s\n%s" %(key,value) ) 75 | # key, value = '', '' 76 | #key = line[1:].rstrip() 77 | else: 78 | value = line.rstrip() 79 | pid = value.split()[ n ] 80 | if not DBM.has_key( pid ): 81 | print "DBM does not contain the following key:", pid 82 | else: value += "\t" + DBM.get(pid) 83 | print value 84 | fo.close() 85 | #if key != '' and value != '': 86 | # sys.stdout.write( ">%s\n%s" %(key,value) ) 87 | DBM.close() 88 | 89 | # ============================================================================= 90 | # === MAIN ==================================================================== 91 | # ============================================================================= 92 | args = handle_arguments( ) 93 | main( args ) 94 | 95 | -------------------------------------------------------------------------------- /python/generic/difference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sets 4 | import sys, os 5 | 6 | def get_lines( file ): 7 | lines = [] 8 | fo = open(file) 9 | for line in fo: 10 | line = line.rstrip() 11 | lines.append(line) 12 | 13 | return sets.Set(lines) 14 | 15 | l1 = get_lines(sys.argv[1]) 16 | l2 = get_lines(sys.argv[2]) 17 | for e in l1.difference(l2): 18 | print e 19 | -------------------------------------------------------------------------------- /python/generic/flat-split-by-lines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from low import * 7 | 8 | # ============================================================================= 9 | def show_help( ): 10 | """ displays the program parameter list and usage information """ 11 | print "splits a flat file into chunks. Options: (1) N number of lines per chunk. (2) N number of chunks of equal size" 12 | print "usage: " + sys.argv[0] + " -f [-i -l ]" 13 | print " " 14 | print " option description" 15 | print " -h help (this text here)" 16 | print " -f flat file to split" 17 | print " -l number of lines per chunk" 18 | print " -i number of equally sized chunks" 19 | print " " 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | print >> sys.stderr, "no arguments provided." 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:i:l:" ) 31 | except getopt.GetoptError: 32 | print >> sys.stderr, "invalid arguments provided." 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-f': args['file'] = value 38 | if key == '-l': args['l'] = int(value) 39 | if key == '-i': args['i'] = int(value) 40 | 41 | if not args.has_key('file'): 42 | print >> sys.stderr, "import file argument missing." 43 | show_help() 44 | elif not file_exists( args.get('file') ): 45 | print >> sys.stderr, "import file does not exist." 46 | show_help() 47 | 48 | if not args.has_key('l') and not args.has_key('i'): 49 | print >> sys.stderr, "l or i missing." 50 | show_help() 51 | 52 | return args 53 | 54 | 55 | def get_number_of_lines(file): 56 | lines = 0 57 | fo = open(file) 58 | for line in fo: lines += 1 59 | return lines 60 | 61 | # ============================================================================= 62 | def get_lines_in(ifile): 63 | lc = 0 64 | fo = open(ifile) 65 | for line in fo: lc += 1 66 | fo.close() 67 | return lc 68 | 69 | # ============================================================================= 70 | # === MAIN ==================================================================== 71 | # ============================================================================= 72 | def main( args ): 73 | 74 | totallines = get_lines_in(args.get('file')) 75 | linecount, filecount = 0, 1 76 | if args.has_key('i'): rotate = int(math.ceil( 1.0 * totallines / args.get('i') )) 77 | else: rotate = args.get('l') 78 | 79 | digits = len(str(math.ceil(1.0*totallines/rotate))) 80 | fw = open( args.get('file') + '.' + add_leading_zeroes(filecount, digits), 'w' ) 81 | fo = open( args.get('file') ) 82 | for line in fo: 83 | linecount += 1 84 | if ((linecount % rotate) == 1 and linecount > 1) or (rotate == 1 and linecount > 1): 85 | filecount += 1 86 | fw.close() 87 | fw = open( args.get('file') + '.' + add_leading_zeroes(filecount, digits), 'w' ) 88 | fw.write(line) 89 | fo.close() 90 | fw.close() 91 | 92 | 93 | # ============================================================================= 94 | args = handle_arguments() 95 | main( args ) 96 | 97 | -------------------------------------------------------------------------------- /python/generic/flat2line.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from collections import defaultdict 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "usage: " + sys.argv[0] + " -f " ) 13 | stdout( " " ) 14 | stdout( " option description" ) 15 | stdout( " -h help (this text here)" ) 16 | stdout( " -f flat file to import" ) 17 | stdout( " -d delimiter (default: ', ' | allowed: any string, tab, space" ) 18 | stdout( " " ) 19 | sys.exit(1) 20 | 21 | # ============================================================================= 22 | def handle_arguments(): 23 | """ verifies the presence of all necessary arguments and returns the data dir """ 24 | if len ( sys.argv ) == 1: 25 | stderr( "no arguments provided." ) 26 | show_help() 27 | 28 | try: # check for the right arguments 29 | keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" ) 30 | except getopt.GetoptError: 31 | stderr( "invalid arguments provided." ) 32 | show_help() 33 | 34 | args = {} 35 | for key, value in keys: 36 | if key == '-f': args['file'] = value 37 | if key == '-d': args['delimiter'] = value 38 | 39 | if not args.has_key('file'): 40 | stderr( "import file argument missing." ) 41 | show_help() 42 | elif not file_exists( args.get('file') ): 43 | stderr( "import file does not exist." ) 44 | show_help() 45 | 46 | if not args.has_key('delimiter'): # or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 47 | args['delimiter'] = ', ' 48 | elif args['delimiter'] == "tab": args['delimiter'] = "\t" 49 | elif args['delimiter'] == "space": args['delimiter'] = " " 50 | 51 | return args 52 | 53 | 54 | # ============================================================================= 55 | # === MAIN ==================================================================== 56 | # ============================================================================= 57 | def main( args ): 58 | 59 | hash = defaultdict(list) 60 | fo = open( args.get('file') ) 61 | for line in fo: 62 | line = line.rstrip() 63 | key, value = line.split("\t") 64 | hash[key].append(value) 65 | fo.close() 66 | 67 | for key, values in hash.iteritems(): 68 | print key + "\t" + string.join(values, args.get('delimiter')) 69 | 70 | # ============================================================================= 71 | args = handle_arguments() 72 | main( args ) 73 | 74 | -------------------------------------------------------------------------------- /python/generic/flat2matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f flat file to import [tab delimited]" ) 18 | stdout( " -a index of the first dimension key [default: 0]" ) 19 | stdout( " -b index of the second dimension key [default: 1]" ) 20 | stdout( " -v index of the value [default: 2]" ) 21 | stdout( " -o order: comma-separated list of keys in which to output the matrix [default: alphabetically sorted]" ) 22 | stdout( " " ) 23 | sys.exit(1) 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | stderr( "no arguments provided." ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:a:b:v:o:" ) 34 | except getopt.GetoptError: 35 | stderr( "invalid arguments provided." ) 36 | show_help() 37 | 38 | args = {'key1':0, 'key2':1, 'value':2} 39 | for key, value in keys: 40 | if key == '-f': args['file'] = value 41 | if key == '-a': args['key1'] = int(value) 42 | if key == '-b': args['key2'] = int(value) 43 | if key == '-v': args['value'] = int(value) 44 | if key == '-o': args['order'] = value.split(",") 45 | 46 | if not args.has_key('file'): 47 | stderr( "import file argument missing." ) 48 | show_help() 49 | elif not file_exists( args.get('file') ): 50 | stderr( "import file does not exist." ) 51 | show_help() 52 | 53 | 54 | return args 55 | 56 | 57 | # ============================================================================= 58 | # === MAIN ==================================================================== 59 | # ============================================================================= 60 | def main( args ): 61 | 62 | hash = {} 63 | keys = [] 64 | fo = open( args.get('file') ) 65 | for line in fo: 66 | col = line.strip().split("\t") 67 | key1, key2, value = col[args['key1']], col[args['key2']], col[args['value']] 68 | hash[key1 + '|||' + key2] = value 69 | if not key1 in keys: keys.append(key1) 70 | if not key2 in keys: keys.append(key2) 71 | fo.close() 72 | if args.has_key('order'): keys = args['order'] 73 | else: keys.sort() 74 | 75 | print string.join(keys, ",") 76 | for i in keys: 77 | sys.stdout.write(i) 78 | for j in keys: 79 | value = 'NA' 80 | if hash.has_key(i+'|||'+j): value = hash[i+'|||'+j] 81 | elif hash.has_key(j+'|||'+i): value = hash[j+'|||'+i] 82 | sys.stdout.write(","+value) 83 | sys.stdout.write("\n") 84 | 85 | 86 | # ============================================================================= 87 | args = handle_arguments() 88 | main( args ) 89 | 90 | -------------------------------------------------------------------------------- /python/generic/flat2xdom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file to import" ) 18 | stdout( " -p prefix to put in fron of the key" ) 19 | stdout( " -d delimiter (default: space | allowed: ; , tab space" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | if key == '-p': args['prefix'] = value 40 | if key == '-d': args['delimiter'] = value 41 | 42 | if not args.has_key('file'): 43 | stderr( "import file argument missing." ) 44 | show_help() 45 | elif not file_exists( args.get('file') ): 46 | stderr( "import file does not exist." ) 47 | show_help() 48 | 49 | if not args.has_key('delimiter') or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 50 | args['delimiter'] = 'space' 51 | 52 | return args 53 | 54 | 55 | # ============================================================================= 56 | # === MAIN ==================================================================== 57 | # ============================================================================= 58 | def main( args ): 59 | 60 | fo = open( args.get('file') ) 61 | oldid = "" 62 | for line in fo: 63 | line = line.rstrip() 64 | if args.get('delimiter') == "tab": 65 | columns = line.split("\t") 66 | elif args.get('delimiter') == "space": 67 | columns = line.split() 68 | else: 69 | columns = line.split( args.get('delimiter') ) 70 | id = columns[0] 71 | if id != oldid: 72 | oldid = id 73 | if args.has_key('prefix'): 74 | print ">" + args.get('prefix') + id 75 | else: 76 | print ">" + id 77 | print string.join( columns[1:], "\t" ) 78 | fo.close() 79 | 80 | # ============================================================================= 81 | args = handle_arguments() 82 | main( args ) 83 | 84 | -------------------------------------------------------------------------------- /python/generic/grab-columns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f -i -n" ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f tab delimited input file" ) 18 | stdout( " -1 keep first column" ) 19 | stdout( " -r regex for the column header to mark as to keep" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:r:1" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | args['keepfirstcol'] = 0 38 | for key, value in keys: 39 | if key == '-f': args['file'] = value 40 | if key == '-r': args['regex'] = re.compile(value) 41 | if key == '-1': args['keepfirstcol'] = 1 42 | 43 | if not args.has_key('file'): 44 | stderr( "import file argument missing." ) 45 | show_help() 46 | elif not file_exists( args.get('file') ): 47 | stderr( "import file does not exist." ) 48 | show_help() 49 | 50 | if not args.has_key('regex'): 51 | stderr( "regex argument missing." ) 52 | show_help() 53 | 54 | return args 55 | 56 | 57 | # ============================================================================= 58 | def get_header( file ): 59 | fo = open(file) 60 | header = fo.readline().rstrip() 61 | fo.close() 62 | return header 63 | 64 | 65 | # ============================================================================= 66 | # === MAIN ==================================================================== 67 | # ============================================================================= 68 | def main( args ): 69 | headline = get_header( args.get('file') ) 70 | columns = headline.split("\t") 71 | regex = args.get('regex') 72 | keepindices = [] 73 | for i in range(len(columns)): 74 | if regex.search(columns[i]): 75 | keepindices.append(i) 76 | #sys.stderr.write("marked:\t%02d\t%s\n" % (i, columns[i])) 77 | elif i == 0 and args.get('keepfirstcol'): 78 | keepindices.append(i) 79 | #sys.stderr.write("marked:\t%02d\t%s\n" % (i, columns[i])) 80 | 81 | fo = open(args.get('file')) 82 | for line in fo: 83 | line = line.rstrip() 84 | columns = line.split("\t") 85 | out = [] 86 | for i in keepindices: out.append( columns[i] ) 87 | print string.join(out, "\t") 88 | fo.close() 89 | 90 | # ============================================================================= 91 | args = handle_arguments() 92 | main( args ) 93 | 94 | -------------------------------------------------------------------------------- /python/generic/intersection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sets 4 | import sys, os 5 | 6 | def get_lines_in_hash(file): 7 | hash = {} 8 | fo = open(file) 9 | for line in fo: hash[line.strip()] = 1 10 | fo.close() 11 | return hash 12 | 13 | def get_lines( file ): 14 | lines = [] 15 | fo = open(file) 16 | for line in fo: 17 | line = line.rstrip() 18 | lines.append(line) 19 | 20 | return sets.Set(lines) 21 | 22 | def terminate(): 23 | print >> sys.stderr, "provide at least two valid input files as input arguments" 24 | sys.exit(1) 25 | 26 | 27 | if len(sys.argv[1:]) < 2: terminate() 28 | for inputfile in sys.argv[1:]: 29 | if not os.path.isfile(inputfile): terminate() 30 | 31 | allhashes = [] 32 | for file in sys.argv[1:]: 33 | allhashes.append( get_lines_in_hash(file) ) 34 | 35 | refkeys = allhashes[0].keys() 36 | for refkey in refkeys: 37 | found = 0 38 | for hash in allhashes: 39 | if hash.has_key(refkey): found += 1 40 | else: break 41 | if found == len(allhashes): 42 | print refkey 43 | 44 | #l1 = get_lines(sys.argv[1]) 45 | #l2 = get_lines(sys.argv[2]) 46 | #for e in l1.intersection(l2): 47 | # print e 48 | -------------------------------------------------------------------------------- /python/generic/subtract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sets 4 | import sys, os 5 | 6 | def get_lines( file ): 7 | lines = [] 8 | fo = open(file) 9 | for line in fo: 10 | line = line.rstrip() 11 | lines.append(line) 12 | 13 | return sets.Set(lines) 14 | 15 | ref = get_lines(sys.argv[1]) 16 | for filename in sys.argv[2:]: 17 | l = get_lines(filename) 18 | for e in l: 19 | if e in ref: ref.remove(e) 20 | 21 | for e in ref: print e 22 | -------------------------------------------------------------------------------- /python/generic/text2range.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | REGEX = re.compile("(\d+)$") 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f text flat file to analyze" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "import file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "import file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def is1higherthan( text1, text2, regex=REGEX ): 52 | 53 | def splittext( text, regex ): 54 | return regex.split( text )[0], int(regex.split( text )[1]) 55 | 56 | id1, number1 = splittext( text1, regex ) 57 | id2, number2 = splittext( text2, regex ) 58 | if id1 != id2: return 0 59 | if (number1 +1) == number2: return 1 60 | return 0 61 | 62 | # ============================================================================= 63 | # === MAIN ==================================================================== 64 | # ============================================================================= 65 | def main( args ): 66 | 67 | fo = open( args.get('file') ) 68 | lines = fo.readlines() 69 | fo.close() 70 | 71 | started_at = "" 72 | 73 | for i in range(1,len(lines)): 74 | line0, line1 = lines[i-1], lines[i] 75 | if started_at == "": started_at = line0 76 | if i < (len(lines)-1) and is1higherthan( line0, line1 ): continue 77 | print string.join([started_at.rstrip(), line0.rstrip()], "\t") 78 | started_at = "" 79 | 80 | # ============================================================================= 81 | args = handle_arguments() 82 | main( args ) 83 | 84 | -------------------------------------------------------------------------------- /python/generic/xdom2flat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import math # match functions 8 | from low import * # custom functions, written by myself 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f fasta file to import" ) 18 | stdout( " -p prefix to put in fron of the key" ) 19 | stdout( " -d delimiter (default: space | allowed: ; , tab space" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:p:d:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | if key == '-d': args['delimiter'] = value 40 | 41 | if not args.has_key('file'): 42 | stderr( "import file argument missing." ) 43 | show_help() 44 | elif not file_exists( args.get('file') ): 45 | stderr( "import file does not exist." ) 46 | show_help() 47 | 48 | if not args.has_key('delimiter') or args.get('delimiter') not in [ ";", ",", "tab", "space" ]: 49 | args['delimiter'] = 'space' 50 | 51 | return args 52 | 53 | 54 | # ============================================================================= 55 | # === MAIN ==================================================================== 56 | # ============================================================================= 57 | def main( args ): 58 | 59 | if args.get('delimiter') == "space": delim = " " 60 | if args.get('delimiter') == "tab": delim = "\t" 61 | if args.get('delimiter') == ";": delim = ";" 62 | if args.get('delimiter') == ",": delim = "," 63 | 64 | fo = open( args.get('file') ) 65 | for line in fo: 66 | line = line.rstrip() 67 | if line.startswith(">"): 68 | id = line[1:] 69 | continue 70 | print id + delim + line 71 | fo.close() 72 | 73 | # ============================================================================= 74 | args = handle_arguments() 75 | main( args ) 76 | 77 | -------------------------------------------------------------------------------- /python/generic/z-score-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | from low import * 4 | from collections import defaultdict 5 | import rpy2.robjects as robjects 6 | R = robjects.r 7 | 8 | 9 | # ============================================================================= 10 | def usage(): 11 | print >> sys.stderr, "usage: " + sys.argv[0] + " paralog-count.tab" 12 | sys.exit(1) 13 | 14 | 15 | def plausi(): 16 | if len(sys.argv) != 2: usage() 17 | inCounts = sys.argv[1] 18 | return inCounts 19 | 20 | 21 | def R_mean_and_sd(pylist): 22 | rcountsvec = robjects.IntVector(pylist) 23 | mean = R['mean'](rcountsvec)[0] 24 | sd = R['sd'](rcountsvec)[0] 25 | return mean, sd 26 | 27 | 28 | def Zscore(x, mean, sd): 29 | if sd == 0: return 0 30 | return (1.0*x - mean)/sd 31 | 32 | def main(): 33 | inCounts = plausi() 34 | fo = open(inCounts) 35 | lines = fo.readlines() 36 | fo.close() 37 | header = lines.pop(0).rstrip().split("\t") 38 | speciesArray = header[1:] 39 | results = defaultdict(lambda: defaultdict(int)) 40 | for line in lines: 41 | line = line.rstrip() 42 | columns = line.split("\t") 43 | cluster = columns[0] 44 | genecounts = columns[1:] 45 | mean, sd = R_mean_and_sd(genecounts) 46 | for i in range(len(genecounts)): 47 | gc, species = int(genecounts[i]), speciesArray[i] 48 | z = Zscore(gc, mean, sd) 49 | if abs(z) < 2: continue 50 | if z > 3: results[species]['Z > 3'] += 1 51 | elif z > 2: results[species]['Z > 2'] += 1 52 | elif z < -3: results[species]['Z < -3'] += 1 53 | elif z < -2: results[species]['Z < -2'] += 1 54 | 55 | speciesArray.sort() 56 | print "\t" + string.join(speciesArray, "\t") 57 | for zcat in ['Z > 3', 'Z > 2', 'Z < -3', 'Z < -2']: 58 | sys.stdout.write(zcat) 59 | for spec in speciesArray: 60 | count = str(results[spec][zcat]) 61 | sys.stdout.write("\t" + count) 62 | sys.stdout.write("\n") 63 | 64 | 65 | main() 66 | -------------------------------------------------------------------------------- /python/gff/droso-chromosome-reconstruction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from collections import defaultdict 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | print >> sys.stderr, "usage: " + sys.argv[0] + " -d " 13 | stdout( " option description" ) 14 | stdout( " -h help (this text here)" ) 15 | stdout( " -d folder with gff files to parse" ) 16 | stdout( " " ) 17 | sys.exit(1) 18 | 19 | # ============================================================================= 20 | def handle_arguments(): 21 | """ verifies the presence of all necessary arguments and returns the data dir """ 22 | if len ( sys.argv ) == 1: 23 | stderr( "no arguments provided." ) 24 | show_help() 25 | 26 | try: # check for the right arguments 27 | keys, values = getopt.getopt( sys.argv[1:], "hd:" ) 28 | except getopt.GetoptError: 29 | stderr( "invalid arguments provided." ) 30 | show_help() 31 | 32 | args = {} 33 | for key, value in keys: 34 | if key == '-d': args['dir'] = value 35 | 36 | if not args.has_key('dir'): 37 | print >> sys.stderr, "gff dir argument missing." 38 | show_help() 39 | elif not dir_exists( args.get('dir') ): 40 | print >> sys.stderr, "gff dir does not exist." 41 | show_help() 42 | 43 | if not args['dir'].endswith("/"): args['dir'] += '/' 44 | return args 45 | 46 | 47 | # ============================================================================= 48 | # === MAIN ==================================================================== 49 | # ============================================================================= 50 | def main( args ): 51 | 52 | def process_gff_line(line, species): 53 | if line.startswith("#") or len(line.rstrip()) == 0: return 54 | columns = line.rstrip().split("\t") 55 | if len(columns) != 9: return 56 | type = columns[2] 57 | if type != "gene": return 58 | chr, start, stop, strand, descr = columns[0], columns[3], columns[4], columns[6], columns[8] 59 | id = re.search("ID=([^;]+);", descr).group(1) 60 | sys.stdout.write(species + "\t" + id + "\t") 61 | print string.join([chr, start, stop, strand], "\t") 62 | 63 | # ============================================================================= 64 | 65 | for filename in os.listdir(args['dir']): 66 | gzip = 0 67 | if not filename.endswith(".gff") and not filename.endswith(".gff.gz"): continue 68 | species = filename[:filename.index("-")] 69 | filename = args['dir'] + filename 70 | if filename.endswith(".gff.gz"): gzip = 1 71 | if gzip: 72 | os.system("gunzip " + filename) 73 | filename = filename[:-3] 74 | fo = open(filename) 75 | for line in fo: process_gff_line(line, species) 76 | fo.close() 77 | if gzip: os.system("gzip " + filename) 78 | 79 | 80 | 81 | # ============================================================================= 82 | args = handle_arguments() 83 | main( args ) 84 | 85 | -------------------------------------------------------------------------------- /python/gff/droso-introns-exons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import getopt # comand line argument handling 6 | from collections import defaultdict 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | print >> sys.stderr, "usage: " + sys.argv[0] + " -d " 13 | stdout( " option description" ) 14 | stdout( " -h help (this text here)" ) 15 | stdout( " -d folder with gff files to parse" ) 16 | stdout( " " ) 17 | sys.exit(1) 18 | 19 | # ============================================================================= 20 | def handle_arguments(): 21 | """ verifies the presence of all necessary arguments and returns the data dir """ 22 | if len ( sys.argv ) == 1: 23 | stderr( "no arguments provided." ) 24 | show_help() 25 | 26 | try: # check for the right arguments 27 | keys, values = getopt.getopt( sys.argv[1:], "hd:" ) 28 | except getopt.GetoptError: 29 | stderr( "invalid arguments provided." ) 30 | show_help() 31 | 32 | args = {} 33 | for key, value in keys: 34 | if key == '-d': args['dir'] = value 35 | 36 | if not args.has_key('dir'): 37 | print >> sys.stderr, "gff dir argument missing." 38 | show_help() 39 | elif not dir_exists( args.get('dir') ): 40 | print >> sys.stderr, "gff dir does not exist." 41 | show_help() 42 | 43 | if not args['dir'].endswith("/"): args['dir'] += '/' 44 | return args 45 | 46 | 47 | # ============================================================================= 48 | # === MAIN ==================================================================== 49 | # ============================================================================= 50 | def main( args ): 51 | 52 | def process_gff_line(line, species): 53 | if line.startswith("#") or len(line.rstrip()) == 0: return 54 | columns = line.rstrip().split("\t") 55 | if len(columns) != 9: return 56 | type = columns[2] 57 | if type != "exon" and type != "intron": return 58 | chr, start, stop, strand, descr = columns[0], columns[3], columns[4], columns[6], columns[8] 59 | # id = re.search("ID=([^;]+);", descr).group(1) 60 | sys.stdout.write(species + "\t" + type + "\t") 61 | print string.join([chr, start, stop], "\t") 62 | 63 | # ============================================================================= 64 | 65 | for filename in os.listdir(args['dir']): 66 | gzip = 0 67 | if not filename.endswith(".gff") and not filename.endswith(".gff.gz"): continue 68 | species = filename[:filename.index("-")] 69 | filename = args['dir'] + filename 70 | if filename.endswith(".gff.gz"): gzip = 1 71 | if gzip: 72 | os.system("gunzip " + filename) 73 | filename = filename[:-3] 74 | fo = open(filename) 75 | for line in fo: process_gff_line(line, species) 76 | fo.close() 77 | if gzip: os.system("gzip " + filename) 78 | 79 | 80 | 81 | # ============================================================================= 82 | args = handle_arguments() 83 | main( args ) 84 | 85 | -------------------------------------------------------------------------------- /python/gff/gff2orthocluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import gff3 9 | from collections import defaultdict 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f gff3 file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['gff'] = value 39 | 40 | if not args.has_key('gff'): 41 | stderr( "gff argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('gff') ): 44 | stderr( "gff does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | # === MAIN ==================================================================== 52 | # ============================================================================= 53 | def main( args ): 54 | 55 | fo = open(args['gff']) 56 | for line in fo: 57 | if line.startswith("#"): continue 58 | if len(line.strip()) == 0: continue 59 | if len(line.split("\t")) != 9: continue 60 | gf = gff3.GeneFeature(line.rstrip()) 61 | if gf.type != "gene": continue 62 | id = gf.get_attributes()['ID'] 63 | if gf.strand == '+': strand = '1' 64 | else: strand = "-1" 65 | print string.join([id, gf.seqid, str(gf.start), str(gf.stop), strand], "\t") 66 | fo.close() 67 | 68 | # ============================================================================= 69 | args = handle_arguments() 70 | main( args ) 71 | 72 | -------------------------------------------------------------------------------- /python/kegg/kegg-enzyme2ko.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm # index databases (file hash) 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -f " ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -f kegg ko file file" ) 19 | stdout( " " ) 20 | 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "kegg file missing." ) 42 | show_help() 43 | if not file_exists( args.get('file') ): 44 | stderr( "kegg file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def strip_tags(value): 52 | "Return the given HTML with all tags (+ KEGG tags) stripped." 53 | value = re.sub(r'<[^>]*?>', '', value) 54 | value = re.sub(r'\[.*\]', '', value) 55 | return value 56 | 57 | # ============================================================================= 58 | # ============================================================================= 59 | def main( args ): 60 | fo = open( args.get('file'), 'r' ) 61 | ko_regex = re.compile( "^ENTRY\s+(K\S+)" ) 62 | enzyme_regex = re.compile( "\s+EC:\s+([0-9.]+)" ) 63 | 64 | ko, enzyme = "", "" 65 | for line in fo: 66 | line = line.rstrip() 67 | if line.startswith("///"): 68 | ko, enzyme = "", "" 69 | continue 70 | if ko == "": 71 | if re.search( ko_regex, line): ko = re.search( ko_regex, line ).group(1) 72 | else: 73 | if re.search( enzyme_regex, line): 74 | enzyme = re.search( enzyme_regex, line ).group(1) 75 | print "%s\t%s" % ( ko, enzyme ) 76 | 77 | fo.close() 78 | 79 | 80 | # ============================================================================= 81 | # === MAIN ==================================================================== 82 | # ============================================================================= 83 | 84 | args = handle_arguments( ) 85 | main( args ) 86 | -------------------------------------------------------------------------------- /python/kegg/kegg-parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm # index databases (file hash) 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -f " ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -f kegg html file" ) 19 | stdout( " " ) 20 | 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "kegg file missing." ) 42 | show_help() 43 | if not file_exists( args.get('file') ): 44 | stderr( "kegg file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def strip_tags(value): 52 | "Return the given HTML with all tags (+ KEGG tags) stripped." 53 | value = re.sub(r'<[^>]*?>', '', value) 54 | value = re.sub(r'\[.*\]', '', value) 55 | return value 56 | 57 | # ============================================================================= 58 | # ============================================================================= 59 | def main( args ): 60 | fo = open( args.get('file'), 'r' ) 61 | statics = {} 62 | statics['entry'] = '^#ENTRY\s+(\S+)' 63 | statics['name'] = '^#NAME\s+(\S+)' 64 | statics['definition'] = '^#DEFINITION\s+(.*)$' 65 | oldlevel = "" 66 | hier = [] 67 | for line in fo: 68 | for name, regex in statics.iteritems(): 69 | if re.search( regex, line ): 70 | print "#%s\t%s" %(name, re.search( regex, line).group(1)) 71 | 72 | if re.match( '[A-Z]\s+', line ): 73 | currentlevel = line[0] 74 | #print currentlevel 75 | rest = re.match( '[A-Z]\s+(.*)$', line ).group(1).strip() 76 | if not re.search( '\S+', rest ): continue 77 | rest = re.match( '(\S+)', rest ).group(1) 78 | if currentlevel > oldlevel: 79 | hier.append( strip_tags(rest) ) 80 | elif currentlevel == oldlevel: 81 | print string.join( hier, '/' ) 82 | hier.pop() 83 | hier.append( strip_tags(rest) ) 84 | else: 85 | hier.pop() 86 | 87 | oldlevel = currentlevel 88 | 89 | fo.close() 90 | 91 | 92 | # ============================================================================= 93 | # === MAIN ==================================================================== 94 | # ============================================================================= 95 | 96 | args = handle_arguments( ) 97 | main( args ) 98 | -------------------------------------------------------------------------------- /python/kegg/kegg2xdom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm # index databases (file hash) 9 | from Bio import SeqIO # biopython stuff, to parse fasta files for instance 10 | 11 | # ============================================================================= 12 | def show_help( ): 13 | """ displays the program parameter list and usage information """ 14 | stdout( "usage: " + sys.argv[0] + " -f " ) 15 | stdout( " " ) 16 | stdout( " option description" ) 17 | stdout( " -h help (this text here)" ) 18 | stdout( " -f kegg KO annotation file" ) 19 | stdout( " " ) 20 | 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "kegg file missing." ) 42 | show_help() 43 | if not file_exists( args.get('file') ): 44 | stderr( "kegg file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def strip_tags(value): 52 | "Return the given HTML with all tags (+ KEGG tags) stripped." 53 | value = re.sub(r'<[^>]*?>', '', value) 54 | value = re.sub(r'\[.*\]', '', value) 55 | return value 56 | 57 | 58 | def read_KOs( file ): 59 | 60 | def next_entry(fo): 61 | pathlist = [] 62 | definition = "" 63 | line = fo.readline().rstrip() 64 | if line == '': 65 | return fo, None, None 66 | entry = re.match('^ENTRY\s+(\S+)', line).group(1) 67 | line = fo.readline().rstrip() 68 | line = fo.readline().rstrip() 69 | if re.match( '^DEFINITION\s+(.*)$',line): 70 | definition = re.search( '^DEFINITION\s+(.*)$', line ).group(1) 71 | line = fo.readline().rstrip() 72 | while line.startswith('CLASS') or line.startswith(' '): 73 | if re.search('\[\S+:\S+\]', line): 74 | pathlist.append( re.search('\[(\S+:\S+)\]',line).group(1) ) 75 | line = fo.readline().rstrip() 76 | 77 | while line != '///': 78 | line = fo.readline().rstrip() 79 | 80 | if definition != "": entry += "\t" + definition 81 | return fo, entry, pathlist 82 | 83 | fo = open( file ) 84 | kohash = {} 85 | while 1: 86 | fo, id, pathlist = next_entry( fo ) 87 | if id == None: break 88 | print ">%s\n%s" %(id, string.join(pathlist,"\t")) 89 | 90 | fo.close() 91 | 92 | # ============================================================================= 93 | # ============================================================================= 94 | def main( args ): 95 | 96 | kohash = read_KOs( args.get('file') ) 97 | 98 | 99 | # ============================================================================= 100 | # === MAIN ==================================================================== 101 | # ============================================================================= 102 | 103 | args = handle_arguments( ) 104 | main( args ) 105 | -------------------------------------------------------------------------------- /python/latex-bibtex/bibtex-number-of-coauthors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, re, string 3 | 4 | class BibtexEntry: 5 | 6 | 7 | def __init__(self, lines): 8 | self.ATTRIBUTE_REGEX = re.compile("\s{2}(\S+)\s{1}=\s\{(.*)\}*$") 9 | self.BIBTEXSTART_REGEX = re.compile("@([A-Z]+)\{(\S+),$") 10 | self.key = "" 11 | self.bibtype = "" 12 | self.attributehash = {} 13 | while 1: 14 | if len(lines) == 0: break 15 | line = lines.pop(0) 16 | 17 | # end of entry 18 | if line.startswith("}"): break 19 | 20 | # bibtex entry start line and key definition 21 | if self.BIBTEXSTART_REGEX.match(line): 22 | self.bibtype = self.BIBTEXSTART_REGEX.match(line).group(1) 23 | self.key = self.BIBTEXSTART_REGEX.match(line).group(2) 24 | continue 25 | 26 | # bibtex attribute start 27 | if self.ATTRIBUTE_REGEX.match(line): 28 | attr = self.ATTRIBUTE_REGEX.match(line).group(1) 29 | value = self.ATTRIBUTE_REGEX.match(line).group(2) 30 | self.attributehash[attr] = value 31 | else: self.attributehash[attr] += " " + line.strip() 32 | 33 | for attr, value in self.attributehash.iteritems(): 34 | if value.endswith("}"): self.attributehash[attr] = value[:-1] 35 | elif value.endswith("},"): self.attributehash[attr] = value[:-2] 36 | 37 | def get_key(self): return self.key 38 | def get_first_author(self): return self.attributehash['author'].split(" and ")[0] 39 | def get_attr(self, name): 40 | if self.attributehash.has_key(name): return self.attributehash[name] 41 | return "" 42 | 43 | def get_author_count(self, return_str=0): 44 | count = self.attributehash['author'].count(" and ") +1 45 | if return_str: return "%s" % count 46 | else: return count 47 | 48 | def annotate(self): 49 | self.attributehash['annotate'] = "(%s co-authors)" % self.get_author_count() 50 | self.attributehash['note'] = "(%s co-authors)" % self.get_author_count() 51 | 52 | def to_s(self, escape_title=1, annotate=0): 53 | print "@" + self.bibtype + "{" + self.key + "," 54 | all_attrs = self.attributehash.keys() 55 | for i in range(len(all_attrs)): 56 | attr = all_attrs[i] 57 | if i == len(all_attrs)-1: comma = "" 58 | else: comma = "," 59 | if attr == "title" and escape_title: print " " + attr + " = \"{" + self.attributehash[attr] + "}\"" + comma 60 | else: print " " + attr + " = {" + self.attributehash[attr] + "}" + comma 61 | print "}" 62 | 63 | 64 | 65 | 66 | 67 | def usage(): 68 | print >> sys.stderr, "usage: " + sys.argv[0] + " db.bib [n=max-coauthors]" 69 | sys.exit(1) 70 | 71 | 72 | def plausi(): 73 | if len(sys.argv) != 3: usage() 74 | inFile, inAuthors = sys.argv[1:3] 75 | return inFile, int(inAuthors) 76 | 77 | 78 | def main(): 79 | inFile, inAuthors = plausi() 80 | fo = open(inFile) 81 | while 1: 82 | line = fo.readline().rstrip() 83 | if line.startswith("%"): continue 84 | if line.startswith("@comment"): break 85 | if line.startswith("@"): 86 | lines = [] 87 | lines.append(line) 88 | while 1: 89 | line = fo.readline().rstrip() 90 | lines.append(line) 91 | if line.startswith("}"): break 92 | b = BibtexEntry(lines) 93 | count = b.get_author_count() 94 | if count > inAuthors: b.annotate() 95 | b.to_s() 96 | 97 | fo.close() 98 | 99 | 100 | main() 101 | -------------------------------------------------------------------------------- /python/latex-bibtex/latex-rename.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | 9 | # ============================================================================= 10 | def show_help( ): 11 | """ displays the program parameter list and usage information """ 12 | stdout( "renames files so that they can be included in latex documents." ) 13 | stdout( "this means that all dots are removed except for the last one of the actual file extension." ) 14 | stdout( "dots are replaced by \"_\" by default." ) 15 | stdout( "usage: " + sys.argv[0] + " -f [-r ]" ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f fasta file" ) 20 | stdout( " -r replace dot with this sign (default: \"_\")" ) 21 | stdout( " " ) 22 | sys.exit(1) 23 | 24 | # ============================================================================= 25 | def handle_arguments(): 26 | """ verifies the presence of all necessary arguments and returns the data dir """ 27 | if len ( sys.argv ) == 1: 28 | stderr( "no arguments provided." ) 29 | show_help() 30 | 31 | try: # check for the right arguments 32 | keys, values = getopt.getopt( sys.argv[1:], "hf:r:" ) 33 | except getopt.GetoptError: 34 | stderr( "invalid arguments provided." ) 35 | show_help() 36 | 37 | args = {} 38 | for key, value in keys: 39 | if key == '-f': args['file'] = value 40 | if key == '-r': args['r'] = str(value) 41 | 42 | if not args.has_key('file'): 43 | stderr( "file missing." ) 44 | show_help() 45 | if not file_exists( args.get('file') ): 46 | stderr( "file does not exist." ) 47 | show_help() 48 | 49 | return args 50 | 51 | 52 | # ============================================================================= 53 | # ============================================================================= 54 | def main( args ): 55 | oldfilename = args.get('file') 56 | path, filename = os.path.split(oldfilename) 57 | base, ext = os.path.splitext(filename) 58 | if args.has_key('r'): r = args.get('r') 59 | else: r = '_' 60 | base = base.replace('.',r) 61 | if path != "": 62 | newfilename = path + '/' + base + ext 63 | else: 64 | newfilename = base + ext 65 | os.system( "mv %s %s" %(oldfilename, newfilename) ) 66 | 67 | # ============================================================================= 68 | # === MAIN ==================================================================== 69 | # ============================================================================= 70 | 71 | args = handle_arguments( ) 72 | main( args ) 73 | -------------------------------------------------------------------------------- /python/misa/gc-content-from-misa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from collections import defaultdict 9 | from misa import MisaSSRspecies 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f all.misa out file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "fasta file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "fasta file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | # === MAIN ==================================================================== 52 | # ============================================================================= 53 | def main( args ): 54 | specieshash = {} 55 | fo = open(args['file']) 56 | for line in fo: 57 | m = MisaSSRspecies(line) 58 | if not specieshash.has_key(m.species): specieshash[m.species] = defaultdict(int) 59 | for char in ['A', 'T', 'G', 'C']: 60 | specieshash[m.species][char] += m.motif.count(char) * m.repeats 61 | 62 | speciesarray = specieshash.keys() 63 | speciesarray.sort() 64 | for species in speciesarray: 65 | total = sum(specieshash[species].values()) 66 | gc = 1.0 * (specieshash[species]['G'] + specieshash[species]['C']) / total 67 | print species + "\t" + str(gc) 68 | 69 | # ============================================================================= 70 | args = handle_arguments() 71 | main( args ) 72 | 73 | -------------------------------------------------------------------------------- /python/misa/import-into-sqlite3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import sqlite3 8 | from low import * # custom functions, written by myself 9 | from misa import MisaSSRspecies 10 | from collections import defaultdict 11 | 12 | 13 | # ============================================================================= 14 | def show_help( ): 15 | """ displays the program parameter list and usage information """ 16 | stdout( "usage: " + sys.argv[0] + " -f " ) 17 | stdout( " " ) 18 | stdout( " option description" ) 19 | stdout( " -h help (this text here)" ) 20 | stdout( " -f misa output file with an additional first column = speciesname" ) 21 | stdout( " -d db file" ) 22 | stdout( " " ) 23 | sys.exit(1) 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | stderr( "no arguments provided." ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:d:" ) 34 | except getopt.GetoptError: 35 | stderr( "invalid arguments provided." ) 36 | show_help() 37 | 38 | args = {} 39 | for key, value in keys: 40 | if key == '-f': args['file'] = value 41 | if key == '-d': args['db'] = value 42 | 43 | if not args.has_key('file'): 44 | stderr( "misa file argument missing." ) 45 | show_help() 46 | elif not file_exists( args.get('file') ): 47 | stderr( "misa file does not exist." ) 48 | show_help() 49 | 50 | return args 51 | 52 | # ============================================================================= 53 | def init_db(conn): 54 | conn.execute("CREATE TABLE IF NOT EXISTS ssrs(id INTEGER PRIMARY KEY ASC, species VARCHAR(4), chr VARCHAR(50), startpos INTEGER, endpos INTEGER, ssr_type VARCHAR(2), motif VARCHAR(20), repeats INTEGER)") 55 | 56 | 57 | # ============================================================================= 58 | # === MAIN ==================================================================== 59 | # ============================================================================= 60 | def main( args ): 61 | 62 | conn = sqlite3.connect(args['db']) 63 | init_db(conn) 64 | 65 | fo = open(args['file']) 66 | for line in fo: 67 | if line.startswith("ID\t"): continue 68 | m = MisaSSRspecies(line) 69 | sql = "INSERT INTO ssrs(species, chr, startpos, endpos, ssr_type, motif, repeats) VALUES (\'%s\', \'%s\', %s, %s, \'%s\', \'%s\', %s)" %(m.species, m.geneid, m.startpos, m.endpos, m.type, m.motif, m.repeats) 70 | conn.execute(sql) 71 | res = conn.execute("SELECT COUNT(*) FROM ssrs") 72 | entries = res.fetchall()[0][0] 73 | print "done. entries added:", entries 74 | conn.commit() 75 | conn.close() 76 | 77 | 78 | # ============================================================================= 79 | args = handle_arguments() 80 | main( args ) 81 | 82 | -------------------------------------------------------------------------------- /python/misa/split-compound-ssrs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from misa import MisaSSR 9 | from collections import defaultdict 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f " ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f misa outptu file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "misa file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "misa file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | # === MAIN ==================================================================== 52 | # ============================================================================= 53 | def main( args ): 54 | 55 | fo = open(args['file']) 56 | for line in fo: 57 | if line.startswith("ID\t"): continue 58 | m = MisaSSR(line) 59 | if m.type != "c" and m.type != "c*": print m.to_s() 60 | else: 61 | startpos = m.startpos 62 | separatepatterns = re.findall("\([ATGC]+\)\d+[*]{0,1}",m.pattern) 63 | for separatepattern in separatepatterns: 64 | motif = separatepattern[1:separatepattern.index(")")] 65 | if separatepattern.endswith("*"): repeats = int(separatepattern[separatepattern.index(")")+1:-1]) 66 | else: repeats = int(separatepattern[separatepattern.index(")")+1:]) 67 | length = len(motif)*repeats 68 | endpos = startpos + length -1 69 | print string.join([m.geneid, str(m.ssrnr), "p" + str(len(motif)), separatepattern, str(length), str(startpos), str(endpos)], "\t") 70 | startpos = endpos+1 71 | 72 | 73 | 74 | # ============================================================================= 75 | args = handle_arguments() 76 | main( args ) 77 | 78 | -------------------------------------------------------------------------------- /python/openreadingframe/ORFPREDICTORRR.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys,os,getopt 4 | 5 | OUTFILEPART2 = 'tmp.orf.part2.fasta' 6 | 7 | #============================================================================== 8 | def usage(): 9 | print """Hello! 10 | 11 | Following options are possible: 12 | -i:\tparsed BLASTX best hit definitions 13 | -j:\tinput sequences in FASTA format 14 | -t:\tminimum length for in silico predicted ORFs 15 | """ 16 | 17 | #============================================================================== 18 | def main( XMLfile, CAP3, threshold ): 19 | # First, elongate BLAST-hits 20 | 21 | os.system("orf_prediction_part1.py -b "+str(XMLfile)+" -f "+str(CAP3) ) 22 | #print "BLASTelongator has finished. Starting 2nd part..." 23 | # It has written to temp and now comes Ina's script 24 | 25 | os.system("orf_prediction_part2.py"+" -t "+str(threshold)+" -f " + OUTFILEPART2 ) 26 | #print "ORF-Prediction has finished. Removing temp-files.." 27 | #os.system("cat BLASTelongatorHits.out SimulatedORFS.out > "+str(outfile)) 28 | #os.system("rm BLASTelongatorHits.out") 29 | os.system("rm " + OUTFILEPART2) 30 | #os.system("rm SimulatedORFS.out") 31 | #print "Done. See you soon!" 32 | 33 | 34 | 35 | #============================================================================== 36 | # MAIN ======================================================================== 37 | #============================================================================== 38 | try: 39 | if len(sys.argv) > 1: 40 | opts, args = getopt.getopt(sys.argv[1:],"i:j:t:h") 41 | else: 42 | usage() 43 | #print "Hello! I take at least 3 arguments. I have the following options: -i defines the input XML-file which you want to use, -j defines the CAP3-outputfile, -t defines the threshold for in silico predicted proteins, -o defines the outfile, -h gives you more help! See you soon! You provided:" 44 | sys.exit() 45 | except getopt.GetoptError, err: 46 | print "Something went wrong - maybe this helps: " + str(err) 47 | sys.exit() 48 | 49 | for o, a in opts: 50 | if o == "-h": 51 | usage() 52 | sys.exit() 53 | elif o == "-i": 54 | if os.path.exists(a): 55 | if os.path.isfile(a): 56 | XMLfile = a 57 | elif os.path.isdir(a): 58 | print "Specified XML-file is a directory!" 59 | else: 60 | print "Something is wrong with the XML-file, maybe it doesn't exist?" 61 | elif o == "-j": 62 | if os.path.exists(a): 63 | if os.path.isfile(a): 64 | CAP3 = a 65 | elif os.path.isdir(a): 66 | print "Specified CAP3-file is a directory!" 67 | else: 68 | print "Something is wrong with the CAP3-file, maybe it doesn't exist?" 69 | elif o == "-t": 70 | threshold = a 71 | else: 72 | print "Something went wrong ;_;. Maybe the file you specified doesn't exist?" 73 | 74 | if len(opts) == 3: 75 | main( XMLfile, CAP3, threshold ) 76 | else: 77 | print len(opts) 78 | print "Again, hello to you! You do not have the required amount of arguments given. Please specify them. For more, see -h! I AM THE PREDICTOR" 79 | -------------------------------------------------------------------------------- /python/orthomcl/add-blasthits-to-cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string, anydbm 3 | from low import * 4 | from orthomcl import OrthoMCLCluster 5 | 6 | 7 | # ============================================================================= 8 | def usage(): 9 | print >> sys.stderr, "add significant BLAST hits (e.g. in-paralogs) to an existing orthomcl cluster.\n" 10 | print >> sys.stderr, "usage: (1) " + sys.argv[0] + " noparalogs.orthomcl.out blastout.add.dbm" 11 | print >> sys.stderr, " or (2) " + sys.argv[0] + " noparalogs.orthomcl.out all.fasta all.gg all.blastout" 12 | sys.exit(1) 13 | 14 | 15 | def plausi(): 16 | if len(sys.argv) != 3 and len(sys.argv) != 5: usage() 17 | return sys.argv[1:] 18 | 19 | 20 | def read_gg(inGG): 21 | outHash, speciesArray = {}, [] 22 | fo = open(inGG) 23 | for line in fo: 24 | line = line.rstrip() 25 | cols = line.split() 26 | species = str(cols[0])[:-1] 27 | if not species in speciesArray: speciesArray.append(species) 28 | for col in cols[1:]: 29 | outHash[col] = species 30 | fo.close() 31 | return outHash, speciesArray 32 | 33 | 34 | def get_seq_lengths(file): 35 | lengthHash, id = {}, "" 36 | fo = open(file) 37 | for line in fo: 38 | line = line.strip() 39 | if line.startswith(">"): 40 | id = line[1:] 41 | if id.count(" ") > 0: id = id[:id.index(" ")] 42 | lengthHash[id] = 0 43 | else: lengthHash[id] += len(line) 44 | return lengthHash 45 | 46 | 47 | def main(): 48 | args = plausi() 49 | in_orthomcl = args[0] 50 | EVALUE = float('1e-20') 51 | IDENTITY = 30.0 52 | if len(args) == 4: 53 | in_fasta, in_gg, in_blast = args[1:4] 54 | gene2species, speciesArray = read_gg(in_gg) 55 | gene2length = get_seq_lengths(in_fasta) 56 | dbmfile = in_blast + ".add.dbm" 57 | dbm = anydbm.open(dbmfile, "c") 58 | fo = open(in_blast) 59 | for line in fo: 60 | line = line.rstrip() 61 | cols = line.split("\t") 62 | qid, hid, evalue, identity = cols[0], cols[1], float(cols[10]), float(cols[2]) 63 | # ignore self-hits and between-species hits, check e-value threshold 64 | if qid == hid: continue 65 | if gene2species[qid] != gene2species[hid]: continue 66 | if evalue > EVALUE: continue 67 | if identity < IDENTITY: continue 68 | # check that blast alignment spans at least 75% of the longer sequence 69 | alnlength, qlength, hlength = int(cols[3]), gene2length[qid], gene2length[hid] 70 | lengthcutoff = 0.80 * max([qlength, hlength]) 71 | if alnlength < lengthcutoff: continue 72 | if not dbm.has_key(qid): dbm[qid] = "" 73 | else: dbm[qid] += " " 74 | dbm[qid] += hid 75 | fo.close() 76 | dbm.close() 77 | else: dbmfile = args[1] 78 | dbm = anydbm.open(dbmfile) 79 | 80 | fo = open(in_orthomcl) 81 | for line in fo: 82 | o = OrthoMCLCluster(line.rstrip()) 83 | oldsize = o.get_count() 84 | additions = [] 85 | for geneid, species in o.get_gene_hash().iteritems(): 86 | if not dbm.has_key(geneid): continue 87 | [additions.append([x, species]) for x in dbm[geneid].split()] 88 | 89 | for x, species in additions: o.add_gene(x,species) 90 | o.to_s() 91 | newsize = o.get_count() 92 | print >> sys.stderr, "%s\t%s\t%s" %(o.get_name(), oldsize, newsize) 93 | 94 | main() 95 | -------------------------------------------------------------------------------- /python/orthomcl/build-counts-table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | from low import * 4 | 5 | # takes an input protein fasta file and an orthomcl.gg file 6 | # orthomcl.gg file format: 7 | # speciesname1: id1 id2 id3 id4 .... full genome 8 | # speciesname2: id1 id2 id3 id4 .... full genome 9 | # 10 | # with these infos, the goal is to get only one protein sequence per species 11 | # we use t-coffee to find the most similar protein sequence per species 12 | # to the whole cluster. so in case one species contributes several sequences 13 | # to a cluster, we choose the one species to keep which has the highest average 14 | # similarity to the rest of the cluster. if more than 1 sequence yield the highest 15 | # avgsim, we determine whether these protein sequences are (1) all identical, 16 | # or whether they are (2) slightly different. In case (1), we choose any sequence 17 | # randomly because it does not matter. In case (2), we sum up all pairwise 18 | # similarities for each candidate sequence, and keep only the one sequence 19 | # with the highest sum. If these are identical as well, we again choose randomly 20 | # (should happen very rarely). 21 | 22 | 23 | 24 | def usage(): 25 | print >> sys.stderr, "usage: " + sys.argv[0] + " clustering.out orthomcl.gg" 26 | sys.exit(1) 27 | 28 | 29 | def plausi(): 30 | if len(sys.argv) != 3: usage() 31 | inClustering, inGG = sys.argv[1:3] 32 | return inClustering, inGG 33 | 34 | 35 | def get_number_of_species(inGG): 36 | count = 0 37 | fo = open(inGG) 38 | for line in fo: count += 1 39 | fo.close() 40 | return count 41 | 42 | 43 | def read_gg(inGG): 44 | outHash = {} 45 | speciesArray = [] 46 | fo = open(inGG) 47 | for line in fo: 48 | line = line.rstrip() 49 | cols = line.split() 50 | species = str(cols[0])[:-1] 51 | if not species in speciesArray: speciesArray.append(species) 52 | for col in cols[1:]: 53 | outHash[col] = species 54 | fo.close() 55 | return outHash, speciesArray 56 | 57 | 58 | def main(): 59 | inClustering, inGG = plausi() 60 | speciesHash, speciesArray = read_gg(inGG) 61 | speciesArray.sort() 62 | 63 | sys.stdout.write("\t") 64 | sys.stdout.write(string.join(speciesArray, "\t")) 65 | sys.stdout.write("\n") 66 | 67 | fo = open(inClustering) 68 | for line in fo: 69 | if line.startswith("#"): continue 70 | line = line.rstrip() 71 | cluster, count, geneids = line.split("\t")[0:3] 72 | geneids = geneids.split(", ") 73 | tab = [0]*len(speciesArray) 74 | for id in geneids: 75 | species = speciesHash[id] 76 | tab[ speciesArray.index(species) ] += 1 77 | sys.stdout.write(cluster + "\t") 78 | sys.stdout.write( string.join([str(e) for e in tab], "\t") ) 79 | sys.stdout.write("\n") 80 | fo.close() 81 | 82 | 83 | main() 84 | -------------------------------------------------------------------------------- /python/orthomcl/build-orthomcl-like-output.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | from low import * 4 | 5 | # takes an input protein fasta file and an orthomcl.gg file 6 | # orthomcl.gg file format: 7 | # speciesname1: id1 id2 id3 id4 .... full genome 8 | # speciesname2: id1 id2 id3 id4 .... full genome 9 | # 10 | # with these infos, the goal is to get only one protein sequence per species 11 | # we use t-coffee to find the most similar protein sequence per species 12 | # to the whole cluster. so in case one species contributes several sequences 13 | # to a cluster, we choose the one species to keep which has the highest average 14 | # similarity to the rest of the cluster. if more than 1 sequence yield the highest 15 | # avgsim, we determine whether these protein sequences are (1) all identical, 16 | # or whether they are (2) slightly different. In case (1), we choose any sequence 17 | # randomly because it does not matter. In case (2), we sum up all pairwise 18 | # similarities for each candidate sequence, and keep only the one sequence 19 | # with the highest sum. If these are identical as well, we again choose randomly 20 | # (should happen very rarely). 21 | 22 | 23 | 24 | def usage(): 25 | print >> sys.stderr, "usage: " + sys.argv[0] + " clustering.out orthomcl.gg" 26 | sys.exit(1) 27 | 28 | 29 | def plausi(): 30 | if len(sys.argv) != 3: usage() 31 | inClustering, inGG = sys.argv[1:3] 32 | return inClustering, inGG 33 | 34 | 35 | def get_number_of_species(inGG): 36 | count = 0 37 | fo = open(inGG) 38 | for line in fo: count += 1 39 | fo.close() 40 | return count 41 | 42 | 43 | def read_gg(inGG): 44 | outHash = {} 45 | speciesArray = [] 46 | fo = open(inGG) 47 | for line in fo: 48 | line = line.rstrip() 49 | cols = line.split() 50 | species = str(cols[0])[:-1] 51 | if not species in speciesArray: speciesArray.append(species) 52 | for col in cols[1:]: 53 | outHash[col] = species 54 | fo.close() 55 | return outHash, speciesArray 56 | 57 | 58 | def main(): 59 | inClustering, inGG = plausi() 60 | speciesHash, speciesArray = read_gg(inGG) 61 | 62 | fo = open(inClustering) 63 | for line in fo: 64 | if line.startswith("#"): continue 65 | line = line.rstrip() 66 | cluster, count, geneids = line.split("\t")[0:3] 67 | geneids = geneids.split(", ") 68 | currentSpecies = [] 69 | for id in geneids: currentSpecies.append(speciesHash[id]) 70 | speciesCount = len(set(currentSpecies)) 71 | sys.stdout.write("%s(%s genes, %s taxa):\t" %(cluster, count, speciesCount)) 72 | for id in geneids: 73 | species = speciesHash[id] 74 | sys.stdout.write(id + "(" + species + ") ") 75 | sys.stdout.write("\n") 76 | fo.close() 77 | 78 | 79 | main() 80 | -------------------------------------------------------------------------------- /python/orthomcl/cluster2arath.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | from low import * 4 | from orthomcl import OrthoMCLCluster 5 | 6 | 7 | # ============================================================================= 8 | def usage(): 9 | print >> sys.stderr, "usage: " + sys.argv[0] + " noparalogs.orthomcl.out" 10 | sys.exit(1) 11 | 12 | 13 | def plausi(): 14 | if len(sys.argv) != 2: usage() 15 | inFile = sys.argv[1] 16 | return inFile 17 | 18 | 19 | def main(): 20 | inFile = plausi() 21 | fo = open(inFile) 22 | for line in fo: 23 | o = OrthoMCLCluster(line.rstrip()) 24 | print o.get_name() + "\t" + o.get_species_hash()['Arath'][0] 25 | 26 | 27 | main() 28 | -------------------------------------------------------------------------------- /python/orthomcl/geneid2cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | from low import * 4 | from orthomcl import OrthoMCLCluster 5 | 6 | 7 | # ============================================================================= 8 | def usage(): 9 | print >> sys.stderr, "prints a mapping between each gene id and its cluster from orthomcl output\n" 10 | print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out" 11 | sys.exit(1) 12 | 13 | 14 | def plausi(): 15 | if len(sys.argv) != 2: usage() 16 | inFile = sys.argv[1] 17 | return inFile 18 | 19 | 20 | def main(): 21 | inFile = plausi() 22 | fo = open(inFile) 23 | for line in fo: 24 | o = OrthoMCLCluster(line.rstrip()) 25 | name = o.get_name() 26 | geneHash = o.get_gene_hash() 27 | for geneid, species in geneHash.iteritems(): print geneid + "\t" + name 28 | 29 | 30 | main() 31 | -------------------------------------------------------------------------------- /python/orthomcl/map-orthomcl-clusters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | from low import * 4 | 5 | 6 | def usage(): 7 | print >> sys.stderr, "usage: ", sys.argv[0], " []" 8 | print >> sys.stderr, "from/to: speciesname or \"cluster\"" 9 | sys.exit(1) 10 | 11 | 12 | def plausi(): 13 | if len(sys.argv) < 3: usage() 14 | inTo = sys.argv[2].lower() 15 | inFrom = sys.argv[1].lower() 16 | if len(sys.argv) > 3: 17 | inTable = sys.argv[3] 18 | else: 19 | inTable = "/home/low/workspace/back-to-the-sea-orf-cluster-verification/32-new-orthologs/3-orthomcl-v1.4/noparalogs_orthomcl.out" 20 | if not os.path.exists(inTable) or not os.path.isfile(inTable) or not os.path.getsize(inTable) > 0: 21 | print >> sys.stderr, "specified orthomcl table file does not exist, is not a file, or is empty\n" 22 | usage() 23 | return inFrom, inTo, inTable 24 | 25 | 26 | class OrthoCluster(): 27 | def __init__(self, line): 28 | descr, genedefs = line.split("\t") 29 | genedefs = genedefs.split() 30 | self.name = descr[:descr.index('(')].lower() 31 | self.geneHash = {} 32 | self.speciesHash = {} 33 | for genedef in genedefs: 34 | geneid = genedef[:genedef.index('(')] 35 | species = genedef[genedef.index('(')+1:-1].lower() 36 | self.geneHash[geneid] = species 37 | if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid) 38 | else: self.speciesHash[species] = [geneid] 39 | 40 | def get_name(self): return self.name 41 | def get_count(self): return len(self.geneHash) 42 | def get_gene_hash(self): return self.geneHash 43 | def get_species_hash(self): return self.speciesHash 44 | 45 | 46 | 47 | def main(): 48 | inFrom, inTo, inTable = plausi() 49 | fo = open(inTable) 50 | for line in fo: 51 | o = OrthoCluster(line.rstrip()) 52 | speciesHash = o.get_species_hash() 53 | name = o.get_name() 54 | mapfrom, mapto = "", "" 55 | if inFrom == "cluster": mapfrom = name 56 | else: mapfrom = speciesHash[inFrom][0] 57 | if inTo == "cluster": mapto = name 58 | else: mapto = speciesHash[inTo][0] 59 | print mapfrom + "\t" + mapto 60 | fo.close() 61 | 62 | 63 | main() 64 | -------------------------------------------------------------------------------- /python/orthomcl/paralogs-per-cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | import string 4 | 5 | 6 | def usage(): 7 | print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out" 8 | sys.exit(1) 9 | 10 | 11 | def plausi(): 12 | if len(sys.argv) != 2: usage() 13 | inFile = sys.argv[1] 14 | return inFile 15 | 16 | 17 | class OrthoCluster(): 18 | def __init__(self, line): 19 | descr, genedefs = line.split("\t") 20 | genedefs = genedefs.split() 21 | self.name = descr[:descr.index('(')].lower() 22 | self.geneHash = {} 23 | self.speciesHash = {} 24 | for genedef in genedefs: 25 | geneid = genedef[:genedef.index('(')] 26 | species = genedef[genedef.index('(')+1:-1] 27 | self.geneHash[geneid] = species 28 | if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid) 29 | else: self.speciesHash[species] = [geneid] 30 | 31 | def get_name(self): return self.name 32 | def get_count(self): return len(self.geneHash) 33 | def get_gene_hash(self): return self.geneHash 34 | def get_species_hash(self): return self.speciesHash 35 | 36 | 37 | def get_species_from_first_line(inFile): 38 | fo = open(inFile) 39 | line = fo.readline() 40 | o = OrthoCluster(line.rstrip()) 41 | fo.close() 42 | species = o.get_species_hash().keys() 43 | species.sort() 44 | return species 45 | 46 | 47 | def parse_orthocml_out(inFile): 48 | speciesList = get_species_from_first_line(inFile) 49 | print >> sys.stdout, "\t" + string.join(speciesList, "\t") 50 | fo = open(inFile) 51 | for line in fo: 52 | o = OrthoCluster(line.rstrip()) 53 | speciesHash = o.get_species_hash() 54 | sys.stdout.write(o.get_name()) 55 | for s in speciesList: 56 | count = 0 57 | if speciesHash.has_key(s): count = len(speciesHash[s]) 58 | sys.stdout.write("\t%s" % count) 59 | sys.stdout.write("\n") 60 | 61 | fo.close() 62 | 63 | 64 | def main(): 65 | inFile = plausi() 66 | parse_orthocml_out(inFile) 67 | 68 | 69 | 70 | main() 71 | -------------------------------------------------------------------------------- /python/orthomcl/speciesids4orthomcl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | 4 | 5 | def usage(): 6 | print >> sys.stderr, "usage: " + sys.argv[0] + " folder with genomes (*.fasta or *.fasta.gz)" 7 | sys.exit(1) 8 | 9 | 10 | def plausi(): 11 | if len(sys.argv) != 2: usage() 12 | inFolder = sys.argv[1] 13 | if not os.path.exists(inFolder) or not os.path.isdir(inFolder): 14 | print >> sys.stderr, "specified input folder does not exist or is not a directory\n" 15 | usage() 16 | if not inFolder.endswith('/'): inFolder += '/' 17 | return inFolder 18 | 19 | 20 | def iterate_folder(inFolder): 21 | inFiles = [] 22 | for fname in os.listdir(inFolder): 23 | if not fname.endswith('.fasta') and not fname.endswith('.fasta.gz'): continue 24 | inFiles.append(inFolder + fname) 25 | return inFiles 26 | 27 | 28 | def process_file(inFile): 29 | gzip = 0 30 | if inFile.endswith('.gz'): gzip = 1 31 | 32 | if gzip: 33 | ec = os.system('gunzip ' + inFile) 34 | inFile = os.path.splitext(inFile)[0] 35 | 36 | filename = os.path.split(inFile)[1] 37 | outName = os.path.splitext(filename)[0] 38 | 39 | sys.stdout.write(outName + ": ") 40 | 41 | ids = {} 42 | fo = open(inFile) 43 | for line in fo: 44 | if not line.startswith(">"): continue 45 | line = line.rstrip() 46 | id = line[1:] 47 | if id.count(" ") > 0: id = id[:id.index(" ")] 48 | ids[id] = 1 49 | 50 | sys.stdout.write( string.join(ids.keys(), " ") ) 51 | sys.stdout.write("\n") 52 | 53 | if gzip: ec = os.system('gzip ' + inFile) 54 | 55 | 56 | def main(): 57 | inFolder = plausi() 58 | inFiles = iterate_folder(inFolder) 59 | for inFile in inFiles: process_file(inFile) 60 | 61 | main() 62 | -------------------------------------------------------------------------------- /python/orthomcl/table-of-gene-id-per-cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, string 3 | from low import * 4 | 5 | 6 | def usage(): 7 | print >> sys.stderr, "usage: ", sys.argv[0], " []" 8 | sys.exit(1) 9 | 10 | 11 | def plausi(): 12 | if len(sys.argv) < 2: usage() 13 | inOrtho = sys.argv[1] 14 | if not os.path.exists(inOrtho) or not os.path.isfile(inOrtho) or not os.path.getsize(inOrtho) > 0: 15 | print >> sys.stderr, "specified orthomcl file does not exist, is not a file, or is empty\n" 16 | usage() 17 | return inOrtho 18 | 19 | 20 | class OrthoCluster(): 21 | def __init__(self, line): 22 | descr, genedefs = line.split("\t") 23 | genedefs = genedefs.split() 24 | self.name = descr[:descr.index('(')].lower() 25 | self.geneHash = {} 26 | self.speciesHash = {} 27 | for genedef in genedefs: 28 | geneid = genedef[:genedef.index('(')] 29 | species = genedef[genedef.index('(')+1:-1].lower() 30 | self.geneHash[geneid] = species 31 | if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid) 32 | else: self.speciesHash[species] = [geneid] 33 | 34 | def get_name(self): return self.name 35 | def get_count(self): return len(self.geneHash) 36 | def get_gene_hash(self): return self.geneHash 37 | def get_species_hash(self): return self.speciesHash 38 | 39 | 40 | 41 | def main(): 42 | inOrtho = plausi() 43 | fo = open(inOrtho) 44 | speciesCols = 0 45 | for line in fo: 46 | o = OrthoCluster(line.rstrip()) 47 | SH = o.get_species_hash() 48 | if not speciesCols: 49 | speciesCols = SH.keys() 50 | speciesCols.sort() 51 | print "OrthoMCL.ID" + "\t" + string.join(speciesCols, "\t") 52 | 53 | name = o.get_name() 54 | print name + "\t" + string.join( [SH[x][0] for x in speciesCols], "\t") 55 | 56 | fo.close() 57 | 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /python/orthomcl/tree-for-codeml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys 3 | import string 4 | 5 | 6 | def usage(): 7 | print >> sys.stderr, "usage: " + sys.argv[0] + " orthomcl.out base.tree" 8 | sys.exit(1) 9 | 10 | 11 | def plausi(): 12 | if len(sys.argv) != 3: usage() 13 | inOrtho, inTree = sys.argv[1:3] 14 | return inOrtho, inTree 15 | 16 | 17 | class OrthoCluster(): 18 | def __init__(self, line): 19 | descr, genedefs = line.split("\t") 20 | genedefs = genedefs.split() 21 | self.name = descr[:descr.index('(')].lower() 22 | self.geneHash = {} 23 | self.speciesHash = {} 24 | for genedef in genedefs: 25 | geneid = genedef[:genedef.index('(')] 26 | species = genedef[genedef.index('(')+1:-1] + "1" 27 | self.geneHash[geneid] = species 28 | if self.speciesHash.has_key(species): self.speciesHash[species].append(geneid) 29 | else: self.speciesHash[species] = [geneid] 30 | 31 | def get_name(self): return self.name 32 | def get_count(self): return len(self.geneHash) 33 | def get_gene_hash(self): return self.geneHash 34 | def get_species_hash(self): return self.speciesHash 35 | 36 | 37 | def get_species_from_first_line(inFile): 38 | fo = open(inFile) 39 | line = fo.readline() 40 | o = OrthoCluster(line.rstrip()) 41 | fo.close() 42 | species = o.get_species_hash().keys() 43 | species.sort() 44 | return species 45 | 46 | 47 | def parse_orthocml_out(inFile, tree): 48 | fo = open(inFile) 49 | for line in fo: 50 | o = OrthoCluster(line.rstrip()) 51 | speciesHash = o.get_species_hash() 52 | name = o.get_name() 53 | for species, genelist in speciesHash.iteritems(): 54 | if len(genelist) > 1: break 55 | 56 | replacement = '(' + species[:-1] + '1 #1,' + species[:-1] + '2)' 57 | tree_repl_1 = tree.replace(species, replacement) 58 | replacement = '(' + species[:-1] + '1,' + species[:-1] + '2 #1)' 59 | tree_repl_2 = tree.replace(species, replacement) 60 | fw = open(name + ".tree.1", "w") 61 | fw.write(tree_repl_1) 62 | fw.close() 63 | fw = open(name + ".tree.2", "w") 64 | fw.write(tree_repl_2) 65 | fw.close() 66 | fo.close() 67 | 68 | 69 | def read_tree_from_file(file): 70 | fo = open(file) 71 | tree = "" 72 | for line in fo: 73 | tree += line.strip() 74 | fo.close() 75 | return tree 76 | 77 | 78 | def main(): 79 | inOrtho, inTree = plausi() 80 | tree = read_tree_from_file(inTree) 81 | parse_orthocml_out(inOrtho, tree) 82 | 83 | 84 | 85 | main() 86 | -------------------------------------------------------------------------------- /python/paml/get-paml-results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | import anydbm 9 | 10 | # ============================================================================= 11 | def show_help( ): 12 | """ displays the program parameter list and usage information """ 13 | stdout( "usage: " + sys.argv[0] + " -f " ) 14 | stdout( " " ) 15 | stdout( " option description" ) 16 | stdout( " -h help (this text here)" ) 17 | stdout( " -f nt alignment file" ) 18 | stdout( " " ) 19 | 20 | sys.exit(1) 21 | 22 | # ============================================================================= 23 | def handle_arguments(): 24 | """ verifies the presence of all necessary arguments and returns the data dir """ 25 | if len ( sys.argv ) == 1: 26 | stderr( "no arguments provided." ) 27 | show_help() 28 | 29 | try: # check for the right arguments 30 | keys, values = getopt.getopt( sys.argv[1:], "hf:t:p:" ) 31 | except getopt.GetoptError: 32 | stderr( "invalid arguments provided." ) 33 | show_help() 34 | 35 | args = {} 36 | for key, value in keys: 37 | if key == '-f': args['aln'] = value 38 | 39 | if not args.has_key('aln'): 40 | stderr( "aln file missing." ) 41 | show_help() 42 | if not file_exists( args.get('aln') ): 43 | stderr( "aln file does not exist." ) 44 | show_help() 45 | 46 | return args 47 | 48 | # ============================================================================= 49 | def get_aln_length_from_file( filename ): 50 | fo = open( filename ) 51 | firstline = fo.readline() 52 | n, length = firstline.split() 53 | fo.close() 54 | return length 55 | 56 | # ============================================================================= 57 | def get_lnL_from_file( filename, model ): 58 | file = filename + '.paml.out.' + model 59 | np, lnL = None, None 60 | if not file_exists( file ): 61 | stderr( "File does not exist: %s" %file ) 62 | return np, lnL 63 | 64 | fo = open( file ) 65 | for line in fo: 66 | if line.startswith("lnL"): 67 | #print filename, model, line 68 | np = re.match("lnL\(.*\s+np:\s*(\d+)", line ).group(1) 69 | lnL = re.match("lnL\(.*\):\s+([0-9.-]+)", line ).group(1) 70 | break 71 | fo.close() 72 | return np, lnL 73 | 74 | # ============================================================================= 75 | # ============================================================================= 76 | def main( args ): 77 | 78 | models = ["M0", "M3K2", "M3K3", "M7", "M8", "Free"] 79 | filename = args.get('aln') 80 | 81 | line = [] 82 | line.append( filename ) 83 | length = get_aln_length_from_file( filename ) 84 | line.append( length ) 85 | for M in models: 86 | np, lnL = get_lnL_from_file( filename, M ) 87 | if np == None or lnL == None: 88 | stderr( "%s: None returned for model %s (%s/%s)" %( filename, M, np, lnL ) ) 89 | sys.exit(1) 90 | line.append( M ) 91 | line.append( np ) 92 | line.append( lnL ) 93 | print string.join(line,"\t") 94 | 95 | # ============================================================================= 96 | # === MAIN ==================================================================== 97 | # ============================================================================= 98 | 99 | args = handle_arguments( ) 100 | main( args ) 101 | -------------------------------------------------------------------------------- /python/paml/parse_codeml-modelA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, re 3 | 4 | def usage(): 5 | print >> sys.stderr, "usage: " + sys.argv[0] + " folder (files end with *.MAalt)" 6 | sys.exit(1) 7 | 8 | 9 | def plausi(): 10 | if len(sys.argv) != 2: usage() 11 | inFolder = sys.argv[1] 12 | return inFolder 13 | 14 | 15 | def parse_from_file(inFile): 16 | basename = os.path.split(inFile)[1] 17 | fo = open(inFile) 18 | line = fo.readline().rstrip() 19 | while 1: 20 | if line.startswith("ns ="): 21 | print >> sys.stderr, inFile 22 | length = re.search("ls =\s+(\d+)", line).group(1) 23 | 24 | if not line.startswith("Bayes Empirical Bayes (BEB) analysis (Yang, Wong & Nielsen 2005. Mol. Biol. Evol. 22:1107-1118)"): 25 | line = fo.readline().rstrip() 26 | else: 27 | line = fo.readline().rstrip() # Positive sites for foreground lineages Prob(w>1): 28 | line = fo.readline().rstrip() 29 | if re.match("^$", line): 30 | sites = 0 31 | else: 32 | sites = 0 33 | while not re.match("^$", line): 34 | if line.endswith("*"): 35 | sites += 1 36 | print basename + "\t" + length + "\t" + line 37 | line = fo.readline().rstrip() 38 | break 39 | print >> sys.stderr, basename + "\t" + str(sites) 40 | fo.close() 41 | 42 | 43 | def parse_all_files(inFolder): 44 | for filename in os.listdir(inFolder): 45 | if not filename.endswith(".MAalt"): continue 46 | parse_from_file(inFolder + "/" + filename) 47 | 48 | 49 | def main(): 50 | inFolder = plausi() 51 | parse_all_files(inFolder) 52 | 53 | 54 | main() 55 | -------------------------------------------------------------------------------- /python/paml/parse_codeml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys, re 3 | 4 | MODELS = ["M0", "M7", "M8", "Free", "M1a", "M2a", "MT1", "MT2", "MT3", "MT4", "MT5", "MT6"] 5 | 6 | 7 | def usage(): 8 | print >> sys.stderr, "usage: " + sys.argv[0] + " folder" 9 | sys.exit(1) 10 | 11 | 12 | def plausi(): 13 | if len(sys.argv) != 2: usage() 14 | inFolder = sys.argv[1] 15 | return inFolder 16 | 17 | 18 | def get_all_base_files(inFolder): 19 | fileHash = {} 20 | for file in os.listdir(inFolder): 21 | filename = os.path.split(file)[1] 22 | basename = filename 23 | while basename.count('.') > 0: basename = os.path.splitext(basename)[0] 24 | fileHash[basename] = 1 25 | return fileHash.keys() 26 | 27 | 28 | def parse_all_from_basefile(file): 29 | filesToParse = [] 30 | for m in MODELS: filesToParse.append(file + ".codeml." + m) 31 | for f in filesToParse: 32 | if not os.path.exists(f) or not os.path.isfile(f): 33 | print >> sys.stderr, "bad stuff happening with file", file, "/", f 34 | return 35 | 36 | modelHash = {} 37 | for f in filesToParse: 38 | fo = open(f) 39 | for line in fo: 40 | if line.startswith("lnL("): 41 | np = re.match("lnL.*\s+np:\s*(\d+)", line ).group(1) 42 | lnL = re.match("lnL\(.*\):\s+([0-9.-]+)", line ).group(1) 43 | break 44 | modelHash[ os.path.splitext(f)[1][1:] ] = [lnL, np] 45 | fo.close() 46 | 47 | sys.stdout.write(file) 48 | for m in MODELS: 49 | sys.stdout.write("\t" + m + ":" + modelHash[m][0] + "," + modelHash[m][1]) 50 | sys.stdout.write("\n") 51 | 52 | 53 | def main(): 54 | inFolder = plausi() 55 | basefiles = get_all_base_files(inFolder) 56 | for basefile in basefiles: 57 | parse_all_from_basefile(basefile) 58 | 59 | 60 | main() 61 | -------------------------------------------------------------------------------- /python/phylip/create-distance-matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | from low import * # custom functions, written by myself 8 | from goterm import GOTerm 9 | from collections import defaultdict 10 | 11 | 12 | # ============================================================================= 13 | def show_help( ): 14 | """ displays the program parameter list and usage information """ 15 | stdout( "usage: " + sys.argv[0] + " -f -i -n" ) 16 | stdout( " " ) 17 | stdout( " option description" ) 18 | stdout( " -h help (this text here)" ) 19 | stdout( " -f input file" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | 40 | if not args.has_key('file'): 41 | stderr( "input file argument missing." ) 42 | show_help() 43 | elif not file_exists( args.get('file') ): 44 | stderr( "input file does not exist." ) 45 | show_help() 46 | 47 | return args 48 | 49 | 50 | # ============================================================================= 51 | def read_input(file): 52 | hash = {} 53 | speciesarray = [] 54 | fo = open(file) 55 | for line in fo: 56 | line = line.rstrip() 57 | pair, rate = line.split("\t") 58 | rate = str(round(1-float(rate),4)) 59 | while len(rate) < 6: rate += "0" 60 | hash[pair] = rate 61 | speciesarray.extend(pair.split(",")) 62 | fo.close() 63 | speciesarray = list(set(speciesarray)) 64 | speciesarray.sort() 65 | return speciesarray, hash 66 | 67 | 68 | # ============================================================================= 69 | # === MAIN ==================================================================== 70 | # ============================================================================= 71 | def main( args ): 72 | speciesarray, hash = read_input(args['file']) 73 | print "\t" + str(len(speciesarray)+1) 74 | print "outgroup " + "0.0000" + "\t" + string.join(["1.0000"]*len(speciesarray), "\t") 75 | for sp1 in speciesarray: 76 | line = sp1 77 | while len(line) < 10: line += " " 78 | line += "1.0000" 79 | for sp2 in speciesarray: 80 | key = [sp1,sp2] 81 | key.sort() 82 | key = string.join(key, ",") 83 | if sp1 == sp2: line += "\t" + "0.0000" 84 | else: line += "\t" + hash[key] 85 | print line 86 | 87 | # ============================================================================= 88 | args = handle_arguments() 89 | main( args ) 90 | 91 | -------------------------------------------------------------------------------- /python/sciroko/import-into-sqlite3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os, sys # low level handling, such as command line stuff 4 | import string # string methods available 5 | import re # regular expressions 6 | import getopt # comand line argument handling 7 | import sqlite3 8 | from low import * # custom functions, written by myself 9 | from sciroko import SSR 10 | from collections import defaultdict 11 | 12 | 13 | # ============================================================================= 14 | def show_help( ): 15 | """ displays the program parameter list and usage information """ 16 | stdout( "usage: " + sys.argv[0] + " -f " ) 17 | stdout( " " ) 18 | stdout( " option description" ) 19 | stdout( " -h help (this text here)" ) 20 | stdout( " -f sciroko output file" ) 21 | stdout( " -d db file" ) 22 | stdout( " " ) 23 | sys.exit(1) 24 | 25 | # ============================================================================= 26 | def handle_arguments(): 27 | """ verifies the presence of all necessary arguments and returns the data dir """ 28 | if len ( sys.argv ) == 1: 29 | stderr( "no arguments provided." ) 30 | show_help() 31 | 32 | try: # check for the right arguments 33 | keys, values = getopt.getopt( sys.argv[1:], "hf:d:" ) 34 | except getopt.GetoptError: 35 | stderr( "invalid arguments provided." ) 36 | show_help() 37 | 38 | args = {} 39 | for key, value in keys: 40 | if key == '-f': args['file'] = value 41 | if key == '-d': args['db'] = value 42 | 43 | if not args.has_key('file'): 44 | stderr( "sciroko file argument missing." ) 45 | show_help() 46 | elif not file_exists( args.get('file') ): 47 | stderr( "sciroko file does not exist." ) 48 | show_help() 49 | 50 | return args 51 | 52 | # ============================================================================= 53 | def init_db(conn): 54 | conn.execute("CREATE TABLE IF NOT EXISTS ssrs(id INTEGER PRIMARY KEY ASC, organism VARCHAR(50), chr VARCHAR(50), startpos INTEGER, endpos INTEGER, motif VARCHAR(10), motif_std VARCHAR(10), length INTEGER, score INTEGER, mismatches INTEGER, binpos INTEGER, seq VARCHAR(255))") 55 | 56 | 57 | # ============================================================================= 58 | # === MAIN ==================================================================== 59 | # ============================================================================= 60 | def main( args ): 61 | 62 | conn = sqlite3.connect(args['db']) 63 | init_db(conn) 64 | 65 | fo = open(args['file']) 66 | for line in fo: 67 | if not line.startswith("RAL"): continue 68 | m = SSR(line) 69 | sql = "INSERT INTO ssrs(organism, chr, startpos, endpos, motif, motif_std, length, score, mismatches, binpos, seq) VALUES (\'%s\', \'%s\', %s, %s, \'%s\', \'%s\', %s, %s, %s, %s, \'%s\')" %(m.organism, m.chromosome, m.startpos, m.endpos, m.motif, m.motif_std, m.length, m.score, m.mismatches, m.megabase, m.seq) 70 | conn.execute(sql) 71 | res = conn.execute("SELECT COUNT(*) FROM ssrs") 72 | entries = res.fetchall()[0][0] 73 | print "done. entries added:", entries 74 | conn.commit() 75 | conn.close() 76 | 77 | 78 | # ============================================================================= 79 | args = handle_arguments() 80 | main( args ) 81 | 82 | -------------------------------------------------------------------------------- /python/swapsc/swapsee-table-annotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, sys # low level handling, such as command line stuff 3 | import getopt # comand line argument handling 4 | import anydbm # index databases (file hash) 5 | from low import * # collection of generic self-defined functions 6 | 7 | 8 | # ============================================================================= 9 | def show_help( ): 10 | """ displays the program parameter list and usage information """ 11 | stdout( "usage: " + sys.argv[0] + " -f -e [-i ]" ) 12 | stdout( " " ) 13 | stdout( " option description" ) 14 | stdout( " -h help (this text here)" ) 15 | stdout( " -f table file" ) 16 | stdout( " -i table column index containing the lookup name [default: 0]" ) 17 | stdout( " -c annotation file column to use [default: all]" ) 18 | stdout( " -l annotation file line(s) to use [default: first]" ) 19 | stdout( " -e file extension to look for (= lookupname.extension)" ) 20 | stdout( " " ) 21 | sys.exit(1) 22 | 23 | # ============================================================================= 24 | def handle_arguments(): 25 | """ verifies the presence of all necessary arguments and returns the data dir """ 26 | if len ( sys.argv ) == 1: 27 | stderr( "no arguments provided." ) 28 | show_help() 29 | 30 | try: # check for the right arguments 31 | keys, values = getopt.getopt( sys.argv[1:], "hf:i:e:" ) 32 | except getopt.GetoptError: 33 | stderr( "invalid arguments provided." ) 34 | show_help() 35 | 36 | args = {} 37 | for key, value in keys: 38 | if key == '-f': args['file'] = value 39 | if key == '-i': args['col'] = int( value ) 40 | if key == '-e': args['ext'] = value 41 | 42 | if not args.has_key('file'): 43 | stderr( "table file missing." ) 44 | show_help() 45 | if not file_exists( args.get('file') ): 46 | stderr( "table file does not exist." ) 47 | show_help() 48 | 49 | if not args.has_key('col'): 50 | args['col'] = 0 51 | 52 | if not args.has_key('ext'): 53 | stderr( "file extension missing." ) 54 | show_help() 55 | 56 | return args 57 | 58 | 59 | # ============================================================================= 60 | # ============================================================================= 61 | def main( args ): 62 | fo = open( args.get('file') ) 63 | for line in fo: 64 | line = line.rstrip() 65 | columns = line.split("\t") 66 | lookup = columns[ args.get('col') ] 67 | lookupfile = lookup + args['ext'] 68 | if file_exists( lookupfile): 69 | ft = open( lookupfile ) 70 | lines = ft.readlines() 71 | ft.close() 72 | # TODO: 73 | # get lines, get column 74 | # then add to table 75 | # print the new line 76 | 77 | fo.close() 78 | 79 | # ============================================================================= 80 | # === MAIN ==================================================================== 81 | # ============================================================================= 82 | args = handle_arguments( ) 83 | main( args ) 84 | 85 | -------------------------------------------------------------------------------- /ruby/geneontology/go-eval.rb: -------------------------------------------------------------------------------- 1 | #/usr/bin/ruby 2 | =begin 3 | =end 4 | 5 | class GOterm 6 | attr_accessor :id, :name, :namespace, :parents 7 | def initialize 8 | @parents = Array.new 9 | end 10 | end 11 | 12 | def load_obo_definition(file) 13 | goterm = Hash.new 14 | obofile = File.open(file) 15 | while line = obofile.gets.chomp 16 | if line =~ /^\[Term\]/ 17 | g = GOterm.new 18 | elsif line =~ /^id:/ 19 | g.id = line.scan(/^id:\s+(GO:\d+)/).first.first 20 | elsif line =~ /^name:/ 21 | g.name = line.scan(/^name:\s+(.*)$/).first.first 22 | elsif line =~ /^namespace:/ 23 | g.namespace = line.scan(/^namespace:\s+(\S+)$/).first.first 24 | elsif line =~ /^is_a:/ 25 | g.parents << line.scan(/^is_a:\s+(GO:\d+)/).first.first 26 | elsif line =~ /^$/ 27 | goterm[g.id] = g 28 | end 29 | end 30 | return goterm 31 | end 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /ruby/generic/wordwrap.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require 'ftools' 3 | 4 | def exit_with_usage 5 | STDOUT.print """ 6 | #{$0} [] 7 | 8 | this script inserts newlines in front of all words that would exceed 9 | a given threshold for max line length. 10 | default max length: 80 11 | 12 | """ 13 | exit(1) 14 | end 15 | 16 | exit_with_usage unless ARGV.length > 0 17 | exit_with_usage unless File.exists? ARGV[0] 18 | MAXLENGTH = (ARGV[1] || 80).to_i 19 | 20 | STDERR.puts "INPUT FILE:\t%s" % ARGV[0] 21 | STDERR.puts "MAXLENGTH:\t%s" % MAXLENGTH 22 | 23 | f = File.open(ARGV[0]) 24 | while line = f.gets 25 | if line.length < 80 26 | STDOUT.print line 27 | else 28 | words = line.chomp.split 29 | first = line[0..0] 30 | pos = 0 31 | newline = Array.new 32 | words.each do |word| 33 | newline << word 34 | if newline.join(" ").length > MAXLENGTH 35 | newline[-1] = "\n" 36 | STDOUT.print newline.join(" ") 37 | if first == "#" or first == "%" 38 | newline = [first, word] 39 | else 40 | newline = [word] 41 | end 42 | end 43 | end 44 | STDOUT.puts newline.join(" ").chomp 45 | end 46 | end 47 | f.close 48 | 49 | STDERR.puts "STATUS: \tdone.\n" 50 | -------------------------------------------------------------------------------- /ruby/pfam/hmmout_annotation.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'optparse' 4 | 5 | PFAMFILE = "/global/databases/pfam/current/pfam_scan_db/Pfam-A.hmm" 6 | 7 | class String 8 | def valid_float? 9 | # The double negation turns this into an actual boolean true - if you're 10 | # okay with "truthy" values (like 0.0), you can remove it. 11 | !!Float(self) rescue false 12 | end 13 | end 14 | 15 | 16 | # ============================================================================= 17 | def get_opt 18 | options = {} 19 | optparse = OptionParser.new do |opts| 20 | opts.banner = "Usage: #{$0} -f -c " 21 | opts.on( '-f FILE or DIR', 'single hmmout file (pfam_scan output with first column = protein length), or a directory where all *.hmmout files will be processed' 22 | ){|file| options[:hmmfile] = file} 23 | opts.on( '-c CUTOFF', '[evalueFloat|GA|TC|NC]' 24 | ){|v| options[:cutoff] = v} 25 | end 26 | begin 27 | optparse.parse! 28 | mandatory = [:hmmfile, :cutoff] 29 | missing = mandatory.select{|param| options[param].nil?} 30 | if not missing.empty? 31 | puts "Missing options: #{missing.join(', ')}" 32 | puts optparse 33 | exit 34 | end 35 | rescue OptionParser::InvalidOption, OptionParser::MissingArgument 36 | puts $!.to_s 37 | puts optparse 38 | exit 39 | end 40 | return options 41 | end 42 | 43 | def get_cutoffs(file=PFAMFILE) 44 | cutoffHash = Hash.new 45 | capture = %w( NAME GA NC TC ) 46 | @name = nil 47 | reader = File.open(file, 'r') 48 | while (line = reader.gets) 49 | entry = {} if line[0,6] == 'HMMER3' 50 | capture.each{|e| entry[e] = line.split[1] if line[0,e.length] == e } 51 | if line[0,2] == "//" 52 | if entry.length != capture.count 53 | STDERR.puts "FATAL ERROR: not all required fields found for an entry: #{entry.inspect}" 54 | next 55 | end 56 | cutoffHash[entry['NAME']] = entry 57 | end 58 | end 59 | return cutoffHash 60 | end 61 | 62 | 63 | # ============================================================================== 64 | def filter_hmmout(file, cutoff) 65 | fw = File.open(file + "." + cutoff, 'w') 66 | f = File.open(file, 'r') 67 | if cutoff.valid_float? # e-value cutoff given 68 | e = cutoff.to_f 69 | f.each{|line| cols = line.chomp.split; fw.puts cols.join("\t") if cols[13].to_f < e} 70 | else 71 | e = cutoff if ['GA', 'TC', 'NC'].include?(cutoff) 72 | abort("invalid value given for cutoff method (#{cutoff}). allowed values are GA, NC, and TC.") if e.nil? 73 | cutoffHash = get_cutoffs() 74 | puts "--- cutoffHash: #{cutoffHash.count} ---" 75 | f.each{|line| 76 | cols = line.chomp.split; 77 | name, bitscore = cols[7], cols[12].to_f 78 | puts name, cutoffHash[name] 79 | next unless name[0,6] == 'Pfam-B' or bitscore > cutoffHash[name][e].to_f 80 | fw.puts cols.join("\t") 81 | } 82 | end 83 | f.close 84 | fw.close 85 | end 86 | 87 | 88 | # ============================================================================== 89 | # =MAIN========================================================================= 90 | # ============================================================================== 91 | 92 | options = get_opt() 93 | unless File.directory?(options[:hmmfile]) 94 | filter_hmmout(options[:hmmfile], options[:cutoff]) 95 | else 96 | Dir.glob(options[:hmmfile] + '/*.hmmout').each{|hmmfile| filter_hmmout(hmmfile, options[:cutoff])} 97 | end 98 | 99 | -------------------------------------------------------------------------------- /ruby/pfam/length2hmmout.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -w 2 | HEADER = /^>(\S+)\s?.?/ 3 | unless (ARGV.size == 2) 4 | puts "Usage: #{$0} fasta hmmout [NOTE: will change input hmmout!]" 5 | exit 6 | end 7 | lengths = Hash.new 8 | seq = String.new 9 | pid = nil 10 | f = File.open(ARGV[0], "r") 11 | c = 0 12 | while(line = f.gets) 13 | line.chomp! 14 | if (m = HEADER.match(line)) 15 | lengths[pid] = seq.length.to_s unless (pid.nil?) 16 | pid = m[1] 17 | seq = String.new 18 | c += 1 19 | STDERR.print "\r*** Reading fasta entries: #{c}... " 20 | next 21 | end 22 | seq += line 23 | end 24 | lengths[pid] = seq.length unless (pid.nil?) 25 | f.close 26 | STDERR.puts "done." 27 | oldhmmout = Array.new 28 | IO.foreach(ARGV[1]) {|x| oldhmmout << x} 29 | f = File.open(ARGV[1], "w") 30 | oldhmmout.each do |line| 31 | next if (/^#.+/.match(line)) 32 | fields = line.split 33 | unless (lengths.has_key?(fields[0])) 34 | puts "*** NO LENGTH FOUND FOR >#{fields[0]}<" 35 | present = false 36 | next 37 | end 38 | line.chomp! 39 | f.puts lengths[fields[0]].to_s + "\t" + line + "\n" 40 | end 41 | f.close 42 | -------------------------------------------------------------------------------- /ruby/swapsc/bio-graphics-plot.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | # generates a diagram of where in the sequence accelerated evolution / positive selection / negative selection took place 3 | require 'rubygems' 4 | require 'bio-graphics' 5 | 6 | DEBUG = false 7 | 8 | 9 | 10 | ############################################################################### 11 | class SwapscFeature 12 | 13 | attr_accessor :start, :stop, :category 14 | 15 | def initialize(start,stop,category) 16 | @start = start 17 | @stop = stop 18 | @category = category 19 | @added = false 20 | end 21 | 22 | def <=> other 23 | @start <=> other.start 24 | end 25 | 26 | def added? 27 | return @added 28 | end 29 | 30 | def add_to_track(track) 31 | track.add_feature( Bio::Feature.new(@category, '%s..%s' % [ @start, @stop ]), :colour => $categories[@category][:color] ) 32 | $categories[@category][:stats] += (@stop - @start +1) 33 | $categories[@category][:branchstats] += (@stop - @start +1) 34 | @added = true 35 | end 36 | end 37 | ############################################################################### 38 | 39 | ############################################################################### 40 | 41 | if ARGV[0] and not File.exists?(ARGV[0]) 42 | puts "error: invalid path to file specified." 43 | ARGV[0] = nil 44 | end 45 | 46 | unless ARGV[0] or ARGV[1] 47 | puts "generates a diagram of where in the sequence accelerated evolution / positive selection / negative selection took place\n" 48 | puts "expected format [tab-delimited]:" 49 | puts "PANEL length" 50 | puts "TRACK name label" 51 | puts "FEATURE range color [label]" 52 | puts "usage: visualize-swapsc.rb \n" 53 | exit 1 54 | end 55 | 56 | # === MAIN ==================================================================== 57 | # ============================================================================= 58 | 59 | # 1. read flatfile and save the input 60 | # 2. process input, create the plot 61 | panel = nil 62 | track = nil 63 | tracks = Array.new 64 | features = Array.new 65 | 66 | f = File.open( ARGV[0], "r" ) 67 | #STDERR.print( ARGV[0] + "\t" ) 68 | while line = f.gets 69 | next if line == nil 70 | line.chomp! 71 | cols = line.split("\t") 72 | if cols[0] == "PANEL" 73 | panel = Bio::Graphics::Panel.new( cols[1].to_i, :width => 800, :format => :png ) 74 | elsif cols[0] == "TRACK" 75 | i, name, label = line.split("\t") 76 | if label == "true" 77 | label = true 78 | else 79 | label = false 80 | end 81 | track = panel.add_track(name, :label => label) 82 | elsif cols[0] == "FEATURE" 83 | if line.split("\t").length == 4 84 | i, range, color, label = line.split("\t") 85 | color = color.split(',').collect{|c| c.to_f} 86 | track.add_feature( Bio::Feature.new("feature", range), :colour => color, :label => label ) 87 | else 88 | i, range, color = line.split("\t") 89 | color = color.split(',').collect{|c| c.to_f} 90 | track.add_feature( Bio::Feature.new("feature", range), :colour => color ) 91 | end 92 | else 93 | STDERR.puts "unknown line descriptor \"#{cols[0]}\"" unless cols[0].nil? 94 | end 95 | end 96 | f.close 97 | panel.draw(ARGV[1]) 98 | 99 | #STDERR.puts "done." 100 | --------------------------------------------------------------------------------