├── .gitignore ├── testProtein.py ├── timetest.py ├── aminoAcids.txt ├── README ├── testBinding.py ├── dotPlot.py ├── drawPopulation.py ├── chemistry.py ├── testCell.py ├── solutions.py ├── evolveCell.py ├── analysis.py ├── graphDrawer.py ├── biochemistry.py └── analyseGenomes.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | Genomes/ 4 | Images/ 5 | -------------------------------------------------------------------------------- /testProtein.py: -------------------------------------------------------------------------------- 1 | import biochemistry 2 | 3 | p = biochemistry.Protein('QNLTTTTTTL', 'solution') 4 | p.output() 5 | -------------------------------------------------------------------------------- /timetest.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | t = timeit.Timer('x = virtualCell.Solution(1000) \nx.setMetabolites("default")', 'import virtualCell') 4 | 5 | test_result = t.repeat(3, 100000) 6 | print min(test_result) -------------------------------------------------------------------------------- /aminoAcids.txt: -------------------------------------------------------------------------------- 1 | L 0, 0, 0, 0 2 | M 2,-2,-2,-4 3 | N -2, 2,-4,-2 4 | O -2,-4, 2,-2 5 | P -4,-2,-2, 2 6 | Q 2,-1, 1,-1 7 | R -1, 2,-1, 1 8 | S 1,-1, 2,-1 9 | T -1, 1,-1, 2 10 | U 2, 1, 0,-4 11 | V 1, 2,-4, 0 12 | W 0,-4, 2, 1 13 | X -4, 0, 1, 2 14 | Y 2,-3, 2,-3 15 | Z -3, 2,-3, 2 -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Python simulation of a single-celled organism 2 | Peter Collingridge 2010 3 | 4 | > testCell.py 5 | This is the top-level module 6 | * Gives an example of how to create and update cells 7 | * Runs simulation for 48000 units of time 8 | 9 | > virtualCell.py 10 | * Defines Solution and Cell classes 11 | * Imports 12 | 13 | > biochemistry.py 14 | * Contains all_metabolites list and all_reactions dictionary 15 | * Defines Metabolite, Reaction and Protein classes 16 | 17 | > graphDrawer.py 18 | * Takes multiple time series and plots them as an SVG 19 | -------------------------------------------------------------------------------- /testBinding.py: -------------------------------------------------------------------------------- 1 | import virtualCell 2 | import drawSVGGraph 3 | 4 | DNA = 'AADADD' # Promoter 5 | DNA += 'BB' # Bind DNA 6 | DNA += 'AA BBBABBBA AA' # With QPQP 7 | DNA += 'BA' # Ribosome 8 | DNA += 'ACAA' # ATPase 9 | DNA += 'DDAAAA' # End 10 | 11 | DNA += 'ADDADD' # Promoter 12 | DNA += 'BB' # Bind DNA 13 | DNA += 'AA BBBBBBBA AA' # With QQQP 14 | DNA += 'DDAAAA' # End 15 | 16 | solution = virtualCell.Solution(10000.0) 17 | cell = solution.addCell(1000.0) 18 | cell.metabolites['EH'].amount += 80 # Add ATP 19 | cell.metabolites['JG'].amount += 80 # Add Amino acids 20 | 21 | cell.addDNA(DNA) 22 | cell.addProtein('QLQPQPLPNL', 2.0) 23 | 24 | #print "\n -Proteins-" 25 | #cell.output('proteins') 26 | #cell.output('metabolites') 27 | 28 | sim_time = 5001 29 | data_collection_functions = {\ 30 | #'ribosome': (lambda cell: cell.proteins['QLQPQPLPNL'].amount), 31 | #'repressor': (lambda cell: cell.proteins['QLQQQPL'].amount)}#, 32 | 'tf on tf': (lambda cell: cell.proteins['QLQPQPLPNL'].binding_domains[0].targets[cell.genes[0]][1]), 33 | 'tf on in': (lambda cell: cell.proteins['QLQPQPLPNL'].binding_domains[0].targets[cell.genes[1]][1]), 34 | 'in on tf': (lambda cell: cell.proteins['QLQQQPL'].binding_domains[0].targets[cell.genes[0]][1]), 35 | 'in on in': (lambda cell: cell.proteins['QLQQQPL'].binding_domains[0].targets[cell.genes[1]][1])} 36 | #'ribosome occupancy': (lambda cell: cell.genes[0].occupancy, []))} 37 | 38 | data_collection = dict([(key, []) for key in data_collection_functions.keys()]) 39 | 40 | for t in range(sim_time): 41 | cell.metabolites['EH'].amount = 80 42 | solution.update() 43 | for d in data_collection.keys(): 44 | data_collection[d].append(data_collection_functions[d](cell)) 45 | 46 | for k, v in data_collection.items(): 47 | print k, v[-1] 48 | 49 | print "\n -Proteins-" 50 | cell.output('proteins') 51 | cell.output('metabolites') 52 | 53 | g = drawSVGGraph.Graph() 54 | g.x_axis_label = "Time" 55 | g.data = data_collection 56 | print g.data.keys() 57 | 58 | g.outputSVG('test', width=400, height=300) -------------------------------------------------------------------------------- /dotPlot.py: -------------------------------------------------------------------------------- 1 | def initiliseSVG(filename, (width, height)): 2 | svg = open(filename + '.svg', 'w') 3 | svg.write('\n') 4 | svg.write("""""") 5 | svg.write('\n 9 | 23 | """) 24 | 25 | return svg 26 | 27 | def plot_dot_plot(DNA, dot_colours=None): 28 | for i in range(num_NTs): 29 | for j in range(i, num_NTs): 30 | 31 | if DNA[i] == DNA[j]: 32 | svg.write('\n' % dot_colours[DNA[i]]) 36 | else: 37 | svg.write('/>\n') 38 | svg.write('') 39 | 40 | def plot_dot_lines(DNA): 41 | for i in range(num_NTs-2): 42 | line_length = 0 43 | 44 | for j in range(1, num_NTs-i): 45 | if DNA[j] == DNA[j+i]: 46 | line_length += 1 47 | else: 48 | if line_length > 5: 49 | print i, j, line_length 50 | (x, y) = (scale * (j-line_length), scale * (j+i-line_length)) 51 | svg.write('\n' % (scale*j, scale*(j+i), x, y)) 52 | line_length = 0 53 | svg.write('') 54 | 55 | DNA = 'BACBCCDBCBCAADADDAABABDBBADDCCAAADADBACCBCADCCDDAABCABDDAABABDBCAACCACACABDCAADDAABABDBBADCBDDDAABACBCCDBCBCAADADDAABABDBBADDADDABADCABCBDDAABCABDDAABABDBCAADDCDCDDDDAABABDBBADBACDDAABACBCCDBCBCAADADDAABABDBBADDACDAADBAAAADDDAABCABDDAABABDBCAAAAACCDCABDDAABABDBCAAADDDDDABCCAACBADCDADDAABABDBBADCBCDDAABABDBBADDCDABBDDAABABDBCAAACACCDCCDCBAB' 56 | num_NTs = len(DNA) 57 | scale = 1 58 | 59 | svg = initiliseSVG('test', (scale*num_NTs+2, scale*num_NTs+2)) 60 | dot_colours = {'A': 'red', 'B': 'blue', 'C': 'green', 'D': 'black'} 61 | 62 | #plot_dot_plot(DNA) 63 | plot_dot_lines(DNA) 64 | -------------------------------------------------------------------------------- /drawPopulation.py: -------------------------------------------------------------------------------- 1 | class PopulationDiagram(): 2 | def __init__(self, genomes): 3 | self.genomes = genomes 4 | 5 | self.cols = 8 6 | self.rows = 16 7 | self.max_radius = 16 8 | 9 | def findRanges(self): 10 | fitness = [] 11 | gene_numbers = [] 12 | 13 | for g in self.genomes: 14 | fitness.append(g.fitness) 15 | gene_numbers.append(len(g.genes)) 16 | 17 | self.min_gene_number = min(gene_numbers) 18 | self.max_gene_number = max(gene_numbers) 19 | self.gene_number_range = self.max_gene_number - self.min_gene_number 20 | self.max_fitness = max(fitness) 21 | 22 | def plotPopulation(self): 23 | self.findRanges() 24 | self.circles = [] 25 | 26 | (x, y) = (self.max_radius, self.max_radius) 27 | column = 1 28 | row_height = self.max_radius 29 | 30 | for g in self.genomes: 31 | size = int(self.max_radius * g.fitness/self.max_fitness) 32 | if g.colour == None: 33 | colour = (0, 0, int(255 * (len(g.genes) - self.min_gene_number)/self.gene_number_range)) 34 | else: 35 | colour = g.colour 36 | 37 | self.circles.append((x, y, size, colour)) 38 | x += self.max_radius*2 39 | column += 1 40 | 41 | if column > self.cols: 42 | column = 1 43 | x = self.max_radius 44 | y += row_height + size 45 | row_height = size 46 | 47 | def outputPlot(self, filename): 48 | (width, height) = (10+self.max_radius*2*8, 10+self.max_radius*2*16) 49 | svg = open(filename + '.svg', 'w') 50 | svg.write('\n') 51 | svg.write("""""") 52 | svg.write('\n 55 | 66 | """) 67 | 68 | for c in self.circles: 69 | svg.write('\n' % (c[3][0], c[3][1], c[3][2])) 71 | svg.write('') 72 | 73 | -------------------------------------------------------------------------------- /chemistry.py: -------------------------------------------------------------------------------- 1 | class Chemistry: 2 | """ Container for all potential chemicals and reactions """ 3 | 4 | def __init__(self): 5 | self.chemicals = [] 6 | self.masses = {} 7 | self.charges = {} 8 | self.stabilities = {} 9 | self.reactions = [] 10 | 11 | def addElements(self, names, masses, charges): 12 | for i, name in enumerate(names): 13 | self.chemicals.append(name) 14 | self.masses[name] = masses[i] 15 | self.charges[name] = charges[i] 16 | self.stabilities[name] = 16 * masses[i] / charges[i] **2 17 | 18 | def addMolecules(self, molecules): 19 | for m in molecules: 20 | self.chemicals.append(m) 21 | self.stabilities[m] = self.masses[m[0]] * self.masses[m[1]] * self.charges[m[0]] * self.charges[m[1]] 22 | 23 | def addReaction(self, substrates, products): 24 | k1 = 2.4 / sum(self.stabilities[s] for s in substrates) 25 | k2 = 2.4 / sum(self.stabilities[p] for p in products) 26 | self.reactions.append(Reaction(substrates, products, k1, k2)) 27 | 28 | class Reaction: 29 | def __init__(self, substrates, products, k1, k2): 30 | self.substrates = substrates 31 | self.products = products 32 | self.k1 = k1 33 | self.k2 = k2 34 | 35 | def defineMetabolitesAndReactions(): 36 | c = Chemistry() 37 | 38 | # Define elements 39 | #elements = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] 40 | elements = ['E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', ] 41 | masses = [1.0, 2.0, 3.0, 4.0, 2.5, 5.0, 7.5, 10.0] 42 | charges = [1, 2, 2, 1, 1, 2, 3.2, 1.6] 43 | c.addElements(elements, masses, charges) 44 | 45 | # Define molecular species 46 | molecules = [elements[x*4+y] + elements[z*4-y+3] for x in (0,1) for y in (0,1) for z in (0,1)] 47 | molecules += [elements[x] + elements[y] for x in (1,5) for y in (1,5)] 48 | c.addMolecules(molecules) 49 | 50 | # Define 12 hydrolysis/synthesis reactions 51 | for m in molecules: 52 | c.addReaction([m], [m[0], m[1]]) 53 | 54 | # Define 12 transferase reactions 55 | for x, y in zip((0,2,4,6), (7,6,7,6)): 56 | c.addReaction([molecules[x], elements[y]], [molecules[x+1], elements[y-4]]) 57 | for x, y in zip((0,1,2,3), (4,4,5,5)): 58 | c.addReaction([molecules[x], elements[y]], [molecules[x+4], elements[y-4]]) 59 | for x, y in zip((9,10,11,11), (8,8,9,10)): 60 | c.addReaction([molecules[x], elements[1]], [molecules[y], elements[5]]) 61 | 62 | # Define 3 double transferase reactions 63 | for x, y in [(0,5), (2,7), (8,11)]: 64 | c.addReaction([molecules[x], molecules[y]], [molecules[x+1], molecules[y-1]]) 65 | 66 | return elements+molecules, c.reactions -------------------------------------------------------------------------------- /testCell.py: -------------------------------------------------------------------------------- 1 | import solutions 2 | import drawSVGGraph 3 | 4 | # Initilise Solution 5 | solution_metabolites = solutions.default_metabolites 6 | solution_metabolites['FK'] = 0.20 7 | solution_metabolites['IL'] = 0.08 8 | solution_metabolites['FG'] = 0.08 9 | solution_metabolites['JG'] = 0.04 10 | solution_metabolites['EL'] = 0.01 11 | solution = solutions.Solution(24000.0, solution_metabolites) 12 | #solution.output('metabolites') 13 | 14 | # Initilise Cell 15 | cell = solution.addCell(volume=1000.0, metabolites='default') 16 | cell.metabolites['EH'].amount += 40 # Add ATP 17 | cell.metabolites['F'].amount += 40 # Add ATP 18 | #cell.metabolites['JG'].amount += 40 # Add Amino acids 19 | 20 | DNA = 'AAAAAD BB AA BBBBBBBA AA BA ACAA DDAAAA' # Transcription factor 21 | DNA += 'AADAAD AA ACADAA DDAAAA' # FG pore 22 | DNA += 'AADAAD AA ACBDAA DDAAAA' # FK pore 23 | DNA += 'AADAAD AA BCADAA DDAAAA' # JG pore 24 | DNA += 'AADAAD AA ADAA AB BBAA DDAAAA' # G/I antiporter 25 | DNA += 'AADAAD AA BAAA AB BDAA DDAAAA' # H/K antiporter 26 | DNA += 'AADAAD AC AC DDAAAA' # FGase 27 | DNA += 'AADAAD AC AD DDAAAA' # FKase 28 | DNA += 'AADAAD AA ACAA AC AA DDAAAA' # F-driven EHase 29 | 30 | DNA = 'AAAAADBBAABBBBBBBAAABAACAADDAAAAAADAADAAACADAADDAAAAAADAADAAACBDAADDAAAAAADAADAABCADAADDAAAAAADAADAAADAAABBBADDDAAAAAADAADAABAAAABBDAADDAAAAAADAADACACDDAAAAAADAADACADDDAAAAAADAADAAACAAACAADDAAAAABACDAADCCCCBDBCCACDBCBBDBCCCAACADBBAABACDDCDACABCABCCBCA' 31 | 32 | cell.addDNA(DNA) 33 | 34 | for seq in cell.proteins: 35 | cell.proteins[seq].amount += 0.25 36 | cell.proteins['QLQQQPLPNL'].amount += 0.75 37 | cell.output() 38 | 39 | # Data recording options 40 | data_collection_functions = {\ 41 | '[JG]': (lambda cell: cell.metabolites['JG'].amount), 42 | '[EH]': (lambda cell: cell.metabolites['EH'].amount), 43 | '[tf]': (lambda cell: 10*cell.proteins['QLQQQPLPNL'].amount)} 44 | #'tf on tf': (lambda cell: cell.proteins['QLQPQPLPNL'].binding_domains[0].targets[cell.genes[0]][1])} 45 | data_collection = dict([(key, []) for key in data_collection_functions.keys()]) 46 | 47 | # Run Simulation 48 | run_time = 100 49 | for t in range(run_time): 50 | #cell.metabolites['EH'].amount = 80 # Keep ATP constant 51 | #cell.metabolites['JG'].amount = 40 # Keep amino acids constant 52 | solution.update() 53 | 54 | for d in data_collection.keys(): 55 | data_collection[d].append(data_collection_functions[d](cell)) 56 | 57 | # Output 58 | print "\n\t-Solution-\t-Cell-" 59 | metabolites = solution.metabolites.keys() 60 | metabolites.sort() 61 | 62 | for m in metabolites: 63 | print '%s\t%.3f%%\t\t%2.3f%%' % (m, solution.metabolites[m].concentration(), cell.metabolites[m].concentration()) 64 | 65 | cell.output('proteins') 66 | 67 | g = drawSVGGraph.Graph() 68 | g.x_axis_label = "Time" 69 | g.y_axis_label = "Concentration" 70 | g.data = data_collection 71 | 72 | #g.outputSVG('tf graph', width=400, height=250) 73 | -------------------------------------------------------------------------------- /solutions.py: -------------------------------------------------------------------------------- 1 | import biochemistry 2 | 3 | default_metabolites = dict([(m, 0.08/2 ** i) for i, m in enumerate(biochemistry.CHEMICALS[:8])]) 4 | 5 | class Solution(): 6 | def __init__(self, volume, metabolites='default'): 7 | self.volume = volume 8 | self.DNA = [] 9 | self.cells = [] 10 | self.proteins = {} 11 | 12 | self.metabolites = dict([(m, biochemistry.Metabolite(m, self.volume)) for m in biochemistry.CHEMICALS]) 13 | metabolite_dict = metabolites=='default' and default_metabolites or metabolites 14 | self._setMetabolites(metabolite_dict) 15 | 16 | for name, metabolite in self.metabolites.items(): 17 | metabolite.name = "%s(out)" % name 18 | 19 | def _setMetabolites(self, metabolites): 20 | for name, amount in metabolites.items(): 21 | self.metabolites[name].amount = amount * self.volume 22 | 23 | def addCell(self, volume, metabolites='default'): 24 | new_cell = Cell(volume, self, metabolites) 25 | self.cells.append(new_cell) 26 | return new_cell 27 | 28 | def update(self, ticks=1): 29 | for t in range(ticks): 30 | for cell in self.cells: 31 | cell.update() 32 | 33 | def output(self, output_type='all'): 34 | if output_type == 'proteins' or output_type == 'all': 35 | print "\n-Proteins-" 36 | for protein in self.proteins.values(): 37 | protein.output() 38 | 39 | elif output_type == 'metabolites' or output_type == 'all': 40 | print "\n-Metabolites-" 41 | metabolites = self.metabolites.keys() 42 | metabolites.sort() 43 | 44 | for m in metabolites: 45 | print '%s\t%.4f%%' % (m, self.metabolites[m].concentration()) 46 | 47 | elif output_type == 'cells': 48 | print "%d cells" % len(self.cells) 49 | 50 | for cell in self.cells: 51 | cell.output('proteins') 52 | 53 | class Cell(Solution): 54 | def __init__(self, volume, solution, metabolites='default'): 55 | Solution.__init__(self, volume, metabolites) 56 | self.solution = solution # Solution in which the cell exists 57 | self.genes = [] 58 | 59 | for name, metabolite in self.metabolites.items(): 60 | metabolite.name = "%s(in)" % name 61 | 62 | def addDNA(self, DNA_string): 63 | DNA = DNA_string.rstrip().replace(' ', '') 64 | self.DNA.append(DNA) 65 | 66 | for gene_seq in DNA.split('DDAAAA'): 67 | if len(gene_seq) > 6: 68 | gene = biochemistry.Gene(gene_seq) 69 | if len(gene.protein_code) > 1: 70 | self.genes.append(gene) 71 | 72 | for gene in self.genes: 73 | self.addProtein(gene.protein_code, 0.0) 74 | #print "DNA: %s -> %s" % (gene.ORF, peptide) 75 | 76 | def addProtein(self, protein, amount): 77 | if protein not in self.proteins: 78 | self.proteins[protein] = biochemistry.Protein(protein, self) 79 | self.proteins[protein].amount += amount 80 | 81 | def update(self): 82 | for p in self.proteins.values(): 83 | p.update() 84 | -------------------------------------------------------------------------------- /evolveCell.py: -------------------------------------------------------------------------------- 1 | import virtualCell 2 | import random 3 | 4 | def addRandomSequence(seq): 5 | while random.random() < 0.99: 6 | seq += random.choice(['A', 'B', 'C', 'D']) 7 | return seq 8 | 9 | def copySequenceWithErrors(template): 10 | seq = '' 11 | n = 0 12 | 13 | while n < len(template): 14 | if random.random() < 0.999: 15 | seq += template[n] 16 | else: 17 | if random.random() < 0.75: 18 | seq += random.choice(['A', 'B', 'C', 'D']) 19 | else: 20 | n = random.randint(0, len(template)-1) 21 | n += 1 22 | 23 | return seq 24 | 25 | def breedCells(cells): 26 | offspring = [12,8,8,8,4,4,4,4,1,1,1,1,1,1,1,1] 27 | daughter_DNA = [] 28 | 29 | for n in range(len(offpring)): 30 | for daughter in range(offspring[n]): 31 | daughter_DNA.append(copySequenceWithErrors(cells[n].DNA)) 32 | 33 | for n in range(4): 34 | parent = random.randint(0, NUMBER_OF_CELLS-1) 35 | daughter_DNA.append(copySequenceWithErrors(cells[parent].DNA)) 36 | 37 | return daughter_DNA 38 | 39 | def outputGeneration(generation, solution): 40 | outputFile.write('>Generation %d\n' % generation) 41 | 42 | for m in solution.metabolites.keys(): 43 | outputFile.write('%s:%f, ' % (m, solution.metabolites[m].concentration())) 44 | outputFile.write('\n') 45 | 46 | for cell in solution.cells: 47 | outputFile.write('%f\t%s\n' % (cell.metabolites['EH'].amount, cell.DNA[0])) 48 | 49 | print "Generation: %d, Genes: %d, Fitness: %.4f" % (generation, len(solution.cells[0].proteins.keys()), solution.cells[0].metabolites['EH'].amount) 50 | 51 | # Define metabolites in pool 52 | solution_metabolites = virtualCell.default_metabolites 53 | solution_metabolites['FK'] = 0.20 54 | solution_metabolites['IL'] = 0.08 55 | solution_metabolites['FG'] = 0.08 56 | solution_metabolites['JG'] = 0.04 57 | solution_metabolites['EL'] = 0.01 58 | 59 | # Create generation 0 60 | GENERATION_TIME = 48000 61 | NUM_GENERATIONS = 10 62 | NUMBER_OF_CELLS = 64 63 | SOLUTION_VOLUME = 2000000.0 64 | CELL_VOLUME = 1000.0 65 | outputFile = file('110525 genomes.txt','w') 66 | 67 | ancestral_DNA = 'AAAAAD BB AA BBBBBBBA AA BA ACAA DDAAAA' # Transcription factor 68 | ancestral_DNA += 'AADAAD AA ACADAA DDAAAA' # FG pore 69 | ancestral_DNA += 'AADAAD AA ACBDAA DDAAAA' # FK pore 70 | ancestral_DNA += 'AADAAD AA BCADAA DDAAAA' # JG pore 71 | ancestral_DNA += 'AADAAD AA ADAA AB BBAA DDAAAA' # G/I antiporter 72 | ancestral_DNA += 'AADAAD AA BAAA AB BDAA DDAAAA' # H/K antiporter 73 | ancestral_DNA += 'AADAAD AC AC DDAAAA' # FGase 74 | ancestral_DNA += 'AADAAD AC AD DDAAAA' # FKase 75 | ancestral_DNA += 'AADAAD AA ACAA AC AA DDAAAA' # F-driven EHase 76 | ancestral_metabolites = {'E':0.8, 'F':0.4, 'G':0.2, 'H':0.1, 'I':0.05, 'J':0.025, 'K':0.0125, 'L':0.00625} 77 | 78 | daughter_DNA = [addRandomSequence(copySequenceWithErrors(ancestral_DNA)) for n in range(NUMBER_OF_CELLS)] 79 | daughter_metabolites = [] 80 | 81 | for generation in range(NUM_GENERATIONS): 82 | solution = virtualCell.Solution(SOLUTION_VOLUME, solution_metabolites) 83 | 84 | # Create cells 85 | for n in range(NUMBER_OF_CELLS): 86 | cell = solution.addCell(CELL_VOLUME) 87 | cell.addDNA(daughter_DNA[n]) 88 | 89 | for p in cell.proteins.values(): 90 | p.amount = 1 91 | 92 | # Run Simulation 93 | for t in range(GENERATION_TIME): 94 | for cell in solution.cells: 95 | cell.update() 96 | 97 | solution.cells.sort(lambda x, y: cmp(y.metabolites['EH'].amount, x.metabolites['EH'].amount)) 98 | outputGeneration(generation, solution) 99 | 100 | # Takes the genomes from first half of a list of cells and mutates each twice, returning a list of those genomes 101 | # Therefore every cell in the top 50% gets to replicate 102 | daughter_DNA = [copySequenceWithErrors(solution.cells[int(n/2)].DNA) for n in range(NUMBER_OF_CELLS)] -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import biochemistry 3 | 4 | class EvolutionaryRun: 5 | def __init__(self, filename): 6 | self.generations = 0 7 | self.metabolites = [] 8 | self.genomes = [] 9 | self.fitnesses = [] 10 | 11 | evoFile = file(os.path.join('Genomes', filename), 'r') 12 | genomes = [] 13 | fitnesses = [] 14 | 15 | for line in evoFile: 16 | if line[0] == '>': 17 | self.generations += 1 18 | self.metabolites.append(self._getMetabolites(evoFile.next())) 19 | if len(genomes) > 0: 20 | self.genomes.append(genomes) 21 | self.fitnesses.append(fitnesses) 22 | genomes = [] 23 | fitnesses = [] 24 | 25 | else: 26 | temp = line.rstrip('\r').rstrip('\n').split('\t') 27 | genomes.append(temp[1]) 28 | fitnesses.append(temp[0]) 29 | 30 | self.genomes.append(genomes) 31 | self.fitnesses.append(fitnesses) 32 | 33 | def _getMetabolites(self, metaboliteString): 34 | metaboliteDictionary = {} 35 | 36 | for metabolite in metaboliteString.split(', ')[:-1]: 37 | m = metabolite.split(':') 38 | metaboliteDictionary[m[0]] = float(m[1]) 39 | 40 | return metaboliteDictionary 41 | 42 | class Genome(): 43 | def __init__ (self, seq, fitness=None): 44 | self.seq = seq 45 | self.fitness = fitness 46 | self.genes = [] 47 | self.proteins = {} 48 | self.colour = None 49 | 50 | def findGenes(self): 51 | self.genes = self.seq.split('DDAA') 52 | 53 | def findProteins(self): 54 | if not self.genes: 55 | self.findGenes() 56 | 57 | for g in self.genes: 58 | if len(g) > 3: 59 | protein = interpretGene(g) 60 | 61 | if protein in self.proteins: 62 | self.proteins[protein] += 1 63 | else: 64 | self.proteins[protein] = 1 65 | 66 | def outputProteins(self): 67 | proteins = self.proteins.keys() 68 | proteins.sort() 69 | 70 | for p in proteins: 71 | print self.proteins[p], p 72 | 73 | def interpretGene(sequence): 74 | substrates = [] 75 | products = [] 76 | enz_func = None 77 | 78 | n = 1 79 | while n < len(sequence): 80 | codon = sequence[n-1] + sequence[n] 81 | 82 | if enz_func == None: 83 | if codon in biochemistry.codon_to_function: 84 | enz_func = biochemistry.codon_to_function[codon] 85 | 86 | if enz_func[0] == 'r': 87 | substrates.append('JG') 88 | products.append('new protein') 89 | enz_func = None 90 | 91 | elif enz_func[0] == 't': 92 | m = biochemistry.codon_to_metabolite[codon] 93 | if enz_func[1] == 'f': 94 | substrates.append('%s out' % m) 95 | products.append('%s in' % m) 96 | else: 97 | substrates.append('%s in' % m) 98 | products.append('%s out' % m) 99 | enz_func = None 100 | 101 | elif enz_func[0] == 'e': 102 | rxn = biochemistry.all_reactions.get(codon, None) 103 | 104 | if rxn: 105 | if enz_func[1] == 'f': 106 | for s in rxn.substrates: 107 | substrates.append(s) 108 | for p in rxn.products: 109 | products.append(p) 110 | else: 111 | for s in rxn.substrates: 112 | products.append(s) 113 | for p in rxn.products: 114 | substrates.append(p) 115 | enz_func = None 116 | 117 | n += 2 118 | 119 | protein = '' 120 | 121 | if len(substrates) > 0: 122 | for s in substrates[:-1]: 123 | protein += '%s + ' % s 124 | protein += '%s -> ' % substrates[-1] 125 | 126 | for p in products[:-1]: 127 | protein += '%s + ' % p 128 | protein += '%s' % products[-1] 129 | 130 | return protein 131 | 132 | ancestral_DNA = 'CABCAA-CABCAA-BACC-BACD-BADD-BAADBBBC-BAACBBBA-BCAC-BCAD-BAABBCAA-BCAABDBB'.replace('-', 'DDAA') 133 | 134 | e = EvolutionaryRun('Gen 14 genomes.txt') 135 | g = Genome(ancestral_DNA) 136 | g = Genome(e.genomes[0][0]) 137 | g.findProteins() 138 | g.outputProteins() 139 | 140 | -------------------------------------------------------------------------------- /graphDrawer.py: -------------------------------------------------------------------------------- 1 | class Graph(): 2 | def __init__ (self): 3 | self.series = {} 4 | self.variables = {} 5 | self.border = (50, 5, 60, 45) 6 | self.scaleX = 1.0 7 | self.scaleY = 1.0 8 | self.colours = ['#0060e5', '#001060', '#e52060', '#a00030', '#00c020', '#006010' ] 9 | 10 | self.X_axis = Axis(400) 11 | self.Y_axis = Axis(300) 12 | 13 | def addSeries(self, name): 14 | n = len(self.series.keys()) 15 | newSeries = DataSeries(name, n) 16 | self.series[name] = newSeries 17 | 18 | def addDataToSeries(self, name, data): 19 | self.series[name].data.append(data) 20 | 21 | def outputSeries(self, filename, series, X_range=None, Y_range=None): 22 | self.initiliseSVG('Graphs/'+filename) 23 | 24 | X_values = [] 25 | Y_values = [] 26 | 27 | for s in series: 28 | X_values.append(len(self.series[s].data)) 29 | Y_values.append(max(self.series[s].data)) 30 | 31 | if X_range == None: 32 | self.X_axis.range = (0, max(X_values)) 33 | else: 34 | self.X_axis.range = (X_range[0], X_range[1]) 35 | 36 | if Y_range == None: 37 | self.Y_axis.range = (0, max(Y_values)) 38 | else: 39 | self.Y_axis.range = (Y_range[0], Y_range[1]) 40 | 41 | self.scaleX = 1.0 * self.X_axis.length / (self.X_axis.range[1] - self.X_axis.range[0]) 42 | self.scaleY = 1.0 * self.Y_axis.length / (self.Y_axis.range[1] - self.Y_axis.range[0]) 43 | 44 | self.X_axis.drawX(self.svg, self.border[0], self.Y_axis.length + self.border[1], self.scaleX) 45 | self.Y_axis.drawY(self.svg, self.border[0], self.Y_axis.length + self.border[1], self.scaleY) 46 | 47 | for n in range(len(series)): 48 | self.drawPlot(self.series[series[n]], self.colours[n]) 49 | 50 | self.drawLabels(series) 51 | self.svg.write('') 52 | 53 | def initiliseSVG(self, name): 54 | width = self.X_axis.length + self.border[0] + self.border[2] 55 | height = self.Y_axis.length + self.border[1] + self.border[3] 56 | 57 | self.svg = open(name + '.svg', 'w') 58 | self.svg.write('\n') 59 | self.svg.write("""""") 60 | self.svg.write('\n 63 | 101 | """) 102 | 103 | def drawPlot(self, series, colour): 104 | # Draw the data as a line 105 | 106 | self.svg.write(' self.X_axis.range[0]: 116 | self.svg.write('L%d, %.3f ' % (x, y)) 117 | else: 118 | self.svg.write('d="M%d, %.3f ' % (x, y)) 119 | 120 | x += self.scaleX 121 | 122 | self.svg.write('">\n') 123 | self.svg.write('\n' % (series.number, series.number)) 125 | self.svg.write('\n') 126 | 127 | def drawLabels(self, series): 128 | x = self.border[0] + self.X_axis.length + 10 129 | y = self.border[1] + 75 130 | 131 | for s in series: 132 | self.svg.write('%s\n' % (self.series[s].colour, self.series[s].label)) 134 | y += self.Y_axis.length / len(self.series) 135 | 136 | def writeFinalValue(self, series): 137 | pass 138 | 139 | class DataSeries(): 140 | def __init__(self, label, number): 141 | self.colour = '#000000' 142 | self.number = number 143 | self.label = label 144 | self.data = [] 145 | 146 | class Axis(): 147 | def __init__(self, length): 148 | self.length = length 149 | self.range = (0, 1) 150 | self.tick_interval = 0.2 151 | self.tick_number = 5 152 | self.label = 'Generation' 153 | 154 | def drawX(self, svg, x, y, dx): 155 | if self.range[1] > 4: 156 | self.tick_interval = (self.range[1] - self.range[0]) / self.tick_number 157 | else: 158 | self.tick_interval = (self.range[1] - self.range[0]) / self.tick_number 159 | 160 | if self.tick_interval * dx == 0: return 161 | 162 | svg.write(' \n' % (x, y, x+self.length, y)) 163 | 164 | labelX = x+self.length/2 - 5*len(self.label) 165 | svg.write(' %s\n' % (labelX, y+40, self.label)) 166 | 167 | label = self.range[0] 168 | while label <= self.range[1]: 169 | svg.write(' \n' % (x, y-0.5, x, y+8)) 170 | labelX = x - len(str(label))*4 171 | svg.write(' %s\n' % (labelX, y+20, label)) 172 | 173 | x += self.tick_interval * dx 174 | label += self.tick_interval 175 | 176 | def drawY(self, svg, x, y, dy): 177 | if self.range[1] > 4: 178 | self.tick_interval = int(self.range[1] - self.range[0]) / self.tick_number 179 | else: 180 | self.tick_interval = (self.range[1] - self.range[0]) / self.tick_number 181 | 182 | if self.tick_interval * dy == 0: return 183 | 184 | svg.write(' \n' % (x, y, x, y-self.length)) 185 | 186 | label = self.range[0] 187 | while label <= self.range[1]: 188 | svg.write(' \n' % (x-8, y, x+0.5, y)) 189 | labelX = x - len("%.4f" % label)*7 190 | svg.write(' %.03f\n' % (labelX, y+4, label)) 191 | 192 | y -= self.tick_interval * dy 193 | label += self.tick_interval 194 | 195 | -------------------------------------------------------------------------------- /biochemistry.py: -------------------------------------------------------------------------------- 1 | from chemistry import defineMetabolitesAndReactions 2 | 3 | # --- Initialise chemistry --- 4 | CHEMICALS, REACTIONS = defineMetabolitesAndReactions() 5 | default_metabolites = dict([(m, 0.08/2 ** i) for i, m in enumerate(CHEMICALS[:8])]) 6 | 7 | # Map codons to enzyme functions 8 | NUCLEOTIDES = ['A', 'B', 'C', 'D'] 9 | CODONS = [a+b for a in NUCLEOTIDES for b in NUCLEOTIDES] 10 | 11 | # --- Define mapping from amino acid sequence to protein function --- 12 | AMINO_ACID_CODE = 'LMNOPQRSTUVWXYZ' 13 | TRANSLATE = dict(zip(CODONS, AMINO_ACID_CODE)) 14 | AA_TO_CHEMICAL = dict(zip([a+b for a in AMINO_ACID_CODE[1:] for b in AMINO_ACID_CODE[1:]], CHEMICALS)) 15 | AA_TO_REACTION = dict(zip([a+b for a in AMINO_ACID_CODE[1:] for b in AMINO_ACID_CODE[1:]], REACTIONS)) 16 | 17 | enzyme_functions = 'tf,tr,ef,er,ribosome,binding'.split(',') 18 | aa_to_function = dict(zip(AMINO_ACID_CODE, enzyme_functions)) 19 | 20 | def Translate(DNA): 21 | """ Takes a DNA sequence (using nucleotides: A,B,C,D) 22 | Returns a peptide sequence (using amino acids L-Z) """ 23 | 24 | peptide = '' 25 | 26 | # Splits bases into pairs and cuts off final base if there is an odd number 27 | for n in range(1, len(DNA), 2): 28 | if DNA[n-1:n+1] == 'DD': return peptide 29 | peptide += TRANSLATE[DNA[n-1:n+1]] 30 | 31 | return peptide 32 | 33 | class Metabolite: 34 | def __init__(self, name, volume=100.0): 35 | self.name = name 36 | self.volume = volume 37 | self.amount = 0.0 38 | 39 | def concentration(self): 40 | return 100.0 * self.amount / self.volume 41 | 42 | class BindingDomain: 43 | def __init__(self, sequence): 44 | self.sequence = sequence 45 | self.targets = {} 46 | 47 | def findPromoterStrengths(self, genes): 48 | for gene in genes: 49 | i1 = amino_acids[self.sequence[0]].couplets1[gene.promoter[0:2]] 50 | i2 = amino_acids[self.sequence[1]].couplets2[gene.promoter[1:3]] 51 | i3 = amino_acids[self.sequence[2]].couplets1[gene.promoter[3:5]] 52 | i4 = amino_acids[self.sequence[3]].couplets2[gene.promoter[4:6]] 53 | c1, c2, c3 = i1 + i2, i2 + i3, i3 + i4 54 | 55 | if c1 > 0 and c2 > 0 and c3 > 0: 56 | self.targets[gene] = [c1 * c2 * c3, 0.0] 57 | 58 | class AminoAcid: 59 | def __init__(self, interactions): 60 | self.interactions = {} 61 | self.couplets1 = {} 62 | self.couplets2 = {} 63 | 64 | for nt in range(len(NUCLEOTIDES)): 65 | self.interactions[NUCLEOTIDES[nt]] = int(interactions[nt]) 66 | 67 | for nt1, nt2 in [(nt1, nt2) for nt1 in NUCLEOTIDES for nt2 in NUCLEOTIDES]: 68 | self.couplets1[nt1+nt2] = 0.7 * self.interactions[nt1] + 0.3 * self.interactions[nt2] 69 | self.couplets2[nt1+nt2] = 0.4 * self.interactions[nt1] + 0.6 * self.interactions[nt2] 70 | 71 | class Gene: 72 | def __init__(self, sequence): 73 | self.promoter = sequence[:6] 74 | #self.ORF = sequence[6:] 75 | self.protein_code = Translate(sequence[6:]) 76 | self.occupancy = 0 77 | 78 | class Protein: 79 | def __init__(self, sequence, solution): 80 | self.sequence = sequence 81 | self.length = len(sequence) 82 | self.solution = solution 83 | 84 | self.degradation_rate = 0.000004 85 | self.amount = 0.0 86 | self.amount_bound = 0.0 87 | 88 | self.f_rate = 1.0 89 | self.r_rate = 1.0 90 | self.net_rxn = 0 91 | 92 | self.functions = [self.degrade] 93 | self.binding_domains = [] 94 | self.substrates = [] 95 | self.products = [] 96 | 97 | self._interpretSequence() 98 | 99 | def _interpretSequence(self): 100 | ribosome = False 101 | catalytic = False 102 | domain = None 103 | binding_seq = '' 104 | metabolite = '' 105 | 106 | for aa in self.sequence: 107 | # Find enzyme function 108 | if domain == None: 109 | domain = aa_to_function.get(aa) 110 | #print 'Found domain ', domain 111 | 112 | if domain == 'ribosome': 113 | ribosome = True 114 | domain = None 115 | self.r_rate *= 0.25 116 | self.substrates.append(self.solution.metabolites['JG']) 117 | 118 | # Transporters 119 | elif domain.startswith('t'): 120 | if aa == 'L': 121 | if metabolite in CHEMICALS: 122 | catalytic = True 123 | if domain[1] == 'f': 124 | self.setMetabolites([metabolite], [metabolite], self.solution.solution) 125 | else: 126 | self.setMetabolites([metabolite], [metabolite], self.solution, self.solution.solution) 127 | domain = None 128 | direction = None 129 | metabolite = '' 130 | else: 131 | metabolite += AA_TO_CHEMICAL.get(aa, '') 132 | 133 | # Enzymes 134 | elif domain.startswith('e'): 135 | if aa in AA_TO_REACTION.keys(): 136 | catalytic = True 137 | r = AA_TO_REACTION[aa] 138 | 139 | if domain[1] == 'f': 140 | self.setMetabolites(r.substrates, r.products) 141 | self.f_rate *= r.k1 142 | self.r_rate *= r.k2 143 | else: 144 | self.setMetabolites(r.products, r.substrates) 145 | self.f_rate *= r.k2 146 | self.r_rate *= r.k1 147 | domain = None 148 | 149 | # Binding Proteins 150 | elif domain == 'binding': 151 | if aa == 'L': 152 | domain = 'binding sequence' 153 | binding_seq = '' 154 | 155 | elif domain == 'binding sequence': 156 | if aa == 'L': 157 | if binding_seq and len(binding_seq) >=4 : 158 | self.binding_domains.append(BindingDomain(binding_seq)) 159 | domain = None 160 | else: 161 | binding_seq += aa 162 | 163 | if self.binding_domains: 164 | self.functions.append(self.bind) 165 | if ribosome: 166 | self.functions.extend([self.find_reaction_rate, self.translate]) 167 | elif catalytic: 168 | self.functions.extend([self.find_reaction_rate, self.catalyse]) 169 | 170 | for domain in self.binding_domains: 171 | domain.findPromoterStrengths(self.solution.genes) 172 | 173 | def setMetabolites(self, substrates, products, sol1=None, sol2=None): 174 | if sol1 == None: sol1 = self.solution 175 | if sol2 == None: sol2 = self.solution 176 | 177 | for s in substrates: 178 | self.substrates.append(sol1.metabolites[s]) 179 | 180 | for p in products: 181 | self.products.append(sol2.metabolites[p]) 182 | 183 | def output(self): 184 | print "Sequence: %s" % self.sequence 185 | print "Amount: %s" % self.amount 186 | 187 | if self.substrates: self._outputReaction() 188 | if self.binding_domains: self._outputBindingProperties() 189 | print 190 | 191 | def _outputReaction(self): 192 | print "Catalyses:" 193 | print " %s -> %s" % (' + '.join([s.name for s in self.substrates]), ' + '.join([p.name for p in self.products])) 194 | #print "\t%f" % self.net_rxn 195 | 196 | def _outputBindingProperties(self): 197 | for site in self.binding_domains: 198 | for gene, (strength, tmp) in site.targets.items(): 199 | print "Binds sequence %s with strength %0.2f" % (gene.promoter, strength) 200 | 201 | def degrade(self): 202 | degradation = self.amount * self.degradation_rate 203 | self.amount -= degradation 204 | self.solution.metabolites['JG'].amount += degradation 205 | 206 | def find_reaction_rate(self): 207 | substrate_bound = self.f_rate 208 | product_bound = self.r_rate 209 | 210 | for s in self.substrates: 211 | substrate_bound *= s.amount / s.volume 212 | for p in self.products: 213 | product_bound *= p.amount / p.volume 214 | 215 | self.net_rxn = substrate_bound - product_bound 216 | 217 | def catalyse(self): 218 | self.net_rxn *= self.amount 219 | for s in self.substrates: 220 | s.amount -= self.net_rxn 221 | for p in self.products: 222 | p.amount += self.net_rxn 223 | 224 | def bind(self): 225 | free_protein = self.amount - self.amount_bound 226 | 227 | for domain in self.binding_domains: 228 | for gene, (strength, amount_bound) in domain.targets.items(): 229 | association = ((1.0-gene.occupancy)/len(self.solution.genes))*free_protein/(free_protein*len(self.binding_domains)+1.0) 230 | association -= (amount_bound+association) * 1.0 / (strength + 1.0) 231 | gene.occupancy += association 232 | domain.targets[gene][1] += association 233 | self.amount_bound += association 234 | 235 | #print "amount bound", self.amount_bound 236 | 237 | def translate(self): 238 | if self.net_rxn > 0 and self.amount_bound > 0: 239 | for domain in self.binding_domains: 240 | for gene, (strength, amount_bound) in domain.targets.items(): 241 | self.solution.addProtein(gene.protein_code, self.net_rxn *amount_bound/len(gene.protein_code)) 242 | 243 | self.net_rxn *= self.amount_bound 244 | for s in self.substrates: 245 | s.amount -= self.net_rxn 246 | for p in self.products: 247 | p.amount += self.net_rxn 248 | 249 | def update(self): 250 | for function in self.functions: 251 | function() 252 | 253 | # Define amino acids for their interactions 254 | 255 | amino_acids = {} 256 | for line in open('aminoAcids.txt'): 257 | data = line.rstrip('\n').split('\t') 258 | interactions = data[1].split(',') 259 | amino_acids[data[0]] = AminoAcid(interactions) -------------------------------------------------------------------------------- /analyseGenomes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import graphDrawer 3 | import biochemistry_old 4 | import numpy as np 5 | from collections import Counter 6 | 7 | class Genome(): 8 | def __init__ (self, seq, fitness): 9 | self.seq = seq 10 | self.fitness = fitness 11 | self.genes = seq.split('DDAA') 12 | self.proteins = {} 13 | self.colour = None 14 | 15 | def findProteins(self): 16 | for g in self.genes: 17 | if len(g) > 3: 18 | protein = interpretGene(g) 19 | 20 | if protein: 21 | if protein in self.proteins: 22 | self.proteins[protein] += 1 23 | else: 24 | self.proteins[protein] = 1 25 | 26 | def outputProteins(self): 27 | proteins = self.proteins.keys() 28 | proteins.sort() 29 | 30 | for p in proteins: 31 | print self.proteins[p], p 32 | 33 | def interpretGene(sequence): 34 | substrates = [] 35 | products = [] 36 | enz_func = None 37 | 38 | n = 1 39 | while n < len(sequence): 40 | codon = sequence[n-1] + sequence[n] 41 | 42 | if enz_func == None: 43 | if codon == 'BA': 44 | enz_func = 'tf' 45 | elif codon == 'BB': 46 | enz_func = 'tr' 47 | if codon == 'BC': 48 | enz_func = 'ef' 49 | elif codon == 'BD': 50 | enz_func = 'er' 51 | 52 | elif enz_func[0] == 't': 53 | m = biochemistry_old.codon_to_metabolite[codon] 54 | if enz_func[1] == 'f': 55 | substrates.append('%s out' % m) 56 | products.append('%s in' % m) 57 | else: 58 | substrates.append('%s in' % m) 59 | products.append('%s out' % m) 60 | enz_func = None 61 | 62 | elif enz_func[0] == 'e': 63 | if codon in biochemistry_old.all_reactions.keys(): 64 | r = biochemistry_old.all_reactions[codon] 65 | 66 | if enz_func[1] == 'f': 67 | for s in r.substrates: 68 | substrates.append(s) 69 | for p in r.products: 70 | products.append(p) 71 | else: 72 | for s in r.substrates: 73 | products.append(s) 74 | for p in r.products: 75 | substrates.append(p) 76 | enz_func = None 77 | 78 | n += 2 79 | 80 | protein = '' 81 | 82 | # Treat equivalent proteins equally 83 | substrates.sort() 84 | products.sort() 85 | if substrates > products: 86 | substrates, products = products, substrates 87 | 88 | if len(substrates) > 0: 89 | for s in substrates[:-1]: 90 | protein += '%s + ' % s 91 | protein += '%s -> ' % substrates[-1] 92 | 93 | for p in products[:-1]: 94 | protein += '%s + ' % p 95 | protein += '%s' % products[-1] 96 | 97 | return protein 98 | 99 | def compareProteomes(genome1, genome2): 100 | p1 = genome1.proteins 101 | p2 = genome2.proteins 102 | differences = [0] 103 | 104 | for p in p1.keys(): 105 | if p in p2: 106 | if p1[p] > p2[p]: 107 | differences.append('Lost %d copies of %s' % (p1[p]-p2[p], p)) 108 | differences[0] += p1[p]-p2[p] 109 | if p1[p] < p2[p]: 110 | differences.append('Gained %d copies of %s' % (p2[p]-p1[p], p)) 111 | differences[0] += p2[p]-p1[p] 112 | else: 113 | differences.append('Lost %d copies of %s' % (p1[p], p)) 114 | differences[0] += p1[p] 115 | 116 | for p in p2.keys(): 117 | if p not in p1: 118 | differences.append('New protein %s' % p) 119 | differences[0] += p2[p] 120 | 121 | return differences 122 | 123 | def outputGenomeDifferences(genomes): 124 | for n in range(1, len(genomes)): 125 | if genomes[n-1].seq != genomes[n].seq: 126 | differences = compareProteomes(genomes[n-1], genomes[n]) 127 | 128 | if len(differences) > 0: 129 | print '>Generation %d, %f' % (n, genomes[n].fitness) 130 | print ' %d nts, %d genes' % (len(genomes[n].seq), len(genomes[n].genes)) 131 | 132 | for d in differences: 133 | print ' ', d 134 | 135 | def graphNumberOfProteinPerGeneration(proteins_per_generation): 136 | from drawSVGGraph import Graph 137 | 138 | num_proteins_per_generation = [len(proteins)for proteins in proteins_per_generation] 139 | 140 | g = Graph() 141 | g.addData({'proteins': num_proteins_per_generation}) 142 | g.x_gridlines = False 143 | g.plot() 144 | g.write('test_graph') 145 | 146 | def findFirstGenerationForProteins(proteomes_per_generation): 147 | """ Iteration through list of sets of proteins 148 | to find the first generation in which each protein appeared. """ 149 | 150 | first_generation_for_protein = {} 151 | 152 | for generation, proteome in enumerate(proteomes_per_generation): 153 | for protein in proteome: 154 | if protein not in first_generation_for_protein: 155 | first_generation_for_protein[protein] = generation 156 | 157 | return first_generation_for_protein 158 | 159 | def createImageOfProteinsPerGeneration(proteins_per_generation, filtered_proteins): 160 | 161 | import numpy as np 162 | width = len(proteins_per_generation) 163 | height = len(filtered_proteins) 164 | picture = np.zeros((width, height), 'uint64') 165 | 166 | for generation in range(len(proteins_per_generation)): 167 | for y, protein in enumerate(filtered_proteins): 168 | if protein in proteins_per_generation[generation]: 169 | picture[generation, y] = 255 170 | 171 | from imageIO import saveSurface 172 | saveSurface('test.png', picture) 173 | 174 | def filterGenomeByFitnessPosition(filename, position): 175 | genomeFile = file(filename, 'r') 176 | genomes = [] 177 | 178 | for line in genomeFile: 179 | metabolites = line 180 | 181 | for n in range(generation): 182 | genomeFile.next() 183 | 184 | data = genomeFile.next() 185 | temp = data.rstrip('\r\n').split('\t') 186 | 187 | g = Genome(temp[0], float(temp[1])) 188 | g.findProteins() 189 | genomes.append(g) 190 | 191 | for n in range(127 - generation): 192 | genomeFile.next() 193 | 194 | return genomes 195 | 196 | def colourByDistance(genomes): 197 | distance_matrix = {} 198 | 199 | for i in range(len(genomes)): 200 | for j in range(i+1, len(genomes)): 201 | d = compareProteomes(genomes[i], genomes[j]) 202 | distance_matrix[(i, j)] = d[0] 203 | distance_matrix[(j, i)] = d[0] 204 | 205 | max_distance = max(distance_matrix.values()) 206 | 207 | for i in distance_matrix.keys(): 208 | if distance_matrix[i] == max_distance: 209 | (g1, g2) = i 210 | break 211 | 212 | for n in range(len(genomes)): 213 | if n != g1: 214 | d1 = int(255 * distance_matrix[(n, g1)] / max_distance) 215 | else: 216 | d1 = 0 217 | 218 | if n != g2: 219 | d2 = int(255 * distance_matrix[(n, g2)] / max_distance) 220 | else: 221 | d2 = 0 222 | 223 | genomes[n].colour = (0, d1, d2) 224 | 225 | return genomes 226 | 227 | def createProteinDistanceMatrix(genomes): 228 | total_distance = 0 229 | 230 | for i in range(len(genomes)): 231 | for j in range(i+1, len(genomes)): 232 | d = compareProteomes(genomes[i], genomes[j]) 233 | total_distance += d[0] 234 | 235 | print total_distance 236 | 237 | def PlotPop(genomes): 238 | genomes = [] 239 | for g in range(16): 240 | g = readGenomes2(genomeFile, g) 241 | genomes.extend(g[:16]) 242 | genomes = colourByDistance(genomes) 243 | 244 | pd = drawPopulation.PopulationDiagram(genomes) 245 | pd.plotPopulation() 246 | pd.outputPlot('test circles') 247 | 248 | def getMetabolites(filename): 249 | """ Open file and extract metabolite concentrations for each generation. 250 | Return a list of dictionaries: list[generation][metabolite] = amount. """ 251 | 252 | genomeFile = file(filename, 'r') 253 | metabolites = [] 254 | 255 | for line in genomeFile: 256 | metaboliteDictionary = {} 257 | 258 | for n in line.split(', ')[:-1]: 259 | m = n.split(':') 260 | metaboliteDictionary[m[0]] = float(m[1]) 261 | metabolites.append(metaboliteDictionary) 262 | 263 | for n in range(128): 264 | genomeFile.next() 265 | return metabolites 266 | 267 | def plotMetabolites(): 268 | # Get metabolites from file of just metabolites 269 | filename = os.path.join("Genomes", "Gen 1920 metabolites.txt") 270 | metabolites_per_generation = [] 271 | 272 | with open(filename, 'r') as fin: 273 | for line in fin: 274 | d = {} 275 | for metabolites in line.split(', '): 276 | m, v = metabolites.split(':') 277 | d[m] = float(v) 278 | metabolites_per_generation.append(d) 279 | 280 | # Only look at the first 1000 generations 281 | metabolites_per_generation = metabolites_per_generation[:1000] 282 | 283 | def getMetabolite(m): 284 | """ Get list of concentrations over the generations for a given metabolite. 285 | Return as a percentage of the initial concentration and bin into bins of 5. """ 286 | 287 | initial_concentration = metabolites_per_generation[0][m] 288 | return [1000 * (generation[m] - initial_concentration) for generation in metabolites_per_generation] 289 | 290 | 291 | metabolites_of_interest = ['K', 'F', 'G'] 292 | metabolites_of_interest = ['E', 'H', 'I'] 293 | metabolites_of_interest = ['IL', 'FG', 'FK'] 294 | metabolites_of_interest = ['E', 'H', 'I', 'IL', 'K', 'F', 'G', 'FG', 'FK'] 295 | metabolite_data = {metabolite: getMetabolite(metabolite) for metabolite in metabolites_of_interest} 296 | for m in metabolites_of_interest: 297 | print m, metabolite_data[m][-1] 298 | 299 | from drawSVGGraph import Graph 300 | 301 | 302 | g = Graph({'height': 300, 'width': 450}) 303 | g.addData(metabolite_data) 304 | 305 | g.x_gridlines = False 306 | g.colours = ['#3C9DD0', '#034569', '#0C0874'] 307 | g.colours = ['#111'] * 9 308 | g.addStyle('.gridlines', {'opacity':0.2}) 309 | g.x_axis_label = "Generations" 310 | g.y_axis_label = "Change in concentration" 311 | #g.min_y = 0 312 | 313 | g.plot() 314 | g.write('Conc of E over generations') 315 | 316 | def getGenomeArray(filename, start_organism=0, end_organism=128, start_generation=0, end_generation=None): 317 | """ Returns a 2D array of genomes in which each row contains a list of genomes for a generation 318 | and each column represents an organism of a given fitness. 319 | 320 | genome_array[10][2] = Genome object for the third fittest organism in generation 10 321 | """ 322 | 323 | genome_array = [] 324 | genomes = [] 325 | generation = 0 326 | 327 | with open(filename, 'r') as f: 328 | for line in f: 329 | if line.startswith('EL'): 330 | generation += 1 331 | 332 | if genomes: 333 | genome_array.append(genomes) 334 | 335 | genomes = [] 336 | organism = 0 337 | 338 | if generation < start_generation: 339 | continue # should actually keep continuing until next EL 340 | elif end_generation and generation > end_generation: 341 | return genome_array 342 | else: 343 | if start_organism <= organism < end_organism: 344 | temp = line.rstrip().split('\t') 345 | genome = Genome(temp[0], temp[1]) 346 | genome.findProteins() 347 | genomes.append(genome) 348 | organism += 1 349 | 350 | if genomes: 351 | genome_array.append(genomes) 352 | 353 | return genome_array 354 | 355 | def getProteinCounts(genome_array): 356 | """ Given a 2D genome array, return a hash of all the proteins and 357 | the number of times they occur in the population. """ 358 | 359 | protein_counts = Counter() 360 | 361 | for genomes in genome_array: 362 | for genome in genomes: 363 | protein_counts.update(genome.proteins) 364 | 365 | return protein_counts 366 | 367 | def getProteinCountsPerGenome(genome_array, proteins): 368 | """ Given a 2D genome_array and a list of proteins, 369 | return a 2D array of counts for each protein for each organism. 370 | 371 | protein_counts[1][3] = counts for protein 4 in organism 2 372 | """ 373 | 374 | protein_counts = np.zeros((0, len(proteins))) 375 | 376 | for genomes in genome_array: 377 | for genome in genomes: 378 | counts = np.array([genome.proteins.get(protein, 0) for protein in proteins]) 379 | protein_counts = np.vstack((protein_counts, counts)) 380 | 381 | return protein_counts 382 | 383 | def PCA(X): 384 | """ Do principle component analysis on matrix X. """ 385 | 386 | m, n = X.shape 387 | sigma = X.T.dot(X) 388 | U, S, V = np.linalg.svd(sigma) 389 | 390 | return U, S, V 391 | 392 | def displayOrganisms(simple_organisms, n_organisms, n_generations): 393 | picture = np.zeros((n_generations, n_organisms, 3), 'uint8') 394 | 395 | # Normalise values into the range 0-255 396 | min_values = np.min(simple_organisms, axis=0) 397 | max_values = np.max(simple_organisms, axis=0) 398 | scale = 255 / (max_values - min_values) 399 | 400 | 401 | for i in range(len(min_values)): 402 | simple_organisms[:,i] -= min_values[i] 403 | simple_organisms[:,i] *= scale[i] 404 | 405 | #Add row of zeros for red colour 406 | zeros = np.zeros((len(simple_organisms), 1)) 407 | colours = np.hstack((zeros, simple_organisms)) 408 | 409 | organism = 0 410 | for x in range(n_generations): 411 | for y in range(n_organisms): 412 | 413 | picture[x, y, :] = colours[organism, :] 414 | organism += 1 415 | 416 | from imageIO import saveSurface 417 | saveSurface('evolution_pca.png', picture) 418 | 419 | def plotEvolutionImage(genome_file, generations, organisms, protein_threshold): 420 | """ Convert each organism in a given set of generations and of a certain fitness into a colour 421 | by finding all the proteins that occur more than a certain threshold. 422 | Then convert each organism into a vector of counts for each protein. 423 | Use PCA to reduce each organism vector into a vector of two components. 424 | Then display each organism as a pixel, showing the top organisms over time. 425 | """ 426 | 427 | genome_array = getGenomeArray(genome_file, end_organism=organisms, end_generation=generations) 428 | protein_counts = getProteinCounts(genome_array) 429 | 430 | filtered_proteins = [protein for protein in protein_counts.iterkeys() if protein_counts[protein] > protein_threshold] 431 | filtered_proteins.sort(key=lambda x: protein_counts[x], reverse=True) 432 | print "Found %s proteins occuring above the threshold." % len(filtered_proteins) 433 | 434 | proteins_per_organims = getProteinCountsPerGenome(genome_array, filtered_proteins) 435 | U, S, V = PCA(proteins_per_organims) 436 | principle_components = (U[:,:2]) 437 | simple_organism = proteins_per_organims.dot(principle_components) 438 | displayOrganisms(simple_organism, organisms, generations) 439 | 440 | def plotFitness(fitnesses): 441 | 442 | from drawSVGGraph import Graph 443 | 444 | g = Graph({'height': 200, 'width': 500}) 445 | g.addData({'max': [f[0] for f in fitnesses], 'median': [f[63] for f in fitnesses]}) 446 | 447 | g.x_gridlines = False 448 | g.colours = ['#3C9DD0', '#034569', '#0C0874'] 449 | g.addStyle('.gridlines', {'opacity':0.2}) 450 | g.x_axis_label = "Generations" 451 | g.y_axis_label = "Concentration of IH" 452 | g.div_x = 480 453 | 454 | g.plot() 455 | g.write('Fitness over generations') 456 | 457 | if __name__ == '__main__': 458 | genome_file = os.path.join("Genomes", "Gen 1920 genomes.txt") 459 | 460 | genome_array = getGenomeArray(genome_file, end_organism=64, end_generation=200) 461 | protein_counts = getProteinCounts(genome_array) 462 | filtered_proteins = [protein for protein in protein_counts.iterkeys() if protein_counts[protein] > 10] 463 | filtered_proteins.sort(key=lambda x: protein_counts[x], reverse=True) 464 | print len(protein_counts) 465 | print len(filtered_proteins) 466 | print filtered_proteins[0], protein_counts[filtered_proteins[0]] 467 | 468 | #plotEvolutionImage(genome_file, 200, 64, 10) 469 | 470 | # Recover from projection 471 | # recovered_organisms = simple_organism.dot(principle_components.T) 472 | 473 | #plotMetabolites() 474 | 475 | #fitnesses = getFitnessPerGeneration(genomeFile) 476 | #plotFitness(fitnesses) 477 | 478 | #with open('results.txt', 'w') as fout: 479 | # for name, count in sorted(proteins.iteritems(), key=lambda x: x[1], reverse=True)[:100]: 480 | # fout.write("%s\t%d\n" % (name, count)) 481 | 482 | #genomes = readGenomes(genomeFile, 1920) 483 | #for protein, amount in genomes[0].proteins.iteritems(): 484 | # print protein, amount 485 | 486 | #proteins_per_generation, protein_counts = getProteinCounts(genomeFile, endOrganism=64, max_generations=1900) 487 | #first_appearance = findFirstGenerationForProteins(proteins_per_generation) 488 | #graphNumberOfProteinPerGeneration(proteins_per_generation) 489 | 490 | #createImageOfProteinsPerGeneration(proteins_per_generation, filtered_proteins) 491 | 492 | --------------------------------------------------------------------------------