├── importer.py ├── dep_examples.py ├── README ├── dep.py ├── rel_examples.py └── rel.py /importer.py: -------------------------------------------------------------------------------- 1 | from rel import Rel 2 | 3 | def load_file(filename, attributes): 4 | """ 5 | load the space-delimited file with the given filenames assuming the 6 | attributes given (as a pythontuple) 7 | """ 8 | 9 | r = Rel(attributes) 10 | 11 | for line in open("ccat.txt"): 12 | r.add_tuple(tuple(line.strip().split())) 13 | 14 | return r 15 | -------------------------------------------------------------------------------- /dep_examples.py: -------------------------------------------------------------------------------- 1 | from dep import DependencyAnalysis 2 | 3 | from rel import Rel 4 | 5 | r = Rel(("x", "y")) 6 | r.add(x="A", y=1) 7 | r.add(x="B", y=1) 8 | r.add(x="C", y=1) 9 | r.add(x="A", y=2) 10 | r.add(x="C", y=2) 11 | 12 | d = DependencyAnalysis(r) 13 | 14 | # this will print "('B',) : (1,)" because the only dependency between the x 15 | # and y columns is that if x = "B" then y can only be 1. 16 | 17 | for t in d.find_dependencies(("x",), ("y",)): 18 | print str(t[0]), ":", ", ".join([str(v) for v in t[1]]) 19 | 20 | r.add(x="B", y=2) 21 | 22 | d2 = DependencyAnalysis(r) 23 | 24 | # this will print nothing because there is now no dependency between x and y 25 | # in other words, y can take all values regardless of what x is. 26 | 27 | for t in d2.find_dependencies(("x",), ("y",)): 28 | print str(t[0]), ":", ", ".join([str(v) for v in t[1]]) 29 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Rel is an exploration of the relational model and data analysis in Python. 2 | 3 | I'm starting off just bringing together code I had on my blog from various 4 | posts in 2005, initial focusing on implementing relations, a few relational 5 | operators and exploring functional dependency analysis. 6 | 7 | Still to come is broader support of the relational model, use of namedtuples, 8 | use of itertools, importers and exporters (including possible support for 9 | Django's fixtures format) and more utility functions I have scattered all 10 | over the place in various data analysis scripts I've written over the years. 11 | 12 | 13 | BLOG POSTS 14 | 15 | http://jtauber.com/blog/2005/11/09/relational_python/ 16 | http://jtauber.com/blog/2005/11/10/relational_python:_basic_class_for_relations/ 17 | http://jtauber.com/blog/2005/11/11/relational_python:_displaying_relations/ 18 | http://jtauber.com/blog/2005/11/17/relational_python:_projection/ 19 | http://jtauber.com/blog/2005/11/30/relational_python:_restrict/ 20 | 21 | http://jtauber.com/blog/2005/05/26/finding_dependencies_in_tabular_data/ 22 | http://jtauber.com/blog/2005/05/27/finding_dependencies_in_tabular_data,_part_2/ 23 | 24 | -------------------------------------------------------------------------------- /dep.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | def cartesian_product(sets, done=()): 4 | if sets: 5 | for element in sets[0]: 6 | for tup in cartesian_product(sets[1:], done + (element,)): 7 | yield tup 8 | else: 9 | yield done 10 | 11 | def non_contig_slice(seq, indices): 12 | result = () 13 | for i in indices: 14 | result += (seq[i],) 15 | return result 16 | 17 | class DependencyAnalysis: 18 | 19 | def __init__(self, rel): 20 | self.rel = rel 21 | self.possible_values = defaultdict(set) 22 | 23 | for tup in self.rel.tuples(): 24 | for attribute, value in tup.items(): 25 | self.possible_values[attribute].add(value) 26 | 27 | def find_dependencies(self, cols_i, cols_j): 28 | for i_value in cartesian_product(non_contig_slice(self.possible_values, cols_i)): 29 | j_values = set() 30 | for tup in self.rel.tuples(): 31 | if non_contig_slice(tup, cols_i) == i_value: 32 | j_values.add(non_contig_slice(tup, cols_j)) 33 | if j_values < set(cartesian_product(non_contig_slice(self.possible_values, cols_j))): 34 | yield i_value, j_values 35 | -------------------------------------------------------------------------------- /rel_examples.py: -------------------------------------------------------------------------------- 1 | from rel import Rel, PROJECT, RESTRICT, INTERSECT, UNION, PROJECT_VIEW, RESTRICT_VIEW 2 | 3 | dept = Rel(("DNO", "DNAME", "BUDGET")) 4 | 5 | dept.add(DNO="D1", DNAME="Marketing", BUDGET="10M") 6 | dept.add(DNO="D2", DNAME="Development", BUDGET="12M") 7 | dept.add(DNO="D3", DNAME="Research", BUDGET="5M") 8 | 9 | emp = Rel(("ENO", "ENAME", "DNO", "SALARY")) 10 | 11 | emp.add(ENO="E1", ENAME="Lopez", DNO="D1", SALARY="40K") 12 | emp.add(ENO="E2", ENAME="Cheng", DNO="D1", SALARY="42K") 13 | emp.add(ENO="E3", ENAME="Finzi", DNO="D2", SALARY="30K") 14 | 15 | emp2 = Rel(("ENO", "ENAME", "DNO", "SALARY")) 16 | emp2.add_multiple([ 17 | dict(ENO="E3", ENAME="Finzi", DNO="D2", SALARY="30K"), 18 | dict(ENO="E4", ENAME="Saito", DNO="D2", SALARY="35K") 19 | ]) 20 | 21 | dept.display() 22 | emp.display() 23 | emp2.display() 24 | 25 | print 26 | print "PROJECT" 27 | PROJECT(emp, ("ENO", "ENAME")).display() 28 | 29 | print 30 | print "RESTRICT" 31 | RESTRICT(emp, lambda tup: tup["SALARY"] <= "40K").display() 32 | 33 | print 34 | print "INTERSECT" 35 | INTERSECT(emp, emp2).display() 36 | 37 | print 38 | print "UNION" 39 | UNION(emp, emp2).display() 40 | 41 | p = PROJECT_VIEW(emp, ("ENO", "ENAME")) 42 | p.display() 43 | emp.add(ENO="E4", ENAME="Saito", DNO="D2", SALARY="35K") 44 | p.display() 45 | 46 | r = RESTRICT_VIEW(emp2, lambda tup: tup["SALARY"] <= "40K") 47 | r.display() 48 | emp2.add(ENO="E1", ENAME="Lopez", DNO="D1", SALARY="40K") 49 | r.display() 50 | -------------------------------------------------------------------------------- /rel.py: -------------------------------------------------------------------------------- 1 | 2 | # Relational Python 3 | # by James Tauber 4 | # 5 | # Improvements to Rel class suggested by Kent Johnson 6 | 7 | 8 | class Rel: 9 | """ 10 | A relation. 11 | 12 | Essentially a set of dictionaries (called tuples) where each dictionary has 13 | identical keys (called attributes). 14 | 15 | Internally, each tuple is stored as a Python tuple rather than a dictionary 16 | and the relation also keeps an ordered list of the attributes which is used 17 | as the index into the tuples. 18 | """ 19 | 20 | def __init__(self, attributes, dictset=set()): 21 | """ 22 | create a relation with the given attributes. 23 | """ 24 | 25 | self.attributes_ = tuple(attributes) 26 | self.tuples_ = set() 27 | self.tuples_.update(set([self._convert_dict(d) for d in dictset])) 28 | 29 | def attributes(self): 30 | """ 31 | return the set of attributes. 32 | """ 33 | 34 | return set(self.attributes_) 35 | 36 | def _convert_dict(self, tup): 37 | """ 38 | convert a dictionary to the internal representation of a tuple. 39 | """ 40 | 41 | # don't convert if already a tuple 42 | if isinstance(tup, tuple): 43 | return tup 44 | else: 45 | return tuple([tup[attribute] for attribute in self.attributes_]) 46 | 47 | def add(self, tup=None, **kwargs): 48 | """ 49 | add the given dictionary or keyword args to the relation as a tuple. 50 | """ 51 | 52 | if tup is None: 53 | tup = kwargs 54 | self.tuples_.add(self._convert_dict(tup)) 55 | 56 | def add_tuple(self, tup): 57 | """ 58 | add the given python tuple to the relation 59 | """ 60 | self.tuples_.add(tup) 61 | 62 | def add_multiple(self, tupset): 63 | """ 64 | add the given dictionaries to the relation as tuples. 65 | """ 66 | 67 | self.tuples_.update(set([self._convert_dict(tup) for tup in tupset])) 68 | 69 | def _tuples(self): 70 | return self.tuples_ 71 | 72 | def tuples(self): 73 | """ 74 | return a generator over the tuples in this relation. 75 | 76 | Each item the generator yields is a dictionary. 77 | """ 78 | 79 | for tup in self._tuples(): 80 | yield dict(zip(self.attributes_, tup)) 81 | 82 | def display(self): 83 | """ 84 | display the relation in tabular form. 85 | """ 86 | 87 | # if it seems inefficient that display uses self.tuples() rather than 88 | # self.tuples_, it is because that way it will work on views where 89 | # tuples() is dynamic 90 | 91 | columns = range(len(self.attributes_)) 92 | 93 | col_width = [len(self.attributes_[col]) for col in columns] 94 | 95 | for tupdict in self.tuples(): 96 | tup = self._convert_dict(tupdict) 97 | for col in columns: 98 | col_width[col] = max(col_width[col], len(tup[col])) 99 | 100 | hline = "" 101 | for col in columns: 102 | hline += "+-" + ("-" * col_width[col]) + "-" 103 | hline += "+" 104 | 105 | def line(row): 106 | l = "" 107 | for col in columns: 108 | value = row[col] 109 | l += "| " + value + (" " * (col_width[col] - len(value))) + " " 110 | l += "|" 111 | return l 112 | 113 | print hline 114 | print line(self.attributes_) 115 | print hline 116 | 117 | for tup in self.tuples(): 118 | print line(self._convert_dict(tup)) 119 | 120 | print hline 121 | 122 | 123 | def project(orig_dict, attributes): 124 | return dict([item for item in orig_dict.items() if item[0] in attributes]) 125 | 126 | 127 | def PROJECT(orig_rel, attributes): 128 | return Rel(attributes, [project(tup, attributes) for tup in orig_rel.tuples()]) 129 | 130 | 131 | def RESTRICT(orig_rel, restriction): 132 | return Rel(orig_rel.attributes(), [tup for tup in orig_rel.tuples() if restriction(tup)]) 133 | 134 | 135 | def INTERSECT(rel_1, rel_2): 136 | assert rel_1.attributes() == rel_2.attributes() 137 | return Rel(rel_1.attributes(), rel_1._tuples().intersection(rel_2._tuples())) 138 | 139 | 140 | def UNION(rel_1, rel_2): 141 | assert rel_1.attributes() == rel_2.attributes() 142 | return Rel(rel_1.attributes(), rel_1._tuples().union(rel_2._tuples())) 143 | 144 | 145 | class PROJECT_VIEW(Rel): 146 | 147 | def __init__(self, orig_rel, attributes): 148 | Rel.__init__(self, attributes) 149 | self.orig_rel = orig_rel 150 | 151 | def add(self, tup): 152 | raise Exception # pragma: no cover 153 | 154 | def tuples(self): 155 | for tup in self.orig_rel.tuples(): 156 | yield project(tup, self.attributes_) 157 | 158 | 159 | class RESTRICT_VIEW(Rel): 160 | 161 | def __init__(self, orig_rel, restriction): 162 | Rel.__init__(self, orig_rel.attributes()) 163 | self.orig_rel = orig_rel 164 | self.restriction = restriction 165 | 166 | def add(self, tup): 167 | raise Exception # pragma: no cover 168 | 169 | def tuples(self): 170 | for tup in self.orig_rel.tuples(): 171 | if self.restriction(tup): 172 | yield tup 173 | --------------------------------------------------------------------------------