├── importer.py
├── dep_examples.py
├── README
├── dep.py
├── rel_examples.py
└── rel.py


/importer.py:
--------------------------------------------------------------------------------
 1 | from rel import Rel
 2 | 
 3 | def load_file(filename, attributes):
 4 |     """
 5 |     load the space-delimited file with the given filenames assuming the
 6 |     attributes given (as a pythontuple)
 7 |     """
 8 |     
 9 |     r = Rel(attributes)
10 |     
11 |     for line in open("ccat.txt"):
12 |         r.add_tuple(tuple(line.strip().split()))
13 |     
14 |     return r
15 | 


--------------------------------------------------------------------------------
/dep_examples.py:
--------------------------------------------------------------------------------
 1 | from dep import DependencyAnalysis
 2 | 
 3 | from rel import Rel
 4 | 
 5 | r = Rel(("x", "y"))
 6 | r.add(x="A", y=1)
 7 | r.add(x="B", y=1)
 8 | r.add(x="C", y=1)
 9 | r.add(x="A", y=2)
10 | r.add(x="C", y=2)
11 | 
12 | d = DependencyAnalysis(r)
13 | 
14 | # this will print "('B',) : (1,)" because the only dependency between the x
15 | # and y columns is that if x = "B" then y can only be 1.
16 | 
17 | for t in d.find_dependencies(("x",), ("y",)):
18 |     print str(t[0]), ":", ", ".join([str(v) for v in t[1]])
19 | 
20 | r.add(x="B", y=2)
21 | 
22 | d2 = DependencyAnalysis(r)
23 | 
24 | # this will print nothing because there is now no dependency between x and y
25 | # in other words, y can take all values regardless of what x is.
26 | 
27 | for t in d2.find_dependencies(("x",), ("y",)):
28 |     print str(t[0]), ":", ", ".join([str(v) for v in t[1]])
29 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Rel is an exploration of the relational model and data analysis in Python.
 2 | 
 3 | I'm starting off just bringing together code I had on my blog from various
 4 | posts in 2005, initial focusing on implementing relations, a few relational
 5 | operators and exploring functional dependency analysis.
 6 | 
 7 | Still to come is broader support of the relational model, use of namedtuples,
 8 | use of itertools, importers and exporters (including possible support for
 9 | Django's fixtures format) and more utility functions I have scattered all
10 | over the place in various data analysis scripts I've written over the years.
11 | 
12 | 
13 | BLOG POSTS
14 | 
15 | http://jtauber.com/blog/2005/11/09/relational_python/
16 | http://jtauber.com/blog/2005/11/10/relational_python:_basic_class_for_relations/
17 | http://jtauber.com/blog/2005/11/11/relational_python:_displaying_relations/
18 | http://jtauber.com/blog/2005/11/17/relational_python:_projection/
19 | http://jtauber.com/blog/2005/11/30/relational_python:_restrict/
20 | 
21 | http://jtauber.com/blog/2005/05/26/finding_dependencies_in_tabular_data/
22 | http://jtauber.com/blog/2005/05/27/finding_dependencies_in_tabular_data,_part_2/
23 | 
24 | 


--------------------------------------------------------------------------------
/dep.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | def cartesian_product(sets, done=()):
 4 |     if sets:
 5 |         for element in sets[0]:
 6 |             for tup in cartesian_product(sets[1:], done + (element,)):
 7 |                 yield tup
 8 |     else:
 9 |         yield done
10 | 
11 | def non_contig_slice(seq, indices):
12 |     result = ()
13 |     for i in indices:
14 |         result += (seq[i],)
15 |     return result
16 | 
17 | class DependencyAnalysis:
18 |     
19 |     def __init__(self, rel):
20 |         self.rel = rel
21 |         self.possible_values = defaultdict(set)
22 |         
23 |         for tup in self.rel.tuples():
24 |             for attribute, value in tup.items():
25 |                 self.possible_values[attribute].add(value)
26 |     
27 |     def find_dependencies(self, cols_i, cols_j):
28 |         for i_value in cartesian_product(non_contig_slice(self.possible_values, cols_i)):
29 |             j_values = set()
30 |             for tup in self.rel.tuples():
31 |                 if non_contig_slice(tup, cols_i) == i_value:
32 |                     j_values.add(non_contig_slice(tup, cols_j))
33 |             if j_values < set(cartesian_product(non_contig_slice(self.possible_values, cols_j))):
34 |                 yield i_value, j_values
35 | 


--------------------------------------------------------------------------------
/rel_examples.py:
--------------------------------------------------------------------------------
 1 | from rel import Rel, PROJECT, RESTRICT, INTERSECT, UNION, PROJECT_VIEW, RESTRICT_VIEW
 2 | 
 3 | dept = Rel(("DNO", "DNAME", "BUDGET"))
 4 | 
 5 | dept.add(DNO="D1", DNAME="Marketing", BUDGET="10M")
 6 | dept.add(DNO="D2", DNAME="Development", BUDGET="12M")
 7 | dept.add(DNO="D3", DNAME="Research", BUDGET="5M")
 8 | 
 9 | emp = Rel(("ENO", "ENAME", "DNO", "SALARY"))
10 | 
11 | emp.add(ENO="E1", ENAME="Lopez", DNO="D1", SALARY="40K")
12 | emp.add(ENO="E2", ENAME="Cheng", DNO="D1", SALARY="42K")
13 | emp.add(ENO="E3", ENAME="Finzi", DNO="D2", SALARY="30K")
14 | 
15 | emp2 = Rel(("ENO", "ENAME", "DNO", "SALARY"))
16 | emp2.add_multiple([
17 |     dict(ENO="E3", ENAME="Finzi", DNO="D2", SALARY="30K"),
18 |     dict(ENO="E4", ENAME="Saito", DNO="D2", SALARY="35K")
19 | ])
20 | 
21 | dept.display()
22 | emp.display()
23 | emp2.display()
24 | 
25 | print
26 | print "PROJECT"
27 | PROJECT(emp, ("ENO", "ENAME")).display()
28 | 
29 | print
30 | print "RESTRICT"
31 | RESTRICT(emp, lambda tup: tup["SALARY"] <= "40K").display()
32 | 
33 | print
34 | print "INTERSECT"
35 | INTERSECT(emp, emp2).display()
36 | 
37 | print
38 | print "UNION"
39 | UNION(emp, emp2).display()
40 | 
41 | p = PROJECT_VIEW(emp, ("ENO", "ENAME"))
42 | p.display()
43 | emp.add(ENO="E4", ENAME="Saito", DNO="D2", SALARY="35K")
44 | p.display()
45 | 
46 | r = RESTRICT_VIEW(emp2, lambda tup: tup["SALARY"] <= "40K")
47 | r.display()
48 | emp2.add(ENO="E1", ENAME="Lopez", DNO="D1", SALARY="40K")
49 | r.display()
50 | 


--------------------------------------------------------------------------------
/rel.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Relational Python
  3 | # by James Tauber
  4 | #
  5 | # Improvements to Rel class suggested by Kent Johnson
  6 | 
  7 | 
  8 | class Rel:
  9 |     """
 10 |     A relation.
 11 |     
 12 |     Essentially a set of dictionaries (called tuples) where each dictionary has 
 13 |     identical keys (called attributes).
 14 |     
 15 |     Internally, each tuple is stored as a Python tuple rather than a dictionary
 16 |     and the relation also keeps an ordered list of the attributes which is used
 17 |     as the index into the tuples.
 18 |     """
 19 |     
 20 |     def __init__(self, attributes, dictset=set()):
 21 |         """
 22 |         create a relation with the given attributes.
 23 |         """
 24 |         
 25 |         self.attributes_ = tuple(attributes)
 26 |         self.tuples_ = set()
 27 |         self.tuples_.update(set([self._convert_dict(d) for d in dictset]))
 28 |     
 29 |     def attributes(self):
 30 |         """
 31 |         return the set of attributes.
 32 |         """
 33 |         
 34 |         return set(self.attributes_)
 35 |     
 36 |     def _convert_dict(self, tup):
 37 |         """
 38 |         convert a dictionary to the internal representation of a tuple.
 39 |         """
 40 |         
 41 |         # don't convert if already a tuple
 42 |         if isinstance(tup, tuple):
 43 |             return tup
 44 |         else:
 45 |             return tuple([tup[attribute] for attribute in self.attributes_])
 46 |     
 47 |     def add(self, tup=None, **kwargs):
 48 |         """
 49 |         add the given dictionary or keyword args to the relation as a tuple.
 50 |         """
 51 |         
 52 |         if tup is None:
 53 |             tup = kwargs
 54 |         self.tuples_.add(self._convert_dict(tup))
 55 |     
 56 |     def add_tuple(self, tup):
 57 |         """
 58 |         add the given python tuple to the relation
 59 |         """
 60 |         self.tuples_.add(tup)
 61 |     
 62 |     def add_multiple(self, tupset):
 63 |         """
 64 |         add the given dictionaries to the relation as tuples.
 65 |         """
 66 |         
 67 |         self.tuples_.update(set([self._convert_dict(tup) for tup in tupset]))
 68 |     
 69 |     def _tuples(self):
 70 |         return self.tuples_
 71 |     
 72 |     def tuples(self):
 73 |         """
 74 |         return a generator over the tuples in this relation.
 75 |         
 76 |         Each item the generator yields is a dictionary.
 77 |         """
 78 |         
 79 |         for tup in self._tuples():
 80 |             yield dict(zip(self.attributes_, tup))
 81 |     
 82 |     def display(self):
 83 |         """
 84 |         display the relation in tabular form.
 85 |         """
 86 |         
 87 |         # if it seems inefficient that display uses self.tuples() rather than
 88 |         # self.tuples_, it is because that way it will work on views where
 89 |         # tuples() is dynamic
 90 |         
 91 |         columns = range(len(self.attributes_))
 92 |         
 93 |         col_width = [len(self.attributes_[col]) for col in columns]
 94 |         
 95 |         for tupdict in self.tuples():
 96 |             tup = self._convert_dict(tupdict)
 97 |             for col in columns:
 98 |                 col_width[col] = max(col_width[col], len(tup[col]))
 99 |         
100 |         hline = ""
101 |         for col in columns:
102 |             hline += "+-" + ("-" * col_width[col]) + "-"
103 |         hline += "+"
104 |         
105 |         def line(row):
106 |             l = ""
107 |             for col in columns:
108 |                 value = row[col]
109 |                 l += "| " + value + (" " * (col_width[col] - len(value))) + " "
110 |             l += "|"
111 |             return l
112 |         
113 |         print hline
114 |         print line(self.attributes_)
115 |         print hline
116 |         
117 |         for tup in self.tuples():
118 |             print line(self._convert_dict(tup))
119 |         
120 |         print hline
121 | 
122 | 
123 | def project(orig_dict, attributes):
124 |     return dict([item for item in orig_dict.items() if item[0] in attributes])
125 | 
126 | 
127 | def PROJECT(orig_rel, attributes):
128 |     return Rel(attributes, [project(tup, attributes) for tup in orig_rel.tuples()])
129 | 
130 | 
131 | def RESTRICT(orig_rel, restriction):
132 |     return Rel(orig_rel.attributes(), [tup for tup in orig_rel.tuples() if restriction(tup)])
133 | 
134 | 
135 | def INTERSECT(rel_1, rel_2):
136 |     assert rel_1.attributes() == rel_2.attributes()
137 |     return Rel(rel_1.attributes(), rel_1._tuples().intersection(rel_2._tuples()))
138 | 
139 | 
140 | def UNION(rel_1, rel_2):
141 |     assert rel_1.attributes() == rel_2.attributes()
142 |     return Rel(rel_1.attributes(), rel_1._tuples().union(rel_2._tuples()))
143 | 
144 | 
145 | class PROJECT_VIEW(Rel):
146 |     
147 |     def __init__(self, orig_rel, attributes):
148 |         Rel.__init__(self, attributes)
149 |         self.orig_rel = orig_rel
150 |     
151 |     def add(self, tup):
152 |         raise Exception # pragma: no cover
153 |     
154 |     def tuples(self):
155 |         for tup in self.orig_rel.tuples():
156 |             yield project(tup, self.attributes_)
157 | 
158 | 
159 | class RESTRICT_VIEW(Rel):
160 |     
161 |     def __init__(self, orig_rel, restriction):
162 |         Rel.__init__(self, orig_rel.attributes())
163 |         self.orig_rel = orig_rel
164 |         self.restriction = restriction
165 |     
166 |     def add(self, tup):
167 |         raise Exception # pragma: no cover
168 |     
169 |     def tuples(self):
170 |         for tup in self.orig_rel.tuples():
171 |             if self.restriction(tup):
172 |                 yield tup
173 | 


--------------------------------------------------------------------------------