├── v3 ├── collection.py ├── index.py ├── graph.py └── differential_dataflow.py ├── v4 ├── collection.py ├── version.py ├── index.py ├── graph.py └── differential_dataflow.py ├── v1 ├── collection.py ├── index.py └── difference_sequence.py ├── v2 ├── collection.py ├── index.py ├── graph.py └── differential_dataflow.py ├── example.py ├── index.py ├── graph.py ├── order.py ├── README.md ├── v0 └── collection.py ├── collection.py └── differential_dataflow.py /v3/collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def consolidate(self): 40 | """Produce as output a collection that is logically equivalent to the input 41 | but which combines identical instances of the same record into one 42 | (record, multiplicity) pair. 43 | """ 44 | consolidated = defaultdict(int) 45 | for (data, multiplicity) in self._inner: 46 | consolidated[data] += multiplicity 47 | consolidated = [ 48 | (data, multiplicity) 49 | for (data, multiplicity) in consolidated.items() 50 | if multiplicity != 0 51 | ] 52 | consolidated.sort() 53 | return Collection(consolidated) 54 | 55 | def _extend(self, other): 56 | self._inner.extend(other._inner) 57 | -------------------------------------------------------------------------------- /v4/collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def consolidate(self): 40 | """Produce as output a collection that is logically equivalent to the input 41 | but which combines identical instances of the same record into one 42 | (record, multiplicity) pair. 43 | """ 44 | consolidated = defaultdict(int) 45 | for (data, multiplicity) in self._inner: 46 | consolidated[data] += multiplicity 47 | consolidated = [ 48 | (data, multiplicity) 49 | for (data, multiplicity) in consolidated.items() 50 | if multiplicity != 0 51 | ] 52 | consolidated.sort() 53 | return Collection(consolidated) 54 | 55 | def _extend(self, other): 56 | self._inner.extend(other._inner) 57 | -------------------------------------------------------------------------------- /v1/collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def concat(self, other): 40 | """Concatenate two collections together.""" 41 | out = [] 42 | out.extend(self._inner) 43 | out.extend(other._inner) 44 | return Collection(out) 45 | 46 | def consolidate(self): 47 | """Produce as output a collection that is logically equivalent to the input 48 | but which combines identical instances of the same record into one 49 | (record, multiplicity) pair. 50 | """ 51 | consolidated = defaultdict(int) 52 | for (data, multiplicity) in self._inner: 53 | consolidated[data] += multiplicity 54 | consolidated = [ 55 | (data, multiplicity) 56 | for (data, multiplicity) in consolidated.items() 57 | if multiplicity != 0 58 | ] 59 | consolidated.sort() 60 | return Collection(consolidated) 61 | 62 | def _extend(self, other): 63 | self._inner.extend(other._inner) 64 | -------------------------------------------------------------------------------- /v2/collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def concat(self, other): 40 | """Concatenate two collections together.""" 41 | out = [] 42 | out.extend(self._inner) 43 | out.extend(other._inner) 44 | return Collection(out) 45 | 46 | def consolidate(self): 47 | """Produce as output a collection that is logically equivalent to the input 48 | but which combines identical instances of the same record into one 49 | (record, multiplicity) pair. 50 | """ 51 | consolidated = defaultdict(int) 52 | for (data, multiplicity) in self._inner: 53 | consolidated[data] += multiplicity 54 | consolidated = [ 55 | (data, multiplicity) 56 | for (data, multiplicity) in consolidated.items() 57 | if multiplicity != 0 58 | ] 59 | consolidated.sort() 60 | return Collection(consolidated) 61 | 62 | def _extend(self, other): 63 | self._inner.extend(other._inner) 64 | -------------------------------------------------------------------------------- /v4/version.py: -------------------------------------------------------------------------------- 1 | """The implementation of totally ordered, multidimensional versions (times) for use within a differential dataflow. 2 | """ 3 | 4 | 5 | class Version: 6 | """A totally ordered version (time), consisting of a tuple of 7 | integers, ordered lexicographically. 8 | 9 | All versions within a scope of a dataflow must have the same dimension/number 10 | of coordinates. 11 | """ 12 | 13 | def __init__(self, version): 14 | if isinstance(version, int): 15 | assert version >= 0 16 | self.inner = (version,) 17 | elif isinstance(version, list) or isinstance(version, tuple): 18 | for i in version: 19 | assert isinstance(i, int) 20 | assert i >= 0 21 | self.inner = tuple(version) 22 | else: 23 | assert 0 > 1 24 | 25 | def __repr__(self): 26 | return f"Version({self.inner})" 27 | 28 | def __eq__(self, other): 29 | return self.inner == other.inner 30 | 31 | def __lt__(self, other): 32 | return self.inner.__lt__(other.inner) 33 | 34 | def __le__(self, other): 35 | return self.__lt__(other) or self.__eq__(other) 36 | 37 | def __hash__(self): 38 | return hash(self.inner) 39 | 40 | def _validate(self, other): 41 | assert len(self.inner) > 0 42 | assert len(self.inner) == len(other.inner) 43 | 44 | def extend(self): 45 | elements = [e for e in self.inner] 46 | elements.append(0) 47 | return Version(elements) 48 | 49 | def truncate(self): 50 | elements = [e for e in self.inner] 51 | elements.pop() 52 | return Version(elements) 53 | 54 | def apply_step(self, step, max_value): 55 | assert step > 0 56 | assert len(self.inner) > 1 57 | elements = [e for e in self.inner] 58 | 59 | pos = 1 60 | while True: 61 | if elements[-pos] < max_value or pos == len(elements): 62 | elements[-pos] += step 63 | break 64 | else: 65 | elements[-pos] = 0 66 | pos += 1 67 | output = Version(elements) 68 | assert output > self 69 | return output 70 | -------------------------------------------------------------------------------- /v1/index.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from collection import Collection 3 | 4 | 5 | class Index: 6 | def __init__(self, compaction_frontier=None): 7 | self._index = defaultdict(list) 8 | 9 | def __repr__(self): 10 | return "Index({self._index})" 11 | 12 | def add_value(self, key, value): 13 | """Add a (value, multiplicity) pair for the requested key.""" 14 | self._index[key].append(value) 15 | 16 | def append(self, other): 17 | """Combine all of the data in other into self.""" 18 | for (key, data) in other._index.items(): 19 | self._index[key].extend(data) 20 | 21 | def get(self, key): 22 | if key in self._index: 23 | return self._index[key] 24 | return [] 25 | 26 | def join(self, other): 27 | """Produce a bounded collection trace containing (key, (val1, val2)) 28 | for all (key, val1) in the first index, and (key, val2) in the second 29 | index. 30 | """ 31 | out = [] 32 | for (key, data1) in self._index.items(): 33 | if key not in other._index: 34 | continue 35 | data2 = other._index[key] 36 | 37 | for (val1, mul1) in data1: 38 | for (val2, mul2) in data2: 39 | out.append(((key, (val1, val2)), mul1 * mul2)) 40 | return Collection(out) 41 | 42 | def compact(self, keys=[]): 43 | def consolidate_values(values): 44 | consolidated = defaultdict(int) 45 | for (value, multiplicity) in values: 46 | consolidated[value] += multiplicity 47 | 48 | return [ 49 | (value, multiplicity) 50 | for (value, multiplicity) in consolidated.items() 51 | if multiplicity != 0 52 | ] 53 | 54 | if keys == []: 55 | keys = [key for key in self._index.keys()] 56 | 57 | for key in keys: 58 | if key not in self._index: 59 | continue 60 | data = self._index.pop(key) 61 | consolidated = consolidate_values(data) 62 | 63 | if consolidated != []: 64 | self._index[key].extend(consolidated) 65 | -------------------------------------------------------------------------------- /v2/index.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from collection import Collection 3 | 4 | 5 | class Index: 6 | def __init__(self, compaction_frontier=None): 7 | self._index = defaultdict(list) 8 | 9 | def __repr__(self): 10 | return "Index({self._index})" 11 | 12 | def add_value(self, key, value): 13 | """Add a (value, multiplicity) pair for the requested key.""" 14 | self._index[key].append(value) 15 | 16 | def append(self, other): 17 | """Combine all of the data in other into self.""" 18 | for (key, data) in other._index.items(): 19 | self._index[key].extend(data) 20 | 21 | def get(self, key): 22 | if key in self._index: 23 | return self._index[key] 24 | return [] 25 | 26 | def join(self, other): 27 | """Produce a bounded collection trace containing (key, (val1, val2)) 28 | for all (key, val1) in the first index, and (key, val2) in the second 29 | index. 30 | """ 31 | out = [] 32 | for (key, data1) in self._index.items(): 33 | if key not in other._index: 34 | continue 35 | data2 = other._index[key] 36 | 37 | for (val1, mul1) in data1: 38 | for (val2, mul2) in data2: 39 | out.append(((key, (val1, val2)), mul1 * mul2)) 40 | return Collection(out) 41 | 42 | def compact(self, keys=[]): 43 | def consolidate_values(values): 44 | consolidated = defaultdict(int) 45 | for (value, multiplicity) in values: 46 | consolidated[value] += multiplicity 47 | 48 | return [ 49 | (value, multiplicity) 50 | for (value, multiplicity) in consolidated.items() 51 | if multiplicity != 0 52 | ] 53 | 54 | if keys == []: 55 | keys = [key for key in self._index.keys()] 56 | 57 | for key in keys: 58 | if key not in self._index: 59 | continue 60 | data = self._index.pop(key) 61 | consolidated = consolidate_values(data) 62 | 63 | if consolidated != []: 64 | self._index[key].extend(consolidated) 65 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from collection import Collection 2 | from order import Version, Antichain 3 | from differential_dataflow import GraphBuilder 4 | 5 | 6 | def game_of_life(collection): 7 | maybe_live_cells = collection.map(lambda data: ((data[0] - 1, data[1] - 1), ())) 8 | maybe_live_cells = maybe_live_cells.concat( 9 | collection.map(lambda data: ((data[0] - 1, data[1]), ())) 10 | ) 11 | 12 | maybe_live_cells = maybe_live_cells.concat( 13 | collection.map(lambda data: ((data[0] - 1, data[1] + 1), ())) 14 | ) 15 | maybe_live_cells = maybe_live_cells.concat( 16 | collection.map(lambda data: ((data[0], data[1] - 1), ())) 17 | ) 18 | maybe_live_cells = maybe_live_cells.concat( 19 | collection.map(lambda data: ((data[0], data[1] + 1), ())) 20 | ) 21 | maybe_live_cells = maybe_live_cells.concat( 22 | collection.map(lambda data: ((data[0] + 1, data[1] - 1), ())) 23 | ) 24 | maybe_live_cells = maybe_live_cells.concat( 25 | collection.map(lambda data: ((data[0] + 1, data[1]), ())) 26 | ) 27 | maybe_live_cells = maybe_live_cells.concat( 28 | collection.map(lambda data: ((data[0] + 1, data[1] + 1), ())) 29 | ) 30 | 31 | maybe_live_cells = maybe_live_cells.count() 32 | live_with_three_neighbors = maybe_live_cells.filter(lambda data: data[1] == 3).map( 33 | lambda data: (data[0], ()) 34 | ) 35 | live_with_two_neighbors = ( 36 | maybe_live_cells.filter(lambda data: data[1] == 2) 37 | .join(collection.map(lambda data: (data, ()))) 38 | .map(lambda data: (data[0], ())) 39 | ) 40 | live_next_round = ( 41 | live_with_two_neighbors.concat(live_with_three_neighbors) 42 | .distinct() 43 | .map(lambda data: data[0]) 44 | ) 45 | 46 | return live_next_round 47 | 48 | 49 | graph_builder = GraphBuilder(Antichain([Version(0)])) 50 | input_a, input_a_writer = graph_builder.new_input() 51 | output = input_a.iterate(game_of_life).debug("iterate").connect_reader() 52 | graph = graph_builder.finalize() 53 | 54 | input_a_writer.send_data( 55 | Version(0), Collection([((2, 2), 1), ((2, 3), 1), ((2, 4), 1), ((3, 2), 1)]) 56 | ) 57 | input_a_writer.send_frontier(Antichain([Version(1)])) 58 | 59 | while output.probe_frontier_less_than(Antichain([Version(1)])): 60 | graph.step() 61 | -------------------------------------------------------------------------------- /v2/graph.py: -------------------------------------------------------------------------------- 1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program.""" 2 | 3 | from collections import deque 4 | 5 | 6 | class DifferenceStreamReader: 7 | """A read handle to a dataflow edge that receives data from a writer. 8 | 9 | The data received over this edge are Collection objects that represent difference 10 | collections representing a single logical collection undergoing changes. 11 | """ 12 | 13 | def __init__(self, queue): 14 | self._queue = queue 15 | 16 | def drain(self): 17 | out = [] 18 | while len(self._queue) > 0: 19 | out.append(self._queue.pop()) 20 | 21 | return out 22 | 23 | def is_empty(self): 24 | return len(self._queue) == 0 25 | 26 | 27 | class DifferenceStreamWriter: 28 | """A write handle to a dataflow edge that is allowed to publish data.""" 29 | 30 | def __init__(self): 31 | self._queues = [] 32 | 33 | def send_data(self, collection): 34 | for q in self._queues: 35 | q.appendleft(collection) 36 | 37 | def _new_reader(self): 38 | q = deque() 39 | self._queues.append(q) 40 | return DifferenceStreamReader(q) 41 | 42 | 43 | class Operator: 44 | """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and 45 | one outgoing edge (write handle). 46 | """ 47 | 48 | def __init__(self, inputs, output, f): 49 | self.inputs = inputs 50 | self.output = output 51 | self.f = f 52 | self.pending_work = False 53 | 54 | def run(self): 55 | self.f() 56 | 57 | def pending_work(self): 58 | if self.pending_work is True: 59 | return True 60 | for input_listener in self.inputs: 61 | if input_listener.is_empty() is False: 62 | return True 63 | return False 64 | 65 | 66 | class UnaryOperator(Operator): 67 | """A convenience implementation of a dataflow operator that has a handle to one 68 | incoming stream of data, and one handle to an outgoing stream of data. 69 | """ 70 | 71 | def __init__(self, input_a, output, f): 72 | super().__init__([input_a], output, f) 73 | 74 | def input_messages(self): 75 | return self.inputs[0].drain() 76 | 77 | 78 | class BinaryOperator(Operator): 79 | """A convenience implementation of a dataflow operator that has a handle to two 80 | incoming streams of data, and one handle to an outgoing stream of data. 81 | """ 82 | 83 | def __init__(self, input_a, input_b, output, f): 84 | super().__init__([input_a, input_b], output, f) 85 | 86 | def input_a_messages(self): 87 | return self.inputs[0].drain() 88 | 89 | def input_b_messages(self): 90 | return self.inputs[1].drain() 91 | 92 | 93 | class Graph: 94 | """An implementation of a dataflow graph. 95 | 96 | This implementation needs to keep the entire set of nodes so that they 97 | may be run, and only keeps a set of read handles to all edges for debugging 98 | purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because 99 | this object does not actually know anything about the connections between the 100 | various nodes. 101 | """ 102 | 103 | def __init__(self, streams, operators): 104 | self.streams = streams 105 | self.operators = operators 106 | 107 | def step(self): 108 | for op in self.operators: 109 | op.run() 110 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and 2 | accessing (key, value) structured data across multiple versions (times). 3 | """ 4 | 5 | from collections import defaultdict 6 | from collection import Collection 7 | from order import Version, Antichain 8 | 9 | 10 | class Index: 11 | """A map from a difference collection trace's keys -> versions at which 12 | the key has nonzero multiplicity -> (value, multiplicities) that changed. 13 | 14 | Used in operations like join and reduce where the operation needs to 15 | exploit the key-value structure of the data to run efficiently. 16 | 17 | This implementation supports the general case of partially ordered versions. 18 | """ 19 | 20 | def __init__(self): 21 | self.inner = defaultdict(lambda: defaultdict(list)) 22 | # TODO: take an initial time? 23 | self.compaction_frontier = None 24 | 25 | def _validate(self, requested_version): 26 | if self.compaction_frontier is None: 27 | return True 28 | if isinstance(requested_version, Antichain): 29 | assert self.compaction_frontier.less_equal(requested_version) 30 | elif isinstance(requested_version, Version): 31 | assert self.compaction_frontier.less_equal_version(requested_version) 32 | 33 | def reconstruct_at(self, key, requested_version): 34 | self._validate(requested_version) 35 | out = [] 36 | for (version, values) in self.inner[key].items(): 37 | if version.less_equal(requested_version): 38 | out.extend(values) 39 | return out 40 | 41 | def versions(self, key): 42 | return [version for version in self.inner[key].keys()] 43 | 44 | def add_value(self, key, version, value): 45 | self._validate(version) 46 | self.inner[key][version].append(value) 47 | 48 | def append(self, other): 49 | for (key, versions) in other.inner.items(): 50 | for (version, data) in versions.items(): 51 | self.inner[key][version].extend(data) 52 | 53 | def join(self, other): 54 | collections = defaultdict(list) 55 | for (key, versions) in self.inner.items(): 56 | if key not in other.inner: 57 | continue 58 | other_versions = other.inner[key] 59 | 60 | for (version1, data1) in versions.items(): 61 | for (version2, data2) in other_versions.items(): 62 | for (val1, mul1) in data1: 63 | for (val2, mul2) in data2: 64 | result_version = version1.join(version2) 65 | collections[result_version].append( 66 | ((key, (val1, val2)), mul1 * mul2) 67 | ) 68 | return [ 69 | (version, Collection(c)) for (version, c) in collections.items() if c != [] 70 | ] 71 | 72 | def compact(self, compaction_frontier, keys=[]): 73 | self._validate(compaction_frontier) 74 | 75 | def consolidate_values(values): 76 | consolidated = defaultdict(int) 77 | for (value, multiplicity) in values: 78 | consolidated[value] += multiplicity 79 | 80 | return [ 81 | (value, multiplicity) 82 | for (value, multiplicity) in consolidated.items() 83 | if multiplicity != 0 84 | ] 85 | 86 | if keys == []: 87 | keys = [key for key in self.inner.keys()] 88 | 89 | for key in keys: 90 | versions = self.inner[key] 91 | to_compact = [ 92 | version 93 | for version in versions.keys() 94 | if compaction_frontier.less_equal_version(version) is not True 95 | ] 96 | to_consolidate = set() 97 | for version in to_compact: 98 | values = versions.pop(version) 99 | new_version = version.advance_by(compaction_frontier) 100 | versions[new_version].extend(values) 101 | to_consolidate.add(new_version) 102 | for version in to_consolidate: 103 | values = versions.pop(version) 104 | versions[version] = consolidate_values(values) 105 | assert self.compaction_frontier is None or self.compaction_frontier.less_equal( 106 | compaction_frontier 107 | ) 108 | self.compaction_frontier = compaction_frontier 109 | -------------------------------------------------------------------------------- /v3/index.py: -------------------------------------------------------------------------------- 1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and 2 | accessing (key, value) structured data across multiple versions (times). 3 | """ 4 | 5 | from collections import defaultdict 6 | from collection import Collection 7 | 8 | 9 | class Index: 10 | """A map from a difference collection trace's keys -> versions at which 11 | the key has nonzero multiplicity -> (value, multiplicities) that changed. 12 | 13 | Used in operations like join and reduce where the operation needs to 14 | exploit the key-value structure of the data to run efficiently. 15 | 16 | This implementation is specialized for the case when versions are integers. 17 | """ 18 | 19 | def __init__(self, compaction_frontier=None): 20 | self._index = defaultdict(lambda: defaultdict(list)) 21 | self.compaction_frontier = compaction_frontier 22 | 23 | def __repr__(self): 24 | return "Index1D({self._index}, {self.compaction_frontier})" 25 | 26 | def _validate(self, requested_version): 27 | """Check that requests are at times allowed by the compaction frontier.""" 28 | assert ( 29 | self.compaction_frontier is None 30 | or requested_version >= self.compaction_frontier 31 | ) 32 | 33 | def reconstruct_at(self, key, requested_version): 34 | """Produce the accumulated ((key, value), multiplicity) records for the given key, at the requested version.""" 35 | self._validate(requested_version) 36 | out = [] 37 | for (version, values) in self._index[key].items(): 38 | if version <= requested_version: 39 | out.extend(values) 40 | return out 41 | 42 | def add_value(self, key, version, value): 43 | """Add a (value, multiplicity) pair for the requested key and version.""" 44 | self._validate(version) 45 | self._index[key][version].append(value) 46 | 47 | def append(self, other): 48 | """Combine all of the data in other into self.""" 49 | for (key, versions) in other._index.items(): 50 | for (version, data) in versions.items(): 51 | self._index[key][version].extend(data) 52 | 53 | def join(self, other): 54 | """Produce a bounded collection trace containing (key, (val1, val2)) 55 | for all (key, val1) in the first index, and (key, val2) in the second 56 | index. 57 | 58 | All outputs are produced at output version = max(version of record 1, 59 | version of record 2). 60 | """ 61 | collections = defaultdict(list) 62 | for (key, versions) in self._index.items(): 63 | if key not in other._index: 64 | continue 65 | other_versions = other._index[key] 66 | 67 | for (version1, data1) in versions.items(): 68 | for (version2, data2) in other_versions.items(): 69 | result_version = max(version1, version2) 70 | for (val1, mul1) in data1: 71 | for (val2, mul2) in data2: 72 | collections[result_version].append( 73 | ((key, (val1, val2)), mul1 * mul2) 74 | ) 75 | return [ 76 | (version, Collection(c)) for (version, c) in collections.items() if c != [] 77 | ] 78 | 79 | def compact(self, compaction_version, keys=[]): 80 | """Combine all changes observed before the requested compaction_version 81 | into the compaction_version. 82 | """ 83 | self._validate(compaction_version) 84 | 85 | def consolidate_values(values): 86 | consolidated = defaultdict(int) 87 | for (value, multiplicity) in values: 88 | consolidated[value] += multiplicity 89 | 90 | return [ 91 | (value, multiplicity) 92 | for (value, multiplicity) in consolidated.items() 93 | if multiplicity != 0 94 | ] 95 | 96 | if keys == []: 97 | keys = [key for key in self._index.keys()] 98 | 99 | for key in keys: 100 | versions = self._index[key] 101 | to_compact = [ 102 | version for version in versions.keys() if version <= compaction_version 103 | ] 104 | values = [] 105 | for version in to_compact: 106 | values.extend(versions.pop(version)) 107 | 108 | versions[compaction_version] = consolidate_values(values) 109 | self.compaction_frontier = compaction_version 110 | -------------------------------------------------------------------------------- /v4/index.py: -------------------------------------------------------------------------------- 1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and 2 | accessing (key, value) structured data across multiple versions (times). 3 | """ 4 | 5 | from collections import defaultdict 6 | from collection import Collection 7 | 8 | 9 | class Index: 10 | """A map from a difference collection trace's keys -> versions at which 11 | the key has nonzero multiplicity -> (value, multiplicities) that changed. 12 | 13 | Used in operations like join and reduce where the operation needs to 14 | exploit the key-value structure of the data to run efficiently. 15 | 16 | This implementation is specialized for the case when versions are integers. 17 | """ 18 | 19 | def __init__(self, compaction_frontier=None): 20 | self._index = defaultdict(lambda: defaultdict(list)) 21 | self.compaction_frontier = compaction_frontier 22 | 23 | def __repr__(self): 24 | return "Index1D({self._index}, {self.compaction_frontier})" 25 | 26 | def _validate(self, requested_version): 27 | """Check that requests are at times allowed by the compaction frontier.""" 28 | assert ( 29 | self.compaction_frontier is None 30 | or requested_version >= self.compaction_frontier 31 | ) 32 | 33 | def reconstruct_at(self, key, requested_version): 34 | """Produce the accumulated ((key, value), multiplicity) records for the given key, at the requested version.""" 35 | self._validate(requested_version) 36 | out = [] 37 | for (version, values) in self._index[key].items(): 38 | if version <= requested_version: 39 | out.extend(values) 40 | return out 41 | 42 | def add_value(self, key, version, value): 43 | """Add a (value, multiplicity) pair for the requested key and version.""" 44 | self._validate(version) 45 | self._index[key][version].append(value) 46 | 47 | def append(self, other): 48 | """Combine all of the data in other into self.""" 49 | for (key, versions) in other._index.items(): 50 | for (version, data) in versions.items(): 51 | self._index[key][version].extend(data) 52 | 53 | def join(self, other): 54 | """Produce a bounded collection trace containing (key, (val1, val2)) 55 | for all (key, val1) in the first index, and (key, val2) in the second 56 | index. 57 | 58 | All outputs are produced at output version = max(version of record 1, 59 | version of record 2). 60 | """ 61 | collections = defaultdict(list) 62 | for (key, versions) in self._index.items(): 63 | if key not in other._index: 64 | continue 65 | other_versions = other._index[key] 66 | 67 | for (version1, data1) in versions.items(): 68 | for (version2, data2) in other_versions.items(): 69 | result_version = max(version1, version2) 70 | for (val1, mul1) in data1: 71 | for (val2, mul2) in data2: 72 | collections[result_version].append( 73 | ((key, (val1, val2)), mul1 * mul2) 74 | ) 75 | return [ 76 | (version, Collection(c)) for (version, c) in collections.items() if c != [] 77 | ] 78 | 79 | def compact(self, compaction_version, keys=[]): 80 | """Combine all changes observed before the requested compaction_version 81 | into the compaction_version. 82 | """ 83 | self._validate(compaction_version) 84 | 85 | def consolidate_values(values): 86 | consolidated = defaultdict(int) 87 | for (value, multiplicity) in values: 88 | consolidated[value] += multiplicity 89 | 90 | return [ 91 | (value, multiplicity) 92 | for (value, multiplicity) in consolidated.items() 93 | if multiplicity != 0 94 | ] 95 | 96 | if keys == []: 97 | keys = [key for key in self._index.keys()] 98 | 99 | for key in keys: 100 | versions = self._index[key] 101 | to_compact = [ 102 | version for version in versions.keys() if version <= compaction_version 103 | ] 104 | values = [] 105 | for version in to_compact: 106 | values.extend(versions.pop(version)) 107 | 108 | versions[compaction_version] = consolidate_values(values) 109 | self.compaction_frontier = compaction_version 110 | -------------------------------------------------------------------------------- /v3/graph.py: -------------------------------------------------------------------------------- 1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program.""" 2 | 3 | from collections import deque 4 | from enum import Enum 5 | 6 | 7 | class MessageType(Enum): 8 | DATA = 1 9 | FRONTIER = 2 10 | 11 | 12 | class DifferenceStreamReader: 13 | """A read handle to a dataflow edge that receives data and frontier updates from a writer. 14 | 15 | The data received over this edge are pairs of (version, Collection) and the frontier 16 | updates. 17 | """ 18 | 19 | def __init__(self, queue): 20 | self._queue = queue 21 | 22 | def drain(self): 23 | out = [] 24 | while len(self._queue) > 0: 25 | out.append(self._queue.pop()) 26 | 27 | return out 28 | 29 | def is_empty(self): 30 | return len(self._queue) == 0 31 | 32 | def probe_frontier_less_than(self, frontier): 33 | for (typ, msg) in self._queue: 34 | if typ == MessageType.FRONTIER: 35 | received_frontier = msg 36 | if received_frontier >= frontier: 37 | return False 38 | return True 39 | 40 | 41 | class DifferenceStreamWriter: 42 | """A write handle to a dataflow edge that is allowed to publish data and send 43 | frontier updates. 44 | """ 45 | 46 | def __init__(self): 47 | self._queues = [] 48 | self.frontier = None 49 | 50 | def send_data(self, version, collection): 51 | assert self.frontier is None or self.frontier <= version 52 | for q in self._queues: 53 | q.appendleft((MessageType.DATA, (version, collection))) 54 | 55 | def send_frontier(self, frontier): 56 | assert self.frontier is None or self.frontier <= frontier 57 | 58 | self.frontier = frontier 59 | for q in self._queues: 60 | q.appendleft((MessageType.FRONTIER, frontier)) 61 | 62 | def _new_reader(self): 63 | q = deque() 64 | self._queues.append(q) 65 | return DifferenceStreamReader(q) 66 | 67 | 68 | class Operator: 69 | """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and 70 | one outgoing edge (write handle). 71 | """ 72 | 73 | def __init__(self, inputs, output, f, initial_frontier): 74 | self.inputs = inputs 75 | self.output = output 76 | self.f = f 77 | self.pending_work = False 78 | self.input_frontiers = [initial_frontier for _ in self.inputs] 79 | self.output_frontier = initial_frontier 80 | 81 | def run(self): 82 | self.f() 83 | 84 | def pending_work(self): 85 | if self.pending_work is True: 86 | return True 87 | for input_listener in self.inputs: 88 | if input_listener.is_empty() is False: 89 | return True 90 | return False 91 | 92 | def frontiers(self): 93 | return (self.input_frontiers, self.output_frontier) 94 | 95 | 96 | class UnaryOperator(Operator): 97 | """A convenience implementation of a dataflow operator that has a handle to one 98 | incoming stream of data, and one handle to an outgoing stream of data. 99 | """ 100 | 101 | def __init__(self, input_a, output, f, initial_frontier): 102 | super().__init__([input_a], output, f, initial_frontier) 103 | 104 | def input_messages(self): 105 | return self.inputs[0].drain() 106 | 107 | def input_frontier(self): 108 | return self.input_frontiers[0] 109 | 110 | def set_input_frontier(self, frontier): 111 | self.input_frontiers[0] = frontier 112 | 113 | 114 | class BinaryOperator(Operator): 115 | """A convenience implementation of a dataflow operator that has a handle to two 116 | incoming streams of data, and one handle to an outgoing stream of data. 117 | """ 118 | 119 | def __init__(self, input_a, input_b, output, f, initial_frontier): 120 | super().__init__([input_a, input_b], output, f, initial_frontier) 121 | 122 | def input_a_messages(self): 123 | return self.inputs[0].drain() 124 | 125 | def input_a_frontier(self): 126 | return self.input_frontiers[0] 127 | 128 | def set_input_a_frontier(self, frontier): 129 | self.input_frontiers[0] = frontier 130 | 131 | def input_b_messages(self): 132 | return self.inputs[1].drain() 133 | 134 | def input_b_frontier(self): 135 | return self.input_frontiers[1] 136 | 137 | def set_input_b_frontier(self, frontier): 138 | self.input_frontiers[1] = frontier 139 | 140 | 141 | class Graph: 142 | """An implementation of a dataflow graph. 143 | 144 | This implementation needs to keep the entire set of nodes so that they 145 | may be run, and only keeps a set of read handles to all edges for debugging 146 | purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because 147 | this object does not actually know anything about the connections between the 148 | various nodes. 149 | """ 150 | 151 | def __init__(self, streams, operators): 152 | self.streams = streams 153 | self.operators = operators 154 | 155 | def step(self): 156 | for op in self.operators: 157 | op.run() 158 | -------------------------------------------------------------------------------- /v4/graph.py: -------------------------------------------------------------------------------- 1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program.""" 2 | 3 | from collections import deque 4 | from enum import Enum 5 | 6 | 7 | class MessageType(Enum): 8 | DATA = 1 9 | FRONTIER = 2 10 | 11 | 12 | class DifferenceStreamReader: 13 | """A read handle to a dataflow edge that receives data and frontier updates from a writer. 14 | 15 | The data received over this edge are pairs of (version, Collection) and the frontier 16 | updates are either integers (in the one dimensional case) or Antichains (in the general 17 | case). 18 | """ 19 | 20 | def __init__(self, queue): 21 | self._queue = queue 22 | 23 | def drain(self): 24 | out = [] 25 | while len(self._queue) > 0: 26 | out.append(self._queue.pop()) 27 | 28 | return out 29 | 30 | def is_empty(self): 31 | return len(self._queue) == 0 32 | 33 | def probe_frontier_less_than(self, frontier): 34 | for (typ, msg) in self._queue: 35 | if typ == MessageType.FRONTIER: 36 | received_frontier = msg 37 | if received_frontier >= frontier: 38 | return False 39 | return True 40 | 41 | 42 | class DifferenceStreamWriter: 43 | """A write handle to a dataflow edge that is allowed to publish data and send 44 | frontier updates. 45 | """ 46 | 47 | def __init__(self): 48 | self._queues = [] 49 | self.frontier = None 50 | 51 | def send_data(self, version, collection): 52 | if self.frontier is not None and self.frontier > version: 53 | print(f"frontier {self.frontier}, version: {version}") 54 | assert self.frontier is None or self.frontier <= version 55 | for q in self._queues: 56 | q.appendleft((MessageType.DATA, (version, collection))) 57 | 58 | def send_frontier(self, frontier): 59 | assert self.frontier is None or self.frontier <= frontier 60 | 61 | self.frontier = frontier 62 | for q in self._queues: 63 | q.appendleft((MessageType.FRONTIER, frontier)) 64 | 65 | def _new_reader(self): 66 | q = deque() 67 | self._queues.append(q) 68 | return DifferenceStreamReader(q) 69 | 70 | 71 | class Operator: 72 | """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and 73 | one outgoing edge (write handle). 74 | """ 75 | 76 | def __init__(self, inputs, output, f, initial_frontier): 77 | self.inputs = inputs 78 | self.output = output 79 | self.f = f 80 | self.pending_work = False 81 | self.input_frontiers = [initial_frontier for _ in self.inputs] 82 | self.output_frontier = initial_frontier 83 | 84 | def run(self): 85 | self.f() 86 | 87 | def pending_work(self): 88 | if self.pending_work is True: 89 | return True 90 | for input_listener in self.inputs: 91 | if input_listener.is_empty() is False: 92 | return True 93 | return False 94 | 95 | def frontiers(self): 96 | return (self.input_frontiers, self.output_frontier) 97 | 98 | 99 | class UnaryOperator(Operator): 100 | """A convenience implementation of a dataflow operator that has a handle to one 101 | incoming stream of data, and one handle to an outgoing stream of data. 102 | """ 103 | 104 | def __init__(self, input_a, output, f, initial_frontier): 105 | super().__init__([input_a], output, f, initial_frontier) 106 | 107 | def input_messages(self): 108 | return self.inputs[0].drain() 109 | 110 | def input_frontier(self): 111 | return self.input_frontiers[0] 112 | 113 | def set_input_frontier(self, frontier): 114 | self.input_frontiers[0] = frontier 115 | 116 | 117 | class BinaryOperator(Operator): 118 | """A convenience implementation of a dataflow operator that has a handle to two 119 | incoming streams of data, and one handle to an outgoing stream of data. 120 | """ 121 | 122 | def __init__(self, input_a, input_b, output, f, initial_frontier): 123 | super().__init__([input_a, input_b], output, f, initial_frontier) 124 | 125 | def input_a_messages(self): 126 | return self.inputs[0].drain() 127 | 128 | def input_a_frontier(self): 129 | return self.input_frontiers[0] 130 | 131 | def set_input_a_frontier(self, frontier): 132 | self.input_frontiers[0] = frontier 133 | 134 | def input_b_messages(self): 135 | return self.inputs[1].drain() 136 | 137 | def input_b_frontier(self): 138 | return self.input_frontiers[1] 139 | 140 | def set_input_b_frontier(self, frontier): 141 | self.input_frontiers[1] = frontier 142 | 143 | 144 | class Graph: 145 | """An implementation of a dataflow graph. 146 | 147 | This implementation needs to keep the entire set of nodes so that they 148 | may be run, and only keeps a set of read handles to all edges for debugging 149 | purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because 150 | this object does not actually know anything about the connections between the 151 | various nodes. 152 | """ 153 | 154 | def __init__(self, streams, operators): 155 | self.streams = streams 156 | self.operators = operators 157 | 158 | def step(self): 159 | for op in self.operators: 160 | op.run() 161 | -------------------------------------------------------------------------------- /graph.py: -------------------------------------------------------------------------------- 1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program.""" 2 | 3 | from collections import deque 4 | from enum import Enum 5 | 6 | 7 | class MessageType(Enum): 8 | DATA = 1 9 | FRONTIER = 2 10 | 11 | 12 | class DifferenceStreamReader: 13 | """A read handle to a dataflow edge that receives data and frontier updates from a writer. 14 | 15 | The data received over this edge are pairs of (version, Collection) and the frontier 16 | updates are either integers (in the one dimensional case) or Antichains (in the general 17 | case). 18 | """ 19 | 20 | def __init__(self, queue): 21 | self._queue = queue 22 | 23 | def drain(self): 24 | out = [] 25 | while len(self._queue) > 0: 26 | out.append(self._queue.pop()) 27 | 28 | return out 29 | 30 | def is_empty(self): 31 | return len(self._queue) == 0 32 | 33 | def probe_frontier_less_than(self, frontier): 34 | for (typ, msg) in self._queue: 35 | if typ == MessageType.FRONTIER: 36 | received_frontier = msg 37 | if frontier.less_equal(received_frontier): 38 | return False 39 | return True 40 | 41 | 42 | class DifferenceStreamWriter: 43 | """A write handle to a dataflow edge that is allowed to publish data and send 44 | frontier updates. 45 | """ 46 | 47 | def __init__(self): 48 | self._queues = [] 49 | self.frontier = None 50 | 51 | def send_data(self, version, collection): 52 | if isinstance(version, int): 53 | assert self.frontier is None or self.frontier <= version 54 | else: 55 | assert self.frontier is None or self.frontier.less_equal_version(version) 56 | for q in self._queues: 57 | q.appendleft((MessageType.DATA, (version, collection))) 58 | 59 | def send_frontier(self, frontier): 60 | if isinstance(frontier, int): 61 | assert self.frontier is None or self.frontier <= frontier 62 | else: 63 | assert self.frontier is None or self.frontier.less_equal(frontier) 64 | 65 | self.frontier = frontier 66 | for q in self._queues: 67 | q.appendleft((MessageType.FRONTIER, frontier)) 68 | 69 | def _new_reader(self): 70 | q = deque() 71 | self._queues.append(q) 72 | return DifferenceStreamReader(q) 73 | 74 | 75 | class Operator: 76 | """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and 77 | one outgoing edge (write handle). 78 | """ 79 | 80 | def __init__(self, inputs, output, f, initial_frontier): 81 | self.inputs = inputs 82 | self.output = output 83 | self.f = f 84 | self.pending_work = False 85 | self.input_frontiers = [initial_frontier for _ in self.inputs] 86 | self.output_frontier = initial_frontier 87 | 88 | def run(self): 89 | self.f() 90 | 91 | def pending_work(self): 92 | if self.pending_work is True: 93 | return True 94 | for input_listener in self.inputs: 95 | if input_listener.is_empty() is False: 96 | return True 97 | return False 98 | 99 | def frontiers(self): 100 | return (self.input_frontiers, self.output_frontier) 101 | 102 | 103 | class UnaryOperator(Operator): 104 | """A convenience implementation of a dataflow operator that has a handle to one 105 | incoming stream of data, and one handle to an outgoing stream of data. 106 | """ 107 | 108 | def __init__(self, input_a, output, f, initial_frontier): 109 | super().__init__([input_a], output, f, initial_frontier) 110 | 111 | def input_messages(self): 112 | return self.inputs[0].drain() 113 | 114 | def input_frontier(self): 115 | return self.input_frontiers[0] 116 | 117 | def set_input_frontier(self, frontier): 118 | self.input_frontiers[0] = frontier 119 | 120 | 121 | class BinaryOperator(Operator): 122 | """A convenience implementation of a dataflow operator that has a handle to two 123 | incoming streams of data, and one handle to an outgoing stream of data. 124 | """ 125 | 126 | def __init__(self, input_a, input_b, output, f, initial_frontier): 127 | super().__init__([input_a, input_b], output, f, initial_frontier) 128 | 129 | def input_a_messages(self): 130 | return self.inputs[0].drain() 131 | 132 | def input_a_frontier(self): 133 | return self.input_frontiers[0] 134 | 135 | def set_input_a_frontier(self, frontier): 136 | self.input_frontiers[0] = frontier 137 | 138 | def input_b_messages(self): 139 | return self.inputs[1].drain() 140 | 141 | def input_b_frontier(self): 142 | return self.input_frontiers[1] 143 | 144 | def set_input_b_frontier(self, frontier): 145 | self.input_frontiers[1] = frontier 146 | 147 | 148 | class Graph: 149 | """An implementation of a dataflow graph. 150 | 151 | This implementation needs to keep the entire set of nodes so that they 152 | may be run, and only keeps a set of read handles to all edges for debugging 153 | purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because 154 | this object does not actually know anything about the connections between the 155 | various nodes. 156 | """ 157 | 158 | def __init__(self, streams, operators): 159 | self.streams = streams 160 | self.operators = operators 161 | 162 | def step(self): 163 | for op in self.operators: 164 | op.run() 165 | -------------------------------------------------------------------------------- /order.py: -------------------------------------------------------------------------------- 1 | """The implementation of partially ordered versions (times) for use within a differential dataflow. 2 | """ 3 | 4 | 5 | class Version: 6 | """A partially, or totally ordered version (time), consisting of a tuple of 7 | integers. 8 | 9 | All versions within a scope of a dataflow must have the same dimension/number 10 | of coordinates. One dimensional versions are totally ordered. Multidimensional 11 | versions are partially ordered by the product partial order. 12 | """ 13 | 14 | def __init__(self, version): 15 | if isinstance(version, int): 16 | assert version >= 0 17 | self.inner = (version,) 18 | elif isinstance(version, list) or isinstance(version, tuple): 19 | for i in version: 20 | assert isinstance(i, int) 21 | assert i >= 0 22 | self.inner = tuple(version) 23 | else: 24 | assert 0 > 1 25 | 26 | def __repr__(self): 27 | return f"Version({self.inner})" 28 | 29 | def __eq__(self, other): 30 | return self.inner == other.inner 31 | 32 | # The less than implementation used to sort versions must respect the partial 33 | # order (important for reduce). 34 | def __lt__(self, other): 35 | return self.inner.__lt__(other.inner) 36 | 37 | def __hash__(self): 38 | return hash(self.inner) 39 | 40 | def _validate(self, other): 41 | assert len(self.inner) > 0 42 | assert len(self.inner) == len(other.inner) 43 | 44 | def less_equal(self, other): 45 | self._validate(other) 46 | 47 | for (i1, i2) in zip(self.inner, other.inner): 48 | if i1 > i2: 49 | return False 50 | return True 51 | 52 | def less_than(self, other): 53 | if self.less_equal(other) is True and self.inner != other.inner: 54 | return True 55 | return False 56 | 57 | def join(self, other): 58 | self._validate(other) 59 | out = [] 60 | 61 | for (i1, i2) in zip(self.inner, other.inner): 62 | out.append(max(i1, i2)) 63 | return Version(out) 64 | 65 | def meet(self, other): 66 | self._validate(other) 67 | out = [] 68 | 69 | for (i1, i2) in zip(self.inner, other.inner): 70 | out.append(min(i1, i2)) 71 | return Version(out) 72 | 73 | # TODO the proof for this is in the sharing arrangements paper. 74 | def advance_by(self, frontier): 75 | if frontier.inner == (): 76 | return self 77 | result = self.join(frontier.inner[0]) 78 | for elem in frontier.inner: 79 | result = result.meet(self.join(elem)) 80 | return result 81 | 82 | def extend(self): 83 | elements = [e for e in self.inner] 84 | elements.append(0) 85 | return Version(elements) 86 | 87 | def truncate(self): 88 | elements = [e for e in self.inner] 89 | elements.pop() 90 | return Version(elements) 91 | 92 | def apply_step(self, step): 93 | assert step > 0 94 | elements = [e for e in self.inner] 95 | elements[-1] += step 96 | return Version(elements) 97 | 98 | 99 | # This keeps the min antichain. 100 | # I fully stole this from frank. TODO: Understand this better 101 | class Antichain: 102 | """A minimal set of incomparable versions.""" 103 | 104 | def __init__(self, elements): 105 | self.inner = [] 106 | for element in elements: 107 | self._insert(element) 108 | 109 | def __repr__(self): 110 | return f"Antichain({self.inner})" 111 | 112 | def _insert(self, element): 113 | for e in self.inner: 114 | if e.less_equal(element): 115 | return 116 | self.inner = [x for x in self.inner if element.less_equal(x) is not True] 117 | self.inner.append(element) 118 | 119 | # TODO: is it true that the set of versions <= meet(x, y) is the intersection of the set of versions <= x and the set of versions <= y? 120 | def meet(self, other): 121 | out = Antichain([]) 122 | for element in self.inner: 123 | out._insert(element) 124 | for element in other.inner: 125 | out._insert(element) 126 | 127 | return out 128 | 129 | def _equals(self, other): 130 | elements_1 = [x for x in self.inner] 131 | elements_2 = [y for y in other.inner] 132 | 133 | if len(elements_1) != len(elements_2): 134 | return False 135 | elements_1.sort() 136 | elements_2.sort() 137 | 138 | for (x, y) in zip(elements_1, elements_2): 139 | if x != y: 140 | return False 141 | return True 142 | 143 | # Returns true if other dominates self 144 | # in other words self < other means 145 | # self <= other AND self != other 146 | def less_than(self, other): 147 | if self.less_equal(other) is not True: 148 | return False 149 | 150 | if self._equals(other): 151 | return False 152 | 153 | return True 154 | 155 | def less_equal(self, other): 156 | for o in other.inner: 157 | less_equal = False 158 | for s in self.inner: 159 | if s.less_equal(o): 160 | less_equal = True 161 | if less_equal == False: 162 | return False 163 | return True 164 | 165 | def less_equal_version(self, version): 166 | for elem in self.inner: 167 | if elem.less_equal(version): 168 | return True 169 | return False 170 | 171 | def extend(self): 172 | out = Antichain([]) 173 | for elem in self.inner: 174 | out._insert(elem.extend()) 175 | return out 176 | 177 | def truncate(self): 178 | out = Antichain([]) 179 | for elem in self.inner: 180 | out._insert(elem.truncate()) 181 | return out 182 | 183 | def apply_step(self, step): 184 | out = Antichain([]) 185 | for elem in self.inner: 186 | out._insert(elem.apply_step(step)) 187 | return out 188 | 189 | def _elements(self): 190 | return [x for x in self.inner] 191 | 192 | 193 | if __name__ == "__main__": 194 | 195 | v0_0 = Version([0, 0]) 196 | v1_0 = Version([1, 0]) 197 | v0_1 = Version([0, 1]) 198 | v1_1 = Version([1, 1]) 199 | v2_0 = Version([2, 0]) 200 | 201 | assert v0_0.less_than(v1_0) 202 | assert v0_0.less_than(v0_1) 203 | assert v0_0.less_than(v1_1) 204 | assert v0_0.less_equal(v1_0) 205 | assert v0_0.less_equal(v0_1) 206 | assert v0_0.less_equal(v1_1) 207 | 208 | assert v1_0.less_than(v1_0) is not True 209 | assert v1_0.less_equal(v1_0) 210 | assert v1_0.less_equal(v0_1) is not True 211 | assert v0_1.less_equal(v1_0) is not True 212 | assert v0_1.less_equal(v1_1) 213 | assert v1_0.less_equal(v1_1) 214 | assert v0_0.less_equal(v1_1) 215 | 216 | assert Antichain([v0_0]).less_equal(Antichain([v1_0])) 217 | assert Antichain([v0_0])._equals(Antichain([v1_0])) is not True 218 | assert Antichain([v0_0]).less_than(Antichain([v1_0])) 219 | assert Antichain([v2_0, v1_1]).less_than(Antichain([v2_0])) 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Differential Dataflow, in Python. 2 | 3 | WIP. 4 | 5 | This is an implementation of Differential Dataflow in Python, that is meant as a learning 6 | tool. This implementation is not meant to be high performance - for that please go to the 7 | Rust implementation. 8 | 9 | Simple explanation of what this code does: users get to define their computations as composition 10 | of functional operators like map/filter/join/count/sum/etc. These computations can even have recursion. 11 | They can then send inputs to those computations and get answers back quickly and efficiently. They can keep 12 | sending new inputs, and changing the inputs in arbitrary ways, and keep getting new answers back quickly 13 | and efficiently, regardless of the computation they defined. 14 | 15 | Small terminology note: I started using version instead of time/timestamp, and multiplicity instead of diff, throughout 16 | the code, so I will use those names here as well. 17 | 18 | The code includes several preliminary implementations that build on concepts introduced in each other, to make things easier to understand. These preliminary implementations are in the directories `v0` - `v4`. Each directory is entirely self contained, however a lot of the components in a given implementation are duplicated from prior ones. There are 5 preliminary implementations and one final one: 19 | - `v0`: defines a collection (multiset) of data and implements the various operations (join/reduce/map/etc) over 20 | a single collection. This is roughly the starting point for "what are we even trying to do?". 21 | - `v1`: defines a finite, totally ordered, sequence of difference collections, to describe a location that changes. `v1` also 22 | implements the various operations over such difference collection sequences efficiently. Compared to `v0` the main change in `v1` is 23 | we need to use indexes to efficiently compute reductions and joins when only a small subset of keys change from one collection 24 | version to the next. 25 | - `v2`: extends the approach in `v1` to support an unbounded number of difference collections. Now, we have to explicitly construct a dataflow 26 | graph, rather than relying on the implicit graph induced by function calls. All of the data travels through dataflow edges (basically queues), 27 | to operators/nodes (basically a struct that do some `work_function` to effect a computation + hold onto state across invocations of its 28 | `work_fn`). Each operator still has to output data in order, and binary operators always need to wait for both inputs to become available before they can produce an output. Roughly, this is an approach to implememting something like differential while rejecting a lot of the timely paper. 29 | - `v3`: extends `v2` to explicitly attach a version (time) label to all messages. Operators also now receive a message when a given version/range of versions will no longer receive any more data. Versions are constrained to be integers. 30 | - `v4`: extends `v3` to allow versions to be tuples of integers that are totally ordered with the lexicographic order. This implementation is the first that supports `iterate` with changing data, but the user has to specify a cap on the number of iterations. 31 | TODO: I'm not sure that the cap on the number of iterations is strictly necessary 32 | - `final`/the toplevel of this directory: extends `v4` to support versions that are partially ordered with the product partial order. The 33 | versions still have to be tuples of integers. 34 | 35 | This implementation is different from other implementations (to the best of my knowledge) in that it doesn't 36 | rely on a scheduler reasoning about the structure of the computation graph and scheduling operators intelligently 37 | to guarantee progress / eventual termination. 38 | 39 | Instead, implementation provides the following guarantees: 40 | 41 | 1. After sending a finite number of collections and advancing the frontiers of all inputs to the dataflow graph past a finite set of 42 | versions, the output should, after a finite number of calls to `graph.step()`, see the correct outputs at those versions and also close 43 | those versions. 44 | 45 | 2. Eventually, after all inputs have ceased sending new data or advancing frontiers, all nodes in the dataflow graph should stop producing 46 | either new data or new frontier updates iff the dataflow graph does not contain any non-convergent iterative computations. 47 | 48 | My understanding is that for acyclic dataflow graphs these properties can be satisfied by: 49 | 50 | A. For any set of inputs, all operators are guaranteed to produce their individual expected outputs after a finite number of executions. 51 | So, for example, `reduce` can only produce outputs at versions that are closed, so if no versions are closed, it is to be expected that `reduce` 52 | will not produce any outputs. But once a version is closed, it should produce an output for that version, and potentially others, after a finite 53 | number of executions. 54 | 55 | B. All dataflow operators will only ever produce a finite number of output messages (new collections of data / frontier updates) in response 56 | to any one input message (input collections of data / frontier updates). 57 | 58 | (I'm not claiming to have proved these properties, and indeed I am not even totally how to.) 59 | 60 | For cyclic dataflow graphs, the situation is complicated by the existence of a feedback operator that sends messages in a cycle 61 | to another operator, but with their versions incremented. 62 | 63 | ``` 64 | def example(collection): 65 | return ( 66 | collection.map(lambda data: data + 1) 67 | .map(lambda data: data - 1) 68 | .negate() 69 | .concat(collection) 70 | .consolidate() # This step is mandatory for termination. 71 | ) 72 | 73 | output = input_a.iterate(example).debug("iterate") 74 | graph = graph_builder.finalize() 75 | 76 | input_a_writer.send_data(Version(0), Collection([(1, 1)])) 77 | input_a_writer.send_frontier(Antichain([Version(1)])) 78 | 79 | for i in range(0, 10): 80 | graph.step() 81 | ``` 82 | 83 | Take the following simple example. Here, every step of the iteration takes the 84 | input and applies two consecutive map operators which are collectively a no-op 85 | and the negates the input and concatenates it with itself. Every input therefore 86 | produces the empty collection and this loop should reach fixedpoint in two iterations (two not one because of how `iterate` works and needs to subtract the top-level input on the second iteration). 87 | 88 | However, if you remove the `consolidate`, which waits to produce data at a given 89 | version until all inputs have provided all of the data at that version and updated 90 | their frontiers, then there some operator execution orderings for which this loop will continue circulating non-empty differences and never terminate. This is also 91 | a concern in the Rust implementation, and the Rust implementation also requires that all paths from iterative subgraph input to output have a consolidation step 92 | that makes sure all differences at a given version meet up and get cancelled out 93 | (TODO: LINK). 94 | 95 | There's a second concern: once fixedpoint has been reached (say at `version(0, 1)` in the example above we know we are done with the computation for the top level `version(0)`.) 96 | 97 | We don't then want frontier updates like: 98 | 99 | ``` 100 | Antichain([Version(1, 0), Version(0, 2)]) 101 | Antichain([Version(1, 0), Version(0, 3)]) 102 | Antichain([Version(1, 0), Version(0, 4)]) 103 | ... 104 | ``` 105 | to keep circulating through the iteration subgraph. We'd like instead for one of 106 | the operators to realize "hey, we are done with `Version(0, *)` so we can drop 107 | that from the frontier". This code assigns the feedback operator to this task, 108 | and allows it to drop antichain elements for upper level times that have reached 109 | fixedpoint. 110 | 111 | TODO: I want to understand a bit better how timely does this. 112 | TODO: The code for handling this in the feedback operator is not very nice. Ideally, we would be able to express this operation in a more mathematical way. Perhaps capabilities are a more reasonable interface for this? 113 | -------------------------------------------------------------------------------- /v0/collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def concat(self, other): 40 | """Concatenate two collections together.""" 41 | out = [] 42 | out.extend(self._inner) 43 | out.extend(other._inner) 44 | return Collection(out) 45 | 46 | def consolidate(self): 47 | """Produce as output a collection that is logically equivalent to the input 48 | but which combines identical instances of the same record into one 49 | (record, multiplicity) pair. 50 | """ 51 | consolidated = defaultdict(int) 52 | for (data, multiplicity) in self._inner: 53 | consolidated[data] += multiplicity 54 | consolidated = [ 55 | (data, multiplicity) 56 | for (data, multiplicity) in consolidated.items() 57 | if multiplicity != 0 58 | ] 59 | consolidated.sort() 60 | return Collection(consolidated) 61 | 62 | def join(self, other): 63 | """Match pairs (k, v1) and (k, v2) from the two input collections and produce (k, (v1, v2)).""" 64 | out = [] 65 | for ((k1, v1), d1) in self._inner: 66 | for ((k2, v2), d2) in other._inner: 67 | if k1 == k2: 68 | out.append(((k1, (v1, v2)), d1 * d2)) 69 | return Collection(out) 70 | 71 | def reduce(self, f): 72 | """Apply a reduction function to all record values, grouped by key.""" 73 | keys = defaultdict(list) 74 | out = [] 75 | for ((key, val), multiplicity) in self._inner: 76 | keys[key].append((val, multiplicity)) 77 | for (key, vals) in keys.items(): 78 | results = f(vals) 79 | for (val, multiplicity) in results: 80 | out.append(((key, val), multiplicity)) 81 | return Collection(out) 82 | 83 | def count(self): 84 | """Count the number of times each key occurs in the collection.""" 85 | 86 | def count_inner(vals): 87 | out = 0 88 | for (_, multiplicity) in vals: 89 | out += multiplicity 90 | return [(out, 1)] 91 | 92 | return self.reduce(count_inner) 93 | 94 | def sum(self): 95 | """Produce the sum of all the values paired with a key, for all keys in the collection.""" 96 | 97 | def sum_inner(vals): 98 | out = 0 99 | for (val, multiplicity) in vals: 100 | out += val * multiplicity 101 | return [(out, 1)] 102 | 103 | return self.reduce(sum_inner) 104 | 105 | def min(self): 106 | """Produce the minimum value associated with each key in the collection. 107 | 108 | Note that no record may have negative multiplicity when computing the min, 109 | as it is unclear what exactly the minimum record is in that case. 110 | """ 111 | 112 | def min_inner(vals): 113 | consolidated = defaultdict(int) 114 | for (val, multiplicity) in vals: 115 | consolidated[val] += multiplicity 116 | vals = [ 117 | (val, multiplicity) 118 | for (val, multiplicity) in consolidated.items() 119 | if multiplicity != 0 120 | ] 121 | if len(vals) != 0: 122 | out = vals[0][0] 123 | for (val, multiplicity) in vals: 124 | assert multiplicity > 0 125 | if val < out: 126 | out = val 127 | return [(out, 1)] 128 | else: 129 | return [] 130 | 131 | return self.reduce(min_inner) 132 | 133 | def max(self): 134 | """Produce the maximum value associated with each key in the collection. 135 | 136 | Note that no record may have negative multiplicity when computing the max, 137 | as it is unclear what exactly the maximum record is in that case. 138 | """ 139 | 140 | def max_inner(vals): 141 | consolidated = defaultdict(int) 142 | for (val, multiplicity) in vals: 143 | consolidated[val] += multiplicity 144 | vals = [ 145 | (val, multiplicity) 146 | for (val, multiplicity) in consolidated.items() 147 | if multiplicity != 0 148 | ] 149 | if len(vals) != 0: 150 | out = vals[0][0] 151 | for (val, multiplicity) in vals: 152 | assert multiplicity > 0 153 | if val > out: 154 | out = val 155 | return [(out, 1)] 156 | else: 157 | return [] 158 | 159 | return self.reduce(max_inner) 160 | 161 | def distinct(self): 162 | """Reduce the collection to a set of elements (from a multiset). 163 | 164 | Note that no record may have negative multiplicity when producing this set, 165 | as elements of sets may only have multiplicity one, and it is unclear that is 166 | an appropriate output for elements with negative multiplicity. 167 | """ 168 | 169 | def distinct_inner(vals): 170 | consolidated = defaultdict(int) 171 | for (val, multiplicity) in vals: 172 | consolidated[val] += multiplicity 173 | vals = [ 174 | (val, multiplicity) 175 | for (val, multiplicity) in consolidated.items() 176 | if multiplicity != 0 177 | ] 178 | for (val, multiplicity) in vals: 179 | assert multiplicity > 0 180 | return [(val, 1) for (val, _) in vals] 181 | 182 | return self.reduce(distinct_inner) 183 | 184 | def iterate(self, f): 185 | """Repeatedly invoke a function f on a collection, and return the result 186 | of applying the function an infinite number of times (fixedpoint). 187 | 188 | Note that if the function does not converge to a fixedpoint this implementation 189 | will run forever. 190 | """ 191 | curr = Collection(self._inner) 192 | while True: 193 | result = f(curr) 194 | if result._inner == curr._inner: 195 | break 196 | curr = result 197 | return curr 198 | 199 | 200 | if __name__ == "__main__": 201 | a = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1)]) 202 | b = Collection( 203 | [ 204 | (("apple", "$3"), 1), 205 | (("apple", ("granny smith", "$2")), 1), 206 | (("kiwi", "$2"), 1), 207 | ] 208 | ) 209 | c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)]) 210 | d = Collection( 211 | [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)] 212 | ) 213 | e = Collection([(1, 1)]) 214 | 215 | print(a.concat(b)) 216 | print(a.join(b)) 217 | print(b.join(a)) 218 | print(a.filter(lambda data: data[0] != "apple")) 219 | print(a.map(lambda data: (data[1], data[0]))) 220 | print(a.concat(b).count()) 221 | print(a.concat(b).distinct()) 222 | print(c.min()) 223 | print(c.max()) 224 | print(d.sum()) 225 | 226 | def add_one(collection): 227 | return ( 228 | collection.map(lambda data: data + 1) 229 | .concat(collection) 230 | .filter(lambda data: data <= 5) 231 | .map(lambda data: (data, ())) 232 | .distinct() 233 | .map(lambda data: data[0]) 234 | .consolidate() 235 | ) 236 | 237 | result = e.iterate(add_one).map(lambda data: (data, data * data)) 238 | print(result) 239 | -------------------------------------------------------------------------------- /collection.py: -------------------------------------------------------------------------------- 1 | """The implementation of collections (multisets) of data and functional operations over single collections. 2 | """ 3 | 4 | from collections import defaultdict 5 | 6 | 7 | class Collection: 8 | """A multiset of data""" 9 | 10 | def __init__(self, dataz=None): 11 | if dataz is None: 12 | dataz = [] 13 | self._inner = dataz 14 | 15 | def __repr__(self): 16 | return f"Collection({self._inner})" 17 | 18 | def map(self, f): 19 | """Apply a function to all records in the collection.""" 20 | return Collection( 21 | [(f(data), multiplicity) for (data, multiplicity) in self._inner] 22 | ) 23 | 24 | def filter(self, f): 25 | """Filter out records for which a function f(record) evaluates to False.""" 26 | return Collection( 27 | [ 28 | (data, multiplicity) 29 | for (data, multiplicity) in self._inner 30 | if f(data) == True 31 | ] 32 | ) 33 | 34 | def negate(self): 35 | return Collection( 36 | [(data, -multiplicity) for (data, multiplicity) in self._inner] 37 | ) 38 | 39 | def concat(self, other): 40 | """Concatenate two collections together.""" 41 | out = [] 42 | out.extend(self._inner) 43 | out.extend(other._inner) 44 | return Collection(out) 45 | 46 | def consolidate(self): 47 | """Produce as output a collection that is logically equivalent to the input 48 | but which combines identical instances of the same record into one 49 | (record, multiplicity) pair. 50 | """ 51 | consolidated = defaultdict(int) 52 | for (data, multiplicity) in self._inner: 53 | consolidated[data] += multiplicity 54 | consolidated = [ 55 | (data, multiplicity) 56 | for (data, multiplicity) in consolidated.items() 57 | if multiplicity != 0 58 | ] 59 | consolidated.sort() 60 | return Collection(consolidated) 61 | 62 | def join(self, other): 63 | """Match pairs (k, v1) and (k, v2) from the two input collections and produce (k, (v1, v2)).""" 64 | out = [] 65 | for ((k1, v1), d1) in self._inner: 66 | for ((k2, v2), d2) in other._inner: 67 | if k1 == k2: 68 | out.append(((k1, (v1, v2)), d1 * d2)) 69 | return Collection(out) 70 | 71 | def reduce(self, f): 72 | """Apply a reduction function to all record values, grouped by key.""" 73 | keys = defaultdict(list) 74 | out = [] 75 | for ((key, val), multiplicity) in self._inner: 76 | keys[key].append((val, multiplicity)) 77 | for (key, vals) in keys.items(): 78 | results = f(vals) 79 | for (val, multiplicity) in results: 80 | out.append(((key, val), multiplicity)) 81 | return Collection(out) 82 | 83 | def count(self): 84 | """Count the number of times each key occurs in the collection.""" 85 | 86 | def count_inner(vals): 87 | out = 0 88 | for (_, multiplicity) in vals: 89 | out += multiplicity 90 | return [(out, 1)] 91 | 92 | return self.reduce(count_inner) 93 | 94 | def sum(self): 95 | """Produce the sum of all the values paired with a key, for all keys in the collection.""" 96 | 97 | def sum_inner(vals): 98 | out = 0 99 | for (val, multiplicity) in vals: 100 | out += val * multiplicity 101 | return [(out, 1)] 102 | 103 | return self.reduce(sum_inner) 104 | 105 | def min(self): 106 | """Produce the minimum value associated with each key in the collection. 107 | 108 | Note that no record may have negative multiplicity when computing the min, 109 | as it is unclear what exactly the minimum record is in that case. 110 | """ 111 | 112 | def min_inner(vals): 113 | consolidated = defaultdict(int) 114 | for (val, multiplicity) in vals: 115 | consolidated[val] += multiplicity 116 | vals = [ 117 | (val, multiplicity) 118 | for (val, multiplicity) in consolidated.items() 119 | if multiplicity != 0 120 | ] 121 | if len(vals) != 0: 122 | out = vals[0][0] 123 | for (val, multiplicity) in vals: 124 | assert multiplicity > 0 125 | if val < out: 126 | out = val 127 | return [(out, 1)] 128 | else: 129 | return [] 130 | 131 | return self.reduce(min_inner) 132 | 133 | def max(self): 134 | """Produce the maximum value associated with each key in the collection. 135 | 136 | Note that no record may have negative multiplicity when computing the max, 137 | as it is unclear what exactly the maximum record is in that case. 138 | """ 139 | 140 | def max_inner(vals): 141 | consolidated = defaultdict(int) 142 | for (val, multiplicity) in vals: 143 | consolidated[val] += multiplicity 144 | vals = [ 145 | (val, multiplicity) 146 | for (val, multiplicity) in consolidated.items() 147 | if multiplicity != 0 148 | ] 149 | if len(vals) != 0: 150 | out = vals[0][0] 151 | for (val, multiplicity) in vals: 152 | assert multiplicity > 0 153 | if val > out: 154 | out = val 155 | return [(out, 1)] 156 | else: 157 | return [] 158 | 159 | return self.reduce(max_inner) 160 | 161 | def distinct(self): 162 | """Reduce the collection to a set of elements (from a multiset). 163 | 164 | Note that no record may have negative multiplicity when producing this set, 165 | as elements of sets may only have multiplicity one, and it is unclear that is 166 | an appropriate output for elements with negative multiplicity. 167 | """ 168 | 169 | def distinct_inner(vals): 170 | consolidated = defaultdict(int) 171 | for (val, multiplicity) in vals: 172 | consolidated[val] += multiplicity 173 | vals = [ 174 | (val, multiplicity) 175 | for (val, multiplicity) in consolidated.items() 176 | if multiplicity != 0 177 | ] 178 | for (val, multiplicity) in vals: 179 | assert multiplicity > 0 180 | return [(val, 1) for (val, _) in vals] 181 | 182 | return self.reduce(distinct_inner) 183 | 184 | def iterate(self, f): 185 | """Repeatedly invoke a function f on a collection, and return the result 186 | of applying the function an infinite number of times (fixedpoint). 187 | 188 | Note that if the function does not converge to a fixedpoint this implementation 189 | will run forever. 190 | """ 191 | curr = Collection(self._inner) 192 | while True: 193 | result = f(curr) 194 | if result._inner == curr._inner: 195 | break 196 | curr = result 197 | return curr 198 | 199 | def _extend(self, other): 200 | self._inner.extend(other._inner) 201 | 202 | 203 | if __name__ == "__main__": 204 | a = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1)]) 205 | b = Collection( 206 | [ 207 | (("apple", "$3"), 1), 208 | (("apple", ("granny smith", "$2")), 1), 209 | (("kiwi", "$2"), 1), 210 | ] 211 | ) 212 | c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)]) 213 | d = Collection( 214 | [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)] 215 | ) 216 | e = Collection([(1, 1)]) 217 | 218 | print(a.concat(b)) 219 | print(a.join(b)) 220 | print(b.join(a)) 221 | print(a.filter(lambda data: data[0] != "apple")) 222 | print(a.map(lambda data: (data[1], data[0]))) 223 | print(a.concat(b).count()) 224 | print(a.concat(b).distinct()) 225 | print(c.min()) 226 | print(c.max()) 227 | print(d.sum()) 228 | 229 | def add_one(collection): 230 | return ( 231 | collection.map(lambda data: data + 1) 232 | .concat(collection) 233 | .filter(lambda data: data <= 5) 234 | .map(lambda data: (data, ())) 235 | .distinct() 236 | .map(lambda data: data[0]) 237 | .consolidate() 238 | ) 239 | 240 | result = e.iterate(add_one).map(lambda data: (data, data * data)) 241 | print(result) 242 | -------------------------------------------------------------------------------- /v1/difference_sequence.py: -------------------------------------------------------------------------------- 1 | """The implementation of a collection that changes as a sequence of difference 2 | collections describing each change. 3 | """ 4 | 5 | from collections import defaultdict 6 | from collection import Collection 7 | from index import Index 8 | from itertools import zip_longest 9 | 10 | 11 | class DifferenceSequence: 12 | """A collection that goes through a sequence of changes. 13 | 14 | Each change to the collection is described in a difference collection that 15 | describes the change between the current version of the collection and the 16 | previous version. 17 | 18 | This representation is designed for the case where the differences between 19 | consecutive versions in the sequence are small, and so storing the 20 | sequence of differences is both space efficient, and enables efficient 21 | computation of the sequence of output differences. 22 | """ 23 | 24 | def __init__(self, trace): 25 | self._inner = trace 26 | 27 | def __repr__(self): 28 | return f"DifferenceSequence({self._inner})" 29 | 30 | def map(self, f): 31 | """Apply a function to all records in the collection trace.""" 32 | return DifferenceSequence([collection.map(f) for collection in self._inner]) 33 | 34 | def filter(self, f): 35 | """Filter out records where f(record) evaluates to False from all 36 | collections in the collection trace. 37 | """ 38 | return DifferenceSequence([collection.filter(f) for collection in self._inner]) 39 | 40 | def negate(self): 41 | return DifferenceSequence([collection.negate() for collection in self._inner]) 42 | 43 | def concat(self, other): 44 | """Concatenate two collection traces together.""" 45 | inputs = zip_longest(self._inner, other._inner, fillvalue=Collection()) 46 | return DifferenceSequence([a.concat(b) for (a, b) in inputs]) 47 | 48 | def consolidate(self): 49 | """Produce a collection trace where each collection in the trace 50 | is consolidated. 51 | """ 52 | out = [] 53 | for collection in self._inner: 54 | out.append(collection.consolidate()) 55 | 56 | return DifferenceSequence(out) 57 | 58 | def join(self, other): 59 | """Match pairs (k, v1) and (k, v2) from the two input collection 60 | traces and produce a collection trace containing the corresponding 61 | (k, (v1, v2)). 62 | """ 63 | index_a = Index() 64 | index_b = Index() 65 | out = [] 66 | 67 | for (collection_a, collection_b) in zip_longest( 68 | self._inner, other._inner, fillvalue=Collection() 69 | ): 70 | delta_a = Index() 71 | delta_b = Index() 72 | result = Collection() 73 | 74 | for ((key, value), multiplicity) in collection_a._inner: 75 | delta_a.add_value(key, (value, multiplicity)) 76 | for ((key, value), multiplicity) in collection_b._inner: 77 | delta_b.add_value(key, (value, multiplicity)) 78 | 79 | result._extend(delta_a.join(index_b)) 80 | index_a.append(delta_a) 81 | result._extend(index_a.join(delta_b)) 82 | index_b.append(delta_b) 83 | # Consolidating the output is not strictly necessary and is only done here to make the output easier to inspect visually. 84 | out.append(result.consolidate()) 85 | return DifferenceSequence(out) 86 | 87 | def reduce(self, f): 88 | """Apply a reduction function to all record values, grouped by key.""" 89 | 90 | def subtract_values(first, second): 91 | result = defaultdict(int) 92 | for (v1, m1) in first: 93 | result[v1] += m1 94 | for (v2, m2) in second: 95 | result[v2] -= m2 96 | 97 | return [ 98 | (val, multiplicity) 99 | for (val, multiplicity) in result.items() 100 | if multiplicity != 0 101 | ] 102 | 103 | index = Index() 104 | index_out = Index() 105 | keys_todo = defaultdict(set) 106 | output = [] 107 | 108 | for collection in self._inner: 109 | keys_todo = set() 110 | result = [] 111 | for ((key, value), multiplicity) in collection._inner: 112 | index.add_value(key, (value, multiplicity)) 113 | keys_todo.add(key) 114 | 115 | keys = [key for key in keys_todo] 116 | for key in keys: 117 | curr = index.get(key) 118 | curr_out = index_out.get(key) 119 | out = f(curr) 120 | delta = subtract_values(out, curr_out) 121 | for (value, multiplicity) in delta: 122 | result.append(((key, value), multiplicity)) 123 | index_out.add_value(key, (value, multiplicity)) 124 | output.append(Collection(result)) 125 | index.compact(keys) 126 | index_out.compact(keys) 127 | 128 | return DifferenceSequence(output) 129 | 130 | def count(self): 131 | """Count the number of times each key occurs in each collection in the collection 132 | trace. 133 | """ 134 | 135 | def count_inner(vals): 136 | out = 0 137 | for (_, diff) in vals: 138 | out += diff 139 | return [(out, 1)] 140 | 141 | return self.reduce(count_inner) 142 | 143 | def sum(self): 144 | """Produce the sum of all the values paired with each key, for each 145 | collection in the trace. 146 | """ 147 | 148 | def sum_inner(vals): 149 | out = 0 150 | for (val, diff) in vals: 151 | out += val * diff 152 | return [(out, 1)] 153 | 154 | return self.reduce(sum_inner) 155 | 156 | def min(self): 157 | """Produce the minimum value associated with each key, for each collection in 158 | the trace. 159 | """ 160 | 161 | def min_inner(vals): 162 | consolidated = defaultdict(int) 163 | for (val, multiplicity) in vals: 164 | consolidated[val] += multiplicity 165 | vals = [ 166 | (val, multiplicity) 167 | for (val, multiplicity) in consolidated.items() 168 | if multiplicity != 0 169 | ] 170 | if len(vals) != 0: 171 | out = vals[0][0] 172 | for (val, multiplicity) in vals: 173 | assert multiplicity > 0 174 | if val < out: 175 | out = val 176 | return [(out, 1)] 177 | else: 178 | return [] 179 | 180 | return self.reduce(min_inner) 181 | 182 | def max(self): 183 | """Produce the minimum value associated with each key, for each collection in 184 | the trace. 185 | """ 186 | 187 | def max_inner(vals): 188 | consolidated = defaultdict(int) 189 | for (val, multiplicity) in vals: 190 | consolidated[val] += multiplicity 191 | vals = [ 192 | (val, multiplicity) 193 | for (val, multiplicity) in consolidated.items() 194 | if multiplicity != 0 195 | ] 196 | if len(vals) != 0: 197 | out = vals[0][0] 198 | for (val, multiplicity) in vals: 199 | assert multiplicity > 0 200 | if val > out: 201 | out = val 202 | return [(out, 1)] 203 | else: 204 | return [] 205 | 206 | return self.reduce(max_inner) 207 | 208 | def distinct(self): 209 | def distinct_inner(vals): 210 | consolidated = defaultdict(int) 211 | for (val, multiplicity) in vals: 212 | consolidated[val] += multiplicity 213 | vals = [ 214 | (val, multiplicity) 215 | for (val, multiplicity) in consolidated.items() 216 | if multiplicity != 0 217 | ] 218 | for (val, multiplicity) in vals: 219 | assert multiplicity > 0 220 | return [(val, 1) for (val, _) in vals] 221 | 222 | return self.reduce(distinct_inner) 223 | 224 | def iterate(self, f): 225 | """Return the fixpoint of repeatedly applying f to each collection in the trace.""" 226 | # TODO 227 | 228 | 229 | if __name__ == "__main__": 230 | a = Collection([(("apple", "$5"), 3), (("banana", "$2"), 1)]) 231 | b = Collection([(("apple", "$3"), 1), (("apple", "$2"), 1), (("kiwi", "$2"), 1)]) 232 | c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)]) 233 | d = Collection( 234 | [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)] 235 | ) 236 | e = Collection([(1, 1)]) 237 | 238 | trace_a = DifferenceSequence( 239 | [ 240 | a, 241 | Collection([(("apple", "$5"), -1), (("apple", "$7"), 1)]), 242 | Collection([(("lemon", "$1"), 1)]), 243 | ] 244 | ) 245 | print(trace_a.map(lambda data: (data[1], data[0]))) 246 | print(trace_a.filter(lambda data: data[0] != "apple")) 247 | 248 | trace_b = DifferenceSequence( 249 | [ 250 | b, 251 | Collection([]), 252 | Collection([(("lemon", "$22"), 3), (("kiwi", "$1"), 2)]), 253 | ] 254 | ) 255 | print(trace_a.join(trace_b)) 256 | print(trace_a.join(trace_b).consolidate()) 257 | print(trace_a.min()) 258 | print(trace_a.max()) 259 | print(trace_a.distinct()) 260 | -------------------------------------------------------------------------------- /v2/differential_dataflow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from collection import Collection 4 | from graph import ( 5 | BinaryOperator, 6 | DifferenceStreamReader, 7 | DifferenceStreamWriter, 8 | Graph, 9 | UnaryOperator, 10 | ) 11 | from index import Index 12 | 13 | 14 | class DifferenceStreamBuilder: 15 | def __init__(self, graph): 16 | self._writer = DifferenceStreamWriter() 17 | self.graph = graph 18 | 19 | def connect_reader(self): 20 | return self._writer._new_reader() 21 | 22 | def writer(self): 23 | return self._writer 24 | 25 | def map(self, f): 26 | output = DifferenceStreamBuilder(self.graph) 27 | operator = MapOperator( 28 | self.connect_reader(), 29 | output.writer(), 30 | f, 31 | ) 32 | self.graph.add_operator(operator) 33 | self.graph.add_stream(output.connect_reader()) 34 | return output 35 | 36 | def filter(self, f): 37 | output = DifferenceStreamBuilder(self.graph) 38 | operator = FilterOperator( 39 | self.connect_reader(), 40 | output.writer(), 41 | f, 42 | ) 43 | self.graph.add_operator(operator) 44 | self.graph.add_stream(output.connect_reader()) 45 | return output 46 | 47 | def negate(self): 48 | output = DifferenceStreamBuilder(self.graph) 49 | operator = NegateOperator( 50 | self.connect_reader(), 51 | output.writer(), 52 | ) 53 | self.graph.add_operator(operator) 54 | self.graph.add_stream(output.connect_reader()) 55 | return output 56 | 57 | def concat(self, other): 58 | assert id(self.graph) == id(other.graph) 59 | output = DifferenceStreamBuilder(self.graph) 60 | operator = ConcatOperator( 61 | self.connect_reader(), 62 | other.connect_reader(), 63 | output.writer(), 64 | ) 65 | self.graph.add_operator(operator) 66 | self.graph.add_stream(output.connect_reader()) 67 | return output 68 | 69 | def debug(self, name=""): 70 | output = DifferenceStreamBuilder(self.graph) 71 | operator = DebugOperator( 72 | self.connect_reader(), 73 | output.writer(), 74 | name, 75 | ) 76 | self.graph.add_operator(operator) 77 | self.graph.add_stream(output.connect_reader()) 78 | return output 79 | 80 | def join(self, other): 81 | assert id(self.graph) == id(other.graph) 82 | output = DifferenceStreamBuilder(self.graph) 83 | operator = JoinOperator( 84 | self.connect_reader(), 85 | other.connect_reader(), 86 | output.writer(), 87 | ) 88 | self.graph.add_operator(operator) 89 | self.graph.add_stream(output.connect_reader()) 90 | return output 91 | 92 | def count(self): 93 | output = DifferenceStreamBuilder(self.graph) 94 | operator = CountOperator( 95 | self.connect_reader(), 96 | output.writer(), 97 | ) 98 | self.graph.add_operator(operator) 99 | self.graph.add_stream(output.connect_reader()) 100 | return output 101 | 102 | 103 | class GraphBuilder: 104 | def __init__(self): 105 | self.streams = [] 106 | self.operators = [] 107 | 108 | def new_input(self): 109 | stream_builder = DifferenceStreamBuilder(self) 110 | self.streams.append(stream_builder.connect_reader()) 111 | return stream_builder, stream_builder.writer() 112 | 113 | def add_operator(self, operator): 114 | self.operators.append(operator) 115 | 116 | def add_stream(self, stream): 117 | self.streams.append(stream) 118 | 119 | def finalize(self): 120 | return Graph(self.streams, self.operators) 121 | 122 | 123 | class LinearUnaryOperator(UnaryOperator): 124 | def __init__(self, input_a, output, f): 125 | def inner(): 126 | for collection in self.input_messages(): 127 | self.output.send_data(f(collection)) 128 | 129 | super().__init__(input_a, output, inner) 130 | 131 | 132 | class MapOperator(LinearUnaryOperator): 133 | def __init__(self, input_a, output, f): 134 | def map_inner(collection): 135 | return collection.map(f) 136 | 137 | super().__init__(input_a, output, map_inner) 138 | 139 | 140 | class FilterOperator(LinearUnaryOperator): 141 | def __init__(self, input_a, output, f): 142 | def filter_inner(collection): 143 | return collection.filter(f) 144 | 145 | super().__init__(input_a, output, filter_inner) 146 | 147 | 148 | class NegateOperator(LinearUnaryOperator): 149 | def __init__(self, input_a, output): 150 | def negate_inner(collection): 151 | return collection.negate() 152 | 153 | super().__init__(input_a, output, negate_inner) 154 | 155 | 156 | class ConcatOperator(BinaryOperator): 157 | def __init__(self, input_a, input_b, output): 158 | self.input_a_pending = [] 159 | self.input_b_pending = [] 160 | 161 | def inner(): 162 | # This is not internally consistent! 163 | for collection in self.input_a_messages(): 164 | self.input_a_pending.append(collection) 165 | for collection in self.input_b_messages(): 166 | self.input_b_pending.append(collection) 167 | 168 | sent = 0 169 | for (collection_a, collection_b) in zip( 170 | self.input_a_pending, self.input_b_pending 171 | ): 172 | self.output.send_data(collection_a.concat(collection_b)) 173 | sent += 1 174 | if sent > 0: 175 | self.input_a_pending = self.input_a_pending[sent:] 176 | self.input_b_pending = self.input_b_pending[sent:] 177 | 178 | super().__init__(input_a, input_b, output, inner) 179 | 180 | 181 | class DebugOperator(UnaryOperator): 182 | def __init__(self, input_a, output, name): 183 | def inner(): 184 | for collection in self.input_messages(): 185 | print(f"debug {name} data: collection: {collection}") 186 | self.output.send_data(collection) 187 | 188 | super().__init__(input_a, output, inner) 189 | 190 | 191 | class JoinOperator(BinaryOperator): 192 | def __init__(self, input_a, input_b, output): 193 | self.index_a = Index() 194 | self.index_b = Index() 195 | self.input_a_pending = [] 196 | self.input_b_pending = [] 197 | 198 | def inner(): 199 | for collection in self.input_a_messages(): 200 | delta_a = Index() 201 | for ((key, value), multiplicity) in collection._inner: 202 | delta_a.add_value(key, (value, multiplicity)) 203 | self.input_a_pending.append(delta_a) 204 | for collection in self.input_b_messages(): 205 | delta_b = Index() 206 | for ((key, value), multiplicity) in collection._inner: 207 | delta_b.add_value(key, (value, multiplicity)) 208 | self.input_b_pending.append(delta_b) 209 | 210 | sent = 0 211 | for (delta_a, delta_b) in zip(self.input_a_pending, self.input_b_pending): 212 | result = Collection() 213 | result._extend(delta_a.join(self.index_b)) 214 | self.index_a.append(delta_a) 215 | result._extend(self.index_a.join(delta_b)) 216 | self.index_b.append(delta_b) 217 | self.output.send_data(result.consolidate()) 218 | sent += 1 219 | self.index_a.compact() 220 | self.index_b.compact() 221 | 222 | if sent > 0: 223 | self.input_a_pending = self.input_a_pending[sent:] 224 | self.input_b_pending = self.input_b_pending[sent:] 225 | 226 | super().__init__(input_a, input_b, output, inner) 227 | 228 | 229 | class ReduceOperator(UnaryOperator): 230 | def __init__(self, input_a, output, f): 231 | self.index = Index() 232 | self.index_out = Index() 233 | 234 | def subtract_values(first, second): 235 | result = defaultdict(int) 236 | for (v1, m1) in first: 237 | result[v1] += m1 238 | for (v2, m2) in second: 239 | result[v2] -= m2 240 | 241 | return [ 242 | (val, multiplicity) 243 | for (val, multiplicity) in result.items() 244 | if multiplicity != 0 245 | ] 246 | 247 | def inner(): 248 | for collection in self.input_messages(): 249 | keys_todo = set() 250 | result = [] 251 | for ((key, value), multiplicity) in collection._inner: 252 | self.index.add_value(key, (value, multiplicity)) 253 | keys_todo.add(key) 254 | keys = [key for key in keys_todo] 255 | for key in keys: 256 | curr = self.index.get(key) 257 | curr_out = self.index_out.get(key) 258 | out = f(curr) 259 | delta = subtract_values(out, curr_out) 260 | for (value, multiplicity) in delta: 261 | result.append(((key, value), multiplicity)) 262 | self.index_out.add_value(key, (value, multiplicity)) 263 | self.output.send_data(Collection(result)) 264 | self.index.compact(keys) 265 | self.index_out.compact(keys) 266 | 267 | super().__init__(input_a, output, inner) 268 | 269 | 270 | class CountOperator(ReduceOperator): 271 | def __init__(self, input_a, output): 272 | def count_inner(vals): 273 | out = 0 274 | for (_, diff) in vals: 275 | out += diff 276 | return [(out, 1)] 277 | 278 | super().__init__(input_a, output, count_inner) 279 | 280 | 281 | if __name__ == "__main__": 282 | graph_builder = GraphBuilder() 283 | input_a, input_a_writer = graph_builder.new_input() 284 | output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0) 285 | input_a.negate().concat(output).debug("output") 286 | graph = graph_builder.finalize() 287 | 288 | for i in range(0, 10): 289 | input_a_writer.send_data(Collection([(i, 1)])) 290 | graph.step() 291 | graph_builder = GraphBuilder() 292 | input_a, input_a_writer = graph_builder.new_input() 293 | input_b, input_b_writer = graph_builder.new_input() 294 | 295 | output = input_a.join(input_b).count().debug("count") 296 | graph = graph_builder.finalize() 297 | 298 | for i in range(0, 10): 299 | input_a_writer.send_data(Collection([((1, i), 2)])) 300 | input_a_writer.send_data(Collection([((2, i), 2)])) 301 | input_b_writer.send_data(Collection([((1, i + 2), 2)])) 302 | input_b_writer.send_data(Collection([((2, i + 3), 2)])) 303 | graph.step() 304 | graph.step() 305 | -------------------------------------------------------------------------------- /v3/differential_dataflow.py: -------------------------------------------------------------------------------- 1 | """An implementation of differential dataflow specialized for the setting where versions (times) are 2 | integers. This implementation supports all differential operations except iterate. 3 | """ 4 | 5 | from collections import defaultdict 6 | 7 | from collection import Collection 8 | from graph import ( 9 | BinaryOperator, 10 | DifferenceStreamReader, 11 | DifferenceStreamWriter, 12 | Graph, 13 | MessageType, 14 | UnaryOperator, 15 | ) 16 | from index import Index 17 | 18 | 19 | class DifferenceStreamBuilder: 20 | def __init__(self, graph): 21 | self._writer = DifferenceStreamWriter() 22 | self.graph = graph 23 | 24 | def connect_reader(self): 25 | return self._writer._new_reader() 26 | 27 | def writer(self): 28 | return self._writer 29 | 30 | def map(self, f): 31 | output = DifferenceStreamBuilder(self.graph) 32 | operator = MapOperator( 33 | self.connect_reader(), output.writer(), f, self.graph.frontier() 34 | ) 35 | self.graph.add_operator(operator) 36 | self.graph.add_stream(output.connect_reader()) 37 | return output 38 | 39 | def filter(self, f): 40 | output = DifferenceStreamBuilder(self.graph) 41 | operator = FilterOperator( 42 | self.connect_reader(), output.writer(), f, self.graph.frontier() 43 | ) 44 | self.graph.add_operator(operator) 45 | self.graph.add_stream(output.connect_reader()) 46 | return output 47 | 48 | def negate(self): 49 | output = DifferenceStreamBuilder(self.graph) 50 | operator = NegateOperator( 51 | self.connect_reader(), output.writer(), self.graph.frontier() 52 | ) 53 | self.graph.add_operator(operator) 54 | self.graph.add_stream(output.connect_reader()) 55 | return output 56 | 57 | def concat(self, other): 58 | assert id(self.graph) == id(other.graph) 59 | output = DifferenceStreamBuilder(self.graph) 60 | operator = ConcatOperator( 61 | self.connect_reader(), 62 | other.connect_reader(), 63 | output.writer(), 64 | self.graph.frontier(), 65 | ) 66 | self.graph.add_operator(operator) 67 | self.graph.add_stream(output.connect_reader()) 68 | return output 69 | 70 | def debug(self, name=""): 71 | output = DifferenceStreamBuilder(self.graph) 72 | operator = DebugOperator( 73 | self.connect_reader(), output.writer(), name, self.graph.frontier() 74 | ) 75 | self.graph.add_operator(operator) 76 | self.graph.add_stream(output.connect_reader()) 77 | return output 78 | 79 | def join(self, other): 80 | assert id(self.graph) == id(other.graph) 81 | output = DifferenceStreamBuilder(self.graph) 82 | operator = JoinOperator( 83 | self.connect_reader(), 84 | other.connect_reader(), 85 | output.writer(), 86 | self.graph.frontier(), 87 | ) 88 | self.graph.add_operator(operator) 89 | self.graph.add_stream(output.connect_reader()) 90 | return output 91 | 92 | def count(self): 93 | output = DifferenceStreamBuilder(self.graph) 94 | operator = CountOperator( 95 | self.connect_reader(), output.writer(), self.graph.frontier() 96 | ) 97 | self.graph.add_operator(operator) 98 | self.graph.add_stream(output.connect_reader()) 99 | return output 100 | 101 | 102 | class GraphBuilder: 103 | def __init__(self, initial_frontier): 104 | self.streams = [] 105 | self.operators = [] 106 | self.initial_frontier = initial_frontier 107 | 108 | def new_input(self): 109 | stream_builder = DifferenceStreamBuilder(self) 110 | self.streams.append(stream_builder.connect_reader()) 111 | return stream_builder, stream_builder.writer() 112 | 113 | def add_operator(self, operator): 114 | self.operators.append(operator) 115 | 116 | def add_stream(self, stream): 117 | self.streams.append(stream) 118 | 119 | def frontier(self): 120 | return self.initial_frontier 121 | 122 | def finalize(self): 123 | return Graph(self.streams, self.operators) 124 | 125 | 126 | class LinearUnaryOperator(UnaryOperator): 127 | def __init__(self, input_a, output, f, initial_frontier): 128 | def inner(): 129 | for (typ, msg) in self.input_messages(): 130 | if typ == MessageType.DATA: 131 | version, collection = msg 132 | self.output.send_data(version, f(collection)) 133 | elif typ == MessageType.FRONTIER: 134 | frontier = msg 135 | self.set_input_frontier(frontier) 136 | 137 | if self.input_frontier() > self.output_frontier: 138 | self.output_frontier = self.input_frontier() 139 | self.output.send_frontier(self.output_frontier) 140 | 141 | super().__init__(input_a, output, inner, initial_frontier) 142 | 143 | 144 | class MapOperator(LinearUnaryOperator): 145 | def __init__(self, input_a, output, f, initial_frontier): 146 | def map_inner(collection): 147 | return collection.map(f) 148 | 149 | super().__init__(input_a, output, map_inner, initial_frontier) 150 | 151 | 152 | class FilterOperator(LinearUnaryOperator): 153 | def __init__(self, input_a, output, f, initial_frontier): 154 | def filter_inner(collection): 155 | return collection.filter(f) 156 | 157 | super().__init__(input_a, output, filter_inner, initial_frontier) 158 | 159 | 160 | class NegateOperator(LinearUnaryOperator): 161 | def __init__(self, input_a, output, initial_frontier): 162 | def negate_inner(collection): 163 | return collection.negate() 164 | 165 | super().__init__(input_a, output, negate_inner, initial_frontier) 166 | 167 | 168 | class ConcatOperator(BinaryOperator): 169 | def __init__(self, input_a, input_b, output, initial_frontier): 170 | def inner(): 171 | for (typ, msg) in self.input_a_messages(): 172 | if typ == MessageType.DATA: 173 | version, collection = msg 174 | self.output.send_data(version, collection) 175 | elif typ == MessageType.FRONTIER: 176 | frontier = msg 177 | self.set_input_a_frontier(frontier) 178 | for (typ, msg) in self.input_b_messages(): 179 | if typ == MessageType.DATA: 180 | version, collection = msg 181 | self.output.send_data(version, collection) 182 | elif typ == MessageType.FRONTIER: 183 | frontier = msg 184 | self.set_input_b_frontier(version) 185 | 186 | min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier()) 187 | if min_input_frontier > self.output_frontier: 188 | self.output_frontier = min_input_frontier 189 | self.output.send_frontier(self.output_frontier) 190 | 191 | super().__init__(input_a, input_b, output, inner, initial_frontier) 192 | 193 | 194 | class DebugOperator(UnaryOperator): 195 | def __init__(self, input_a, output, name, initial_frontier): 196 | def inner(): 197 | for (typ, msg) in self.input_messages(): 198 | if typ == MessageType.DATA: 199 | version, collection = msg 200 | print( 201 | f"debug {name} data: version: {version} collection: {collection}" 202 | ) 203 | self.output.send_data(version, collection) 204 | elif typ == MessageType.FRONTIER: 205 | frontier = msg 206 | assert self.input_frontier() <= frontier 207 | self.set_input_frontier(frontier) 208 | print(f"debug {name} notification: frontier {version}") 209 | assert self.output_frontier <= self.input_frontier() 210 | if self.output_frontier < self.input_frontier(): 211 | self.output_frontier = self.input_frontier() 212 | self.output.send_frontier(self.output_frontier) 213 | 214 | super().__init__(input_a, output, inner, initial_frontier) 215 | 216 | 217 | class JoinOperator(BinaryOperator): 218 | def __init__(self, input_a, input_b, output, initial_frontier): 219 | self.index_a = Index() 220 | self.index_b = Index() 221 | 222 | def inner(): 223 | delta_a = Index() 224 | delta_b = Index() 225 | for (typ, msg) in self.input_a_messages(): 226 | if typ == MessageType.DATA: 227 | version, collection = msg 228 | for ((key, value), multiplicity) in collection._inner: 229 | delta_a.add_value(key, version, (value, multiplicity)) 230 | elif typ == MessageType.FRONTIER: 231 | frontier = msg 232 | self.set_input_a_frontier(msg) 233 | for (typ, msg) in self.input_b_messages(): 234 | if typ == MessageType.DATA: 235 | version, collection = msg 236 | for ((key, value), multiplicity) in collection._inner: 237 | delta_b.add_value(key, version, (value, multiplicity)) 238 | elif typ == MessageType.FRONTIER: 239 | frontier = msg 240 | self.set_input_b_frontier(frontier) 241 | 242 | results = defaultdict(Collection) 243 | for (version, collection) in delta_a.join(self.index_b): 244 | results[version]._extend(collection) 245 | 246 | self.index_a.append(delta_a) 247 | 248 | for (version, collection) in self.index_a.join(delta_b): 249 | results[version]._extend(collection) 250 | 251 | for (version, collection) in results.items(): 252 | self.output.send_data(version, collection) 253 | self.index_b.append(delta_b) 254 | 255 | min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier()) 256 | if min_input_frontier > self.output_frontier: 257 | self.output_frontier = min_input_frontier 258 | self.output.send_frontier(self.output_frontier) 259 | self.index_a.compact(self.output_frontier) 260 | self.index_b.compact(self.output_frontier) 261 | 262 | super().__init__(input_a, input_b, output, inner, initial_frontier) 263 | 264 | 265 | class ReduceOperator(UnaryOperator): 266 | def __init__(self, input_a, output, f, initial_frontier): 267 | self.index = Index() 268 | self.index_out = Index() 269 | self.keys_todo = defaultdict(set) 270 | 271 | def subtract_values(first, second): 272 | result = defaultdict(int) 273 | for (v1, m1) in first: 274 | result[v1] += m1 275 | for (v2, m2) in second: 276 | result[v2] -= m2 277 | 278 | return [ 279 | (val, multiplicity) 280 | for (val, multiplicity) in result.items() 281 | if multiplicity != 0 282 | ] 283 | 284 | def inner(): 285 | for (typ, msg) in self.input_messages(): 286 | if typ == MessageType.DATA: 287 | version, collection = msg 288 | for ((key, value), multiplicity) in collection._inner: 289 | self.index.add_value(key, version, (value, multiplicity)) 290 | self.keys_todo[version].add(key) 291 | elif typ == MessageType.FRONTIER: 292 | frontier = msg 293 | self.set_input_frontier(frontier) 294 | 295 | finished_versions = [ 296 | version 297 | for version in self.keys_todo.keys() 298 | if version < self.input_frontier() 299 | ] 300 | 301 | finished_versions.sort() 302 | for version in finished_versions: 303 | keys = self.keys_todo.pop(version) 304 | result = [] 305 | for key in keys: 306 | curr = self.index.reconstruct_at(key, version) 307 | curr_out = self.index_out.reconstruct_at(key, version) 308 | out = f(curr) 309 | delta = subtract_values(out, curr_out) 310 | for (value, multiplicity) in delta: 311 | result.append(((key, value), multiplicity)) 312 | self.index_out.add_value(key, version, (value, multiplicity)) 313 | if result != []: 314 | self.output.send_data(version, Collection(result)) 315 | 316 | if self.input_frontier() > self.output_frontier: 317 | self.output_frontier = self.input_frontier() 318 | self.output.send_frontier(self.output_frontier) 319 | self.index.compact(self.output_frontier) 320 | self.index_out.compact(self.output_frontier) 321 | 322 | super().__init__(input_a, output, inner, initial_frontier) 323 | 324 | 325 | class CountOperator(ReduceOperator): 326 | def __init__(self, input_a, output, initial_frontier): 327 | def count_inner(vals): 328 | out = 0 329 | for (_, diff) in vals: 330 | out += diff 331 | return [(out, 1)] 332 | 333 | super().__init__(input_a, output, count_inner, initial_frontier) 334 | 335 | 336 | if __name__ == "__main__": 337 | graph_builder = GraphBuilder(0) 338 | input_a, input_a_writer = graph_builder.new_input() 339 | output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0) 340 | input_a.negate().concat(output).debug("output") 341 | graph = graph_builder.finalize() 342 | 343 | for i in range(0, 10): 344 | input_a_writer.send_data(i, Collection([(i, 1)])) 345 | input_a_writer.send_frontier(i) 346 | graph.step() 347 | graph_builder = GraphBuilder(0) 348 | input_a, input_a_writer = graph_builder.new_input() 349 | input_b, input_b_writer = graph_builder.new_input() 350 | 351 | output = input_a.join(input_b).count().debug("count") 352 | graph = graph_builder.finalize() 353 | 354 | for i in range(0, 10): 355 | input_a_writer.send_data(i, Collection([((1, i), 2)])) 356 | input_a_writer.send_data(i, Collection([((2, i), 2)])) 357 | input_b_writer.send_data(i, Collection([((1, i + 2), 2)])) 358 | input_b_writer.send_data(i, Collection([((2, i + 3), 2)])) 359 | input_a_writer.send_frontier(i) 360 | input_b_writer.send_frontier(i) 361 | graph.step() 362 | input_a_writer.send_frontier(11) 363 | input_b_writer.send_frontier(11) 364 | graph.step() 365 | -------------------------------------------------------------------------------- /v4/differential_dataflow.py: -------------------------------------------------------------------------------- 1 | """An implementation of differential dataflow specialized for the setting where versions (times) are 2 | integer tuples totally ordered lexicographically. This implementation supports all differential operations. 3 | """ 4 | 5 | from collections import defaultdict 6 | 7 | from collection import Collection 8 | from graph import ( 9 | BinaryOperator, 10 | DifferenceStreamReader, 11 | DifferenceStreamWriter, 12 | Graph, 13 | MessageType, 14 | UnaryOperator, 15 | ) 16 | from index import Index 17 | from version import Version 18 | 19 | ITERATION_LIMIT = 100 20 | 21 | 22 | class DifferenceStreamBuilder: 23 | def __init__(self, graph): 24 | self._writer = DifferenceStreamWriter() 25 | self.graph = graph 26 | 27 | def connect_reader(self): 28 | return self._writer._new_reader() 29 | 30 | def writer(self): 31 | return self._writer 32 | 33 | def map(self, f): 34 | output = DifferenceStreamBuilder(self.graph) 35 | operator = MapOperator( 36 | self.connect_reader(), output.writer(), f, self.graph.frontier() 37 | ) 38 | self.graph.add_operator(operator) 39 | self.graph.add_stream(output.connect_reader()) 40 | return output 41 | 42 | def filter(self, f): 43 | output = DifferenceStreamBuilder(self.graph) 44 | operator = FilterOperator( 45 | self.connect_reader(), output.writer(), f, self.graph.frontier() 46 | ) 47 | self.graph.add_operator(operator) 48 | self.graph.add_stream(output.connect_reader()) 49 | return output 50 | 51 | def negate(self): 52 | output = DifferenceStreamBuilder(self.graph) 53 | operator = NegateOperator( 54 | self.connect_reader(), output.writer(), self.graph.frontier() 55 | ) 56 | self.graph.add_operator(operator) 57 | self.graph.add_stream(output.connect_reader()) 58 | return output 59 | 60 | def concat(self, other): 61 | assert id(self.graph) == id(other.graph) 62 | output = DifferenceStreamBuilder(self.graph) 63 | operator = ConcatOperator( 64 | self.connect_reader(), 65 | other.connect_reader(), 66 | output.writer(), 67 | self.graph.frontier(), 68 | ) 69 | self.graph.add_operator(operator) 70 | self.graph.add_stream(output.connect_reader()) 71 | return output 72 | 73 | def debug(self, name=""): 74 | output = DifferenceStreamBuilder(self.graph) 75 | operator = DebugOperator( 76 | self.connect_reader(), output.writer(), name, self.graph.frontier() 77 | ) 78 | self.graph.add_operator(operator) 79 | self.graph.add_stream(output.connect_reader()) 80 | return output 81 | 82 | def join(self, other): 83 | assert id(self.graph) == id(other.graph) 84 | output = DifferenceStreamBuilder(self.graph) 85 | operator = JoinOperator( 86 | self.connect_reader(), 87 | other.connect_reader(), 88 | output.writer(), 89 | self.graph.frontier(), 90 | ) 91 | self.graph.add_operator(operator) 92 | self.graph.add_stream(output.connect_reader()) 93 | return output 94 | 95 | def count(self): 96 | output = DifferenceStreamBuilder(self.graph) 97 | operator = CountOperator( 98 | self.connect_reader(), output.writer(), self.graph.frontier() 99 | ) 100 | self.graph.add_operator(operator) 101 | self.graph.add_stream(output.connect_reader()) 102 | return output 103 | 104 | def consolidate(self): 105 | output = DifferenceStreamBuilder(self.graph) 106 | operator = ConsolidateOperator( 107 | self.connect_reader(), output.writer(), self.graph.frontier() 108 | ) 109 | self.graph.add_operator(operator) 110 | self.graph.add_stream(output.connect_reader()) 111 | return output 112 | 113 | def distinct(self): 114 | output = DifferenceStreamBuilder(self.graph) 115 | operator = DistinctOperator( 116 | self.connect_reader(), output.writer(), self.graph.frontier() 117 | ) 118 | self.graph.add_operator(operator) 119 | self.graph.add_stream(output.connect_reader()) 120 | return output 121 | 122 | def _start_scope(self): 123 | new_frontier = self.graph.frontier().extend() 124 | self.graph.push_frontier(new_frontier) 125 | 126 | def _end_scope(self): 127 | self.graph.pop_frontier() 128 | 129 | def _ingress(self): 130 | output = DifferenceStreamBuilder(self.graph) 131 | operator = IngressOperator( 132 | self.connect_reader(), 133 | output.writer(), 134 | ITERATION_LIMIT, 135 | self.graph.frontier(), 136 | ) 137 | self.graph.add_operator(operator) 138 | self.graph.add_stream(output.connect_reader()) 139 | return output 140 | 141 | def _egress(self): 142 | output = DifferenceStreamBuilder(self.graph) 143 | operator = EgressOperator( 144 | self.connect_reader(), output.writer(), self.graph.frontier() 145 | ) 146 | self.graph.add_operator(operator) 147 | self.graph.add_stream(output.connect_reader()) 148 | return output 149 | 150 | def iterate(self, f): 151 | self._start_scope() 152 | feedback_stream = DifferenceStreamBuilder(self.graph) 153 | entered = self._ingress().concat(feedback_stream) 154 | result = f(entered) 155 | feedback_operator = FeedbackOperator( 156 | result.connect_reader(), 157 | 1, 158 | ITERATION_LIMIT, 159 | feedback_stream.writer(), 160 | self.graph.frontier(), 161 | ) 162 | self.graph.add_stream(feedback_stream) 163 | self.graph.add_operator(feedback_operator) 164 | self._end_scope() 165 | return result._egress() 166 | 167 | 168 | class GraphBuilder: 169 | def __init__(self, initial_frontier): 170 | self.streams = [] 171 | self.operators = [] 172 | self.frontier_stack = [initial_frontier] 173 | 174 | def new_input(self): 175 | stream_builder = DifferenceStreamBuilder(self) 176 | self.streams.append(stream_builder.connect_reader()) 177 | return stream_builder, stream_builder.writer() 178 | 179 | def add_operator(self, operator): 180 | self.operators.append(operator) 181 | 182 | def add_stream(self, stream): 183 | self.streams.append(stream) 184 | 185 | def frontier(self): 186 | return self.frontier_stack[-1] 187 | 188 | def push_frontier(self, new_frontier): 189 | self.frontier_stack.append(new_frontier) 190 | 191 | def pop_frontier(self): 192 | self.frontier_stack.pop() 193 | 194 | def finalize(self): 195 | return Graph(self.streams, self.operators) 196 | 197 | 198 | class LinearUnaryOperator(UnaryOperator): 199 | def __init__(self, input_a, output, f, initial_frontier): 200 | def inner(): 201 | for (typ, msg) in self.input_messages(): 202 | if typ == MessageType.DATA: 203 | version, collection = msg 204 | self.output.send_data(version, f(collection)) 205 | elif typ == MessageType.FRONTIER: 206 | frontier = msg 207 | self.set_input_frontier(frontier) 208 | 209 | if self.input_frontier() > self.output_frontier: 210 | self.output_frontier = self.input_frontier() 211 | self.output.send_frontier(self.output_frontier) 212 | 213 | super().__init__(input_a, output, inner, initial_frontier) 214 | 215 | 216 | class MapOperator(LinearUnaryOperator): 217 | def __init__(self, input_a, output, f, initial_frontier): 218 | def map_inner(collection): 219 | return collection.map(f) 220 | 221 | super().__init__(input_a, output, map_inner, initial_frontier) 222 | 223 | 224 | class FilterOperator(LinearUnaryOperator): 225 | def __init__(self, input_a, output, f, initial_frontier): 226 | def filter_inner(collection): 227 | return collection.filter(f) 228 | 229 | super().__init__(input_a, output, filter_inner, initial_frontier) 230 | 231 | 232 | class NegateOperator(LinearUnaryOperator): 233 | def __init__(self, input_a, output, initial_frontier): 234 | def negate_inner(collection): 235 | return collection.negate() 236 | 237 | super().__init__(input_a, output, negate_inner, initial_frontier) 238 | 239 | 240 | class ConcatOperator(BinaryOperator): 241 | def __init__(self, input_a, input_b, output, initial_frontier): 242 | def inner(): 243 | for (typ, msg) in self.input_a_messages(): 244 | if typ == MessageType.DATA: 245 | version, collection = msg 246 | self.output.send_data(version, collection) 247 | elif typ == MessageType.FRONTIER: 248 | frontier = msg 249 | self.set_input_a_frontier(frontier) 250 | for (typ, msg) in self.input_b_messages(): 251 | if typ == MessageType.DATA: 252 | version, collection = msg 253 | self.output.send_data(version, collection) 254 | elif typ == MessageType.FRONTIER: 255 | frontier = msg 256 | self.set_input_b_frontier(frontier) 257 | 258 | min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier()) 259 | if min_input_frontier > self.output_frontier: 260 | self.output_frontier = min_input_frontier 261 | self.output.send_frontier(self.output_frontier) 262 | 263 | super().__init__(input_a, input_b, output, inner, initial_frontier) 264 | 265 | 266 | class DebugOperator(UnaryOperator): 267 | def __init__(self, input_a, output, name, initial_frontier): 268 | def inner(): 269 | for (typ, msg) in self.input_messages(): 270 | if typ == MessageType.DATA: 271 | version, collection = msg 272 | print( 273 | f"debug {name} data: version: {version} collection: {collection}" 274 | ) 275 | self.output.send_data(version, collection) 276 | elif typ == MessageType.FRONTIER: 277 | frontier = msg 278 | assert self.input_frontier() <= frontier 279 | self.set_input_frontier(frontier) 280 | print(f"debug {name} notification: frontier {frontier}") 281 | assert self.output_frontier <= self.input_frontier() 282 | if self.output_frontier < self.input_frontier(): 283 | self.output_frontier = self.input_frontier() 284 | self.output.send_frontier(self.output_frontier) 285 | 286 | super().__init__(input_a, output, inner, initial_frontier) 287 | 288 | 289 | class ConsolidateOperator(UnaryOperator): 290 | def __init__(self, input_a, output, initial_frontier): 291 | self.collections = defaultdict(Collection) 292 | 293 | def inner(): 294 | for (typ, msg) in self.input_messages(): 295 | if typ == MessageType.DATA: 296 | version, collection = msg 297 | self.collections[version]._extend(collection) 298 | elif typ == MessageType.FRONTIER: 299 | frontier = msg 300 | assert self.input_frontier() <= frontier 301 | self.set_input_frontier(frontier) 302 | finished_versions = [ 303 | version 304 | for version in self.collections.keys() 305 | if version < self.input_frontier() 306 | ] 307 | for version in finished_versions: 308 | collection = self.collections.pop(version).consolidate() 309 | self.output.send_data(version, collection) 310 | assert self.output_frontier <= self.input_frontier() 311 | if self.output_frontier < self.input_frontier(): 312 | self.output_frontier = self.input_frontier() 313 | self.output.send_frontier(self.output_frontier) 314 | 315 | super().__init__(input_a, output, inner, initial_frontier) 316 | 317 | 318 | class JoinOperator(BinaryOperator): 319 | def __init__(self, input_a, input_b, output, initial_frontier): 320 | self.index_a = Index() 321 | self.index_b = Index() 322 | 323 | def inner(): 324 | delta_a = Index() 325 | delta_b = Index() 326 | for (typ, msg) in self.input_a_messages(): 327 | if typ == MessageType.DATA: 328 | version, collection = msg 329 | for ((key, value), multiplicity) in collection._inner: 330 | delta_a.add_value(key, version, (value, multiplicity)) 331 | elif typ == MessageType.FRONTIER: 332 | frontier = msg 333 | self.set_input_a_frontier(msg) 334 | for (typ, msg) in self.input_b_messages(): 335 | if typ == MessageType.DATA: 336 | version, collection = msg 337 | for ((key, value), multiplicity) in collection._inner: 338 | delta_b.add_value(key, version, (value, multiplicity)) 339 | elif typ == MessageType.FRONTIER: 340 | frontier = msg 341 | self.set_input_b_frontier(frontier) 342 | 343 | results = defaultdict(Collection) 344 | for (version, collection) in delta_a.join(self.index_b): 345 | results[version]._extend(collection) 346 | 347 | self.index_a.append(delta_a) 348 | 349 | for (version, collection) in self.index_a.join(delta_b): 350 | results[version]._extend(collection) 351 | 352 | for (version, collection) in results.items(): 353 | self.output.send_data(version, collection) 354 | self.index_b.append(delta_b) 355 | 356 | min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier()) 357 | if min_input_frontier > self.output_frontier: 358 | self.output_frontier = min_input_frontier 359 | self.output.send_frontier(self.output_frontier) 360 | self.index_a.compact(self.output_frontier) 361 | self.index_b.compact(self.output_frontier) 362 | 363 | super().__init__(input_a, input_b, output, inner, initial_frontier) 364 | 365 | 366 | class ReduceOperator(UnaryOperator): 367 | def __init__(self, input_a, output, f, initial_frontier): 368 | self.index = Index() 369 | self.index_out = Index() 370 | self.keys_todo = defaultdict(set) 371 | 372 | def subtract_values(first, second): 373 | result = defaultdict(int) 374 | for (v1, m1) in first: 375 | result[v1] += m1 376 | for (v2, m2) in second: 377 | result[v2] -= m2 378 | 379 | return [ 380 | (val, multiplicity) 381 | for (val, multiplicity) in result.items() 382 | if multiplicity != 0 383 | ] 384 | 385 | def inner(): 386 | for (typ, msg) in self.input_messages(): 387 | if typ == MessageType.DATA: 388 | version, collection = msg 389 | for ((key, value), multiplicity) in collection._inner: 390 | self.index.add_value(key, version, (value, multiplicity)) 391 | self.keys_todo[version].add(key) 392 | elif typ == MessageType.FRONTIER: 393 | frontier = msg 394 | self.set_input_frontier(frontier) 395 | 396 | finished_versions = [ 397 | version 398 | for version in self.keys_todo.keys() 399 | if version < self.input_frontier() 400 | ] 401 | 402 | finished_versions.sort() 403 | for version in finished_versions: 404 | keys = self.keys_todo.pop(version) 405 | result = [] 406 | for key in keys: 407 | curr = self.index.reconstruct_at(key, version) 408 | curr_out = self.index_out.reconstruct_at(key, version) 409 | out = f(curr) 410 | delta = subtract_values(out, curr_out) 411 | for (value, multiplicity) in delta: 412 | result.append(((key, value), multiplicity)) 413 | self.index_out.add_value(key, version, (value, multiplicity)) 414 | if result != []: 415 | self.output.send_data(version, Collection(result)) 416 | 417 | if self.input_frontier() > self.output_frontier: 418 | self.output_frontier = self.input_frontier() 419 | self.output.send_frontier(self.output_frontier) 420 | self.index.compact(self.output_frontier) 421 | self.index_out.compact(self.output_frontier) 422 | 423 | super().__init__(input_a, output, inner, initial_frontier) 424 | 425 | 426 | class CountOperator(ReduceOperator): 427 | def __init__(self, input_a, output, initial_frontier): 428 | def count_inner(vals): 429 | out = 0 430 | for (_, diff) in vals: 431 | out += diff 432 | return [(out, 1)] 433 | 434 | super().__init__(input_a, output, count_inner, initial_frontier) 435 | 436 | 437 | class DistinctOperator(ReduceOperator): 438 | def __init__(self, input_a, output, initial_frontier): 439 | def distinct_inner(vals): 440 | consolidated = defaultdict(int) 441 | for (val, diff) in vals: 442 | consolidated[val] += diff 443 | for (val, diff) in consolidated.items(): 444 | assert diff >= 0 445 | return [(val, 1) for (val, diff) in consolidated.items() if diff != 0] 446 | 447 | super().__init__(input_a, output, distinct_inner, initial_frontier) 448 | 449 | 450 | class FeedbackOperator(UnaryOperator): 451 | def __init__(self, input_a, step, iteration_limit, output, initial_frontier): 452 | def inner(): 453 | for (typ, msg) in self.input_messages(): 454 | if typ == MessageType.DATA: 455 | version, collection = msg 456 | if version.inner[-1] < iteration_limit: 457 | self.output.send_data( 458 | version.apply_step(step, iteration_limit), collection 459 | ) 460 | elif typ == MessageType.FRONTIER: 461 | frontier = msg 462 | assert self.input_frontier() <= frontier 463 | self.set_input_frontier(frontier) 464 | 465 | candidate_output_frontier = self.input_frontier().apply_step( 466 | step, iteration_limit 467 | ) 468 | assert self.output_frontier <= candidate_output_frontier 469 | if self.output_frontier < candidate_output_frontier: 470 | self.output_frontier = candidate_output_frontier 471 | self.output.send_frontier(self.output_frontier) 472 | 473 | super().__init__(input_a, output, inner, initial_frontier) 474 | 475 | def connect_loop(output): 476 | self.output = output 477 | 478 | 479 | class IngressOperator(UnaryOperator): 480 | def __init__(self, input_a, output, iteration_limit, initial_frontier): 481 | def inner(): 482 | for (typ, msg) in self.input_messages(): 483 | if typ == MessageType.DATA: 484 | version, collection = msg 485 | new_version = version.extend() 486 | self.output.send_data(new_version, collection) 487 | self.output.send_data( 488 | new_version.apply_step(1, iteration_limit), collection.negate() 489 | ) 490 | elif typ == MessageType.FRONTIER: 491 | frontier = msg 492 | new_frontier = frontier.extend() 493 | assert self.input_frontier() <= new_frontier 494 | self.set_input_frontier(new_frontier) 495 | 496 | assert self.output_frontier <= self.input_frontier() 497 | if self.output_frontier < self.input_frontier(): 498 | self.output_frontier = self.input_frontier() 499 | self.output.send_frontier(self.output_frontier) 500 | 501 | super().__init__(input_a, output, inner, initial_frontier) 502 | 503 | 504 | class EgressOperator(UnaryOperator): 505 | def __init__(self, input_a, output, initial_frontier): 506 | def inner(): 507 | for (typ, msg) in self.input_messages(): 508 | if typ == MessageType.DATA: 509 | version, collection = msg 510 | new_version = version.truncate() 511 | self.output.send_data(new_version, collection) 512 | elif typ == MessageType.FRONTIER: 513 | frontier = msg 514 | new_frontier = frontier.truncate() 515 | assert self.input_frontier() <= new_frontier 516 | self.set_input_frontier(new_frontier) 517 | 518 | assert self.output_frontier <= self.input_frontier() 519 | if self.output_frontier < self.input_frontier(): 520 | self.output_frontier = self.input_frontier() 521 | self.output.send_frontier(self.output_frontier) 522 | 523 | super().__init__(input_a, output, inner, initial_frontier) 524 | 525 | 526 | if __name__ == "__main__": 527 | graph_builder = GraphBuilder(Version(0)) 528 | input_a, input_a_writer = graph_builder.new_input() 529 | output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0) 530 | input_a.negate().concat(output).debug("output") 531 | graph = graph_builder.finalize() 532 | 533 | for i in range(0, 10): 534 | input_a_writer.send_data(Version(i), Collection([(i, 1)])) 535 | input_a_writer.send_frontier(Version(i)) 536 | graph.step() 537 | graph_builder = GraphBuilder(Version(0)) 538 | input_a, input_a_writer = graph_builder.new_input() 539 | input_b, input_b_writer = graph_builder.new_input() 540 | 541 | output = input_a.join(input_b).count().debug("count") 542 | graph = graph_builder.finalize() 543 | 544 | for i in range(0, 10): 545 | input_a_writer.send_data(Version(i), Collection([((1, i), 2)])) 546 | input_a_writer.send_data(Version(i), Collection([((2, i), 2)])) 547 | input_b_writer.send_data(Version(i), Collection([((1, i + 2), 2)])) 548 | input_b_writer.send_data(Version(i), Collection([((2, i + 3), 2)])) 549 | input_a_writer.send_frontier(Version(i)) 550 | input_b_writer.send_frontier(Version(i)) 551 | graph.step() 552 | input_a_writer.send_frontier(Version(11)) 553 | input_b_writer.send_frontier(Version(11)) 554 | graph.step() 555 | 556 | graph_builder = GraphBuilder(Version(0)) 557 | input_a, input_a_writer = graph_builder.new_input() 558 | 559 | def geometric_series(collection): 560 | return ( 561 | collection.map(lambda data: data * 2) 562 | .concat(collection) 563 | .filter(lambda data: data <= 50) 564 | .map(lambda data: (data, ())) 565 | .distinct() 566 | .map(lambda data: data[0]) 567 | .consolidate() 568 | ) 569 | 570 | output = input_a.iterate(geometric_series).debug("iterate").connect_reader() 571 | graph = graph_builder.finalize() 572 | 573 | input_a_writer.send_data(Version(0), Collection([(1, 1)])) 574 | input_a_writer.send_frontier(Version(1)) 575 | 576 | while output.probe_frontier_less_than(Version(1)): 577 | graph.step() 578 | 579 | input_a_writer.send_data(Version(1), Collection([(16, 1), (3, 1)])) 580 | input_a_writer.send_frontier(Version(2)) 581 | 582 | while output.probe_frontier_less_than(Version(2)): 583 | graph.step() 584 | 585 | input_a_writer.send_data(Version(2), Collection([(3, -1)])) 586 | input_a_writer.send_frontier(Version(3)) 587 | 588 | while output.probe_frontier_less_than(Version(3)): 589 | graph.step() 590 | -------------------------------------------------------------------------------- /differential_dataflow.py: -------------------------------------------------------------------------------- 1 | """An implementation of differential dataflow. 2 | 3 | Compared to the Rust implementation, this implementation is both much less performant 4 | and more restrictive. Specifically, multiplicities in collections are constrained to 5 | be integers, and versions (timestamps in the Rust codebase) are constrained to be 6 | Version objects (integer tuples ordered by the product partial order). 7 | """ 8 | 9 | from collections import defaultdict 10 | 11 | from collection import Collection 12 | from graph import ( 13 | BinaryOperator, 14 | DifferenceStreamReader, 15 | DifferenceStreamWriter, 16 | Graph, 17 | MessageType, 18 | UnaryOperator, 19 | ) 20 | from index import Index 21 | from order import Version, Antichain 22 | 23 | 24 | class DifferenceStreamBuilder: 25 | """A representation of a dataflow edge as the dataflow graph is being built. 26 | 27 | This object is only used to set up the dataflow graph, and does not actually 28 | interact with any data. Manually creating an instance of this object is highly 29 | unexpected - instead more normal usage would be to create an instance using 30 | the new_input method on GraphBuilder. 31 | """ 32 | 33 | def __init__(self, graph): 34 | self._writer = DifferenceStreamWriter() 35 | self.graph = graph 36 | 37 | def connect_reader(self): 38 | return self._writer._new_reader() 39 | 40 | def writer(self): 41 | return self._writer 42 | 43 | def map(self, f): 44 | output = DifferenceStreamBuilder(self.graph) 45 | operator = MapOperator( 46 | self.connect_reader(), output.writer(), f, self.graph.frontier() 47 | ) 48 | self.graph.add_operator(operator) 49 | self.graph.add_stream(output.connect_reader()) 50 | return output 51 | 52 | def filter(self, f): 53 | output = DifferenceStreamBuilder(self.graph) 54 | operator = FilterOperator( 55 | self.connect_reader(), output.writer(), f, self.graph.frontier() 56 | ) 57 | self.graph.add_operator(operator) 58 | self.graph.add_stream(output.connect_reader()) 59 | return output 60 | 61 | def negate(self): 62 | output = DifferenceStreamBuilder(self.graph) 63 | operator = NegateOperator( 64 | self.connect_reader(), output.writer(), self.graph.frontier() 65 | ) 66 | self.graph.add_operator(operator) 67 | self.graph.add_stream(output.connect_reader()) 68 | return output 69 | 70 | def concat(self, other): 71 | assert id(self.graph) == id(other.graph) 72 | output = DifferenceStreamBuilder(self.graph) 73 | operator = ConcatOperator( 74 | self.connect_reader(), 75 | other.connect_reader(), 76 | output.writer(), 77 | self.graph.frontier(), 78 | ) 79 | self.graph.add_operator(operator) 80 | self.graph.add_stream(output.connect_reader()) 81 | return output 82 | 83 | def debug(self, name=""): 84 | output = DifferenceStreamBuilder(self.graph) 85 | operator = DebugOperator( 86 | self.connect_reader(), output.writer(), name, self.graph.frontier() 87 | ) 88 | self.graph.add_operator(operator) 89 | self.graph.add_stream(output.connect_reader()) 90 | return output 91 | 92 | def join(self, other): 93 | assert id(self.graph) == id(other.graph) 94 | output = DifferenceStreamBuilder(self.graph) 95 | operator = JoinOperator( 96 | self.connect_reader(), 97 | other.connect_reader(), 98 | output.writer(), 99 | self.graph.frontier(), 100 | ) 101 | self.graph.add_operator(operator) 102 | self.graph.add_stream(output.connect_reader()) 103 | return output 104 | 105 | def count(self): 106 | output = DifferenceStreamBuilder(self.graph) 107 | operator = CountOperator( 108 | self.connect_reader(), output.writer(), self.graph.frontier() 109 | ) 110 | self.graph.add_operator(operator) 111 | self.graph.add_stream(output.connect_reader()) 112 | return output 113 | 114 | def consolidate(self): 115 | output = DifferenceStreamBuilder(self.graph) 116 | operator = ConsolidateOperator( 117 | self.connect_reader(), output.writer(), self.graph.frontier() 118 | ) 119 | self.graph.add_operator(operator) 120 | self.graph.add_stream(output.connect_reader()) 121 | return output 122 | 123 | def distinct(self): 124 | output = DifferenceStreamBuilder(self.graph) 125 | operator = DistinctOperator( 126 | self.connect_reader(), output.writer(), self.graph.frontier() 127 | ) 128 | self.graph.add_operator(operator) 129 | self.graph.add_stream(output.connect_reader()) 130 | return output 131 | 132 | def _start_scope(self): 133 | new_frontier = self.graph.frontier().extend() 134 | self.graph.push_frontier(new_frontier) 135 | 136 | def _end_scope(self): 137 | self.graph.pop_frontier() 138 | 139 | def _ingress(self): 140 | output = DifferenceStreamBuilder(self.graph) 141 | operator = IngressOperator( 142 | self.connect_reader(), output.writer(), self.graph.frontier() 143 | ) 144 | self.graph.add_operator(operator) 145 | self.graph.add_stream(output.connect_reader()) 146 | return output 147 | 148 | def _egress(self): 149 | output = DifferenceStreamBuilder(self.graph) 150 | operator = EgressOperator( 151 | self.connect_reader(), output.writer(), self.graph.frontier() 152 | ) 153 | self.graph.add_operator(operator) 154 | self.graph.add_stream(output.connect_reader()) 155 | return output 156 | 157 | def iterate(self, f): 158 | self._start_scope() 159 | feedback_stream = DifferenceStreamBuilder(self.graph) 160 | entered = self._ingress().concat(feedback_stream) 161 | result = f(entered) 162 | feedback_operator = FeedbackOperator( 163 | result.connect_reader(), 1, feedback_stream.writer(), self.graph.frontier() 164 | ) 165 | self.graph.add_stream(feedback_stream) 166 | self.graph.add_operator(feedback_operator) 167 | self._end_scope() 168 | return result._egress() 169 | 170 | 171 | class GraphBuilder: 172 | """A representation of a dataflow graph as it is being built.""" 173 | 174 | def __init__(self, initial_frontier): 175 | self.streams = [] 176 | self.operators = [] 177 | self.frontier_stack = [initial_frontier] 178 | 179 | def new_input(self): 180 | stream_builder = DifferenceStreamBuilder(self) 181 | self.streams.append(stream_builder.connect_reader()) 182 | return stream_builder, stream_builder.writer() 183 | 184 | def add_operator(self, operator): 185 | self.operators.append(operator) 186 | 187 | def add_stream(self, stream): 188 | self.streams.append(stream) 189 | 190 | def frontier(self): 191 | return self.frontier_stack[-1] 192 | 193 | def push_frontier(self, new_frontier): 194 | self.frontier_stack.append(new_frontier) 195 | 196 | def pop_frontier(self): 197 | self.frontier_stack.pop() 198 | 199 | def finalize(self): 200 | return Graph(self.streams, self.operators) 201 | 202 | 203 | class LinearUnaryOperator(UnaryOperator): 204 | def __init__(self, input_a, output, f, initial_frontier): 205 | def inner(): 206 | for (typ, msg) in self.input_messages(): 207 | if typ == MessageType.DATA: 208 | version, collection = msg 209 | self.output.send_data(version, f(collection)) 210 | elif typ == MessageType.FRONTIER: 211 | frontier = msg 212 | assert self.input_frontier().less_equal(frontier) 213 | self.set_input_frontier(frontier) 214 | 215 | assert self.output_frontier.less_equal(self.input_frontier()) 216 | if self.output_frontier.less_than(self.input_frontier()): 217 | self.output_frontier = self.input_frontier() 218 | self.output.send_frontier(self.output_frontier) 219 | 220 | super().__init__(input_a, output, inner, initial_frontier) 221 | 222 | 223 | class MapOperator(LinearUnaryOperator): 224 | def __init__(self, input_a, output, f, initial_frontier): 225 | def map_inner(collection): 226 | return collection.map(f) 227 | 228 | super().__init__(input_a, output, map_inner, initial_frontier) 229 | 230 | 231 | class FilterOperator(LinearUnaryOperator): 232 | def __init__(self, input_a, output, f, initial_frontier): 233 | def filter_inner(collection): 234 | return collection.filter(f) 235 | 236 | super().__init__(input_a, output, filter_inner, initial_frontier) 237 | 238 | 239 | class NegateOperator(LinearUnaryOperator): 240 | def __init__(self, input_a, output, initial_frontier): 241 | def negate_inner(collection): 242 | return collection.negate() 243 | 244 | super().__init__(input_a, output, negate_inner, initial_frontier) 245 | 246 | 247 | class ConcatOperator(BinaryOperator): 248 | def __init__(self, input_a, input_b, output, initial_frontier): 249 | def inner(): 250 | for (typ, msg) in self.input_a_messages(): 251 | if typ == MessageType.DATA: 252 | version, collection = msg 253 | self.output.send_data(version, collection) 254 | elif typ == MessageType.FRONTIER: 255 | frontier = msg 256 | assert self.input_a_frontier().less_equal(frontier) 257 | self.set_input_a_frontier(frontier) 258 | for (typ, msg) in self.input_b_messages(): 259 | if typ == MessageType.DATA: 260 | version, collection = msg 261 | self.output.send_data(version, collection) 262 | elif typ == MessageType.FRONTIER: 263 | frontier = msg 264 | assert self.input_b_frontier().less_equal(frontier) 265 | self.set_input_b_frontier(frontier) 266 | 267 | input_frontier = self.input_a_frontier().meet(self.input_b_frontier()) 268 | assert self.output_frontier.less_equal(input_frontier) 269 | if self.output_frontier.less_than(input_frontier): 270 | self.output_frontier = input_frontier 271 | self.output.send_frontier(self.output_frontier) 272 | 273 | super().__init__(input_a, input_b, output, inner, initial_frontier) 274 | 275 | 276 | class ConsolidateOperator(UnaryOperator): 277 | def __init__(self, input_a, output, initial_frontier): 278 | self.collections = defaultdict(Collection) 279 | 280 | def inner(): 281 | for (typ, msg) in self.input_messages(): 282 | if typ == MessageType.DATA: 283 | version, collection = msg 284 | self.collections[version]._extend(collection) 285 | elif typ == MessageType.FRONTIER: 286 | frontier = msg 287 | assert self.input_frontier().less_equal(frontier) 288 | self.set_input_frontier(frontier) 289 | finished_versions = [ 290 | version 291 | for version in self.collections.keys() 292 | if self.input_frontier().less_equal_version(version) is not True 293 | ] 294 | for version in finished_versions: 295 | collection = self.collections.pop(version).consolidate() 296 | self.output.send_data(version, collection) 297 | assert self.output_frontier.less_equal(self.input_frontier()) 298 | if self.output_frontier.less_than(self.input_frontier()): 299 | self.output_frontier = self.input_frontier() 300 | self.output.send_frontier(self.output_frontier) 301 | 302 | super().__init__(input_a, output, inner, initial_frontier) 303 | 304 | 305 | class DebugOperator(UnaryOperator): 306 | def __init__(self, input_a, output, name, initial_frontier): 307 | def inner(): 308 | for (typ, msg) in self.input_messages(): 309 | if typ == MessageType.DATA: 310 | version, collection = msg 311 | print( 312 | f"debug {name} data: version: {version} collection: {collection}" 313 | ) 314 | self.output.send_data(version, collection) 315 | elif typ == MessageType.FRONTIER: 316 | frontier = msg 317 | assert self.input_frontier().less_equal(frontier) 318 | self.set_input_frontier(frontier) 319 | print(f"debug {name} notification: frontier {frontier}") 320 | assert self.output_frontier.less_equal(self.input_frontier()) 321 | if self.output_frontier.less_than(self.input_frontier()): 322 | self.output_frontier = self.input_frontier() 323 | self.output.send_frontier(self.output_frontier) 324 | 325 | super().__init__(input_a, output, inner, initial_frontier) 326 | 327 | 328 | class JoinOperator(BinaryOperator): 329 | def __init__(self, input_a, input_b, output, initial_frontier): 330 | self.index_a = Index() 331 | self.index_b = Index() 332 | 333 | def inner(): 334 | delta_a = Index() 335 | delta_b = Index() 336 | for (typ, msg) in self.input_a_messages(): 337 | if typ == MessageType.DATA: 338 | version, collection = msg 339 | for ((key, value), multiplicity) in collection._inner: 340 | delta_a.add_value(key, version, (value, multiplicity)) 341 | elif typ == MessageType.FRONTIER: 342 | frontier = msg 343 | assert self.input_a_frontier().less_equal(frontier) 344 | self.set_input_a_frontier(frontier) 345 | for (typ, msg) in self.input_b_messages(): 346 | if typ == MessageType.DATA: 347 | version, collection = msg 348 | for ((key, value), multiplicity) in collection._inner: 349 | delta_b.add_value(key, version, (value, multiplicity)) 350 | elif typ == MessageType.FRONTIER: 351 | frontier = msg 352 | assert self.input_b_frontier().less_equal(frontier) 353 | self.set_input_b_frontier(frontier) 354 | 355 | results = defaultdict(Collection) 356 | for (version, collection) in delta_a.join(self.index_b): 357 | results[version]._extend(collection) 358 | 359 | self.index_a.append(delta_a) 360 | 361 | for (version, collection) in self.index_a.join(delta_b): 362 | results[version]._extend(collection) 363 | 364 | for (version, collection) in results.items(): 365 | self.output.send_data(version, collection) 366 | self.index_b.append(delta_b) 367 | 368 | input_frontier = self.input_a_frontier().meet(self.input_b_frontier()) 369 | assert self.output_frontier.less_equal(input_frontier) 370 | if self.output_frontier.less_than(input_frontier): 371 | self.output_frontier = input_frontier 372 | self.output.send_frontier(self.output_frontier) 373 | self.index_a.compact(self.output_frontier) 374 | self.index_b.compact(self.output_frontier) 375 | 376 | super().__init__(input_a, input_b, output, inner, initial_frontier) 377 | 378 | 379 | class ReduceOperator(UnaryOperator): 380 | def __init__(self, input_a, output, f, initial_frontier): 381 | self.index = Index() 382 | self.index_out = Index() 383 | self.keys_todo = defaultdict(set) 384 | 385 | def subtract_values(first, second): 386 | result = defaultdict(int) 387 | for (v1, m1) in first: 388 | result[v1] += m1 389 | for (v2, m2) in second: 390 | result[v2] -= m2 391 | 392 | return [ 393 | (val, multiplicity) 394 | for (val, multiplicity) in result.items() 395 | if multiplicity != 0 396 | ] 397 | 398 | def inner(): 399 | for (typ, msg) in self.input_messages(): 400 | if typ == MessageType.DATA: 401 | version, collection = msg 402 | for ((key, value), multiplicity) in collection._inner: 403 | self.index.add_value(key, version, (value, multiplicity)) 404 | self.keys_todo[version].add(key) 405 | for v2 in self.index.versions(key): 406 | self.keys_todo[version.join(v2)].add(key) 407 | elif typ == MessageType.FRONTIER: 408 | frontier = msg 409 | assert self.input_frontier().less_equal(frontier) 410 | self.set_input_frontier(frontier) 411 | 412 | finished_versions = [ 413 | version 414 | for version in self.keys_todo.keys() 415 | if self.input_frontier().less_equal_version(version) is not True 416 | ] 417 | 418 | finished_versions.sort() 419 | for version in finished_versions: 420 | keys = self.keys_todo.pop(version) 421 | result = [] 422 | for key in keys: 423 | curr = self.index.reconstruct_at(key, version) 424 | curr_out = self.index_out.reconstruct_at(key, version) 425 | out = f(curr) 426 | delta = subtract_values(out, curr_out) 427 | for (value, multiplicity) in delta: 428 | result.append(((key, value), multiplicity)) 429 | self.index_out.add_value(key, version, (value, multiplicity)) 430 | if result != []: 431 | self.output.send_data(version, Collection(result)) 432 | 433 | assert self.output_frontier.less_equal(self.input_frontier()) 434 | if self.output_frontier.less_than(self.input_frontier()): 435 | self.output_frontier = self.input_frontier() 436 | self.output.send_frontier(self.output_frontier) 437 | self.index.compact(self.output_frontier) 438 | self.index_out.compact(self.output_frontier) 439 | 440 | super().__init__(input_a, output, inner, initial_frontier) 441 | 442 | 443 | class CountOperator(ReduceOperator): 444 | def __init__(self, input_a, output, initial_frontier): 445 | def count_inner(vals): 446 | out = 0 447 | for (_, diff) in vals: 448 | out += diff 449 | return [(out, 1)] 450 | 451 | super().__init__(input_a, output, count_inner, initial_frontier) 452 | 453 | 454 | class DistinctOperator(ReduceOperator): 455 | def __init__(self, input_a, output, initial_frontier): 456 | def distinct_inner(vals): 457 | consolidated = defaultdict(int) 458 | for (val, diff) in vals: 459 | consolidated[val] += diff 460 | for (val, diff) in consolidated.items(): 461 | assert diff >= 0 462 | return [(val, 1) for (val, diff) in consolidated.items() if diff > 0] 463 | 464 | super().__init__(input_a, output, distinct_inner, initial_frontier) 465 | 466 | 467 | class FeedbackOperator(UnaryOperator): 468 | def __init__(self, input_a, step, output, initial_frontier): 469 | # Map from top-level version -> set of messages where we have 470 | # sent some data at that version 471 | self.in_flight_data = defaultdict(set) 472 | # Versions where a given top-level version has updated 473 | # its iteration without sending any data. 474 | self.empty_versions = defaultdict(set) 475 | 476 | def inner(): 477 | for (typ, msg) in self.input_messages(): 478 | if typ == MessageType.DATA: 479 | version, collection = msg 480 | new_version = version.apply_step(step) 481 | truncated = new_version.truncate() 482 | self.output.send_data(new_version, collection) 483 | 484 | # Record that we sent data at this version. 485 | self.in_flight_data[truncated].add(new_version) 486 | # Make sure we track that we are iterating at this top-level 487 | # version if we haven't already 488 | if truncated not in self.empty_versions: 489 | self.empty_versions[truncated] = set() 490 | elif typ == MessageType.FRONTIER: 491 | frontier = msg 492 | assert self.input_frontier().less_equal(frontier) 493 | self.set_input_frontier(frontier) 494 | 495 | # Increment the current input frontier 496 | incremented_input_frontier = self.input_frontier().apply_step(step) 497 | # Grab all of the elements from the potential output frontier. 498 | elements = incremented_input_frontier._elements() 499 | # Partition every element from this potential output frontier into one of 500 | # two sets, either elements to keep, or elements to reject. 501 | candidate_output_frontier = [] 502 | rejected = [] 503 | for elem in elements: 504 | truncated = elem.truncate() 505 | 506 | # Always keep a frontier element if there is are differences associated 507 | # with its top-level version that are still in flight. 508 | if len(self.in_flight_data[truncated]) != 0: 509 | candidate_output_frontier.append(elem) 510 | 511 | # We can stop remembering any versions that will be closed 512 | # by this frontier element. 513 | closed = { 514 | x for x in self.in_flight_data[truncated] if x.less_than(elem) 515 | } 516 | self.in_flight_data[truncated] -= closed 517 | else: 518 | # This frontier element does not have any differences associated with its 519 | # top-level version that were not closed out by prior frontier updates. 520 | 521 | # Remember that we observed an "empty" update for this top-level version. 522 | self.empty_versions[truncated].add(elem) 523 | 524 | # Don't do anything if we haven't observed at least three "empty" frontier 525 | # updates for this top-level time. 526 | if len(self.empty_versions[truncated]) <= 3: 527 | candidate_output_frontier.append(elem) 528 | else: 529 | self.in_flight_data.pop(truncated) 530 | self.empty_versions.pop(truncated) 531 | rejected.append(elem) 532 | 533 | # Ensure that we can still send data at all other top-level 534 | # versions that were not rejected. 535 | for r in rejected: 536 | for truncated in self.in_flight_data.keys(): 537 | candidate_output_frontier.append(r.join(truncated.extend())) 538 | 539 | # Construct a minimal antichain from the set of candidate elements. 540 | candidate_output_frontier = Antichain(candidate_output_frontier) 541 | 542 | assert self.output_frontier.less_equal(candidate_output_frontier) 543 | if self.output_frontier.less_than(candidate_output_frontier): 544 | self.output_frontier = candidate_output_frontier 545 | self.output.send_frontier(self.output_frontier) 546 | 547 | super().__init__(input_a, output, inner, initial_frontier) 548 | 549 | def connect_loop(output): 550 | self.output = output 551 | 552 | 553 | class IngressOperator(UnaryOperator): 554 | def __init__(self, input_a, output, initial_frontier): 555 | def inner(): 556 | for (typ, msg) in self.input_messages(): 557 | if typ == MessageType.DATA: 558 | version, collection = msg 559 | new_version = version.extend() 560 | self.output.send_data(new_version, collection) 561 | self.output.send_data( 562 | new_version.apply_step(1), collection.negate() 563 | ) 564 | elif typ == MessageType.FRONTIER: 565 | frontier = msg 566 | new_frontier = frontier.extend() 567 | assert self.input_frontier().less_equal(new_frontier) 568 | self.set_input_frontier(new_frontier) 569 | 570 | assert self.output_frontier.less_equal(self.input_frontier()) 571 | if self.output_frontier.less_than(self.input_frontier()): 572 | self.output_frontier = self.input_frontier() 573 | self.output.send_frontier(self.output_frontier) 574 | 575 | super().__init__(input_a, output, inner, initial_frontier) 576 | 577 | 578 | class EgressOperator(UnaryOperator): 579 | def __init__(self, input_a, output, initial_frontier): 580 | def inner(): 581 | for (typ, msg) in self.input_messages(): 582 | if typ == MessageType.DATA: 583 | version, collection = msg 584 | new_version = version.truncate() 585 | self.output.send_data(new_version, collection) 586 | elif typ == MessageType.FRONTIER: 587 | frontier = msg 588 | new_frontier = frontier.truncate() 589 | assert self.input_frontier().less_equal(new_frontier) 590 | self.set_input_frontier(new_frontier) 591 | 592 | assert self.output_frontier.less_equal(self.input_frontier()) 593 | if self.output_frontier.less_than(self.input_frontier()): 594 | self.output_frontier = self.input_frontier() 595 | self.output.send_frontier(self.output_frontier) 596 | 597 | super().__init__(input_a, output, inner, initial_frontier) 598 | 599 | 600 | if __name__ == "__main__": 601 | graph_builder = GraphBuilder(Antichain([Version([0, 0])])) 602 | input_a, input_a_writer = graph_builder.new_input() 603 | output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0) 604 | input_a.negate().concat(output).debug("output") 605 | graph = graph_builder.finalize() 606 | 607 | for i in range(0, 10): 608 | input_a_writer.send_data(Version([0, i]), Collection([(i, 1)])) 609 | input_a_writer.send_frontier(Antichain([Version([i, 0]), Version([0, i])])) 610 | graph.step() 611 | 612 | graph_builder = GraphBuilder(Antichain([Version([0, 0])])) 613 | input_a, input_a_writer = graph_builder.new_input() 614 | input_b, input_b_writer = graph_builder.new_input() 615 | 616 | input_a.join(input_b).count().debug("count") 617 | graph = graph_builder.finalize() 618 | 619 | for i in range(0, 2): 620 | input_a_writer.send_data(Version([0, i]), Collection([((1, i), 2)])) 621 | input_a_writer.send_data(Version([0, i]), Collection([((2, i), 2)])) 622 | 623 | a_frontier = Antichain([Version([i + 2, 0]), Version([0, i])]) 624 | input_a_writer.send_frontier(a_frontier) 625 | input_b_writer.send_data(Version([i, 0]), Collection([((1, i + 2), 2)])) 626 | input_b_writer.send_data(Version([i, 0]), Collection([((2, i + 3), 2)])) 627 | input_b_writer.send_frontier(Antichain([Version([i, 0]), Version([0, i * 2])])) 628 | graph.step() 629 | 630 | input_a_writer.send_frontier(Antichain([Version([11, 11])])) 631 | input_b_writer.send_frontier(Antichain([Version([11, 11])])) 632 | graph.step() 633 | 634 | graph_builder = GraphBuilder(Antichain([Version(0)])) 635 | input_a, input_a_writer = graph_builder.new_input() 636 | 637 | def geometric_series(collection): 638 | return ( 639 | collection.map(lambda data: data * 2) 640 | .concat(collection) 641 | .filter(lambda data: data <= 50) 642 | .map(lambda data: (data, ())) 643 | .distinct() 644 | .map(lambda data: data[0]) 645 | .consolidate() 646 | ) 647 | 648 | output = input_a.iterate(geometric_series).debug("iterate").connect_reader() 649 | graph = graph_builder.finalize() 650 | 651 | input_a_writer.send_data(Version(0), Collection([(1, 1)])) 652 | input_a_writer.send_frontier(Antichain([Version(1)])) 653 | 654 | while output.probe_frontier_less_than(Antichain([Version(1)])): 655 | graph.step() 656 | 657 | input_a_writer.send_data(Version(1), Collection([(16, 1), (3, 1)])) 658 | input_a_writer.send_frontier(Antichain([Version(2)])) 659 | 660 | while output.probe_frontier_less_than(Antichain([Version(2)])): 661 | graph.step() 662 | 663 | input_a_writer.send_data(Version(2), Collection([(3, -1)])) 664 | input_a_writer.send_frontier(Antichain([Version(3)])) 665 | 666 | while output.probe_frontier_less_than(Antichain([Version(3)])): 667 | graph.step() 668 | --------------------------------------------------------------------------------