├── v3
    ├── collection.py
    ├── index.py
    ├── graph.py
    └── differential_dataflow.py
├── v4
    ├── collection.py
    ├── version.py
    ├── index.py
    ├── graph.py
    └── differential_dataflow.py
├── v1
    ├── collection.py
    ├── index.py
    └── difference_sequence.py
├── v2
    ├── collection.py
    ├── index.py
    ├── graph.py
    └── differential_dataflow.py
├── example.py
├── index.py
├── graph.py
├── order.py
├── README.md
├── v0
    └── collection.py
├── collection.py
└── differential_dataflow.py


/v3/collection.py:
--------------------------------------------------------------------------------
 1 | """The implementation of collections (multisets) of data and functional operations over single collections.
 2 | """
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | class Collection:
 8 |     """A multiset of data"""
 9 | 
10 |     def __init__(self, dataz=None):
11 |         if dataz is None:
12 |             dataz = []
13 |         self._inner = dataz
14 | 
15 |     def __repr__(self):
16 |         return f"Collection({self._inner})"
17 | 
18 |     def map(self, f):
19 |         """Apply a function to all records in the collection."""
20 |         return Collection(
21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
22 |         )
23 | 
24 |     def filter(self, f):
25 |         """Filter out records for which a function f(record) evaluates to False."""
26 |         return Collection(
27 |             [
28 |                 (data, multiplicity)
29 |                 for (data, multiplicity) in self._inner
30 |                 if f(data) == True
31 |             ]
32 |         )
33 | 
34 |     def negate(self):
35 |         return Collection(
36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
37 |         )
38 | 
39 |     def consolidate(self):
40 |         """Produce as output a collection that is logically equivalent to the input
41 |         but which combines identical instances of the same record into one
42 |         (record, multiplicity) pair.
43 |         """
44 |         consolidated = defaultdict(int)
45 |         for (data, multiplicity) in self._inner:
46 |             consolidated[data] += multiplicity
47 |         consolidated = [
48 |             (data, multiplicity)
49 |             for (data, multiplicity) in consolidated.items()
50 |             if multiplicity != 0
51 |         ]
52 |         consolidated.sort()
53 |         return Collection(consolidated)
54 | 
55 |     def _extend(self, other):
56 |         self._inner.extend(other._inner)
57 | 


--------------------------------------------------------------------------------
/v4/collection.py:
--------------------------------------------------------------------------------
 1 | """The implementation of collections (multisets) of data and functional operations over single collections.
 2 | """
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | class Collection:
 8 |     """A multiset of data"""
 9 | 
10 |     def __init__(self, dataz=None):
11 |         if dataz is None:
12 |             dataz = []
13 |         self._inner = dataz
14 | 
15 |     def __repr__(self):
16 |         return f"Collection({self._inner})"
17 | 
18 |     def map(self, f):
19 |         """Apply a function to all records in the collection."""
20 |         return Collection(
21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
22 |         )
23 | 
24 |     def filter(self, f):
25 |         """Filter out records for which a function f(record) evaluates to False."""
26 |         return Collection(
27 |             [
28 |                 (data, multiplicity)
29 |                 for (data, multiplicity) in self._inner
30 |                 if f(data) == True
31 |             ]
32 |         )
33 | 
34 |     def negate(self):
35 |         return Collection(
36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
37 |         )
38 | 
39 |     def consolidate(self):
40 |         """Produce as output a collection that is logically equivalent to the input
41 |         but which combines identical instances of the same record into one
42 |         (record, multiplicity) pair.
43 |         """
44 |         consolidated = defaultdict(int)
45 |         for (data, multiplicity) in self._inner:
46 |             consolidated[data] += multiplicity
47 |         consolidated = [
48 |             (data, multiplicity)
49 |             for (data, multiplicity) in consolidated.items()
50 |             if multiplicity != 0
51 |         ]
52 |         consolidated.sort()
53 |         return Collection(consolidated)
54 | 
55 |     def _extend(self, other):
56 |         self._inner.extend(other._inner)
57 | 


--------------------------------------------------------------------------------
/v1/collection.py:
--------------------------------------------------------------------------------
 1 | """The implementation of collections (multisets) of data and functional operations over single collections.
 2 | """
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | class Collection:
 8 |     """A multiset of data"""
 9 | 
10 |     def __init__(self, dataz=None):
11 |         if dataz is None:
12 |             dataz = []
13 |         self._inner = dataz
14 | 
15 |     def __repr__(self):
16 |         return f"Collection({self._inner})"
17 | 
18 |     def map(self, f):
19 |         """Apply a function to all records in the collection."""
20 |         return Collection(
21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
22 |         )
23 | 
24 |     def filter(self, f):
25 |         """Filter out records for which a function f(record) evaluates to False."""
26 |         return Collection(
27 |             [
28 |                 (data, multiplicity)
29 |                 for (data, multiplicity) in self._inner
30 |                 if f(data) == True
31 |             ]
32 |         )
33 | 
34 |     def negate(self):
35 |         return Collection(
36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
37 |         )
38 | 
39 |     def concat(self, other):
40 |         """Concatenate two collections together."""
41 |         out = []
42 |         out.extend(self._inner)
43 |         out.extend(other._inner)
44 |         return Collection(out)
45 | 
46 |     def consolidate(self):
47 |         """Produce as output a collection that is logically equivalent to the input
48 |         but which combines identical instances of the same record into one
49 |         (record, multiplicity) pair.
50 |         """
51 |         consolidated = defaultdict(int)
52 |         for (data, multiplicity) in self._inner:
53 |             consolidated[data] += multiplicity
54 |         consolidated = [
55 |             (data, multiplicity)
56 |             for (data, multiplicity) in consolidated.items()
57 |             if multiplicity != 0
58 |         ]
59 |         consolidated.sort()
60 |         return Collection(consolidated)
61 | 
62 |     def _extend(self, other):
63 |         self._inner.extend(other._inner)
64 | 


--------------------------------------------------------------------------------
/v2/collection.py:
--------------------------------------------------------------------------------
 1 | """The implementation of collections (multisets) of data and functional operations over single collections.
 2 | """
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | class Collection:
 8 |     """A multiset of data"""
 9 | 
10 |     def __init__(self, dataz=None):
11 |         if dataz is None:
12 |             dataz = []
13 |         self._inner = dataz
14 | 
15 |     def __repr__(self):
16 |         return f"Collection({self._inner})"
17 | 
18 |     def map(self, f):
19 |         """Apply a function to all records in the collection."""
20 |         return Collection(
21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
22 |         )
23 | 
24 |     def filter(self, f):
25 |         """Filter out records for which a function f(record) evaluates to False."""
26 |         return Collection(
27 |             [
28 |                 (data, multiplicity)
29 |                 for (data, multiplicity) in self._inner
30 |                 if f(data) == True
31 |             ]
32 |         )
33 | 
34 |     def negate(self):
35 |         return Collection(
36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
37 |         )
38 | 
39 |     def concat(self, other):
40 |         """Concatenate two collections together."""
41 |         out = []
42 |         out.extend(self._inner)
43 |         out.extend(other._inner)
44 |         return Collection(out)
45 | 
46 |     def consolidate(self):
47 |         """Produce as output a collection that is logically equivalent to the input
48 |         but which combines identical instances of the same record into one
49 |         (record, multiplicity) pair.
50 |         """
51 |         consolidated = defaultdict(int)
52 |         for (data, multiplicity) in self._inner:
53 |             consolidated[data] += multiplicity
54 |         consolidated = [
55 |             (data, multiplicity)
56 |             for (data, multiplicity) in consolidated.items()
57 |             if multiplicity != 0
58 |         ]
59 |         consolidated.sort()
60 |         return Collection(consolidated)
61 | 
62 |     def _extend(self, other):
63 |         self._inner.extend(other._inner)
64 | 


--------------------------------------------------------------------------------
/v4/version.py:
--------------------------------------------------------------------------------
 1 | """The implementation of totally ordered, multidimensional versions (times) for use within a differential dataflow.
 2 | """
 3 | 
 4 | 
 5 | class Version:
 6 |     """A totally ordered version (time), consisting of a tuple of
 7 |     integers, ordered lexicographically.
 8 | 
 9 |     All versions within a scope of a dataflow must have the same dimension/number
10 |     of coordinates.
11 |     """
12 | 
13 |     def __init__(self, version):
14 |         if isinstance(version, int):
15 |             assert version >= 0
16 |             self.inner = (version,)
17 |         elif isinstance(version, list) or isinstance(version, tuple):
18 |             for i in version:
19 |                 assert isinstance(i, int)
20 |                 assert i >= 0
21 |             self.inner = tuple(version)
22 |         else:
23 |             assert 0 > 1
24 | 
25 |     def __repr__(self):
26 |         return f"Version({self.inner})"
27 | 
28 |     def __eq__(self, other):
29 |         return self.inner == other.inner
30 | 
31 |     def __lt__(self, other):
32 |         return self.inner.__lt__(other.inner)
33 | 
34 |     def __le__(self, other):
35 |         return self.__lt__(other) or self.__eq__(other)
36 | 
37 |     def __hash__(self):
38 |         return hash(self.inner)
39 | 
40 |     def _validate(self, other):
41 |         assert len(self.inner) > 0
42 |         assert len(self.inner) == len(other.inner)
43 | 
44 |     def extend(self):
45 |         elements = [e for e in self.inner]
46 |         elements.append(0)
47 |         return Version(elements)
48 | 
49 |     def truncate(self):
50 |         elements = [e for e in self.inner]
51 |         elements.pop()
52 |         return Version(elements)
53 | 
54 |     def apply_step(self, step, max_value):
55 |         assert step > 0
56 |         assert len(self.inner) > 1
57 |         elements = [e for e in self.inner]
58 | 
59 |         pos = 1
60 |         while True:
61 |             if elements[-pos] < max_value or pos == len(elements):
62 |                 elements[-pos] += step
63 |                 break
64 |             else:
65 |                 elements[-pos] = 0
66 |                 pos += 1
67 |         output = Version(elements)
68 |         assert output > self
69 |         return output
70 | 


--------------------------------------------------------------------------------
/v1/index.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from collection import Collection
 3 | 
 4 | 
 5 | class Index:
 6 |     def __init__(self, compaction_frontier=None):
 7 |         self._index = defaultdict(list)
 8 | 
 9 |     def __repr__(self):
10 |         return "Index({self._index})"
11 | 
12 |     def add_value(self, key, value):
13 |         """Add a (value, multiplicity) pair for the requested key."""
14 |         self._index[key].append(value)
15 | 
16 |     def append(self, other):
17 |         """Combine all of the data in other into self."""
18 |         for (key, data) in other._index.items():
19 |             self._index[key].extend(data)
20 | 
21 |     def get(self, key):
22 |         if key in self._index:
23 |             return self._index[key]
24 |         return []
25 | 
26 |     def join(self, other):
27 |         """Produce a bounded collection trace containing (key, (val1, val2))
28 |         for all (key, val1) in the first index, and (key, val2) in the second
29 |         index.
30 |         """
31 |         out = []
32 |         for (key, data1) in self._index.items():
33 |             if key not in other._index:
34 |                 continue
35 |             data2 = other._index[key]
36 | 
37 |             for (val1, mul1) in data1:
38 |                 for (val2, mul2) in data2:
39 |                     out.append(((key, (val1, val2)), mul1 * mul2))
40 |         return Collection(out)
41 | 
42 |     def compact(self, keys=[]):
43 |         def consolidate_values(values):
44 |             consolidated = defaultdict(int)
45 |             for (value, multiplicity) in values:
46 |                 consolidated[value] += multiplicity
47 | 
48 |             return [
49 |                 (value, multiplicity)
50 |                 for (value, multiplicity) in consolidated.items()
51 |                 if multiplicity != 0
52 |             ]
53 | 
54 |         if keys == []:
55 |             keys = [key for key in self._index.keys()]
56 | 
57 |         for key in keys:
58 |             if key not in self._index:
59 |                 continue
60 |             data = self._index.pop(key)
61 |             consolidated = consolidate_values(data)
62 | 
63 |             if consolidated != []:
64 |                 self._index[key].extend(consolidated)
65 | 


--------------------------------------------------------------------------------
/v2/index.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from collection import Collection
 3 | 
 4 | 
 5 | class Index:
 6 |     def __init__(self, compaction_frontier=None):
 7 |         self._index = defaultdict(list)
 8 | 
 9 |     def __repr__(self):
10 |         return "Index({self._index})"
11 | 
12 |     def add_value(self, key, value):
13 |         """Add a (value, multiplicity) pair for the requested key."""
14 |         self._index[key].append(value)
15 | 
16 |     def append(self, other):
17 |         """Combine all of the data in other into self."""
18 |         for (key, data) in other._index.items():
19 |             self._index[key].extend(data)
20 | 
21 |     def get(self, key):
22 |         if key in self._index:
23 |             return self._index[key]
24 |         return []
25 | 
26 |     def join(self, other):
27 |         """Produce a bounded collection trace containing (key, (val1, val2))
28 |         for all (key, val1) in the first index, and (key, val2) in the second
29 |         index.
30 |         """
31 |         out = []
32 |         for (key, data1) in self._index.items():
33 |             if key not in other._index:
34 |                 continue
35 |             data2 = other._index[key]
36 | 
37 |             for (val1, mul1) in data1:
38 |                 for (val2, mul2) in data2:
39 |                     out.append(((key, (val1, val2)), mul1 * mul2))
40 |         return Collection(out)
41 | 
42 |     def compact(self, keys=[]):
43 |         def consolidate_values(values):
44 |             consolidated = defaultdict(int)
45 |             for (value, multiplicity) in values:
46 |                 consolidated[value] += multiplicity
47 | 
48 |             return [
49 |                 (value, multiplicity)
50 |                 for (value, multiplicity) in consolidated.items()
51 |                 if multiplicity != 0
52 |             ]
53 | 
54 |         if keys == []:
55 |             keys = [key for key in self._index.keys()]
56 | 
57 |         for key in keys:
58 |             if key not in self._index:
59 |                 continue
60 |             data = self._index.pop(key)
61 |             consolidated = consolidate_values(data)
62 | 
63 |             if consolidated != []:
64 |                 self._index[key].extend(consolidated)
65 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from collection import Collection
 2 | from order import Version, Antichain
 3 | from differential_dataflow import GraphBuilder
 4 | 
 5 | 
 6 | def game_of_life(collection):
 7 |     maybe_live_cells = collection.map(lambda data: ((data[0] - 1, data[1] - 1), ()))
 8 |     maybe_live_cells = maybe_live_cells.concat(
 9 |         collection.map(lambda data: ((data[0] - 1, data[1]), ()))
10 |     )
11 | 
12 |     maybe_live_cells = maybe_live_cells.concat(
13 |         collection.map(lambda data: ((data[0] - 1, data[1] + 1), ()))
14 |     )
15 |     maybe_live_cells = maybe_live_cells.concat(
16 |         collection.map(lambda data: ((data[0], data[1] - 1), ()))
17 |     )
18 |     maybe_live_cells = maybe_live_cells.concat(
19 |         collection.map(lambda data: ((data[0], data[1] + 1), ()))
20 |     )
21 |     maybe_live_cells = maybe_live_cells.concat(
22 |         collection.map(lambda data: ((data[0] + 1, data[1] - 1), ()))
23 |     )
24 |     maybe_live_cells = maybe_live_cells.concat(
25 |         collection.map(lambda data: ((data[0] + 1, data[1]), ()))
26 |     )
27 |     maybe_live_cells = maybe_live_cells.concat(
28 |         collection.map(lambda data: ((data[0] + 1, data[1] + 1), ()))
29 |     )
30 | 
31 |     maybe_live_cells = maybe_live_cells.count()
32 |     live_with_three_neighbors = maybe_live_cells.filter(lambda data: data[1] == 3).map(
33 |         lambda data: (data[0], ())
34 |     )
35 |     live_with_two_neighbors = (
36 |         maybe_live_cells.filter(lambda data: data[1] == 2)
37 |         .join(collection.map(lambda data: (data, ())))
38 |         .map(lambda data: (data[0], ()))
39 |     )
40 |     live_next_round = (
41 |         live_with_two_neighbors.concat(live_with_three_neighbors)
42 |         .distinct()
43 |         .map(lambda data: data[0])
44 |     )
45 | 
46 |     return live_next_round
47 | 
48 | 
49 | graph_builder = GraphBuilder(Antichain([Version(0)]))
50 | input_a, input_a_writer = graph_builder.new_input()
51 | output = input_a.iterate(game_of_life).debug("iterate").connect_reader()
52 | graph = graph_builder.finalize()
53 | 
54 | input_a_writer.send_data(
55 |     Version(0), Collection([((2, 2), 1), ((2, 3), 1), ((2, 4), 1), ((3, 2), 1)])
56 | )
57 | input_a_writer.send_frontier(Antichain([Version(1)]))
58 | 
59 | while output.probe_frontier_less_than(Antichain([Version(1)])):
60 |     graph.step()
61 | 


--------------------------------------------------------------------------------
/v2/graph.py:
--------------------------------------------------------------------------------
  1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program."""
  2 | 
  3 | from collections import deque
  4 | 
  5 | 
  6 | class DifferenceStreamReader:
  7 |     """A read handle to a dataflow edge that receives data from a writer.
  8 | 
  9 |     The data received over this edge are Collection objects that represent difference
 10 |     collections representing a single logical collection undergoing changes.
 11 |     """
 12 | 
 13 |     def __init__(self, queue):
 14 |         self._queue = queue
 15 | 
 16 |     def drain(self):
 17 |         out = []
 18 |         while len(self._queue) > 0:
 19 |             out.append(self._queue.pop())
 20 | 
 21 |         return out
 22 | 
 23 |     def is_empty(self):
 24 |         return len(self._queue) == 0
 25 | 
 26 | 
 27 | class DifferenceStreamWriter:
 28 |     """A write handle to a dataflow edge that is allowed to publish data."""
 29 | 
 30 |     def __init__(self):
 31 |         self._queues = []
 32 | 
 33 |     def send_data(self, collection):
 34 |         for q in self._queues:
 35 |             q.appendleft(collection)
 36 | 
 37 |     def _new_reader(self):
 38 |         q = deque()
 39 |         self._queues.append(q)
 40 |         return DifferenceStreamReader(q)
 41 | 
 42 | 
 43 | class Operator:
 44 |     """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and
 45 |     one outgoing edge (write handle).
 46 |     """
 47 | 
 48 |     def __init__(self, inputs, output, f):
 49 |         self.inputs = inputs
 50 |         self.output = output
 51 |         self.f = f
 52 |         self.pending_work = False
 53 | 
 54 |     def run(self):
 55 |         self.f()
 56 | 
 57 |     def pending_work(self):
 58 |         if self.pending_work is True:
 59 |             return True
 60 |         for input_listener in self.inputs:
 61 |             if input_listener.is_empty() is False:
 62 |                 return True
 63 |         return False
 64 | 
 65 | 
 66 | class UnaryOperator(Operator):
 67 |     """A convenience implementation of a dataflow operator that has a handle to one
 68 |     incoming stream of data, and one handle to an outgoing stream of data.
 69 |     """
 70 | 
 71 |     def __init__(self, input_a, output, f):
 72 |         super().__init__([input_a], output, f)
 73 | 
 74 |     def input_messages(self):
 75 |         return self.inputs[0].drain()
 76 | 
 77 | 
 78 | class BinaryOperator(Operator):
 79 |     """A convenience implementation of a dataflow operator that has a handle to two
 80 |     incoming streams of data, and one handle to an outgoing stream of data.
 81 |     """
 82 | 
 83 |     def __init__(self, input_a, input_b, output, f):
 84 |         super().__init__([input_a, input_b], output, f)
 85 | 
 86 |     def input_a_messages(self):
 87 |         return self.inputs[0].drain()
 88 | 
 89 |     def input_b_messages(self):
 90 |         return self.inputs[1].drain()
 91 | 
 92 | 
 93 | class Graph:
 94 |     """An implementation of a dataflow graph.
 95 | 
 96 |     This implementation needs to keep the entire set of nodes so that they
 97 |     may be run, and only keeps a set of read handles to all edges for debugging
 98 |     purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because
 99 |     this object does not actually know anything about the connections between the
100 |     various nodes.
101 |     """
102 | 
103 |     def __init__(self, streams, operators):
104 |         self.streams = streams
105 |         self.operators = operators
106 | 
107 |     def step(self):
108 |         for op in self.operators:
109 |             op.run()
110 | 


--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
  1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and
  2 | accessing (key, value) structured data across multiple versions (times).
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | from collection import Collection
  7 | from order import Version, Antichain
  8 | 
  9 | 
 10 | class Index:
 11 |     """A map from a difference collection trace's keys -> versions at which
 12 |     the key has nonzero multiplicity -> (value, multiplicities) that changed.
 13 | 
 14 |     Used in operations like join and reduce where the operation needs to
 15 |     exploit the key-value structure of the data to run efficiently.
 16 | 
 17 |     This implementation supports the general case of partially ordered versions.
 18 |     """
 19 | 
 20 |     def __init__(self):
 21 |         self.inner = defaultdict(lambda: defaultdict(list))
 22 |         # TODO: take an initial time?
 23 |         self.compaction_frontier = None
 24 | 
 25 |     def _validate(self, requested_version):
 26 |         if self.compaction_frontier is None:
 27 |             return True
 28 |         if isinstance(requested_version, Antichain):
 29 |             assert self.compaction_frontier.less_equal(requested_version)
 30 |         elif isinstance(requested_version, Version):
 31 |             assert self.compaction_frontier.less_equal_version(requested_version)
 32 | 
 33 |     def reconstruct_at(self, key, requested_version):
 34 |         self._validate(requested_version)
 35 |         out = []
 36 |         for (version, values) in self.inner[key].items():
 37 |             if version.less_equal(requested_version):
 38 |                 out.extend(values)
 39 |         return out
 40 | 
 41 |     def versions(self, key):
 42 |         return [version for version in self.inner[key].keys()]
 43 | 
 44 |     def add_value(self, key, version, value):
 45 |         self._validate(version)
 46 |         self.inner[key][version].append(value)
 47 | 
 48 |     def append(self, other):
 49 |         for (key, versions) in other.inner.items():
 50 |             for (version, data) in versions.items():
 51 |                 self.inner[key][version].extend(data)
 52 | 
 53 |     def join(self, other):
 54 |         collections = defaultdict(list)
 55 |         for (key, versions) in self.inner.items():
 56 |             if key not in other.inner:
 57 |                 continue
 58 |             other_versions = other.inner[key]
 59 | 
 60 |             for (version1, data1) in versions.items():
 61 |                 for (version2, data2) in other_versions.items():
 62 |                     for (val1, mul1) in data1:
 63 |                         for (val2, mul2) in data2:
 64 |                             result_version = version1.join(version2)
 65 |                             collections[result_version].append(
 66 |                                 ((key, (val1, val2)), mul1 * mul2)
 67 |                             )
 68 |         return [
 69 |             (version, Collection(c)) for (version, c) in collections.items() if c != []
 70 |         ]
 71 | 
 72 |     def compact(self, compaction_frontier, keys=[]):
 73 |         self._validate(compaction_frontier)
 74 | 
 75 |         def consolidate_values(values):
 76 |             consolidated = defaultdict(int)
 77 |             for (value, multiplicity) in values:
 78 |                 consolidated[value] += multiplicity
 79 | 
 80 |             return [
 81 |                 (value, multiplicity)
 82 |                 for (value, multiplicity) in consolidated.items()
 83 |                 if multiplicity != 0
 84 |             ]
 85 | 
 86 |         if keys == []:
 87 |             keys = [key for key in self.inner.keys()]
 88 | 
 89 |         for key in keys:
 90 |             versions = self.inner[key]
 91 |             to_compact = [
 92 |                 version
 93 |                 for version in versions.keys()
 94 |                 if compaction_frontier.less_equal_version(version) is not True
 95 |             ]
 96 |             to_consolidate = set()
 97 |             for version in to_compact:
 98 |                 values = versions.pop(version)
 99 |                 new_version = version.advance_by(compaction_frontier)
100 |                 versions[new_version].extend(values)
101 |                 to_consolidate.add(new_version)
102 |             for version in to_consolidate:
103 |                 values = versions.pop(version)
104 |                 versions[version] = consolidate_values(values)
105 |         assert self.compaction_frontier is None or self.compaction_frontier.less_equal(
106 |             compaction_frontier
107 |         )
108 |         self.compaction_frontier = compaction_frontier
109 | 


--------------------------------------------------------------------------------
/v3/index.py:
--------------------------------------------------------------------------------
  1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and
  2 | accessing (key, value) structured data across multiple versions (times).
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | from collection import Collection
  7 | 
  8 | 
  9 | class Index:
 10 |     """A map from a difference collection trace's keys -> versions at which
 11 |     the key has nonzero multiplicity -> (value, multiplicities) that changed.
 12 | 
 13 |     Used in operations like join and reduce where the operation needs to
 14 |     exploit the key-value structure of the data to run efficiently.
 15 | 
 16 |     This implementation is specialized for the case when versions are integers.
 17 |     """
 18 | 
 19 |     def __init__(self, compaction_frontier=None):
 20 |         self._index = defaultdict(lambda: defaultdict(list))
 21 |         self.compaction_frontier = compaction_frontier
 22 | 
 23 |     def __repr__(self):
 24 |         return "Index1D({self._index}, {self.compaction_frontier})"
 25 | 
 26 |     def _validate(self, requested_version):
 27 |         """Check that requests are at times allowed by the compaction frontier."""
 28 |         assert (
 29 |             self.compaction_frontier is None
 30 |             or requested_version >= self.compaction_frontier
 31 |         )
 32 | 
 33 |     def reconstruct_at(self, key, requested_version):
 34 |         """Produce the accumulated ((key, value), multiplicity) records for the given key, at the requested version."""
 35 |         self._validate(requested_version)
 36 |         out = []
 37 |         for (version, values) in self._index[key].items():
 38 |             if version <= requested_version:
 39 |                 out.extend(values)
 40 |         return out
 41 | 
 42 |     def add_value(self, key, version, value):
 43 |         """Add a (value, multiplicity) pair for the requested key and version."""
 44 |         self._validate(version)
 45 |         self._index[key][version].append(value)
 46 | 
 47 |     def append(self, other):
 48 |         """Combine all of the data in other into self."""
 49 |         for (key, versions) in other._index.items():
 50 |             for (version, data) in versions.items():
 51 |                 self._index[key][version].extend(data)
 52 | 
 53 |     def join(self, other):
 54 |         """Produce a bounded collection trace containing (key, (val1, val2))
 55 |         for all (key, val1) in the first index, and (key, val2) in the second
 56 |         index.
 57 | 
 58 |         All outputs are produced at output version = max(version of record 1,
 59 |         version of record 2).
 60 |         """
 61 |         collections = defaultdict(list)
 62 |         for (key, versions) in self._index.items():
 63 |             if key not in other._index:
 64 |                 continue
 65 |             other_versions = other._index[key]
 66 | 
 67 |             for (version1, data1) in versions.items():
 68 |                 for (version2, data2) in other_versions.items():
 69 |                     result_version = max(version1, version2)
 70 |                     for (val1, mul1) in data1:
 71 |                         for (val2, mul2) in data2:
 72 |                             collections[result_version].append(
 73 |                                 ((key, (val1, val2)), mul1 * mul2)
 74 |                             )
 75 |         return [
 76 |             (version, Collection(c)) for (version, c) in collections.items() if c != []
 77 |         ]
 78 | 
 79 |     def compact(self, compaction_version, keys=[]):
 80 |         """Combine all changes observed before the requested compaction_version
 81 |         into the compaction_version.
 82 |         """
 83 |         self._validate(compaction_version)
 84 | 
 85 |         def consolidate_values(values):
 86 |             consolidated = defaultdict(int)
 87 |             for (value, multiplicity) in values:
 88 |                 consolidated[value] += multiplicity
 89 | 
 90 |             return [
 91 |                 (value, multiplicity)
 92 |                 for (value, multiplicity) in consolidated.items()
 93 |                 if multiplicity != 0
 94 |             ]
 95 | 
 96 |         if keys == []:
 97 |             keys = [key for key in self._index.keys()]
 98 | 
 99 |         for key in keys:
100 |             versions = self._index[key]
101 |             to_compact = [
102 |                 version for version in versions.keys() if version <= compaction_version
103 |             ]
104 |             values = []
105 |             for version in to_compact:
106 |                 values.extend(versions.pop(version))
107 | 
108 |             versions[compaction_version] = consolidate_values(values)
109 |         self.compaction_frontier = compaction_version
110 | 


--------------------------------------------------------------------------------
/v4/index.py:
--------------------------------------------------------------------------------
  1 | """The implementation of index structures roughly analogous to differential arrangements for manipulating and
  2 | accessing (key, value) structured data across multiple versions (times).
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | from collection import Collection
  7 | 
  8 | 
  9 | class Index:
 10 |     """A map from a difference collection trace's keys -> versions at which
 11 |     the key has nonzero multiplicity -> (value, multiplicities) that changed.
 12 | 
 13 |     Used in operations like join and reduce where the operation needs to
 14 |     exploit the key-value structure of the data to run efficiently.
 15 | 
 16 |     This implementation is specialized for the case when versions are integers.
 17 |     """
 18 | 
 19 |     def __init__(self, compaction_frontier=None):
 20 |         self._index = defaultdict(lambda: defaultdict(list))
 21 |         self.compaction_frontier = compaction_frontier
 22 | 
 23 |     def __repr__(self):
 24 |         return "Index1D({self._index}, {self.compaction_frontier})"
 25 | 
 26 |     def _validate(self, requested_version):
 27 |         """Check that requests are at times allowed by the compaction frontier."""
 28 |         assert (
 29 |             self.compaction_frontier is None
 30 |             or requested_version >= self.compaction_frontier
 31 |         )
 32 | 
 33 |     def reconstruct_at(self, key, requested_version):
 34 |         """Produce the accumulated ((key, value), multiplicity) records for the given key, at the requested version."""
 35 |         self._validate(requested_version)
 36 |         out = []
 37 |         for (version, values) in self._index[key].items():
 38 |             if version <= requested_version:
 39 |                 out.extend(values)
 40 |         return out
 41 | 
 42 |     def add_value(self, key, version, value):
 43 |         """Add a (value, multiplicity) pair for the requested key and version."""
 44 |         self._validate(version)
 45 |         self._index[key][version].append(value)
 46 | 
 47 |     def append(self, other):
 48 |         """Combine all of the data in other into self."""
 49 |         for (key, versions) in other._index.items():
 50 |             for (version, data) in versions.items():
 51 |                 self._index[key][version].extend(data)
 52 | 
 53 |     def join(self, other):
 54 |         """Produce a bounded collection trace containing (key, (val1, val2))
 55 |         for all (key, val1) in the first index, and (key, val2) in the second
 56 |         index.
 57 | 
 58 |         All outputs are produced at output version = max(version of record 1,
 59 |         version of record 2).
 60 |         """
 61 |         collections = defaultdict(list)
 62 |         for (key, versions) in self._index.items():
 63 |             if key not in other._index:
 64 |                 continue
 65 |             other_versions = other._index[key]
 66 | 
 67 |             for (version1, data1) in versions.items():
 68 |                 for (version2, data2) in other_versions.items():
 69 |                     result_version = max(version1, version2)
 70 |                     for (val1, mul1) in data1:
 71 |                         for (val2, mul2) in data2:
 72 |                             collections[result_version].append(
 73 |                                 ((key, (val1, val2)), mul1 * mul2)
 74 |                             )
 75 |         return [
 76 |             (version, Collection(c)) for (version, c) in collections.items() if c != []
 77 |         ]
 78 | 
 79 |     def compact(self, compaction_version, keys=[]):
 80 |         """Combine all changes observed before the requested compaction_version
 81 |         into the compaction_version.
 82 |         """
 83 |         self._validate(compaction_version)
 84 | 
 85 |         def consolidate_values(values):
 86 |             consolidated = defaultdict(int)
 87 |             for (value, multiplicity) in values:
 88 |                 consolidated[value] += multiplicity
 89 | 
 90 |             return [
 91 |                 (value, multiplicity)
 92 |                 for (value, multiplicity) in consolidated.items()
 93 |                 if multiplicity != 0
 94 |             ]
 95 | 
 96 |         if keys == []:
 97 |             keys = [key for key in self._index.keys()]
 98 | 
 99 |         for key in keys:
100 |             versions = self._index[key]
101 |             to_compact = [
102 |                 version for version in versions.keys() if version <= compaction_version
103 |             ]
104 |             values = []
105 |             for version in to_compact:
106 |                 values.extend(versions.pop(version))
107 | 
108 |             versions[compaction_version] = consolidate_values(values)
109 |         self.compaction_frontier = compaction_version
110 | 


--------------------------------------------------------------------------------
/v3/graph.py:
--------------------------------------------------------------------------------
  1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program."""
  2 | 
  3 | from collections import deque
  4 | from enum import Enum
  5 | 
  6 | 
  7 | class MessageType(Enum):
  8 |     DATA = 1
  9 |     FRONTIER = 2
 10 | 
 11 | 
 12 | class DifferenceStreamReader:
 13 |     """A read handle to a dataflow edge that receives data and frontier updates from a writer.
 14 | 
 15 |     The data received over this edge are pairs of (version, Collection) and the frontier
 16 |     updates.
 17 |     """
 18 | 
 19 |     def __init__(self, queue):
 20 |         self._queue = queue
 21 | 
 22 |     def drain(self):
 23 |         out = []
 24 |         while len(self._queue) > 0:
 25 |             out.append(self._queue.pop())
 26 | 
 27 |         return out
 28 | 
 29 |     def is_empty(self):
 30 |         return len(self._queue) == 0
 31 | 
 32 |     def probe_frontier_less_than(self, frontier):
 33 |         for (typ, msg) in self._queue:
 34 |             if typ == MessageType.FRONTIER:
 35 |                 received_frontier = msg
 36 |                 if received_frontier >= frontier:
 37 |                     return False
 38 |         return True
 39 | 
 40 | 
 41 | class DifferenceStreamWriter:
 42 |     """A write handle to a dataflow edge that is allowed to publish data and send
 43 |     frontier updates.
 44 |     """
 45 | 
 46 |     def __init__(self):
 47 |         self._queues = []
 48 |         self.frontier = None
 49 | 
 50 |     def send_data(self, version, collection):
 51 |         assert self.frontier is None or self.frontier <= version
 52 |         for q in self._queues:
 53 |             q.appendleft((MessageType.DATA, (version, collection)))
 54 | 
 55 |     def send_frontier(self, frontier):
 56 |         assert self.frontier is None or self.frontier <= frontier
 57 | 
 58 |         self.frontier = frontier
 59 |         for q in self._queues:
 60 |             q.appendleft((MessageType.FRONTIER, frontier))
 61 | 
 62 |     def _new_reader(self):
 63 |         q = deque()
 64 |         self._queues.append(q)
 65 |         return DifferenceStreamReader(q)
 66 | 
 67 | 
 68 | class Operator:
 69 |     """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and
 70 |     one outgoing edge (write handle).
 71 |     """
 72 | 
 73 |     def __init__(self, inputs, output, f, initial_frontier):
 74 |         self.inputs = inputs
 75 |         self.output = output
 76 |         self.f = f
 77 |         self.pending_work = False
 78 |         self.input_frontiers = [initial_frontier for _ in self.inputs]
 79 |         self.output_frontier = initial_frontier
 80 | 
 81 |     def run(self):
 82 |         self.f()
 83 | 
 84 |     def pending_work(self):
 85 |         if self.pending_work is True:
 86 |             return True
 87 |         for input_listener in self.inputs:
 88 |             if input_listener.is_empty() is False:
 89 |                 return True
 90 |         return False
 91 | 
 92 |     def frontiers(self):
 93 |         return (self.input_frontiers, self.output_frontier)
 94 | 
 95 | 
 96 | class UnaryOperator(Operator):
 97 |     """A convenience implementation of a dataflow operator that has a handle to one
 98 |     incoming stream of data, and one handle to an outgoing stream of data.
 99 |     """
100 | 
101 |     def __init__(self, input_a, output, f, initial_frontier):
102 |         super().__init__([input_a], output, f, initial_frontier)
103 | 
104 |     def input_messages(self):
105 |         return self.inputs[0].drain()
106 | 
107 |     def input_frontier(self):
108 |         return self.input_frontiers[0]
109 | 
110 |     def set_input_frontier(self, frontier):
111 |         self.input_frontiers[0] = frontier
112 | 
113 | 
114 | class BinaryOperator(Operator):
115 |     """A convenience implementation of a dataflow operator that has a handle to two
116 |     incoming streams of data, and one handle to an outgoing stream of data.
117 |     """
118 | 
119 |     def __init__(self, input_a, input_b, output, f, initial_frontier):
120 |         super().__init__([input_a, input_b], output, f, initial_frontier)
121 | 
122 |     def input_a_messages(self):
123 |         return self.inputs[0].drain()
124 | 
125 |     def input_a_frontier(self):
126 |         return self.input_frontiers[0]
127 | 
128 |     def set_input_a_frontier(self, frontier):
129 |         self.input_frontiers[0] = frontier
130 | 
131 |     def input_b_messages(self):
132 |         return self.inputs[1].drain()
133 | 
134 |     def input_b_frontier(self):
135 |         return self.input_frontiers[1]
136 | 
137 |     def set_input_b_frontier(self, frontier):
138 |         self.input_frontiers[1] = frontier
139 | 
140 | 
141 | class Graph:
142 |     """An implementation of a dataflow graph.
143 | 
144 |     This implementation needs to keep the entire set of nodes so that they
145 |     may be run, and only keeps a set of read handles to all edges for debugging
146 |     purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because
147 |     this object does not actually know anything about the connections between the
148 |     various nodes.
149 |     """
150 | 
151 |     def __init__(self, streams, operators):
152 |         self.streams = streams
153 |         self.operators = operators
154 | 
155 |     def step(self):
156 |         for op in self.operators:
157 |             op.run()
158 | 


--------------------------------------------------------------------------------
/v4/graph.py:
--------------------------------------------------------------------------------
  1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program."""
  2 | 
  3 | from collections import deque
  4 | from enum import Enum
  5 | 
  6 | 
  7 | class MessageType(Enum):
  8 |     DATA = 1
  9 |     FRONTIER = 2
 10 | 
 11 | 
 12 | class DifferenceStreamReader:
 13 |     """A read handle to a dataflow edge that receives data and frontier updates from a writer.
 14 | 
 15 |     The data received over this edge are pairs of (version, Collection) and the frontier
 16 |     updates are either integers (in the one dimensional case) or Antichains (in the general
 17 |     case).
 18 |     """
 19 | 
 20 |     def __init__(self, queue):
 21 |         self._queue = queue
 22 | 
 23 |     def drain(self):
 24 |         out = []
 25 |         while len(self._queue) > 0:
 26 |             out.append(self._queue.pop())
 27 | 
 28 |         return out
 29 | 
 30 |     def is_empty(self):
 31 |         return len(self._queue) == 0
 32 | 
 33 |     def probe_frontier_less_than(self, frontier):
 34 |         for (typ, msg) in self._queue:
 35 |             if typ == MessageType.FRONTIER:
 36 |                 received_frontier = msg
 37 |                 if received_frontier >= frontier:
 38 |                     return False
 39 |         return True
 40 | 
 41 | 
 42 | class DifferenceStreamWriter:
 43 |     """A write handle to a dataflow edge that is allowed to publish data and send
 44 |     frontier updates.
 45 |     """
 46 | 
 47 |     def __init__(self):
 48 |         self._queues = []
 49 |         self.frontier = None
 50 | 
 51 |     def send_data(self, version, collection):
 52 |         if self.frontier is not None and self.frontier > version:
 53 |             print(f"frontier {self.frontier}, version: {version}")
 54 |         assert self.frontier is None or self.frontier <= version
 55 |         for q in self._queues:
 56 |             q.appendleft((MessageType.DATA, (version, collection)))
 57 | 
 58 |     def send_frontier(self, frontier):
 59 |         assert self.frontier is None or self.frontier <= frontier
 60 | 
 61 |         self.frontier = frontier
 62 |         for q in self._queues:
 63 |             q.appendleft((MessageType.FRONTIER, frontier))
 64 | 
 65 |     def _new_reader(self):
 66 |         q = deque()
 67 |         self._queues.append(q)
 68 |         return DifferenceStreamReader(q)
 69 | 
 70 | 
 71 | class Operator:
 72 |     """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and
 73 |     one outgoing edge (write handle).
 74 |     """
 75 | 
 76 |     def __init__(self, inputs, output, f, initial_frontier):
 77 |         self.inputs = inputs
 78 |         self.output = output
 79 |         self.f = f
 80 |         self.pending_work = False
 81 |         self.input_frontiers = [initial_frontier for _ in self.inputs]
 82 |         self.output_frontier = initial_frontier
 83 | 
 84 |     def run(self):
 85 |         self.f()
 86 | 
 87 |     def pending_work(self):
 88 |         if self.pending_work is True:
 89 |             return True
 90 |         for input_listener in self.inputs:
 91 |             if input_listener.is_empty() is False:
 92 |                 return True
 93 |         return False
 94 | 
 95 |     def frontiers(self):
 96 |         return (self.input_frontiers, self.output_frontier)
 97 | 
 98 | 
 99 | class UnaryOperator(Operator):
100 |     """A convenience implementation of a dataflow operator that has a handle to one
101 |     incoming stream of data, and one handle to an outgoing stream of data.
102 |     """
103 | 
104 |     def __init__(self, input_a, output, f, initial_frontier):
105 |         super().__init__([input_a], output, f, initial_frontier)
106 | 
107 |     def input_messages(self):
108 |         return self.inputs[0].drain()
109 | 
110 |     def input_frontier(self):
111 |         return self.input_frontiers[0]
112 | 
113 |     def set_input_frontier(self, frontier):
114 |         self.input_frontiers[0] = frontier
115 | 
116 | 
117 | class BinaryOperator(Operator):
118 |     """A convenience implementation of a dataflow operator that has a handle to two
119 |     incoming streams of data, and one handle to an outgoing stream of data.
120 |     """
121 | 
122 |     def __init__(self, input_a, input_b, output, f, initial_frontier):
123 |         super().__init__([input_a, input_b], output, f, initial_frontier)
124 | 
125 |     def input_a_messages(self):
126 |         return self.inputs[0].drain()
127 | 
128 |     def input_a_frontier(self):
129 |         return self.input_frontiers[0]
130 | 
131 |     def set_input_a_frontier(self, frontier):
132 |         self.input_frontiers[0] = frontier
133 | 
134 |     def input_b_messages(self):
135 |         return self.inputs[1].drain()
136 | 
137 |     def input_b_frontier(self):
138 |         return self.input_frontiers[1]
139 | 
140 |     def set_input_b_frontier(self, frontier):
141 |         self.input_frontiers[1] = frontier
142 | 
143 | 
144 | class Graph:
145 |     """An implementation of a dataflow graph.
146 | 
147 |     This implementation needs to keep the entire set of nodes so that they
148 |     may be run, and only keeps a set of read handles to all edges for debugging
149 |     purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because
150 |     this object does not actually know anything about the connections between the
151 |     various nodes.
152 |     """
153 | 
154 |     def __init__(self, streams, operators):
155 |         self.streams = streams
156 |         self.operators = operators
157 | 
158 |     def step(self):
159 |         for op in self.operators:
160 |             op.run()
161 | 


--------------------------------------------------------------------------------
/graph.py:
--------------------------------------------------------------------------------
  1 | """The implementation of dataflow graph edge, node, and graph objects, used to run a dataflow program."""
  2 | 
  3 | from collections import deque
  4 | from enum import Enum
  5 | 
  6 | 
  7 | class MessageType(Enum):
  8 |     DATA = 1
  9 |     FRONTIER = 2
 10 | 
 11 | 
 12 | class DifferenceStreamReader:
 13 |     """A read handle to a dataflow edge that receives data and frontier updates from a writer.
 14 | 
 15 |     The data received over this edge are pairs of (version, Collection) and the frontier
 16 |     updates are either integers (in the one dimensional case) or Antichains (in the general
 17 |     case).
 18 |     """
 19 | 
 20 |     def __init__(self, queue):
 21 |         self._queue = queue
 22 | 
 23 |     def drain(self):
 24 |         out = []
 25 |         while len(self._queue) > 0:
 26 |             out.append(self._queue.pop())
 27 | 
 28 |         return out
 29 | 
 30 |     def is_empty(self):
 31 |         return len(self._queue) == 0
 32 | 
 33 |     def probe_frontier_less_than(self, frontier):
 34 |         for (typ, msg) in self._queue:
 35 |             if typ == MessageType.FRONTIER:
 36 |                 received_frontier = msg
 37 |                 if frontier.less_equal(received_frontier):
 38 |                     return False
 39 |         return True
 40 | 
 41 | 
 42 | class DifferenceStreamWriter:
 43 |     """A write handle to a dataflow edge that is allowed to publish data and send
 44 |     frontier updates.
 45 |     """
 46 | 
 47 |     def __init__(self):
 48 |         self._queues = []
 49 |         self.frontier = None
 50 | 
 51 |     def send_data(self, version, collection):
 52 |         if isinstance(version, int):
 53 |             assert self.frontier is None or self.frontier <= version
 54 |         else:
 55 |             assert self.frontier is None or self.frontier.less_equal_version(version)
 56 |         for q in self._queues:
 57 |             q.appendleft((MessageType.DATA, (version, collection)))
 58 | 
 59 |     def send_frontier(self, frontier):
 60 |         if isinstance(frontier, int):
 61 |             assert self.frontier is None or self.frontier <= frontier
 62 |         else:
 63 |             assert self.frontier is None or self.frontier.less_equal(frontier)
 64 | 
 65 |         self.frontier = frontier
 66 |         for q in self._queues:
 67 |             q.appendleft((MessageType.FRONTIER, frontier))
 68 | 
 69 |     def _new_reader(self):
 70 |         q = deque()
 71 |         self._queues.append(q)
 72 |         return DifferenceStreamReader(q)
 73 | 
 74 | 
 75 | class Operator:
 76 |     """A generic implementation of a dataflow operator (node) that has multiple incoming edges (read handles) and
 77 |     one outgoing edge (write handle).
 78 |     """
 79 | 
 80 |     def __init__(self, inputs, output, f, initial_frontier):
 81 |         self.inputs = inputs
 82 |         self.output = output
 83 |         self.f = f
 84 |         self.pending_work = False
 85 |         self.input_frontiers = [initial_frontier for _ in self.inputs]
 86 |         self.output_frontier = initial_frontier
 87 | 
 88 |     def run(self):
 89 |         self.f()
 90 | 
 91 |     def pending_work(self):
 92 |         if self.pending_work is True:
 93 |             return True
 94 |         for input_listener in self.inputs:
 95 |             if input_listener.is_empty() is False:
 96 |                 return True
 97 |         return False
 98 | 
 99 |     def frontiers(self):
100 |         return (self.input_frontiers, self.output_frontier)
101 | 
102 | 
103 | class UnaryOperator(Operator):
104 |     """A convenience implementation of a dataflow operator that has a handle to one
105 |     incoming stream of data, and one handle to an outgoing stream of data.
106 |     """
107 | 
108 |     def __init__(self, input_a, output, f, initial_frontier):
109 |         super().__init__([input_a], output, f, initial_frontier)
110 | 
111 |     def input_messages(self):
112 |         return self.inputs[0].drain()
113 | 
114 |     def input_frontier(self):
115 |         return self.input_frontiers[0]
116 | 
117 |     def set_input_frontier(self, frontier):
118 |         self.input_frontiers[0] = frontier
119 | 
120 | 
121 | class BinaryOperator(Operator):
122 |     """A convenience implementation of a dataflow operator that has a handle to two
123 |     incoming streams of data, and one handle to an outgoing stream of data.
124 |     """
125 | 
126 |     def __init__(self, input_a, input_b, output, f, initial_frontier):
127 |         super().__init__([input_a, input_b], output, f, initial_frontier)
128 | 
129 |     def input_a_messages(self):
130 |         return self.inputs[0].drain()
131 | 
132 |     def input_a_frontier(self):
133 |         return self.input_frontiers[0]
134 | 
135 |     def set_input_a_frontier(self, frontier):
136 |         self.input_frontiers[0] = frontier
137 | 
138 |     def input_b_messages(self):
139 |         return self.inputs[1].drain()
140 | 
141 |     def input_b_frontier(self):
142 |         return self.input_frontiers[1]
143 | 
144 |     def set_input_b_frontier(self, frontier):
145 |         self.input_frontiers[1] = frontier
146 | 
147 | 
148 | class Graph:
149 |     """An implementation of a dataflow graph.
150 | 
151 |     This implementation needs to keep the entire set of nodes so that they
152 |     may be run, and only keeps a set of read handles to all edges for debugging
153 |     purposes. Calling this a graph instead of a 'bag of nodes' is misleading, because
154 |     this object does not actually know anything about the connections between the
155 |     various nodes.
156 |     """
157 | 
158 |     def __init__(self, streams, operators):
159 |         self.streams = streams
160 |         self.operators = operators
161 | 
162 |     def step(self):
163 |         for op in self.operators:
164 |             op.run()
165 | 


--------------------------------------------------------------------------------
/order.py:
--------------------------------------------------------------------------------
  1 | """The implementation of partially ordered versions (times) for use within a differential dataflow.
  2 | """
  3 | 
  4 | 
  5 | class Version:
  6 |     """A partially, or totally ordered version (time), consisting of a tuple of
  7 |     integers.
  8 | 
  9 |     All versions within a scope of a dataflow must have the same dimension/number
 10 |     of coordinates. One dimensional versions are totally ordered. Multidimensional
 11 |     versions are partially ordered by the product partial order.
 12 |     """
 13 | 
 14 |     def __init__(self, version):
 15 |         if isinstance(version, int):
 16 |             assert version >= 0
 17 |             self.inner = (version,)
 18 |         elif isinstance(version, list) or isinstance(version, tuple):
 19 |             for i in version:
 20 |                 assert isinstance(i, int)
 21 |                 assert i >= 0
 22 |             self.inner = tuple(version)
 23 |         else:
 24 |             assert 0 > 1
 25 | 
 26 |     def __repr__(self):
 27 |         return f"Version({self.inner})"
 28 | 
 29 |     def __eq__(self, other):
 30 |         return self.inner == other.inner
 31 | 
 32 |     # The less than implementation used to sort versions must respect the partial
 33 |     # order (important for reduce).
 34 |     def __lt__(self, other):
 35 |         return self.inner.__lt__(other.inner)
 36 | 
 37 |     def __hash__(self):
 38 |         return hash(self.inner)
 39 | 
 40 |     def _validate(self, other):
 41 |         assert len(self.inner) > 0
 42 |         assert len(self.inner) == len(other.inner)
 43 | 
 44 |     def less_equal(self, other):
 45 |         self._validate(other)
 46 | 
 47 |         for (i1, i2) in zip(self.inner, other.inner):
 48 |             if i1 > i2:
 49 |                 return False
 50 |         return True
 51 | 
 52 |     def less_than(self, other):
 53 |         if self.less_equal(other) is True and self.inner != other.inner:
 54 |             return True
 55 |         return False
 56 | 
 57 |     def join(self, other):
 58 |         self._validate(other)
 59 |         out = []
 60 | 
 61 |         for (i1, i2) in zip(self.inner, other.inner):
 62 |             out.append(max(i1, i2))
 63 |         return Version(out)
 64 | 
 65 |     def meet(self, other):
 66 |         self._validate(other)
 67 |         out = []
 68 | 
 69 |         for (i1, i2) in zip(self.inner, other.inner):
 70 |             out.append(min(i1, i2))
 71 |         return Version(out)
 72 | 
 73 |     # TODO the proof for this is in the sharing arrangements paper.
 74 |     def advance_by(self, frontier):
 75 |         if frontier.inner == ():
 76 |             return self
 77 |         result = self.join(frontier.inner[0])
 78 |         for elem in frontier.inner:
 79 |             result = result.meet(self.join(elem))
 80 |         return result
 81 | 
 82 |     def extend(self):
 83 |         elements = [e for e in self.inner]
 84 |         elements.append(0)
 85 |         return Version(elements)
 86 | 
 87 |     def truncate(self):
 88 |         elements = [e for e in self.inner]
 89 |         elements.pop()
 90 |         return Version(elements)
 91 | 
 92 |     def apply_step(self, step):
 93 |         assert step > 0
 94 |         elements = [e for e in self.inner]
 95 |         elements[-1] += step
 96 |         return Version(elements)
 97 | 
 98 | 
 99 | # This keeps the min antichain.
100 | # I fully stole this from frank. TODO: Understand this better
101 | class Antichain:
102 |     """A minimal set of incomparable versions."""
103 | 
104 |     def __init__(self, elements):
105 |         self.inner = []
106 |         for element in elements:
107 |             self._insert(element)
108 | 
109 |     def __repr__(self):
110 |         return f"Antichain({self.inner})"
111 | 
112 |     def _insert(self, element):
113 |         for e in self.inner:
114 |             if e.less_equal(element):
115 |                 return
116 |         self.inner = [x for x in self.inner if element.less_equal(x) is not True]
117 |         self.inner.append(element)
118 | 
119 |     # TODO: is it true that the set of versions <= meet(x, y) is the intersection of the set of versions <= x and the set of versions <= y?
120 |     def meet(self, other):
121 |         out = Antichain([])
122 |         for element in self.inner:
123 |             out._insert(element)
124 |         for element in other.inner:
125 |             out._insert(element)
126 | 
127 |         return out
128 | 
129 |     def _equals(self, other):
130 |         elements_1 = [x for x in self.inner]
131 |         elements_2 = [y for y in other.inner]
132 | 
133 |         if len(elements_1) != len(elements_2):
134 |             return False
135 |         elements_1.sort()
136 |         elements_2.sort()
137 | 
138 |         for (x, y) in zip(elements_1, elements_2):
139 |             if x != y:
140 |                 return False
141 |         return True
142 | 
143 |     # Returns true if other dominates self
144 |     # in other words self < other means
145 |     # self <= other AND self != other
146 |     def less_than(self, other):
147 |         if self.less_equal(other) is not True:
148 |             return False
149 | 
150 |         if self._equals(other):
151 |             return False
152 | 
153 |         return True
154 | 
155 |     def less_equal(self, other):
156 |         for o in other.inner:
157 |             less_equal = False
158 |             for s in self.inner:
159 |                 if s.less_equal(o):
160 |                     less_equal = True
161 |             if less_equal == False:
162 |                 return False
163 |         return True
164 | 
165 |     def less_equal_version(self, version):
166 |         for elem in self.inner:
167 |             if elem.less_equal(version):
168 |                 return True
169 |         return False
170 | 
171 |     def extend(self):
172 |         out = Antichain([])
173 |         for elem in self.inner:
174 |             out._insert(elem.extend())
175 |         return out
176 | 
177 |     def truncate(self):
178 |         out = Antichain([])
179 |         for elem in self.inner:
180 |             out._insert(elem.truncate())
181 |         return out
182 | 
183 |     def apply_step(self, step):
184 |         out = Antichain([])
185 |         for elem in self.inner:
186 |             out._insert(elem.apply_step(step))
187 |         return out
188 | 
189 |     def _elements(self):
190 |         return [x for x in self.inner]
191 | 
192 | 
193 | if __name__ == "__main__":
194 | 
195 |     v0_0 = Version([0, 0])
196 |     v1_0 = Version([1, 0])
197 |     v0_1 = Version([0, 1])
198 |     v1_1 = Version([1, 1])
199 |     v2_0 = Version([2, 0])
200 | 
201 |     assert v0_0.less_than(v1_0)
202 |     assert v0_0.less_than(v0_1)
203 |     assert v0_0.less_than(v1_1)
204 |     assert v0_0.less_equal(v1_0)
205 |     assert v0_0.less_equal(v0_1)
206 |     assert v0_0.less_equal(v1_1)
207 | 
208 |     assert v1_0.less_than(v1_0) is not True
209 |     assert v1_0.less_equal(v1_0)
210 |     assert v1_0.less_equal(v0_1) is not True
211 |     assert v0_1.less_equal(v1_0) is not True
212 |     assert v0_1.less_equal(v1_1)
213 |     assert v1_0.less_equal(v1_1)
214 |     assert v0_0.less_equal(v1_1)
215 | 
216 |     assert Antichain([v0_0]).less_equal(Antichain([v1_0]))
217 |     assert Antichain([v0_0])._equals(Antichain([v1_0])) is not True
218 |     assert Antichain([v0_0]).less_than(Antichain([v1_0]))
219 |     assert Antichain([v2_0, v1_1]).less_than(Antichain([v2_0]))
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Differential Dataflow, in Python.
  2 | 
  3 | WIP. 
  4 | 
  5 | This is an implementation of Differential Dataflow in Python, that is meant as a learning
  6 | tool. This implementation is not meant to be high performance - for that please go to the
  7 | Rust implementation.
  8 | 
  9 | Simple explanation of what this code does: users get to define their computations as composition
 10 | of functional operators like map/filter/join/count/sum/etc. These computations can even have recursion.
 11 | They can then send inputs to those computations and get answers back quickly and efficiently. They can keep
 12 | sending new inputs, and changing the inputs in arbitrary ways, and keep getting new answers back quickly
 13 | and efficiently, regardless of the computation they defined.
 14 | 
 15 | Small terminology note: I started using version instead of time/timestamp, and multiplicity instead of diff, throughout
 16 | the code, so I will use those names here as well.
 17 | 
 18 | The code includes several preliminary implementations that build on concepts introduced in each other, to make things easier to understand. These preliminary implementations are in the directories `v0` - `v4`. Each directory is entirely self contained, however a lot of the components in a given implementation are duplicated from prior ones. There are 5 preliminary implementations and one final one:
 19 |   - `v0`: defines a collection (multiset) of data and implements the various operations (join/reduce/map/etc) over
 20 |   a single collection. This is roughly the starting point for "what are we even trying to do?".
 21 |   - `v1`: defines a finite, totally ordered, sequence of difference collections, to describe a location that changes. `v1` also
 22 |   implements the various operations over such difference collection sequences efficiently. Compared to `v0` the main change in `v1` is
 23 |   we need to use indexes to efficiently compute reductions and joins when only a small subset of keys change from one collection
 24 |   version to the next.
 25 |   - `v2`: extends the approach in `v1` to support an unbounded number of difference collections. Now, we have to explicitly construct a dataflow
 26 |   graph, rather than relying on the implicit graph induced by function calls. All of the data travels through dataflow edges (basically queues),
 27 |   to operators/nodes (basically a struct that do some `work_function` to effect a computation + hold onto state across invocations of its
 28 |   `work_fn`). Each operator still has to output data in order, and binary operators always need to wait for both inputs to become available before they can produce an output. Roughly, this is an approach to implememting something like differential while rejecting a lot of the timely paper.
 29 |   - `v3`: extends `v2` to explicitly attach a version (time) label to all messages. Operators also now receive a message when a given version/range of versions will no longer receive any more data. Versions are constrained to be integers.
 30 |   - `v4`: extends `v3` to allow versions to be tuples of integers that are totally ordered with the lexicographic order. This implementation is the first that supports `iterate` with changing data, but the user has to specify a cap on the number of iterations.
 31 |   TODO: I'm not sure that the cap on the number of iterations is strictly necessary
 32 |   - `final`/the toplevel of this directory: extends `v4` to support versions that are partially ordered with the product partial order. The
 33 |   versions still have to be tuples of integers.
 34 | 
 35 | This implementation is different from other implementations (to the best of my knowledge) in that it doesn't
 36 | rely on a scheduler reasoning about the structure of the computation graph and scheduling operators intelligently
 37 | to guarantee progress / eventual termination.
 38 | 
 39 | Instead, implementation provides the following guarantees:
 40 | 
 41 | 1. After sending a finite number of collections and advancing the frontiers of all inputs to the dataflow graph past a finite set of
 42 | versions, the output should, after a finite number of calls to `graph.step()`, see the correct outputs at those versions and also close
 43 | those versions.
 44 | 
 45 | 2. Eventually, after all inputs have ceased sending new data or advancing frontiers, all nodes in the dataflow graph should stop producing
 46 | either new data or new frontier updates iff the dataflow graph does not contain any non-convergent iterative computations.
 47 | 
 48 | My understanding is that for acyclic dataflow graphs these properties can be satisfied by:
 49 | 
 50 | A. For any set of inputs, all operators are guaranteed to produce their individual expected outputs after a finite number of executions.
 51 | So, for example, `reduce` can only produce outputs at versions that are closed, so if no versions are closed, it is to be expected that `reduce`
 52 | will not produce any outputs. But once a version is closed, it should produce an output for that version, and potentially others, after a finite
 53 | number of executions.
 54 | 
 55 | B. All dataflow operators will only ever produce a finite number of output messages (new collections of data / frontier updates) in response
 56 | to any one input message (input collections of data / frontier updates).
 57 | 
 58 | (I'm not claiming to have proved these properties, and indeed I am not even totally how to.)
 59 | 
 60 | For cyclic dataflow graphs, the situation is complicated by the existence of a feedback operator that sends messages in a cycle
 61 | to another operator, but with their versions incremented.
 62 | 
 63 | ```
 64 |     def example(collection):
 65 |         return (
 66 |             collection.map(lambda data: data + 1)
 67 |             .map(lambda data: data - 1)
 68 |             .negate()
 69 |             .concat(collection)
 70 |             .consolidate() # This step is mandatory for termination.
 71 |         )
 72 | 
 73 |     output = input_a.iterate(example).debug("iterate")
 74 |     graph = graph_builder.finalize()
 75 | 
 76 |     input_a_writer.send_data(Version(0), Collection([(1, 1)]))
 77 |     input_a_writer.send_frontier(Antichain([Version(1)]))
 78 | 
 79 |     for i in range(0, 10):
 80 |         graph.step()
 81 | ```
 82 | 
 83 | Take the following simple example. Here, every step of the iteration takes the
 84 | input and applies two consecutive map operators which are collectively a no-op
 85 | and the negates the input and concatenates it with itself. Every input therefore
 86 | produces the empty collection and this loop should reach fixedpoint in two iterations (two not one because of how `iterate` works and needs to subtract the top-level input on the second iteration).
 87 | 
 88 | However, if you remove the `consolidate`, which waits to produce data at a given
 89 | version until all inputs have provided all of the data at that version and updated
 90 | their frontiers, then there some operator execution orderings for which this loop will continue circulating non-empty differences and never terminate. This is also
 91 | a concern in the Rust implementation, and the Rust implementation also requires that all paths from iterative subgraph input to output have a consolidation step
 92 | that makes sure all differences at a given version meet up and get cancelled out
 93 | (TODO: LINK).
 94 | 
 95 | There's a second concern: once fixedpoint has been reached (say at `version(0, 1)` in the example above we know we are done with the computation for the top level `version(0)`.)
 96 | 
 97 | We don't then want frontier updates like:
 98 | 
 99 | ```
100 | Antichain([Version(1, 0), Version(0, 2)])
101 | Antichain([Version(1, 0), Version(0, 3)])
102 | Antichain([Version(1, 0), Version(0, 4)])
103 | ...
104 | ```
105 | to keep circulating through the iteration subgraph. We'd like instead for one of
106 | the operators to realize "hey, we are done with `Version(0, *)` so we can drop
107 | that from the frontier". This code assigns the feedback operator to this task,
108 | and allows it to drop antichain elements for upper level times that have reached
109 | fixedpoint.
110 | 
111 | TODO: I want to understand a bit better how timely does this.
112 | TODO: The code for handling this in the feedback operator is not very nice. Ideally, we would be able to express this operation in a more mathematical way. Perhaps capabilities are a more reasonable interface for this?
113 | 


--------------------------------------------------------------------------------
/v0/collection.py:
--------------------------------------------------------------------------------
  1 | """The implementation of collections (multisets) of data and functional operations over single collections.
  2 | """
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | class Collection:
  8 |     """A multiset of data"""
  9 | 
 10 |     def __init__(self, dataz=None):
 11 |         if dataz is None:
 12 |             dataz = []
 13 |         self._inner = dataz
 14 | 
 15 |     def __repr__(self):
 16 |         return f"Collection({self._inner})"
 17 | 
 18 |     def map(self, f):
 19 |         """Apply a function to all records in the collection."""
 20 |         return Collection(
 21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
 22 |         )
 23 | 
 24 |     def filter(self, f):
 25 |         """Filter out records for which a function f(record) evaluates to False."""
 26 |         return Collection(
 27 |             [
 28 |                 (data, multiplicity)
 29 |                 for (data, multiplicity) in self._inner
 30 |                 if f(data) == True
 31 |             ]
 32 |         )
 33 | 
 34 |     def negate(self):
 35 |         return Collection(
 36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
 37 |         )
 38 | 
 39 |     def concat(self, other):
 40 |         """Concatenate two collections together."""
 41 |         out = []
 42 |         out.extend(self._inner)
 43 |         out.extend(other._inner)
 44 |         return Collection(out)
 45 | 
 46 |     def consolidate(self):
 47 |         """Produce as output a collection that is logically equivalent to the input
 48 |         but which combines identical instances of the same record into one
 49 |         (record, multiplicity) pair.
 50 |         """
 51 |         consolidated = defaultdict(int)
 52 |         for (data, multiplicity) in self._inner:
 53 |             consolidated[data] += multiplicity
 54 |         consolidated = [
 55 |             (data, multiplicity)
 56 |             for (data, multiplicity) in consolidated.items()
 57 |             if multiplicity != 0
 58 |         ]
 59 |         consolidated.sort()
 60 |         return Collection(consolidated)
 61 | 
 62 |     def join(self, other):
 63 |         """Match pairs (k, v1) and (k, v2) from the two input collections and produce (k, (v1, v2))."""
 64 |         out = []
 65 |         for ((k1, v1), d1) in self._inner:
 66 |             for ((k2, v2), d2) in other._inner:
 67 |                 if k1 == k2:
 68 |                     out.append(((k1, (v1, v2)), d1 * d2))
 69 |         return Collection(out)
 70 | 
 71 |     def reduce(self, f):
 72 |         """Apply a reduction function to all record values, grouped by key."""
 73 |         keys = defaultdict(list)
 74 |         out = []
 75 |         for ((key, val), multiplicity) in self._inner:
 76 |             keys[key].append((val, multiplicity))
 77 |         for (key, vals) in keys.items():
 78 |             results = f(vals)
 79 |             for (val, multiplicity) in results:
 80 |                 out.append(((key, val), multiplicity))
 81 |         return Collection(out)
 82 | 
 83 |     def count(self):
 84 |         """Count the number of times each key occurs in the collection."""
 85 | 
 86 |         def count_inner(vals):
 87 |             out = 0
 88 |             for (_, multiplicity) in vals:
 89 |                 out += multiplicity
 90 |             return [(out, 1)]
 91 | 
 92 |         return self.reduce(count_inner)
 93 | 
 94 |     def sum(self):
 95 |         """Produce the sum of all the values paired with a key, for all keys in the collection."""
 96 | 
 97 |         def sum_inner(vals):
 98 |             out = 0
 99 |             for (val, multiplicity) in vals:
100 |                 out += val * multiplicity
101 |             return [(out, 1)]
102 | 
103 |         return self.reduce(sum_inner)
104 | 
105 |     def min(self):
106 |         """Produce the minimum value associated with each key in the collection.
107 | 
108 |         Note that no record may have negative multiplicity when computing the min,
109 |         as it is unclear what exactly the minimum record is in that case.
110 |         """
111 | 
112 |         def min_inner(vals):
113 |             consolidated = defaultdict(int)
114 |             for (val, multiplicity) in vals:
115 |                 consolidated[val] += multiplicity
116 |             vals = [
117 |                 (val, multiplicity)
118 |                 for (val, multiplicity) in consolidated.items()
119 |                 if multiplicity != 0
120 |             ]
121 |             if len(vals) != 0:
122 |                 out = vals[0][0]
123 |                 for (val, multiplicity) in vals:
124 |                     assert multiplicity > 0
125 |                     if val < out:
126 |                         out = val
127 |                 return [(out, 1)]
128 |             else:
129 |                 return []
130 | 
131 |         return self.reduce(min_inner)
132 | 
133 |     def max(self):
134 |         """Produce the maximum value associated with each key in the collection.
135 | 
136 |         Note that no record may have negative multiplicity when computing the max,
137 |         as it is unclear what exactly the maximum record is in that case.
138 |         """
139 | 
140 |         def max_inner(vals):
141 |             consolidated = defaultdict(int)
142 |             for (val, multiplicity) in vals:
143 |                 consolidated[val] += multiplicity
144 |             vals = [
145 |                 (val, multiplicity)
146 |                 for (val, multiplicity) in consolidated.items()
147 |                 if multiplicity != 0
148 |             ]
149 |             if len(vals) != 0:
150 |                 out = vals[0][0]
151 |                 for (val, multiplicity) in vals:
152 |                     assert multiplicity > 0
153 |                     if val > out:
154 |                         out = val
155 |                 return [(out, 1)]
156 |             else:
157 |                 return []
158 | 
159 |         return self.reduce(max_inner)
160 | 
161 |     def distinct(self):
162 |         """Reduce the collection to a set of elements (from a multiset).
163 | 
164 |         Note that no record may have negative multiplicity when producing this set,
165 |         as elements of sets may only have multiplicity one, and it is unclear that is
166 |         an appropriate output for elements with negative multiplicity.
167 |         """
168 | 
169 |         def distinct_inner(vals):
170 |             consolidated = defaultdict(int)
171 |             for (val, multiplicity) in vals:
172 |                 consolidated[val] += multiplicity
173 |             vals = [
174 |                 (val, multiplicity)
175 |                 for (val, multiplicity) in consolidated.items()
176 |                 if multiplicity != 0
177 |             ]
178 |             for (val, multiplicity) in vals:
179 |                 assert multiplicity > 0
180 |             return [(val, 1) for (val, _) in vals]
181 | 
182 |         return self.reduce(distinct_inner)
183 | 
184 |     def iterate(self, f):
185 |         """Repeatedly invoke a function f on a collection, and return the result
186 |         of applying the function an infinite number of times (fixedpoint).
187 | 
188 |         Note that if the function does not converge to a fixedpoint this implementation
189 |         will run forever.
190 |         """
191 |         curr = Collection(self._inner)
192 |         while True:
193 |             result = f(curr)
194 |             if result._inner == curr._inner:
195 |                 break
196 |             curr = result
197 |         return curr
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     a = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1)])
202 |     b = Collection(
203 |         [
204 |             (("apple", "$3"), 1),
205 |             (("apple", ("granny smith", "$2")), 1),
206 |             (("kiwi", "$2"), 1),
207 |         ]
208 |     )
209 |     c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)])
210 |     d = Collection(
211 |         [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)]
212 |     )
213 |     e = Collection([(1, 1)])
214 | 
215 |     print(a.concat(b))
216 |     print(a.join(b))
217 |     print(b.join(a))
218 |     print(a.filter(lambda data: data[0] != "apple"))
219 |     print(a.map(lambda data: (data[1], data[0])))
220 |     print(a.concat(b).count())
221 |     print(a.concat(b).distinct())
222 |     print(c.min())
223 |     print(c.max())
224 |     print(d.sum())
225 | 
226 |     def add_one(collection):
227 |         return (
228 |             collection.map(lambda data: data + 1)
229 |             .concat(collection)
230 |             .filter(lambda data: data <= 5)
231 |             .map(lambda data: (data, ()))
232 |             .distinct()
233 |             .map(lambda data: data[0])
234 |             .consolidate()
235 |         )
236 | 
237 |     result = e.iterate(add_one).map(lambda data: (data, data * data))
238 |     print(result)
239 | 


--------------------------------------------------------------------------------
/collection.py:
--------------------------------------------------------------------------------
  1 | """The implementation of collections (multisets) of data and functional operations over single collections.
  2 | """
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | class Collection:
  8 |     """A multiset of data"""
  9 | 
 10 |     def __init__(self, dataz=None):
 11 |         if dataz is None:
 12 |             dataz = []
 13 |         self._inner = dataz
 14 | 
 15 |     def __repr__(self):
 16 |         return f"Collection({self._inner})"
 17 | 
 18 |     def map(self, f):
 19 |         """Apply a function to all records in the collection."""
 20 |         return Collection(
 21 |             [(f(data), multiplicity) for (data, multiplicity) in self._inner]
 22 |         )
 23 | 
 24 |     def filter(self, f):
 25 |         """Filter out records for which a function f(record) evaluates to False."""
 26 |         return Collection(
 27 |             [
 28 |                 (data, multiplicity)
 29 |                 for (data, multiplicity) in self._inner
 30 |                 if f(data) == True
 31 |             ]
 32 |         )
 33 | 
 34 |     def negate(self):
 35 |         return Collection(
 36 |             [(data, -multiplicity) for (data, multiplicity) in self._inner]
 37 |         )
 38 | 
 39 |     def concat(self, other):
 40 |         """Concatenate two collections together."""
 41 |         out = []
 42 |         out.extend(self._inner)
 43 |         out.extend(other._inner)
 44 |         return Collection(out)
 45 | 
 46 |     def consolidate(self):
 47 |         """Produce as output a collection that is logically equivalent to the input
 48 |         but which combines identical instances of the same record into one
 49 |         (record, multiplicity) pair.
 50 |         """
 51 |         consolidated = defaultdict(int)
 52 |         for (data, multiplicity) in self._inner:
 53 |             consolidated[data] += multiplicity
 54 |         consolidated = [
 55 |             (data, multiplicity)
 56 |             for (data, multiplicity) in consolidated.items()
 57 |             if multiplicity != 0
 58 |         ]
 59 |         consolidated.sort()
 60 |         return Collection(consolidated)
 61 | 
 62 |     def join(self, other):
 63 |         """Match pairs (k, v1) and (k, v2) from the two input collections and produce (k, (v1, v2))."""
 64 |         out = []
 65 |         for ((k1, v1), d1) in self._inner:
 66 |             for ((k2, v2), d2) in other._inner:
 67 |                 if k1 == k2:
 68 |                     out.append(((k1, (v1, v2)), d1 * d2))
 69 |         return Collection(out)
 70 | 
 71 |     def reduce(self, f):
 72 |         """Apply a reduction function to all record values, grouped by key."""
 73 |         keys = defaultdict(list)
 74 |         out = []
 75 |         for ((key, val), multiplicity) in self._inner:
 76 |             keys[key].append((val, multiplicity))
 77 |         for (key, vals) in keys.items():
 78 |             results = f(vals)
 79 |             for (val, multiplicity) in results:
 80 |                 out.append(((key, val), multiplicity))
 81 |         return Collection(out)
 82 | 
 83 |     def count(self):
 84 |         """Count the number of times each key occurs in the collection."""
 85 | 
 86 |         def count_inner(vals):
 87 |             out = 0
 88 |             for (_, multiplicity) in vals:
 89 |                 out += multiplicity
 90 |             return [(out, 1)]
 91 | 
 92 |         return self.reduce(count_inner)
 93 | 
 94 |     def sum(self):
 95 |         """Produce the sum of all the values paired with a key, for all keys in the collection."""
 96 | 
 97 |         def sum_inner(vals):
 98 |             out = 0
 99 |             for (val, multiplicity) in vals:
100 |                 out += val * multiplicity
101 |             return [(out, 1)]
102 | 
103 |         return self.reduce(sum_inner)
104 | 
105 |     def min(self):
106 |         """Produce the minimum value associated with each key in the collection.
107 | 
108 |         Note that no record may have negative multiplicity when computing the min,
109 |         as it is unclear what exactly the minimum record is in that case.
110 |         """
111 | 
112 |         def min_inner(vals):
113 |             consolidated = defaultdict(int)
114 |             for (val, multiplicity) in vals:
115 |                 consolidated[val] += multiplicity
116 |             vals = [
117 |                 (val, multiplicity)
118 |                 for (val, multiplicity) in consolidated.items()
119 |                 if multiplicity != 0
120 |             ]
121 |             if len(vals) != 0:
122 |                 out = vals[0][0]
123 |                 for (val, multiplicity) in vals:
124 |                     assert multiplicity > 0
125 |                     if val < out:
126 |                         out = val
127 |                 return [(out, 1)]
128 |             else:
129 |                 return []
130 | 
131 |         return self.reduce(min_inner)
132 | 
133 |     def max(self):
134 |         """Produce the maximum value associated with each key in the collection.
135 | 
136 |         Note that no record may have negative multiplicity when computing the max,
137 |         as it is unclear what exactly the maximum record is in that case.
138 |         """
139 | 
140 |         def max_inner(vals):
141 |             consolidated = defaultdict(int)
142 |             for (val, multiplicity) in vals:
143 |                 consolidated[val] += multiplicity
144 |             vals = [
145 |                 (val, multiplicity)
146 |                 for (val, multiplicity) in consolidated.items()
147 |                 if multiplicity != 0
148 |             ]
149 |             if len(vals) != 0:
150 |                 out = vals[0][0]
151 |                 for (val, multiplicity) in vals:
152 |                     assert multiplicity > 0
153 |                     if val > out:
154 |                         out = val
155 |                 return [(out, 1)]
156 |             else:
157 |                 return []
158 | 
159 |         return self.reduce(max_inner)
160 | 
161 |     def distinct(self):
162 |         """Reduce the collection to a set of elements (from a multiset).
163 | 
164 |         Note that no record may have negative multiplicity when producing this set,
165 |         as elements of sets may only have multiplicity one, and it is unclear that is
166 |         an appropriate output for elements with negative multiplicity.
167 |         """
168 | 
169 |         def distinct_inner(vals):
170 |             consolidated = defaultdict(int)
171 |             for (val, multiplicity) in vals:
172 |                 consolidated[val] += multiplicity
173 |             vals = [
174 |                 (val, multiplicity)
175 |                 for (val, multiplicity) in consolidated.items()
176 |                 if multiplicity != 0
177 |             ]
178 |             for (val, multiplicity) in vals:
179 |                 assert multiplicity > 0
180 |             return [(val, 1) for (val, _) in vals]
181 | 
182 |         return self.reduce(distinct_inner)
183 | 
184 |     def iterate(self, f):
185 |         """Repeatedly invoke a function f on a collection, and return the result
186 |         of applying the function an infinite number of times (fixedpoint).
187 | 
188 |         Note that if the function does not converge to a fixedpoint this implementation
189 |         will run forever.
190 |         """
191 |         curr = Collection(self._inner)
192 |         while True:
193 |             result = f(curr)
194 |             if result._inner == curr._inner:
195 |                 break
196 |             curr = result
197 |         return curr
198 | 
199 |     def _extend(self, other):
200 |         self._inner.extend(other._inner)
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     a = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1)])
205 |     b = Collection(
206 |         [
207 |             (("apple", "$3"), 1),
208 |             (("apple", ("granny smith", "$2")), 1),
209 |             (("kiwi", "$2"), 1),
210 |         ]
211 |     )
212 |     c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)])
213 |     d = Collection(
214 |         [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)]
215 |     )
216 |     e = Collection([(1, 1)])
217 | 
218 |     print(a.concat(b))
219 |     print(a.join(b))
220 |     print(b.join(a))
221 |     print(a.filter(lambda data: data[0] != "apple"))
222 |     print(a.map(lambda data: (data[1], data[0])))
223 |     print(a.concat(b).count())
224 |     print(a.concat(b).distinct())
225 |     print(c.min())
226 |     print(c.max())
227 |     print(d.sum())
228 | 
229 |     def add_one(collection):
230 |         return (
231 |             collection.map(lambda data: data + 1)
232 |             .concat(collection)
233 |             .filter(lambda data: data <= 5)
234 |             .map(lambda data: (data, ()))
235 |             .distinct()
236 |             .map(lambda data: data[0])
237 |             .consolidate()
238 |         )
239 | 
240 |     result = e.iterate(add_one).map(lambda data: (data, data * data))
241 |     print(result)
242 | 


--------------------------------------------------------------------------------
/v1/difference_sequence.py:
--------------------------------------------------------------------------------
  1 | """The implementation of a collection that changes as a sequence of difference
  2 | collections describing each change.
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | from collection import Collection
  7 | from index import Index
  8 | from itertools import zip_longest
  9 | 
 10 | 
 11 | class DifferenceSequence:
 12 |     """A collection that goes through a sequence of changes.
 13 | 
 14 |     Each change to the collection is described in a difference collection that
 15 |     describes the change between the current version of the collection and the
 16 |     previous version.
 17 | 
 18 |     This representation is designed for the case where the differences between
 19 |     consecutive versions in the sequence are small, and so storing the
 20 |     sequence of differences is both space efficient, and enables efficient
 21 |     computation of the sequence of output differences.
 22 |     """
 23 | 
 24 |     def __init__(self, trace):
 25 |         self._inner = trace
 26 | 
 27 |     def __repr__(self):
 28 |         return f"DifferenceSequence({self._inner})"
 29 | 
 30 |     def map(self, f):
 31 |         """Apply a function to all records in the collection trace."""
 32 |         return DifferenceSequence([collection.map(f) for collection in self._inner])
 33 | 
 34 |     def filter(self, f):
 35 |         """Filter out records where f(record) evaluates to False from all
 36 |         collections in the collection trace.
 37 |         """
 38 |         return DifferenceSequence([collection.filter(f) for collection in self._inner])
 39 | 
 40 |     def negate(self):
 41 |         return DifferenceSequence([collection.negate() for collection in self._inner])
 42 | 
 43 |     def concat(self, other):
 44 |         """Concatenate two collection traces together."""
 45 |         inputs = zip_longest(self._inner, other._inner, fillvalue=Collection())
 46 |         return DifferenceSequence([a.concat(b) for (a, b) in inputs])
 47 | 
 48 |     def consolidate(self):
 49 |         """Produce a collection trace where each collection in the trace
 50 |         is consolidated.
 51 |         """
 52 |         out = []
 53 |         for collection in self._inner:
 54 |             out.append(collection.consolidate())
 55 | 
 56 |         return DifferenceSequence(out)
 57 | 
 58 |     def join(self, other):
 59 |         """Match pairs (k, v1) and (k, v2) from the two input collection
 60 |         traces and produce a collection trace containing the corresponding
 61 |         (k, (v1, v2)).
 62 |         """
 63 |         index_a = Index()
 64 |         index_b = Index()
 65 |         out = []
 66 | 
 67 |         for (collection_a, collection_b) in zip_longest(
 68 |             self._inner, other._inner, fillvalue=Collection()
 69 |         ):
 70 |             delta_a = Index()
 71 |             delta_b = Index()
 72 |             result = Collection()
 73 | 
 74 |             for ((key, value), multiplicity) in collection_a._inner:
 75 |                 delta_a.add_value(key, (value, multiplicity))
 76 |             for ((key, value), multiplicity) in collection_b._inner:
 77 |                 delta_b.add_value(key, (value, multiplicity))
 78 | 
 79 |             result._extend(delta_a.join(index_b))
 80 |             index_a.append(delta_a)
 81 |             result._extend(index_a.join(delta_b))
 82 |             index_b.append(delta_b)
 83 |             # Consolidating the output is not strictly necessary and is only done here to make the output easier to inspect visually.
 84 |             out.append(result.consolidate())
 85 |         return DifferenceSequence(out)
 86 | 
 87 |     def reduce(self, f):
 88 |         """Apply a reduction function to all record values, grouped by key."""
 89 | 
 90 |         def subtract_values(first, second):
 91 |             result = defaultdict(int)
 92 |             for (v1, m1) in first:
 93 |                 result[v1] += m1
 94 |             for (v2, m2) in second:
 95 |                 result[v2] -= m2
 96 | 
 97 |             return [
 98 |                 (val, multiplicity)
 99 |                 for (val, multiplicity) in result.items()
100 |                 if multiplicity != 0
101 |             ]
102 | 
103 |         index = Index()
104 |         index_out = Index()
105 |         keys_todo = defaultdict(set)
106 |         output = []
107 | 
108 |         for collection in self._inner:
109 |             keys_todo = set()
110 |             result = []
111 |             for ((key, value), multiplicity) in collection._inner:
112 |                 index.add_value(key, (value, multiplicity))
113 |                 keys_todo.add(key)
114 | 
115 |             keys = [key for key in keys_todo]
116 |             for key in keys:
117 |                 curr = index.get(key)
118 |                 curr_out = index_out.get(key)
119 |                 out = f(curr)
120 |                 delta = subtract_values(out, curr_out)
121 |                 for (value, multiplicity) in delta:
122 |                     result.append(((key, value), multiplicity))
123 |                     index_out.add_value(key, (value, multiplicity))
124 |             output.append(Collection(result))
125 |             index.compact(keys)
126 |             index_out.compact(keys)
127 | 
128 |         return DifferenceSequence(output)
129 | 
130 |     def count(self):
131 |         """Count the number of times each key occurs in each collection in the collection
132 |         trace.
133 |         """
134 | 
135 |         def count_inner(vals):
136 |             out = 0
137 |             for (_, diff) in vals:
138 |                 out += diff
139 |             return [(out, 1)]
140 | 
141 |         return self.reduce(count_inner)
142 | 
143 |     def sum(self):
144 |         """Produce the sum of all the values paired with each key, for each
145 |         collection in the trace.
146 |         """
147 | 
148 |         def sum_inner(vals):
149 |             out = 0
150 |             for (val, diff) in vals:
151 |                 out += val * diff
152 |             return [(out, 1)]
153 | 
154 |         return self.reduce(sum_inner)
155 | 
156 |     def min(self):
157 |         """Produce the minimum value associated with each key, for each collection in
158 |         the trace.
159 |         """
160 | 
161 |         def min_inner(vals):
162 |             consolidated = defaultdict(int)
163 |             for (val, multiplicity) in vals:
164 |                 consolidated[val] += multiplicity
165 |             vals = [
166 |                 (val, multiplicity)
167 |                 for (val, multiplicity) in consolidated.items()
168 |                 if multiplicity != 0
169 |             ]
170 |             if len(vals) != 0:
171 |                 out = vals[0][0]
172 |                 for (val, multiplicity) in vals:
173 |                     assert multiplicity > 0
174 |                     if val < out:
175 |                         out = val
176 |                 return [(out, 1)]
177 |             else:
178 |                 return []
179 | 
180 |         return self.reduce(min_inner)
181 | 
182 |     def max(self):
183 |         """Produce the minimum value associated with each key, for each collection in
184 |         the trace.
185 |         """
186 | 
187 |         def max_inner(vals):
188 |             consolidated = defaultdict(int)
189 |             for (val, multiplicity) in vals:
190 |                 consolidated[val] += multiplicity
191 |             vals = [
192 |                 (val, multiplicity)
193 |                 for (val, multiplicity) in consolidated.items()
194 |                 if multiplicity != 0
195 |             ]
196 |             if len(vals) != 0:
197 |                 out = vals[0][0]
198 |                 for (val, multiplicity) in vals:
199 |                     assert multiplicity > 0
200 |                     if val > out:
201 |                         out = val
202 |                 return [(out, 1)]
203 |             else:
204 |                 return []
205 | 
206 |         return self.reduce(max_inner)
207 | 
208 |     def distinct(self):
209 |         def distinct_inner(vals):
210 |             consolidated = defaultdict(int)
211 |             for (val, multiplicity) in vals:
212 |                 consolidated[val] += multiplicity
213 |             vals = [
214 |                 (val, multiplicity)
215 |                 for (val, multiplicity) in consolidated.items()
216 |                 if multiplicity != 0
217 |             ]
218 |             for (val, multiplicity) in vals:
219 |                 assert multiplicity > 0
220 |             return [(val, 1) for (val, _) in vals]
221 | 
222 |         return self.reduce(distinct_inner)
223 | 
224 |     def iterate(self, f):
225 |         """Return the fixpoint of repeatedly applying f to each collection in the trace."""
226 |         # TODO
227 | 
228 | 
229 | if __name__ == "__main__":
230 |     a = Collection([(("apple", "$5"), 3), (("banana", "$2"), 1)])
231 |     b = Collection([(("apple", "$3"), 1), (("apple", "$2"), 1), (("kiwi", "$2"), 1)])
232 |     c = Collection([(("apple", "$5"), 2), (("banana", "$2"), 1), (("apple", "$2"), 20)])
233 |     d = Collection(
234 |         [(("apple", 11), 1), (("apple", 3), 2), (("banana", 2), 3), (("coconut", 3), 1)]
235 |     )
236 |     e = Collection([(1, 1)])
237 | 
238 |     trace_a = DifferenceSequence(
239 |         [
240 |             a,
241 |             Collection([(("apple", "$5"), -1), (("apple", "$7"), 1)]),
242 |             Collection([(("lemon", "$1"), 1)]),
243 |         ]
244 |     )
245 |     print(trace_a.map(lambda data: (data[1], data[0])))
246 |     print(trace_a.filter(lambda data: data[0] != "apple"))
247 | 
248 |     trace_b = DifferenceSequence(
249 |         [
250 |             b,
251 |             Collection([]),
252 |             Collection([(("lemon", "$22"), 3), (("kiwi", "$1"), 2)]),
253 |         ]
254 |     )
255 |     print(trace_a.join(trace_b))
256 |     print(trace_a.join(trace_b).consolidate())
257 |     print(trace_a.min())
258 |     print(trace_a.max())
259 |     print(trace_a.distinct())
260 | 


--------------------------------------------------------------------------------
/v2/differential_dataflow.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | from collection import Collection
  4 | from graph import (
  5 |     BinaryOperator,
  6 |     DifferenceStreamReader,
  7 |     DifferenceStreamWriter,
  8 |     Graph,
  9 |     UnaryOperator,
 10 | )
 11 | from index import Index
 12 | 
 13 | 
 14 | class DifferenceStreamBuilder:
 15 |     def __init__(self, graph):
 16 |         self._writer = DifferenceStreamWriter()
 17 |         self.graph = graph
 18 | 
 19 |     def connect_reader(self):
 20 |         return self._writer._new_reader()
 21 | 
 22 |     def writer(self):
 23 |         return self._writer
 24 | 
 25 |     def map(self, f):
 26 |         output = DifferenceStreamBuilder(self.graph)
 27 |         operator = MapOperator(
 28 |             self.connect_reader(),
 29 |             output.writer(),
 30 |             f,
 31 |         )
 32 |         self.graph.add_operator(operator)
 33 |         self.graph.add_stream(output.connect_reader())
 34 |         return output
 35 | 
 36 |     def filter(self, f):
 37 |         output = DifferenceStreamBuilder(self.graph)
 38 |         operator = FilterOperator(
 39 |             self.connect_reader(),
 40 |             output.writer(),
 41 |             f,
 42 |         )
 43 |         self.graph.add_operator(operator)
 44 |         self.graph.add_stream(output.connect_reader())
 45 |         return output
 46 | 
 47 |     def negate(self):
 48 |         output = DifferenceStreamBuilder(self.graph)
 49 |         operator = NegateOperator(
 50 |             self.connect_reader(),
 51 |             output.writer(),
 52 |         )
 53 |         self.graph.add_operator(operator)
 54 |         self.graph.add_stream(output.connect_reader())
 55 |         return output
 56 | 
 57 |     def concat(self, other):
 58 |         assert id(self.graph) == id(other.graph)
 59 |         output = DifferenceStreamBuilder(self.graph)
 60 |         operator = ConcatOperator(
 61 |             self.connect_reader(),
 62 |             other.connect_reader(),
 63 |             output.writer(),
 64 |         )
 65 |         self.graph.add_operator(operator)
 66 |         self.graph.add_stream(output.connect_reader())
 67 |         return output
 68 | 
 69 |     def debug(self, name=""):
 70 |         output = DifferenceStreamBuilder(self.graph)
 71 |         operator = DebugOperator(
 72 |             self.connect_reader(),
 73 |             output.writer(),
 74 |             name,
 75 |         )
 76 |         self.graph.add_operator(operator)
 77 |         self.graph.add_stream(output.connect_reader())
 78 |         return output
 79 | 
 80 |     def join(self, other):
 81 |         assert id(self.graph) == id(other.graph)
 82 |         output = DifferenceStreamBuilder(self.graph)
 83 |         operator = JoinOperator(
 84 |             self.connect_reader(),
 85 |             other.connect_reader(),
 86 |             output.writer(),
 87 |         )
 88 |         self.graph.add_operator(operator)
 89 |         self.graph.add_stream(output.connect_reader())
 90 |         return output
 91 | 
 92 |     def count(self):
 93 |         output = DifferenceStreamBuilder(self.graph)
 94 |         operator = CountOperator(
 95 |             self.connect_reader(),
 96 |             output.writer(),
 97 |         )
 98 |         self.graph.add_operator(operator)
 99 |         self.graph.add_stream(output.connect_reader())
100 |         return output
101 | 
102 | 
103 | class GraphBuilder:
104 |     def __init__(self):
105 |         self.streams = []
106 |         self.operators = []
107 | 
108 |     def new_input(self):
109 |         stream_builder = DifferenceStreamBuilder(self)
110 |         self.streams.append(stream_builder.connect_reader())
111 |         return stream_builder, stream_builder.writer()
112 | 
113 |     def add_operator(self, operator):
114 |         self.operators.append(operator)
115 | 
116 |     def add_stream(self, stream):
117 |         self.streams.append(stream)
118 | 
119 |     def finalize(self):
120 |         return Graph(self.streams, self.operators)
121 | 
122 | 
123 | class LinearUnaryOperator(UnaryOperator):
124 |     def __init__(self, input_a, output, f):
125 |         def inner():
126 |             for collection in self.input_messages():
127 |                 self.output.send_data(f(collection))
128 | 
129 |         super().__init__(input_a, output, inner)
130 | 
131 | 
132 | class MapOperator(LinearUnaryOperator):
133 |     def __init__(self, input_a, output, f):
134 |         def map_inner(collection):
135 |             return collection.map(f)
136 | 
137 |         super().__init__(input_a, output, map_inner)
138 | 
139 | 
140 | class FilterOperator(LinearUnaryOperator):
141 |     def __init__(self, input_a, output, f):
142 |         def filter_inner(collection):
143 |             return collection.filter(f)
144 | 
145 |         super().__init__(input_a, output, filter_inner)
146 | 
147 | 
148 | class NegateOperator(LinearUnaryOperator):
149 |     def __init__(self, input_a, output):
150 |         def negate_inner(collection):
151 |             return collection.negate()
152 | 
153 |         super().__init__(input_a, output, negate_inner)
154 | 
155 | 
156 | class ConcatOperator(BinaryOperator):
157 |     def __init__(self, input_a, input_b, output):
158 |         self.input_a_pending = []
159 |         self.input_b_pending = []
160 | 
161 |         def inner():
162 |             # This is not internally consistent!
163 |             for collection in self.input_a_messages():
164 |                 self.input_a_pending.append(collection)
165 |             for collection in self.input_b_messages():
166 |                 self.input_b_pending.append(collection)
167 | 
168 |             sent = 0
169 |             for (collection_a, collection_b) in zip(
170 |                 self.input_a_pending, self.input_b_pending
171 |             ):
172 |                 self.output.send_data(collection_a.concat(collection_b))
173 |                 sent += 1
174 |             if sent > 0:
175 |                 self.input_a_pending = self.input_a_pending[sent:]
176 |                 self.input_b_pending = self.input_b_pending[sent:]
177 | 
178 |         super().__init__(input_a, input_b, output, inner)
179 | 
180 | 
181 | class DebugOperator(UnaryOperator):
182 |     def __init__(self, input_a, output, name):
183 |         def inner():
184 |             for collection in self.input_messages():
185 |                 print(f"debug {name} data: collection: {collection}")
186 |                 self.output.send_data(collection)
187 | 
188 |         super().__init__(input_a, output, inner)
189 | 
190 | 
191 | class JoinOperator(BinaryOperator):
192 |     def __init__(self, input_a, input_b, output):
193 |         self.index_a = Index()
194 |         self.index_b = Index()
195 |         self.input_a_pending = []
196 |         self.input_b_pending = []
197 | 
198 |         def inner():
199 |             for collection in self.input_a_messages():
200 |                 delta_a = Index()
201 |                 for ((key, value), multiplicity) in collection._inner:
202 |                     delta_a.add_value(key, (value, multiplicity))
203 |                 self.input_a_pending.append(delta_a)
204 |             for collection in self.input_b_messages():
205 |                 delta_b = Index()
206 |                 for ((key, value), multiplicity) in collection._inner:
207 |                     delta_b.add_value(key, (value, multiplicity))
208 |                 self.input_b_pending.append(delta_b)
209 | 
210 |             sent = 0
211 |             for (delta_a, delta_b) in zip(self.input_a_pending, self.input_b_pending):
212 |                 result = Collection()
213 |                 result._extend(delta_a.join(self.index_b))
214 |                 self.index_a.append(delta_a)
215 |                 result._extend(self.index_a.join(delta_b))
216 |                 self.index_b.append(delta_b)
217 |                 self.output.send_data(result.consolidate())
218 |                 sent += 1
219 |                 self.index_a.compact()
220 |                 self.index_b.compact()
221 | 
222 |             if sent > 0:
223 |                 self.input_a_pending = self.input_a_pending[sent:]
224 |                 self.input_b_pending = self.input_b_pending[sent:]
225 | 
226 |         super().__init__(input_a, input_b, output, inner)
227 | 
228 | 
229 | class ReduceOperator(UnaryOperator):
230 |     def __init__(self, input_a, output, f):
231 |         self.index = Index()
232 |         self.index_out = Index()
233 | 
234 |         def subtract_values(first, second):
235 |             result = defaultdict(int)
236 |             for (v1, m1) in first:
237 |                 result[v1] += m1
238 |             for (v2, m2) in second:
239 |                 result[v2] -= m2
240 | 
241 |             return [
242 |                 (val, multiplicity)
243 |                 for (val, multiplicity) in result.items()
244 |                 if multiplicity != 0
245 |             ]
246 | 
247 |         def inner():
248 |             for collection in self.input_messages():
249 |                 keys_todo = set()
250 |                 result = []
251 |                 for ((key, value), multiplicity) in collection._inner:
252 |                     self.index.add_value(key, (value, multiplicity))
253 |                     keys_todo.add(key)
254 |                 keys = [key for key in keys_todo]
255 |                 for key in keys:
256 |                     curr = self.index.get(key)
257 |                     curr_out = self.index_out.get(key)
258 |                     out = f(curr)
259 |                     delta = subtract_values(out, curr_out)
260 |                     for (value, multiplicity) in delta:
261 |                         result.append(((key, value), multiplicity))
262 |                         self.index_out.add_value(key, (value, multiplicity))
263 |                 self.output.send_data(Collection(result))
264 |                 self.index.compact(keys)
265 |                 self.index_out.compact(keys)
266 | 
267 |         super().__init__(input_a, output, inner)
268 | 
269 | 
270 | class CountOperator(ReduceOperator):
271 |     def __init__(self, input_a, output):
272 |         def count_inner(vals):
273 |             out = 0
274 |             for (_, diff) in vals:
275 |                 out += diff
276 |             return [(out, 1)]
277 | 
278 |         super().__init__(input_a, output, count_inner)
279 | 
280 | 
281 | if __name__ == "__main__":
282 |     graph_builder = GraphBuilder()
283 |     input_a, input_a_writer = graph_builder.new_input()
284 |     output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0)
285 |     input_a.negate().concat(output).debug("output")
286 |     graph = graph_builder.finalize()
287 | 
288 |     for i in range(0, 10):
289 |         input_a_writer.send_data(Collection([(i, 1)]))
290 |         graph.step()
291 |     graph_builder = GraphBuilder()
292 |     input_a, input_a_writer = graph_builder.new_input()
293 |     input_b, input_b_writer = graph_builder.new_input()
294 | 
295 |     output = input_a.join(input_b).count().debug("count")
296 |     graph = graph_builder.finalize()
297 | 
298 |     for i in range(0, 10):
299 |         input_a_writer.send_data(Collection([((1, i), 2)]))
300 |         input_a_writer.send_data(Collection([((2, i), 2)]))
301 |         input_b_writer.send_data(Collection([((1, i + 2), 2)]))
302 |         input_b_writer.send_data(Collection([((2, i + 3), 2)]))
303 |         graph.step()
304 |     graph.step()
305 | 


--------------------------------------------------------------------------------
/v3/differential_dataflow.py:
--------------------------------------------------------------------------------
  1 | """An implementation of differential dataflow specialized for the setting where versions (times) are
  2 | integers. This implementation supports all differential operations except iterate.
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | 
  7 | from collection import Collection
  8 | from graph import (
  9 |     BinaryOperator,
 10 |     DifferenceStreamReader,
 11 |     DifferenceStreamWriter,
 12 |     Graph,
 13 |     MessageType,
 14 |     UnaryOperator,
 15 | )
 16 | from index import Index
 17 | 
 18 | 
 19 | class DifferenceStreamBuilder:
 20 |     def __init__(self, graph):
 21 |         self._writer = DifferenceStreamWriter()
 22 |         self.graph = graph
 23 | 
 24 |     def connect_reader(self):
 25 |         return self._writer._new_reader()
 26 | 
 27 |     def writer(self):
 28 |         return self._writer
 29 | 
 30 |     def map(self, f):
 31 |         output = DifferenceStreamBuilder(self.graph)
 32 |         operator = MapOperator(
 33 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 34 |         )
 35 |         self.graph.add_operator(operator)
 36 |         self.graph.add_stream(output.connect_reader())
 37 |         return output
 38 | 
 39 |     def filter(self, f):
 40 |         output = DifferenceStreamBuilder(self.graph)
 41 |         operator = FilterOperator(
 42 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 43 |         )
 44 |         self.graph.add_operator(operator)
 45 |         self.graph.add_stream(output.connect_reader())
 46 |         return output
 47 | 
 48 |     def negate(self):
 49 |         output = DifferenceStreamBuilder(self.graph)
 50 |         operator = NegateOperator(
 51 |             self.connect_reader(), output.writer(), self.graph.frontier()
 52 |         )
 53 |         self.graph.add_operator(operator)
 54 |         self.graph.add_stream(output.connect_reader())
 55 |         return output
 56 | 
 57 |     def concat(self, other):
 58 |         assert id(self.graph) == id(other.graph)
 59 |         output = DifferenceStreamBuilder(self.graph)
 60 |         operator = ConcatOperator(
 61 |             self.connect_reader(),
 62 |             other.connect_reader(),
 63 |             output.writer(),
 64 |             self.graph.frontier(),
 65 |         )
 66 |         self.graph.add_operator(operator)
 67 |         self.graph.add_stream(output.connect_reader())
 68 |         return output
 69 | 
 70 |     def debug(self, name=""):
 71 |         output = DifferenceStreamBuilder(self.graph)
 72 |         operator = DebugOperator(
 73 |             self.connect_reader(), output.writer(), name, self.graph.frontier()
 74 |         )
 75 |         self.graph.add_operator(operator)
 76 |         self.graph.add_stream(output.connect_reader())
 77 |         return output
 78 | 
 79 |     def join(self, other):
 80 |         assert id(self.graph) == id(other.graph)
 81 |         output = DifferenceStreamBuilder(self.graph)
 82 |         operator = JoinOperator(
 83 |             self.connect_reader(),
 84 |             other.connect_reader(),
 85 |             output.writer(),
 86 |             self.graph.frontier(),
 87 |         )
 88 |         self.graph.add_operator(operator)
 89 |         self.graph.add_stream(output.connect_reader())
 90 |         return output
 91 | 
 92 |     def count(self):
 93 |         output = DifferenceStreamBuilder(self.graph)
 94 |         operator = CountOperator(
 95 |             self.connect_reader(), output.writer(), self.graph.frontier()
 96 |         )
 97 |         self.graph.add_operator(operator)
 98 |         self.graph.add_stream(output.connect_reader())
 99 |         return output
100 | 
101 | 
102 | class GraphBuilder:
103 |     def __init__(self, initial_frontier):
104 |         self.streams = []
105 |         self.operators = []
106 |         self.initial_frontier = initial_frontier
107 | 
108 |     def new_input(self):
109 |         stream_builder = DifferenceStreamBuilder(self)
110 |         self.streams.append(stream_builder.connect_reader())
111 |         return stream_builder, stream_builder.writer()
112 | 
113 |     def add_operator(self, operator):
114 |         self.operators.append(operator)
115 | 
116 |     def add_stream(self, stream):
117 |         self.streams.append(stream)
118 | 
119 |     def frontier(self):
120 |         return self.initial_frontier
121 | 
122 |     def finalize(self):
123 |         return Graph(self.streams, self.operators)
124 | 
125 | 
126 | class LinearUnaryOperator(UnaryOperator):
127 |     def __init__(self, input_a, output, f, initial_frontier):
128 |         def inner():
129 |             for (typ, msg) in self.input_messages():
130 |                 if typ == MessageType.DATA:
131 |                     version, collection = msg
132 |                     self.output.send_data(version, f(collection))
133 |                 elif typ == MessageType.FRONTIER:
134 |                     frontier = msg
135 |                     self.set_input_frontier(frontier)
136 | 
137 |             if self.input_frontier() > self.output_frontier:
138 |                 self.output_frontier = self.input_frontier()
139 |                 self.output.send_frontier(self.output_frontier)
140 | 
141 |         super().__init__(input_a, output, inner, initial_frontier)
142 | 
143 | 
144 | class MapOperator(LinearUnaryOperator):
145 |     def __init__(self, input_a, output, f, initial_frontier):
146 |         def map_inner(collection):
147 |             return collection.map(f)
148 | 
149 |         super().__init__(input_a, output, map_inner, initial_frontier)
150 | 
151 | 
152 | class FilterOperator(LinearUnaryOperator):
153 |     def __init__(self, input_a, output, f, initial_frontier):
154 |         def filter_inner(collection):
155 |             return collection.filter(f)
156 | 
157 |         super().__init__(input_a, output, filter_inner, initial_frontier)
158 | 
159 | 
160 | class NegateOperator(LinearUnaryOperator):
161 |     def __init__(self, input_a, output, initial_frontier):
162 |         def negate_inner(collection):
163 |             return collection.negate()
164 | 
165 |         super().__init__(input_a, output, negate_inner, initial_frontier)
166 | 
167 | 
168 | class ConcatOperator(BinaryOperator):
169 |     def __init__(self, input_a, input_b, output, initial_frontier):
170 |         def inner():
171 |             for (typ, msg) in self.input_a_messages():
172 |                 if typ == MessageType.DATA:
173 |                     version, collection = msg
174 |                     self.output.send_data(version, collection)
175 |                 elif typ == MessageType.FRONTIER:
176 |                     frontier = msg
177 |                     self.set_input_a_frontier(frontier)
178 |             for (typ, msg) in self.input_b_messages():
179 |                 if typ == MessageType.DATA:
180 |                     version, collection = msg
181 |                     self.output.send_data(version, collection)
182 |                 elif typ == MessageType.FRONTIER:
183 |                     frontier = msg
184 |                     self.set_input_b_frontier(version)
185 | 
186 |             min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier())
187 |             if min_input_frontier > self.output_frontier:
188 |                 self.output_frontier = min_input_frontier
189 |                 self.output.send_frontier(self.output_frontier)
190 | 
191 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
192 | 
193 | 
194 | class DebugOperator(UnaryOperator):
195 |     def __init__(self, input_a, output, name, initial_frontier):
196 |         def inner():
197 |             for (typ, msg) in self.input_messages():
198 |                 if typ == MessageType.DATA:
199 |                     version, collection = msg
200 |                     print(
201 |                         f"debug {name} data: version: {version} collection: {collection}"
202 |                     )
203 |                     self.output.send_data(version, collection)
204 |                 elif typ == MessageType.FRONTIER:
205 |                     frontier = msg
206 |                     assert self.input_frontier() <= frontier
207 |                     self.set_input_frontier(frontier)
208 |                     print(f"debug {name} notification: frontier {version}")
209 |                     assert self.output_frontier <= self.input_frontier()
210 |                     if self.output_frontier < self.input_frontier():
211 |                         self.output_frontier = self.input_frontier()
212 |                         self.output.send_frontier(self.output_frontier)
213 | 
214 |         super().__init__(input_a, output, inner, initial_frontier)
215 | 
216 | 
217 | class JoinOperator(BinaryOperator):
218 |     def __init__(self, input_a, input_b, output, initial_frontier):
219 |         self.index_a = Index()
220 |         self.index_b = Index()
221 | 
222 |         def inner():
223 |             delta_a = Index()
224 |             delta_b = Index()
225 |             for (typ, msg) in self.input_a_messages():
226 |                 if typ == MessageType.DATA:
227 |                     version, collection = msg
228 |                     for ((key, value), multiplicity) in collection._inner:
229 |                         delta_a.add_value(key, version, (value, multiplicity))
230 |                 elif typ == MessageType.FRONTIER:
231 |                     frontier = msg
232 |                     self.set_input_a_frontier(msg)
233 |             for (typ, msg) in self.input_b_messages():
234 |                 if typ == MessageType.DATA:
235 |                     version, collection = msg
236 |                     for ((key, value), multiplicity) in collection._inner:
237 |                         delta_b.add_value(key, version, (value, multiplicity))
238 |                 elif typ == MessageType.FRONTIER:
239 |                     frontier = msg
240 |                     self.set_input_b_frontier(frontier)
241 | 
242 |             results = defaultdict(Collection)
243 |             for (version, collection) in delta_a.join(self.index_b):
244 |                 results[version]._extend(collection)
245 | 
246 |             self.index_a.append(delta_a)
247 | 
248 |             for (version, collection) in self.index_a.join(delta_b):
249 |                 results[version]._extend(collection)
250 | 
251 |             for (version, collection) in results.items():
252 |                 self.output.send_data(version, collection)
253 |             self.index_b.append(delta_b)
254 | 
255 |             min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier())
256 |             if min_input_frontier > self.output_frontier:
257 |                 self.output_frontier = min_input_frontier
258 |                 self.output.send_frontier(self.output_frontier)
259 |                 self.index_a.compact(self.output_frontier)
260 |                 self.index_b.compact(self.output_frontier)
261 | 
262 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
263 | 
264 | 
265 | class ReduceOperator(UnaryOperator):
266 |     def __init__(self, input_a, output, f, initial_frontier):
267 |         self.index = Index()
268 |         self.index_out = Index()
269 |         self.keys_todo = defaultdict(set)
270 | 
271 |         def subtract_values(first, second):
272 |             result = defaultdict(int)
273 |             for (v1, m1) in first:
274 |                 result[v1] += m1
275 |             for (v2, m2) in second:
276 |                 result[v2] -= m2
277 | 
278 |             return [
279 |                 (val, multiplicity)
280 |                 for (val, multiplicity) in result.items()
281 |                 if multiplicity != 0
282 |             ]
283 | 
284 |         def inner():
285 |             for (typ, msg) in self.input_messages():
286 |                 if typ == MessageType.DATA:
287 |                     version, collection = msg
288 |                     for ((key, value), multiplicity) in collection._inner:
289 |                         self.index.add_value(key, version, (value, multiplicity))
290 |                         self.keys_todo[version].add(key)
291 |                 elif typ == MessageType.FRONTIER:
292 |                     frontier = msg
293 |                     self.set_input_frontier(frontier)
294 | 
295 |             finished_versions = [
296 |                 version
297 |                 for version in self.keys_todo.keys()
298 |                 if version < self.input_frontier()
299 |             ]
300 | 
301 |             finished_versions.sort()
302 |             for version in finished_versions:
303 |                 keys = self.keys_todo.pop(version)
304 |                 result = []
305 |                 for key in keys:
306 |                     curr = self.index.reconstruct_at(key, version)
307 |                     curr_out = self.index_out.reconstruct_at(key, version)
308 |                     out = f(curr)
309 |                     delta = subtract_values(out, curr_out)
310 |                     for (value, multiplicity) in delta:
311 |                         result.append(((key, value), multiplicity))
312 |                         self.index_out.add_value(key, version, (value, multiplicity))
313 |                 if result != []:
314 |                     self.output.send_data(version, Collection(result))
315 | 
316 |             if self.input_frontier() > self.output_frontier:
317 |                 self.output_frontier = self.input_frontier()
318 |                 self.output.send_frontier(self.output_frontier)
319 |                 self.index.compact(self.output_frontier)
320 |                 self.index_out.compact(self.output_frontier)
321 | 
322 |         super().__init__(input_a, output, inner, initial_frontier)
323 | 
324 | 
325 | class CountOperator(ReduceOperator):
326 |     def __init__(self, input_a, output, initial_frontier):
327 |         def count_inner(vals):
328 |             out = 0
329 |             for (_, diff) in vals:
330 |                 out += diff
331 |             return [(out, 1)]
332 | 
333 |         super().__init__(input_a, output, count_inner, initial_frontier)
334 | 
335 | 
336 | if __name__ == "__main__":
337 |     graph_builder = GraphBuilder(0)
338 |     input_a, input_a_writer = graph_builder.new_input()
339 |     output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0)
340 |     input_a.negate().concat(output).debug("output")
341 |     graph = graph_builder.finalize()
342 | 
343 |     for i in range(0, 10):
344 |         input_a_writer.send_data(i, Collection([(i, 1)]))
345 |         input_a_writer.send_frontier(i)
346 |         graph.step()
347 |     graph_builder = GraphBuilder(0)
348 |     input_a, input_a_writer = graph_builder.new_input()
349 |     input_b, input_b_writer = graph_builder.new_input()
350 | 
351 |     output = input_a.join(input_b).count().debug("count")
352 |     graph = graph_builder.finalize()
353 | 
354 |     for i in range(0, 10):
355 |         input_a_writer.send_data(i, Collection([((1, i), 2)]))
356 |         input_a_writer.send_data(i, Collection([((2, i), 2)]))
357 |         input_b_writer.send_data(i, Collection([((1, i + 2), 2)]))
358 |         input_b_writer.send_data(i, Collection([((2, i + 3), 2)]))
359 |         input_a_writer.send_frontier(i)
360 |         input_b_writer.send_frontier(i)
361 |         graph.step()
362 |     input_a_writer.send_frontier(11)
363 |     input_b_writer.send_frontier(11)
364 |     graph.step()
365 | 


--------------------------------------------------------------------------------
/v4/differential_dataflow.py:
--------------------------------------------------------------------------------
  1 | """An implementation of differential dataflow specialized for the setting where versions (times) are
  2 | integer tuples totally ordered lexicographically. This implementation supports all differential operations.
  3 | """
  4 | 
  5 | from collections import defaultdict
  6 | 
  7 | from collection import Collection
  8 | from graph import (
  9 |     BinaryOperator,
 10 |     DifferenceStreamReader,
 11 |     DifferenceStreamWriter,
 12 |     Graph,
 13 |     MessageType,
 14 |     UnaryOperator,
 15 | )
 16 | from index import Index
 17 | from version import Version
 18 | 
 19 | ITERATION_LIMIT = 100
 20 | 
 21 | 
 22 | class DifferenceStreamBuilder:
 23 |     def __init__(self, graph):
 24 |         self._writer = DifferenceStreamWriter()
 25 |         self.graph = graph
 26 | 
 27 |     def connect_reader(self):
 28 |         return self._writer._new_reader()
 29 | 
 30 |     def writer(self):
 31 |         return self._writer
 32 | 
 33 |     def map(self, f):
 34 |         output = DifferenceStreamBuilder(self.graph)
 35 |         operator = MapOperator(
 36 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 37 |         )
 38 |         self.graph.add_operator(operator)
 39 |         self.graph.add_stream(output.connect_reader())
 40 |         return output
 41 | 
 42 |     def filter(self, f):
 43 |         output = DifferenceStreamBuilder(self.graph)
 44 |         operator = FilterOperator(
 45 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 46 |         )
 47 |         self.graph.add_operator(operator)
 48 |         self.graph.add_stream(output.connect_reader())
 49 |         return output
 50 | 
 51 |     def negate(self):
 52 |         output = DifferenceStreamBuilder(self.graph)
 53 |         operator = NegateOperator(
 54 |             self.connect_reader(), output.writer(), self.graph.frontier()
 55 |         )
 56 |         self.graph.add_operator(operator)
 57 |         self.graph.add_stream(output.connect_reader())
 58 |         return output
 59 | 
 60 |     def concat(self, other):
 61 |         assert id(self.graph) == id(other.graph)
 62 |         output = DifferenceStreamBuilder(self.graph)
 63 |         operator = ConcatOperator(
 64 |             self.connect_reader(),
 65 |             other.connect_reader(),
 66 |             output.writer(),
 67 |             self.graph.frontier(),
 68 |         )
 69 |         self.graph.add_operator(operator)
 70 |         self.graph.add_stream(output.connect_reader())
 71 |         return output
 72 | 
 73 |     def debug(self, name=""):
 74 |         output = DifferenceStreamBuilder(self.graph)
 75 |         operator = DebugOperator(
 76 |             self.connect_reader(), output.writer(), name, self.graph.frontier()
 77 |         )
 78 |         self.graph.add_operator(operator)
 79 |         self.graph.add_stream(output.connect_reader())
 80 |         return output
 81 | 
 82 |     def join(self, other):
 83 |         assert id(self.graph) == id(other.graph)
 84 |         output = DifferenceStreamBuilder(self.graph)
 85 |         operator = JoinOperator(
 86 |             self.connect_reader(),
 87 |             other.connect_reader(),
 88 |             output.writer(),
 89 |             self.graph.frontier(),
 90 |         )
 91 |         self.graph.add_operator(operator)
 92 |         self.graph.add_stream(output.connect_reader())
 93 |         return output
 94 | 
 95 |     def count(self):
 96 |         output = DifferenceStreamBuilder(self.graph)
 97 |         operator = CountOperator(
 98 |             self.connect_reader(), output.writer(), self.graph.frontier()
 99 |         )
100 |         self.graph.add_operator(operator)
101 |         self.graph.add_stream(output.connect_reader())
102 |         return output
103 | 
104 |     def consolidate(self):
105 |         output = DifferenceStreamBuilder(self.graph)
106 |         operator = ConsolidateOperator(
107 |             self.connect_reader(), output.writer(), self.graph.frontier()
108 |         )
109 |         self.graph.add_operator(operator)
110 |         self.graph.add_stream(output.connect_reader())
111 |         return output
112 | 
113 |     def distinct(self):
114 |         output = DifferenceStreamBuilder(self.graph)
115 |         operator = DistinctOperator(
116 |             self.connect_reader(), output.writer(), self.graph.frontier()
117 |         )
118 |         self.graph.add_operator(operator)
119 |         self.graph.add_stream(output.connect_reader())
120 |         return output
121 | 
122 |     def _start_scope(self):
123 |         new_frontier = self.graph.frontier().extend()
124 |         self.graph.push_frontier(new_frontier)
125 | 
126 |     def _end_scope(self):
127 |         self.graph.pop_frontier()
128 | 
129 |     def _ingress(self):
130 |         output = DifferenceStreamBuilder(self.graph)
131 |         operator = IngressOperator(
132 |             self.connect_reader(),
133 |             output.writer(),
134 |             ITERATION_LIMIT,
135 |             self.graph.frontier(),
136 |         )
137 |         self.graph.add_operator(operator)
138 |         self.graph.add_stream(output.connect_reader())
139 |         return output
140 | 
141 |     def _egress(self):
142 |         output = DifferenceStreamBuilder(self.graph)
143 |         operator = EgressOperator(
144 |             self.connect_reader(), output.writer(), self.graph.frontier()
145 |         )
146 |         self.graph.add_operator(operator)
147 |         self.graph.add_stream(output.connect_reader())
148 |         return output
149 | 
150 |     def iterate(self, f):
151 |         self._start_scope()
152 |         feedback_stream = DifferenceStreamBuilder(self.graph)
153 |         entered = self._ingress().concat(feedback_stream)
154 |         result = f(entered)
155 |         feedback_operator = FeedbackOperator(
156 |             result.connect_reader(),
157 |             1,
158 |             ITERATION_LIMIT,
159 |             feedback_stream.writer(),
160 |             self.graph.frontier(),
161 |         )
162 |         self.graph.add_stream(feedback_stream)
163 |         self.graph.add_operator(feedback_operator)
164 |         self._end_scope()
165 |         return result._egress()
166 | 
167 | 
168 | class GraphBuilder:
169 |     def __init__(self, initial_frontier):
170 |         self.streams = []
171 |         self.operators = []
172 |         self.frontier_stack = [initial_frontier]
173 | 
174 |     def new_input(self):
175 |         stream_builder = DifferenceStreamBuilder(self)
176 |         self.streams.append(stream_builder.connect_reader())
177 |         return stream_builder, stream_builder.writer()
178 | 
179 |     def add_operator(self, operator):
180 |         self.operators.append(operator)
181 | 
182 |     def add_stream(self, stream):
183 |         self.streams.append(stream)
184 | 
185 |     def frontier(self):
186 |         return self.frontier_stack[-1]
187 | 
188 |     def push_frontier(self, new_frontier):
189 |         self.frontier_stack.append(new_frontier)
190 | 
191 |     def pop_frontier(self):
192 |         self.frontier_stack.pop()
193 | 
194 |     def finalize(self):
195 |         return Graph(self.streams, self.operators)
196 | 
197 | 
198 | class LinearUnaryOperator(UnaryOperator):
199 |     def __init__(self, input_a, output, f, initial_frontier):
200 |         def inner():
201 |             for (typ, msg) in self.input_messages():
202 |                 if typ == MessageType.DATA:
203 |                     version, collection = msg
204 |                     self.output.send_data(version, f(collection))
205 |                 elif typ == MessageType.FRONTIER:
206 |                     frontier = msg
207 |                     self.set_input_frontier(frontier)
208 | 
209 |             if self.input_frontier() > self.output_frontier:
210 |                 self.output_frontier = self.input_frontier()
211 |                 self.output.send_frontier(self.output_frontier)
212 | 
213 |         super().__init__(input_a, output, inner, initial_frontier)
214 | 
215 | 
216 | class MapOperator(LinearUnaryOperator):
217 |     def __init__(self, input_a, output, f, initial_frontier):
218 |         def map_inner(collection):
219 |             return collection.map(f)
220 | 
221 |         super().__init__(input_a, output, map_inner, initial_frontier)
222 | 
223 | 
224 | class FilterOperator(LinearUnaryOperator):
225 |     def __init__(self, input_a, output, f, initial_frontier):
226 |         def filter_inner(collection):
227 |             return collection.filter(f)
228 | 
229 |         super().__init__(input_a, output, filter_inner, initial_frontier)
230 | 
231 | 
232 | class NegateOperator(LinearUnaryOperator):
233 |     def __init__(self, input_a, output, initial_frontier):
234 |         def negate_inner(collection):
235 |             return collection.negate()
236 | 
237 |         super().__init__(input_a, output, negate_inner, initial_frontier)
238 | 
239 | 
240 | class ConcatOperator(BinaryOperator):
241 |     def __init__(self, input_a, input_b, output, initial_frontier):
242 |         def inner():
243 |             for (typ, msg) in self.input_a_messages():
244 |                 if typ == MessageType.DATA:
245 |                     version, collection = msg
246 |                     self.output.send_data(version, collection)
247 |                 elif typ == MessageType.FRONTIER:
248 |                     frontier = msg
249 |                     self.set_input_a_frontier(frontier)
250 |             for (typ, msg) in self.input_b_messages():
251 |                 if typ == MessageType.DATA:
252 |                     version, collection = msg
253 |                     self.output.send_data(version, collection)
254 |                 elif typ == MessageType.FRONTIER:
255 |                     frontier = msg
256 |                     self.set_input_b_frontier(frontier)
257 | 
258 |             min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier())
259 |             if min_input_frontier > self.output_frontier:
260 |                 self.output_frontier = min_input_frontier
261 |                 self.output.send_frontier(self.output_frontier)
262 | 
263 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
264 | 
265 | 
266 | class DebugOperator(UnaryOperator):
267 |     def __init__(self, input_a, output, name, initial_frontier):
268 |         def inner():
269 |             for (typ, msg) in self.input_messages():
270 |                 if typ == MessageType.DATA:
271 |                     version, collection = msg
272 |                     print(
273 |                         f"debug {name} data: version: {version} collection: {collection}"
274 |                     )
275 |                     self.output.send_data(version, collection)
276 |                 elif typ == MessageType.FRONTIER:
277 |                     frontier = msg
278 |                     assert self.input_frontier() <= frontier
279 |                     self.set_input_frontier(frontier)
280 |                     print(f"debug {name} notification: frontier {frontier}")
281 |                     assert self.output_frontier <= self.input_frontier()
282 |                     if self.output_frontier < self.input_frontier():
283 |                         self.output_frontier = self.input_frontier()
284 |                         self.output.send_frontier(self.output_frontier)
285 | 
286 |         super().__init__(input_a, output, inner, initial_frontier)
287 | 
288 | 
289 | class ConsolidateOperator(UnaryOperator):
290 |     def __init__(self, input_a, output, initial_frontier):
291 |         self.collections = defaultdict(Collection)
292 | 
293 |         def inner():
294 |             for (typ, msg) in self.input_messages():
295 |                 if typ == MessageType.DATA:
296 |                     version, collection = msg
297 |                     self.collections[version]._extend(collection)
298 |                 elif typ == MessageType.FRONTIER:
299 |                     frontier = msg
300 |                     assert self.input_frontier() <= frontier
301 |                     self.set_input_frontier(frontier)
302 |             finished_versions = [
303 |                 version
304 |                 for version in self.collections.keys()
305 |                 if version < self.input_frontier()
306 |             ]
307 |             for version in finished_versions:
308 |                 collection = self.collections.pop(version).consolidate()
309 |                 self.output.send_data(version, collection)
310 |             assert self.output_frontier <= self.input_frontier()
311 |             if self.output_frontier < self.input_frontier():
312 |                 self.output_frontier = self.input_frontier()
313 |                 self.output.send_frontier(self.output_frontier)
314 | 
315 |         super().__init__(input_a, output, inner, initial_frontier)
316 | 
317 | 
318 | class JoinOperator(BinaryOperator):
319 |     def __init__(self, input_a, input_b, output, initial_frontier):
320 |         self.index_a = Index()
321 |         self.index_b = Index()
322 | 
323 |         def inner():
324 |             delta_a = Index()
325 |             delta_b = Index()
326 |             for (typ, msg) in self.input_a_messages():
327 |                 if typ == MessageType.DATA:
328 |                     version, collection = msg
329 |                     for ((key, value), multiplicity) in collection._inner:
330 |                         delta_a.add_value(key, version, (value, multiplicity))
331 |                 elif typ == MessageType.FRONTIER:
332 |                     frontier = msg
333 |                     self.set_input_a_frontier(msg)
334 |             for (typ, msg) in self.input_b_messages():
335 |                 if typ == MessageType.DATA:
336 |                     version, collection = msg
337 |                     for ((key, value), multiplicity) in collection._inner:
338 |                         delta_b.add_value(key, version, (value, multiplicity))
339 |                 elif typ == MessageType.FRONTIER:
340 |                     frontier = msg
341 |                     self.set_input_b_frontier(frontier)
342 | 
343 |             results = defaultdict(Collection)
344 |             for (version, collection) in delta_a.join(self.index_b):
345 |                 results[version]._extend(collection)
346 | 
347 |             self.index_a.append(delta_a)
348 | 
349 |             for (version, collection) in self.index_a.join(delta_b):
350 |                 results[version]._extend(collection)
351 | 
352 |             for (version, collection) in results.items():
353 |                 self.output.send_data(version, collection)
354 |             self.index_b.append(delta_b)
355 | 
356 |             min_input_frontier = min(self.input_a_frontier(), self.input_b_frontier())
357 |             if min_input_frontier > self.output_frontier:
358 |                 self.output_frontier = min_input_frontier
359 |                 self.output.send_frontier(self.output_frontier)
360 |                 self.index_a.compact(self.output_frontier)
361 |                 self.index_b.compact(self.output_frontier)
362 | 
363 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
364 | 
365 | 
366 | class ReduceOperator(UnaryOperator):
367 |     def __init__(self, input_a, output, f, initial_frontier):
368 |         self.index = Index()
369 |         self.index_out = Index()
370 |         self.keys_todo = defaultdict(set)
371 | 
372 |         def subtract_values(first, second):
373 |             result = defaultdict(int)
374 |             for (v1, m1) in first:
375 |                 result[v1] += m1
376 |             for (v2, m2) in second:
377 |                 result[v2] -= m2
378 | 
379 |             return [
380 |                 (val, multiplicity)
381 |                 for (val, multiplicity) in result.items()
382 |                 if multiplicity != 0
383 |             ]
384 | 
385 |         def inner():
386 |             for (typ, msg) in self.input_messages():
387 |                 if typ == MessageType.DATA:
388 |                     version, collection = msg
389 |                     for ((key, value), multiplicity) in collection._inner:
390 |                         self.index.add_value(key, version, (value, multiplicity))
391 |                         self.keys_todo[version].add(key)
392 |                 elif typ == MessageType.FRONTIER:
393 |                     frontier = msg
394 |                     self.set_input_frontier(frontier)
395 | 
396 |             finished_versions = [
397 |                 version
398 |                 for version in self.keys_todo.keys()
399 |                 if version < self.input_frontier()
400 |             ]
401 | 
402 |             finished_versions.sort()
403 |             for version in finished_versions:
404 |                 keys = self.keys_todo.pop(version)
405 |                 result = []
406 |                 for key in keys:
407 |                     curr = self.index.reconstruct_at(key, version)
408 |                     curr_out = self.index_out.reconstruct_at(key, version)
409 |                     out = f(curr)
410 |                     delta = subtract_values(out, curr_out)
411 |                     for (value, multiplicity) in delta:
412 |                         result.append(((key, value), multiplicity))
413 |                         self.index_out.add_value(key, version, (value, multiplicity))
414 |                 if result != []:
415 |                     self.output.send_data(version, Collection(result))
416 | 
417 |             if self.input_frontier() > self.output_frontier:
418 |                 self.output_frontier = self.input_frontier()
419 |                 self.output.send_frontier(self.output_frontier)
420 |                 self.index.compact(self.output_frontier)
421 |                 self.index_out.compact(self.output_frontier)
422 | 
423 |         super().__init__(input_a, output, inner, initial_frontier)
424 | 
425 | 
426 | class CountOperator(ReduceOperator):
427 |     def __init__(self, input_a, output, initial_frontier):
428 |         def count_inner(vals):
429 |             out = 0
430 |             for (_, diff) in vals:
431 |                 out += diff
432 |             return [(out, 1)]
433 | 
434 |         super().__init__(input_a, output, count_inner, initial_frontier)
435 | 
436 | 
437 | class DistinctOperator(ReduceOperator):
438 |     def __init__(self, input_a, output, initial_frontier):
439 |         def distinct_inner(vals):
440 |             consolidated = defaultdict(int)
441 |             for (val, diff) in vals:
442 |                 consolidated[val] += diff
443 |             for (val, diff) in consolidated.items():
444 |                 assert diff >= 0
445 |             return [(val, 1) for (val, diff) in consolidated.items() if diff != 0]
446 | 
447 |         super().__init__(input_a, output, distinct_inner, initial_frontier)
448 | 
449 | 
450 | class FeedbackOperator(UnaryOperator):
451 |     def __init__(self, input_a, step, iteration_limit, output, initial_frontier):
452 |         def inner():
453 |             for (typ, msg) in self.input_messages():
454 |                 if typ == MessageType.DATA:
455 |                     version, collection = msg
456 |                     if version.inner[-1] < iteration_limit:
457 |                         self.output.send_data(
458 |                             version.apply_step(step, iteration_limit), collection
459 |                         )
460 |                 elif typ == MessageType.FRONTIER:
461 |                     frontier = msg
462 |                     assert self.input_frontier() <= frontier
463 |                     self.set_input_frontier(frontier)
464 | 
465 |             candidate_output_frontier = self.input_frontier().apply_step(
466 |                 step, iteration_limit
467 |             )
468 |             assert self.output_frontier <= candidate_output_frontier
469 |             if self.output_frontier < candidate_output_frontier:
470 |                 self.output_frontier = candidate_output_frontier
471 |                 self.output.send_frontier(self.output_frontier)
472 | 
473 |         super().__init__(input_a, output, inner, initial_frontier)
474 | 
475 |     def connect_loop(output):
476 |         self.output = output
477 | 
478 | 
479 | class IngressOperator(UnaryOperator):
480 |     def __init__(self, input_a, output, iteration_limit, initial_frontier):
481 |         def inner():
482 |             for (typ, msg) in self.input_messages():
483 |                 if typ == MessageType.DATA:
484 |                     version, collection = msg
485 |                     new_version = version.extend()
486 |                     self.output.send_data(new_version, collection)
487 |                     self.output.send_data(
488 |                         new_version.apply_step(1, iteration_limit), collection.negate()
489 |                     )
490 |                 elif typ == MessageType.FRONTIER:
491 |                     frontier = msg
492 |                     new_frontier = frontier.extend()
493 |                     assert self.input_frontier() <= new_frontier
494 |                     self.set_input_frontier(new_frontier)
495 | 
496 |             assert self.output_frontier <= self.input_frontier()
497 |             if self.output_frontier < self.input_frontier():
498 |                 self.output_frontier = self.input_frontier()
499 |                 self.output.send_frontier(self.output_frontier)
500 | 
501 |         super().__init__(input_a, output, inner, initial_frontier)
502 | 
503 | 
504 | class EgressOperator(UnaryOperator):
505 |     def __init__(self, input_a, output, initial_frontier):
506 |         def inner():
507 |             for (typ, msg) in self.input_messages():
508 |                 if typ == MessageType.DATA:
509 |                     version, collection = msg
510 |                     new_version = version.truncate()
511 |                     self.output.send_data(new_version, collection)
512 |                 elif typ == MessageType.FRONTIER:
513 |                     frontier = msg
514 |                     new_frontier = frontier.truncate()
515 |                     assert self.input_frontier() <= new_frontier
516 |                     self.set_input_frontier(new_frontier)
517 | 
518 |             assert self.output_frontier <= self.input_frontier()
519 |             if self.output_frontier < self.input_frontier():
520 |                 self.output_frontier = self.input_frontier()
521 |                 self.output.send_frontier(self.output_frontier)
522 | 
523 |         super().__init__(input_a, output, inner, initial_frontier)
524 | 
525 | 
526 | if __name__ == "__main__":
527 |     graph_builder = GraphBuilder(Version(0))
528 |     input_a, input_a_writer = graph_builder.new_input()
529 |     output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0)
530 |     input_a.negate().concat(output).debug("output")
531 |     graph = graph_builder.finalize()
532 | 
533 |     for i in range(0, 10):
534 |         input_a_writer.send_data(Version(i), Collection([(i, 1)]))
535 |         input_a_writer.send_frontier(Version(i))
536 |         graph.step()
537 |     graph_builder = GraphBuilder(Version(0))
538 |     input_a, input_a_writer = graph_builder.new_input()
539 |     input_b, input_b_writer = graph_builder.new_input()
540 | 
541 |     output = input_a.join(input_b).count().debug("count")
542 |     graph = graph_builder.finalize()
543 | 
544 |     for i in range(0, 10):
545 |         input_a_writer.send_data(Version(i), Collection([((1, i), 2)]))
546 |         input_a_writer.send_data(Version(i), Collection([((2, i), 2)]))
547 |         input_b_writer.send_data(Version(i), Collection([((1, i + 2), 2)]))
548 |         input_b_writer.send_data(Version(i), Collection([((2, i + 3), 2)]))
549 |         input_a_writer.send_frontier(Version(i))
550 |         input_b_writer.send_frontier(Version(i))
551 |         graph.step()
552 |     input_a_writer.send_frontier(Version(11))
553 |     input_b_writer.send_frontier(Version(11))
554 |     graph.step()
555 | 
556 |     graph_builder = GraphBuilder(Version(0))
557 |     input_a, input_a_writer = graph_builder.new_input()
558 | 
559 |     def geometric_series(collection):
560 |         return (
561 |             collection.map(lambda data: data * 2)
562 |             .concat(collection)
563 |             .filter(lambda data: data <= 50)
564 |             .map(lambda data: (data, ()))
565 |             .distinct()
566 |             .map(lambda data: data[0])
567 |             .consolidate()
568 |         )
569 | 
570 |     output = input_a.iterate(geometric_series).debug("iterate").connect_reader()
571 |     graph = graph_builder.finalize()
572 | 
573 |     input_a_writer.send_data(Version(0), Collection([(1, 1)]))
574 |     input_a_writer.send_frontier(Version(1))
575 | 
576 |     while output.probe_frontier_less_than(Version(1)):
577 |         graph.step()
578 | 
579 |     input_a_writer.send_data(Version(1), Collection([(16, 1), (3, 1)]))
580 |     input_a_writer.send_frontier(Version(2))
581 | 
582 |     while output.probe_frontier_less_than(Version(2)):
583 |         graph.step()
584 | 
585 |     input_a_writer.send_data(Version(2), Collection([(3, -1)]))
586 |     input_a_writer.send_frontier(Version(3))
587 | 
588 |     while output.probe_frontier_less_than(Version(3)):
589 |         graph.step()
590 | 


--------------------------------------------------------------------------------
/differential_dataflow.py:
--------------------------------------------------------------------------------
  1 | """An implementation of differential dataflow.
  2 | 
  3 | Compared to the Rust implementation, this implementation is both much less performant
  4 | and more restrictive. Specifically, multiplicities in collections are constrained to
  5 | be integers, and versions (timestamps in the Rust codebase) are constrained to be
  6 | Version objects (integer tuples ordered by the product partial order).
  7 | """
  8 | 
  9 | from collections import defaultdict
 10 | 
 11 | from collection import Collection
 12 | from graph import (
 13 |     BinaryOperator,
 14 |     DifferenceStreamReader,
 15 |     DifferenceStreamWriter,
 16 |     Graph,
 17 |     MessageType,
 18 |     UnaryOperator,
 19 | )
 20 | from index import Index
 21 | from order import Version, Antichain
 22 | 
 23 | 
 24 | class DifferenceStreamBuilder:
 25 |     """A representation of a dataflow edge as the dataflow graph is being built.
 26 | 
 27 |     This object is only used to set up the dataflow graph, and does not actually
 28 |     interact with any data. Manually creating an instance of this object is highly
 29 |     unexpected - instead more normal usage would be to create an instance using
 30 |     the new_input method on GraphBuilder.
 31 |     """
 32 | 
 33 |     def __init__(self, graph):
 34 |         self._writer = DifferenceStreamWriter()
 35 |         self.graph = graph
 36 | 
 37 |     def connect_reader(self):
 38 |         return self._writer._new_reader()
 39 | 
 40 |     def writer(self):
 41 |         return self._writer
 42 | 
 43 |     def map(self, f):
 44 |         output = DifferenceStreamBuilder(self.graph)
 45 |         operator = MapOperator(
 46 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 47 |         )
 48 |         self.graph.add_operator(operator)
 49 |         self.graph.add_stream(output.connect_reader())
 50 |         return output
 51 | 
 52 |     def filter(self, f):
 53 |         output = DifferenceStreamBuilder(self.graph)
 54 |         operator = FilterOperator(
 55 |             self.connect_reader(), output.writer(), f, self.graph.frontier()
 56 |         )
 57 |         self.graph.add_operator(operator)
 58 |         self.graph.add_stream(output.connect_reader())
 59 |         return output
 60 | 
 61 |     def negate(self):
 62 |         output = DifferenceStreamBuilder(self.graph)
 63 |         operator = NegateOperator(
 64 |             self.connect_reader(), output.writer(), self.graph.frontier()
 65 |         )
 66 |         self.graph.add_operator(operator)
 67 |         self.graph.add_stream(output.connect_reader())
 68 |         return output
 69 | 
 70 |     def concat(self, other):
 71 |         assert id(self.graph) == id(other.graph)
 72 |         output = DifferenceStreamBuilder(self.graph)
 73 |         operator = ConcatOperator(
 74 |             self.connect_reader(),
 75 |             other.connect_reader(),
 76 |             output.writer(),
 77 |             self.graph.frontier(),
 78 |         )
 79 |         self.graph.add_operator(operator)
 80 |         self.graph.add_stream(output.connect_reader())
 81 |         return output
 82 | 
 83 |     def debug(self, name=""):
 84 |         output = DifferenceStreamBuilder(self.graph)
 85 |         operator = DebugOperator(
 86 |             self.connect_reader(), output.writer(), name, self.graph.frontier()
 87 |         )
 88 |         self.graph.add_operator(operator)
 89 |         self.graph.add_stream(output.connect_reader())
 90 |         return output
 91 | 
 92 |     def join(self, other):
 93 |         assert id(self.graph) == id(other.graph)
 94 |         output = DifferenceStreamBuilder(self.graph)
 95 |         operator = JoinOperator(
 96 |             self.connect_reader(),
 97 |             other.connect_reader(),
 98 |             output.writer(),
 99 |             self.graph.frontier(),
100 |         )
101 |         self.graph.add_operator(operator)
102 |         self.graph.add_stream(output.connect_reader())
103 |         return output
104 | 
105 |     def count(self):
106 |         output = DifferenceStreamBuilder(self.graph)
107 |         operator = CountOperator(
108 |             self.connect_reader(), output.writer(), self.graph.frontier()
109 |         )
110 |         self.graph.add_operator(operator)
111 |         self.graph.add_stream(output.connect_reader())
112 |         return output
113 | 
114 |     def consolidate(self):
115 |         output = DifferenceStreamBuilder(self.graph)
116 |         operator = ConsolidateOperator(
117 |             self.connect_reader(), output.writer(), self.graph.frontier()
118 |         )
119 |         self.graph.add_operator(operator)
120 |         self.graph.add_stream(output.connect_reader())
121 |         return output
122 | 
123 |     def distinct(self):
124 |         output = DifferenceStreamBuilder(self.graph)
125 |         operator = DistinctOperator(
126 |             self.connect_reader(), output.writer(), self.graph.frontier()
127 |         )
128 |         self.graph.add_operator(operator)
129 |         self.graph.add_stream(output.connect_reader())
130 |         return output
131 | 
132 |     def _start_scope(self):
133 |         new_frontier = self.graph.frontier().extend()
134 |         self.graph.push_frontier(new_frontier)
135 | 
136 |     def _end_scope(self):
137 |         self.graph.pop_frontier()
138 | 
139 |     def _ingress(self):
140 |         output = DifferenceStreamBuilder(self.graph)
141 |         operator = IngressOperator(
142 |             self.connect_reader(), output.writer(), self.graph.frontier()
143 |         )
144 |         self.graph.add_operator(operator)
145 |         self.graph.add_stream(output.connect_reader())
146 |         return output
147 | 
148 |     def _egress(self):
149 |         output = DifferenceStreamBuilder(self.graph)
150 |         operator = EgressOperator(
151 |             self.connect_reader(), output.writer(), self.graph.frontier()
152 |         )
153 |         self.graph.add_operator(operator)
154 |         self.graph.add_stream(output.connect_reader())
155 |         return output
156 | 
157 |     def iterate(self, f):
158 |         self._start_scope()
159 |         feedback_stream = DifferenceStreamBuilder(self.graph)
160 |         entered = self._ingress().concat(feedback_stream)
161 |         result = f(entered)
162 |         feedback_operator = FeedbackOperator(
163 |             result.connect_reader(), 1, feedback_stream.writer(), self.graph.frontier()
164 |         )
165 |         self.graph.add_stream(feedback_stream)
166 |         self.graph.add_operator(feedback_operator)
167 |         self._end_scope()
168 |         return result._egress()
169 | 
170 | 
171 | class GraphBuilder:
172 |     """A representation of a dataflow graph as it is being built."""
173 | 
174 |     def __init__(self, initial_frontier):
175 |         self.streams = []
176 |         self.operators = []
177 |         self.frontier_stack = [initial_frontier]
178 | 
179 |     def new_input(self):
180 |         stream_builder = DifferenceStreamBuilder(self)
181 |         self.streams.append(stream_builder.connect_reader())
182 |         return stream_builder, stream_builder.writer()
183 | 
184 |     def add_operator(self, operator):
185 |         self.operators.append(operator)
186 | 
187 |     def add_stream(self, stream):
188 |         self.streams.append(stream)
189 | 
190 |     def frontier(self):
191 |         return self.frontier_stack[-1]
192 | 
193 |     def push_frontier(self, new_frontier):
194 |         self.frontier_stack.append(new_frontier)
195 | 
196 |     def pop_frontier(self):
197 |         self.frontier_stack.pop()
198 | 
199 |     def finalize(self):
200 |         return Graph(self.streams, self.operators)
201 | 
202 | 
203 | class LinearUnaryOperator(UnaryOperator):
204 |     def __init__(self, input_a, output, f, initial_frontier):
205 |         def inner():
206 |             for (typ, msg) in self.input_messages():
207 |                 if typ == MessageType.DATA:
208 |                     version, collection = msg
209 |                     self.output.send_data(version, f(collection))
210 |                 elif typ == MessageType.FRONTIER:
211 |                     frontier = msg
212 |                     assert self.input_frontier().less_equal(frontier)
213 |                     self.set_input_frontier(frontier)
214 | 
215 |             assert self.output_frontier.less_equal(self.input_frontier())
216 |             if self.output_frontier.less_than(self.input_frontier()):
217 |                 self.output_frontier = self.input_frontier()
218 |                 self.output.send_frontier(self.output_frontier)
219 | 
220 |         super().__init__(input_a, output, inner, initial_frontier)
221 | 
222 | 
223 | class MapOperator(LinearUnaryOperator):
224 |     def __init__(self, input_a, output, f, initial_frontier):
225 |         def map_inner(collection):
226 |             return collection.map(f)
227 | 
228 |         super().__init__(input_a, output, map_inner, initial_frontier)
229 | 
230 | 
231 | class FilterOperator(LinearUnaryOperator):
232 |     def __init__(self, input_a, output, f, initial_frontier):
233 |         def filter_inner(collection):
234 |             return collection.filter(f)
235 | 
236 |         super().__init__(input_a, output, filter_inner, initial_frontier)
237 | 
238 | 
239 | class NegateOperator(LinearUnaryOperator):
240 |     def __init__(self, input_a, output, initial_frontier):
241 |         def negate_inner(collection):
242 |             return collection.negate()
243 | 
244 |         super().__init__(input_a, output, negate_inner, initial_frontier)
245 | 
246 | 
247 | class ConcatOperator(BinaryOperator):
248 |     def __init__(self, input_a, input_b, output, initial_frontier):
249 |         def inner():
250 |             for (typ, msg) in self.input_a_messages():
251 |                 if typ == MessageType.DATA:
252 |                     version, collection = msg
253 |                     self.output.send_data(version, collection)
254 |                 elif typ == MessageType.FRONTIER:
255 |                     frontier = msg
256 |                     assert self.input_a_frontier().less_equal(frontier)
257 |                     self.set_input_a_frontier(frontier)
258 |             for (typ, msg) in self.input_b_messages():
259 |                 if typ == MessageType.DATA:
260 |                     version, collection = msg
261 |                     self.output.send_data(version, collection)
262 |                 elif typ == MessageType.FRONTIER:
263 |                     frontier = msg
264 |                     assert self.input_b_frontier().less_equal(frontier)
265 |                     self.set_input_b_frontier(frontier)
266 | 
267 |             input_frontier = self.input_a_frontier().meet(self.input_b_frontier())
268 |             assert self.output_frontier.less_equal(input_frontier)
269 |             if self.output_frontier.less_than(input_frontier):
270 |                 self.output_frontier = input_frontier
271 |                 self.output.send_frontier(self.output_frontier)
272 | 
273 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
274 | 
275 | 
276 | class ConsolidateOperator(UnaryOperator):
277 |     def __init__(self, input_a, output, initial_frontier):
278 |         self.collections = defaultdict(Collection)
279 | 
280 |         def inner():
281 |             for (typ, msg) in self.input_messages():
282 |                 if typ == MessageType.DATA:
283 |                     version, collection = msg
284 |                     self.collections[version]._extend(collection)
285 |                 elif typ == MessageType.FRONTIER:
286 |                     frontier = msg
287 |                     assert self.input_frontier().less_equal(frontier)
288 |                     self.set_input_frontier(frontier)
289 |             finished_versions = [
290 |                 version
291 |                 for version in self.collections.keys()
292 |                 if self.input_frontier().less_equal_version(version) is not True
293 |             ]
294 |             for version in finished_versions:
295 |                 collection = self.collections.pop(version).consolidate()
296 |                 self.output.send_data(version, collection)
297 |             assert self.output_frontier.less_equal(self.input_frontier())
298 |             if self.output_frontier.less_than(self.input_frontier()):
299 |                 self.output_frontier = self.input_frontier()
300 |                 self.output.send_frontier(self.output_frontier)
301 | 
302 |         super().__init__(input_a, output, inner, initial_frontier)
303 | 
304 | 
305 | class DebugOperator(UnaryOperator):
306 |     def __init__(self, input_a, output, name, initial_frontier):
307 |         def inner():
308 |             for (typ, msg) in self.input_messages():
309 |                 if typ == MessageType.DATA:
310 |                     version, collection = msg
311 |                     print(
312 |                         f"debug {name} data: version: {version} collection: {collection}"
313 |                     )
314 |                     self.output.send_data(version, collection)
315 |                 elif typ == MessageType.FRONTIER:
316 |                     frontier = msg
317 |                     assert self.input_frontier().less_equal(frontier)
318 |                     self.set_input_frontier(frontier)
319 |                     print(f"debug {name} notification: frontier {frontier}")
320 |                     assert self.output_frontier.less_equal(self.input_frontier())
321 |                     if self.output_frontier.less_than(self.input_frontier()):
322 |                         self.output_frontier = self.input_frontier()
323 |                         self.output.send_frontier(self.output_frontier)
324 | 
325 |         super().__init__(input_a, output, inner, initial_frontier)
326 | 
327 | 
328 | class JoinOperator(BinaryOperator):
329 |     def __init__(self, input_a, input_b, output, initial_frontier):
330 |         self.index_a = Index()
331 |         self.index_b = Index()
332 | 
333 |         def inner():
334 |             delta_a = Index()
335 |             delta_b = Index()
336 |             for (typ, msg) in self.input_a_messages():
337 |                 if typ == MessageType.DATA:
338 |                     version, collection = msg
339 |                     for ((key, value), multiplicity) in collection._inner:
340 |                         delta_a.add_value(key, version, (value, multiplicity))
341 |                 elif typ == MessageType.FRONTIER:
342 |                     frontier = msg
343 |                     assert self.input_a_frontier().less_equal(frontier)
344 |                     self.set_input_a_frontier(frontier)
345 |             for (typ, msg) in self.input_b_messages():
346 |                 if typ == MessageType.DATA:
347 |                     version, collection = msg
348 |                     for ((key, value), multiplicity) in collection._inner:
349 |                         delta_b.add_value(key, version, (value, multiplicity))
350 |                 elif typ == MessageType.FRONTIER:
351 |                     frontier = msg
352 |                     assert self.input_b_frontier().less_equal(frontier)
353 |                     self.set_input_b_frontier(frontier)
354 | 
355 |             results = defaultdict(Collection)
356 |             for (version, collection) in delta_a.join(self.index_b):
357 |                 results[version]._extend(collection)
358 | 
359 |             self.index_a.append(delta_a)
360 | 
361 |             for (version, collection) in self.index_a.join(delta_b):
362 |                 results[version]._extend(collection)
363 | 
364 |             for (version, collection) in results.items():
365 |                 self.output.send_data(version, collection)
366 |             self.index_b.append(delta_b)
367 | 
368 |             input_frontier = self.input_a_frontier().meet(self.input_b_frontier())
369 |             assert self.output_frontier.less_equal(input_frontier)
370 |             if self.output_frontier.less_than(input_frontier):
371 |                 self.output_frontier = input_frontier
372 |                 self.output.send_frontier(self.output_frontier)
373 |                 self.index_a.compact(self.output_frontier)
374 |                 self.index_b.compact(self.output_frontier)
375 | 
376 |         super().__init__(input_a, input_b, output, inner, initial_frontier)
377 | 
378 | 
379 | class ReduceOperator(UnaryOperator):
380 |     def __init__(self, input_a, output, f, initial_frontier):
381 |         self.index = Index()
382 |         self.index_out = Index()
383 |         self.keys_todo = defaultdict(set)
384 | 
385 |         def subtract_values(first, second):
386 |             result = defaultdict(int)
387 |             for (v1, m1) in first:
388 |                 result[v1] += m1
389 |             for (v2, m2) in second:
390 |                 result[v2] -= m2
391 | 
392 |             return [
393 |                 (val, multiplicity)
394 |                 for (val, multiplicity) in result.items()
395 |                 if multiplicity != 0
396 |             ]
397 | 
398 |         def inner():
399 |             for (typ, msg) in self.input_messages():
400 |                 if typ == MessageType.DATA:
401 |                     version, collection = msg
402 |                     for ((key, value), multiplicity) in collection._inner:
403 |                         self.index.add_value(key, version, (value, multiplicity))
404 |                         self.keys_todo[version].add(key)
405 |                         for v2 in self.index.versions(key):
406 |                             self.keys_todo[version.join(v2)].add(key)
407 |                 elif typ == MessageType.FRONTIER:
408 |                     frontier = msg
409 |                     assert self.input_frontier().less_equal(frontier)
410 |                     self.set_input_frontier(frontier)
411 | 
412 |             finished_versions = [
413 |                 version
414 |                 for version in self.keys_todo.keys()
415 |                 if self.input_frontier().less_equal_version(version) is not True
416 |             ]
417 | 
418 |             finished_versions.sort()
419 |             for version in finished_versions:
420 |                 keys = self.keys_todo.pop(version)
421 |                 result = []
422 |                 for key in keys:
423 |                     curr = self.index.reconstruct_at(key, version)
424 |                     curr_out = self.index_out.reconstruct_at(key, version)
425 |                     out = f(curr)
426 |                     delta = subtract_values(out, curr_out)
427 |                     for (value, multiplicity) in delta:
428 |                         result.append(((key, value), multiplicity))
429 |                         self.index_out.add_value(key, version, (value, multiplicity))
430 |                 if result != []:
431 |                     self.output.send_data(version, Collection(result))
432 | 
433 |             assert self.output_frontier.less_equal(self.input_frontier())
434 |             if self.output_frontier.less_than(self.input_frontier()):
435 |                 self.output_frontier = self.input_frontier()
436 |                 self.output.send_frontier(self.output_frontier)
437 |                 self.index.compact(self.output_frontier)
438 |                 self.index_out.compact(self.output_frontier)
439 | 
440 |         super().__init__(input_a, output, inner, initial_frontier)
441 | 
442 | 
443 | class CountOperator(ReduceOperator):
444 |     def __init__(self, input_a, output, initial_frontier):
445 |         def count_inner(vals):
446 |             out = 0
447 |             for (_, diff) in vals:
448 |                 out += diff
449 |             return [(out, 1)]
450 | 
451 |         super().__init__(input_a, output, count_inner, initial_frontier)
452 | 
453 | 
454 | class DistinctOperator(ReduceOperator):
455 |     def __init__(self, input_a, output, initial_frontier):
456 |         def distinct_inner(vals):
457 |             consolidated = defaultdict(int)
458 |             for (val, diff) in vals:
459 |                 consolidated[val] += diff
460 |             for (val, diff) in consolidated.items():
461 |                 assert diff >= 0
462 |             return [(val, 1) for (val, diff) in consolidated.items() if diff > 0]
463 | 
464 |         super().__init__(input_a, output, distinct_inner, initial_frontier)
465 | 
466 | 
467 | class FeedbackOperator(UnaryOperator):
468 |     def __init__(self, input_a, step, output, initial_frontier):
469 |         # Map from top-level version -> set of messages where we have
470 |         # sent some data at that version
471 |         self.in_flight_data = defaultdict(set)
472 |         # Versions where a given top-level version has updated
473 |         # its iteration without sending any data.
474 |         self.empty_versions = defaultdict(set)
475 | 
476 |         def inner():
477 |             for (typ, msg) in self.input_messages():
478 |                 if typ == MessageType.DATA:
479 |                     version, collection = msg
480 |                     new_version = version.apply_step(step)
481 |                     truncated = new_version.truncate()
482 |                     self.output.send_data(new_version, collection)
483 | 
484 |                     # Record that we sent data at this version.
485 |                     self.in_flight_data[truncated].add(new_version)
486 |                     # Make sure we track that we are iterating at this top-level
487 |                     # version if we haven't already
488 |                     if truncated not in self.empty_versions:
489 |                         self.empty_versions[truncated] = set()
490 |                 elif typ == MessageType.FRONTIER:
491 |                     frontier = msg
492 |                     assert self.input_frontier().less_equal(frontier)
493 |                     self.set_input_frontier(frontier)
494 | 
495 |             # Increment the current input frontier
496 |             incremented_input_frontier = self.input_frontier().apply_step(step)
497 |             # Grab all of the elements from the potential output frontier.
498 |             elements = incremented_input_frontier._elements()
499 |             # Partition every element from this potential output frontier into one of
500 |             # two sets, either elements to keep, or elements to reject.
501 |             candidate_output_frontier = []
502 |             rejected = []
503 |             for elem in elements:
504 |                 truncated = elem.truncate()
505 | 
506 |                 # Always keep a frontier element if there is are differences associated
507 |                 # with its top-level version that are still in flight.
508 |                 if len(self.in_flight_data[truncated]) != 0:
509 |                     candidate_output_frontier.append(elem)
510 | 
511 |                     # We can stop remembering any versions that will be closed
512 |                     # by this frontier element.
513 |                     closed = {
514 |                         x for x in self.in_flight_data[truncated] if x.less_than(elem)
515 |                     }
516 |                     self.in_flight_data[truncated] -= closed
517 |                 else:
518 |                     # This frontier element does not have any differences associated with its
519 |                     # top-level version that were not closed out by prior frontier updates.
520 | 
521 |                     # Remember that we observed an "empty" update for this top-level version.
522 |                     self.empty_versions[truncated].add(elem)
523 | 
524 |                     # Don't do anything if we haven't observed at least three "empty" frontier
525 |                     # updates for this top-level time.
526 |                     if len(self.empty_versions[truncated]) <= 3:
527 |                         candidate_output_frontier.append(elem)
528 |                     else:
529 |                         self.in_flight_data.pop(truncated)
530 |                         self.empty_versions.pop(truncated)
531 |                         rejected.append(elem)
532 | 
533 |             # Ensure that we can still send data at all other top-level
534 |             # versions that were not rejected.
535 |             for r in rejected:
536 |                 for truncated in self.in_flight_data.keys():
537 |                     candidate_output_frontier.append(r.join(truncated.extend()))
538 | 
539 |             # Construct a minimal antichain from the set of candidate elements.
540 |             candidate_output_frontier = Antichain(candidate_output_frontier)
541 | 
542 |             assert self.output_frontier.less_equal(candidate_output_frontier)
543 |             if self.output_frontier.less_than(candidate_output_frontier):
544 |                 self.output_frontier = candidate_output_frontier
545 |                 self.output.send_frontier(self.output_frontier)
546 | 
547 |         super().__init__(input_a, output, inner, initial_frontier)
548 | 
549 |     def connect_loop(output):
550 |         self.output = output
551 | 
552 | 
553 | class IngressOperator(UnaryOperator):
554 |     def __init__(self, input_a, output, initial_frontier):
555 |         def inner():
556 |             for (typ, msg) in self.input_messages():
557 |                 if typ == MessageType.DATA:
558 |                     version, collection = msg
559 |                     new_version = version.extend()
560 |                     self.output.send_data(new_version, collection)
561 |                     self.output.send_data(
562 |                         new_version.apply_step(1), collection.negate()
563 |                     )
564 |                 elif typ == MessageType.FRONTIER:
565 |                     frontier = msg
566 |                     new_frontier = frontier.extend()
567 |                     assert self.input_frontier().less_equal(new_frontier)
568 |                     self.set_input_frontier(new_frontier)
569 | 
570 |             assert self.output_frontier.less_equal(self.input_frontier())
571 |             if self.output_frontier.less_than(self.input_frontier()):
572 |                 self.output_frontier = self.input_frontier()
573 |                 self.output.send_frontier(self.output_frontier)
574 | 
575 |         super().__init__(input_a, output, inner, initial_frontier)
576 | 
577 | 
578 | class EgressOperator(UnaryOperator):
579 |     def __init__(self, input_a, output, initial_frontier):
580 |         def inner():
581 |             for (typ, msg) in self.input_messages():
582 |                 if typ == MessageType.DATA:
583 |                     version, collection = msg
584 |                     new_version = version.truncate()
585 |                     self.output.send_data(new_version, collection)
586 |                 elif typ == MessageType.FRONTIER:
587 |                     frontier = msg
588 |                     new_frontier = frontier.truncate()
589 |                     assert self.input_frontier().less_equal(new_frontier)
590 |                     self.set_input_frontier(new_frontier)
591 | 
592 |             assert self.output_frontier.less_equal(self.input_frontier())
593 |             if self.output_frontier.less_than(self.input_frontier()):
594 |                 self.output_frontier = self.input_frontier()
595 |                 self.output.send_frontier(self.output_frontier)
596 | 
597 |         super().__init__(input_a, output, inner, initial_frontier)
598 | 
599 | 
600 | if __name__ == "__main__":
601 |     graph_builder = GraphBuilder(Antichain([Version([0, 0])]))
602 |     input_a, input_a_writer = graph_builder.new_input()
603 |     output = input_a.map(lambda data: data + 5).filter(lambda data: data % 2 == 0)
604 |     input_a.negate().concat(output).debug("output")
605 |     graph = graph_builder.finalize()
606 | 
607 |     for i in range(0, 10):
608 |         input_a_writer.send_data(Version([0, i]), Collection([(i, 1)]))
609 |         input_a_writer.send_frontier(Antichain([Version([i, 0]), Version([0, i])]))
610 |         graph.step()
611 | 
612 |     graph_builder = GraphBuilder(Antichain([Version([0, 0])]))
613 |     input_a, input_a_writer = graph_builder.new_input()
614 |     input_b, input_b_writer = graph_builder.new_input()
615 | 
616 |     input_a.join(input_b).count().debug("count")
617 |     graph = graph_builder.finalize()
618 | 
619 |     for i in range(0, 2):
620 |         input_a_writer.send_data(Version([0, i]), Collection([((1, i), 2)]))
621 |         input_a_writer.send_data(Version([0, i]), Collection([((2, i), 2)]))
622 | 
623 |         a_frontier = Antichain([Version([i + 2, 0]), Version([0, i])])
624 |         input_a_writer.send_frontier(a_frontier)
625 |         input_b_writer.send_data(Version([i, 0]), Collection([((1, i + 2), 2)]))
626 |         input_b_writer.send_data(Version([i, 0]), Collection([((2, i + 3), 2)]))
627 |         input_b_writer.send_frontier(Antichain([Version([i, 0]), Version([0, i * 2])]))
628 |         graph.step()
629 | 
630 |     input_a_writer.send_frontier(Antichain([Version([11, 11])]))
631 |     input_b_writer.send_frontier(Antichain([Version([11, 11])]))
632 |     graph.step()
633 | 
634 |     graph_builder = GraphBuilder(Antichain([Version(0)]))
635 |     input_a, input_a_writer = graph_builder.new_input()
636 | 
637 |     def geometric_series(collection):
638 |         return (
639 |             collection.map(lambda data: data * 2)
640 |             .concat(collection)
641 |             .filter(lambda data: data <= 50)
642 |             .map(lambda data: (data, ()))
643 |             .distinct()
644 |             .map(lambda data: data[0])
645 |             .consolidate()
646 |         )
647 | 
648 |     output = input_a.iterate(geometric_series).debug("iterate").connect_reader()
649 |     graph = graph_builder.finalize()
650 | 
651 |     input_a_writer.send_data(Version(0), Collection([(1, 1)]))
652 |     input_a_writer.send_frontier(Antichain([Version(1)]))
653 | 
654 |     while output.probe_frontier_less_than(Antichain([Version(1)])):
655 |         graph.step()
656 | 
657 |     input_a_writer.send_data(Version(1), Collection([(16, 1), (3, 1)]))
658 |     input_a_writer.send_frontier(Antichain([Version(2)]))
659 | 
660 |     while output.probe_frontier_less_than(Antichain([Version(2)])):
661 |         graph.step()
662 | 
663 |     input_a_writer.send_data(Version(2), Collection([(3, -1)]))
664 |     input_a_writer.send_frontier(Antichain([Version(3)]))
665 | 
666 |     while output.probe_frontier_less_than(Antichain([Version(3)])):
667 |         graph.step()
668 | 


--------------------------------------------------------------------------------