├── requirements.txt
├── bulldog
    ├── __init__.py
    └── model.py
├── .gitignore
├── setup.py
├── LICENSE
├── example
    └── example.py
└── readme.MD


/requirements.txt:
--------------------------------------------------------------------------------
1 | pathos


--------------------------------------------------------------------------------
/bulldog/__init__.py:
--------------------------------------------------------------------------------
1 | name = "example_pkg"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .idea
3 | *.pyo
4 | *.pyc
5 | *.pyi
6 | build/
7 | bulldog.egg-info/
8 | dist/
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from bulldog.model import version
 3 | 
 4 | with open("README.md", "r") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setuptools.setup(
 8 |     name="bulldog",
 9 |     version=version,
10 |     author="Luca Giacomel",
11 |     author_email="luca.giacomel@gmail.com",
12 |     description="State management for Data Science & Analytics",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/luke14free/bulldog",
16 |     packages=setuptools.find_packages(),
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         "License :: OSI Approved :: MIT License",
20 |         "Operating System :: OS Independent",
21 |     ],
22 |     install_requires=['pathos'],
23 |     python_requires='>=3.6',
24 | )
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 
21 | 


--------------------------------------------------------------------------------
/example/example.py:
--------------------------------------------------------------------------------
 1 | from bulldog.bulldog import Model, Version
 2 | import pandas as pd
 3 | import time
 4 | import pickle
 5 | 
 6 | 
 7 | def on_checkpoint_save(data, key, history):
 8 |     file_name = 'data_{}.pkl'.format(key.step)
 9 |     pickle.dump(data, open('data_{}.pkl'.format(key.step), 'wb'))
10 |     return file_name  # only the file name will be saved in memory
11 | 
12 | 
13 | def on_checkpoint_restore(key, history):
14 |     file_name = history[key]
15 |     return pickle.load(open(file_name, 'rb'))  # store this in model.data
16 | 
17 | 
18 | model = Model(
19 |     data={
20 |         'df': pd.DataFrame(pd.np.ones((100, 100))),
21 |     },
22 |     on_checkpoint_save=on_checkpoint_save,
23 |     on_checkpoint_restore=on_checkpoint_restore
24 | )
25 | 
26 | 
27 | @model.data_modifier
28 | def data_step(data, factor):
29 |     df = data['df']
30 |     df *= factor
31 |     return data  # this will modify the data
32 | 
33 | 
34 | @model.business_logic
35 | @model.checkpoint
36 | def action1(data, commit):
37 |     data['df'] /= 8000  # this has no effect whatsoever, we are modifying a copy
38 |     commit("data_step", 9)
39 |     return data  # consequently this does nothing
40 | 
41 | 
42 | @model.analysis
43 | @model.parallelizable
44 | def analysis(data, history):
45 |     df = data['df']
46 |     time.sleep(3)
47 |     print('fast 1', list(history.keys())[-1].name, pd.np.mean(df.values))
48 | 
49 | 
50 | @model.analysis
51 | @model.parallelizable
52 | def analysis2(data, history):
53 |     df = data['df']
54 |     time.sleep(3)
55 |     print('fast 2', list(history.keys())[-1].name, pd.np.mean(df.values))
56 | 
57 | 
58 | @model.analysis
59 | def analysis3(data, history):
60 |     df = data['df']
61 |     time.sleep(3)
62 |     print('slow', list(history.keys())[-1].name, pd.np.mean(df.values))
63 | 
64 | 
65 | def main():
66 |     model.dispatch('action1')
67 |     model.commit('data_step', factor=9)
68 |     print(model.history.keys())
69 |     model.revert_version(Version(name='action1', step=1))
70 |     # or equivalently `model.rollback(2)`
71 |     print(model.history.keys())
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/bulldog/model.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps
  2 | from typing import Callable, List, Any, Dict, Union
  3 | from pathos.multiprocessing import ProcessingPool as Pool
  4 | 
  5 | from copy import deepcopy
  6 | from collections import OrderedDict, namedtuple
  7 | from multiprocessing import cpu_count
  8 | 
  9 | version = "1.0.3"
 10 | 
 11 | FROZEN_ERROR = "model data is readonly. It can only be modified with a data_modifier"
 12 | DATA_MODIFIER_NOT_CALLABLE_ERROR = "data_modifier(s) should be committed, not called directly"
 13 | BUSINESS_LOGIC_NOT_CALLABLE_ERROR = "business_logic(s) should be dispatched, not called directly"
 14 | DATA_MODIFIER_NOT_FOUND = "{} is not a registered data_modifier"
 15 | BUSINESS_LOGIC_NOT_FOUND = "{} is not a registered business_logic"
 16 | BUSINESS_LOGIC_ALREADY_EXECUTED = "{} was already executed, raising and error since `unique_bl_steps` was set to true"
 17 | 
 18 | 
 19 | class BusinessLogicNotCallableException(Exception):
 20 |     pass
 21 | 
 22 | 
 23 | class DataModifierNotCallableException(Exception):
 24 |     pass
 25 | 
 26 | 
 27 | class DataNotDirectlyModifiableException(Exception):
 28 |     pass
 29 | 
 30 | 
 31 | class DataModifierNotFound(Exception):
 32 |     pass
 33 | 
 34 | 
 35 | class BusinessLogicNotFound(Exception):
 36 |     pass
 37 | 
 38 | 
 39 | class NoCheckpointAvailableForKey(Exception):
 40 |     pass
 41 | 
 42 | 
 43 | class BusinessLogicAlreadyExecutedException(Exception):
 44 |     pass
 45 | 
 46 | 
 47 | Version = namedtuple('Version', 'step name')
 48 | 
 49 | 
 50 | class Model:
 51 |     def __init__(self, data: Dict, max_pool_size=0, unique_bl_steps=True,
 52 |                  on_checkpoint_save: Union[None, Callable] = None,
 53 |                  on_checkpoint_restore: Union[None, Callable] = None) -> None:
 54 |         self._data = data
 55 |         self.checkpoints = []
 56 |         self.data_modifiers = {}
 57 |         self.business_logics = {}
 58 |         self.analyses = {}
 59 |         self.parallelizables = []
 60 |         self._history = OrderedDict({})
 61 |         self.unique_bl_steps = unique_bl_steps
 62 |         self.pool = Pool(max_pool_size or cpu_count())
 63 |         self.on_checkpoint_save = on_checkpoint_save
 64 |         self.on_checkpoint_restore = on_checkpoint_restore
 65 | 
 66 |     @property
 67 |     def data(self) -> Dict:
 68 |         return deepcopy(self._data)
 69 | 
 70 |     @data.setter
 71 |     def data(self, value: Dict) -> None:
 72 |         raise DataNotDirectlyModifiableException(FROZEN_ERROR)
 73 | 
 74 |     @data.deleter
 75 |     def data(self) -> None:
 76 |         raise DataNotDirectlyModifiableException(FROZEN_ERROR)
 77 | 
 78 |     @property
 79 |     def history(self) -> Dict:
 80 |         return deepcopy(self._history)
 81 | 
 82 |     @history.setter
 83 |     def history(self, value: Dict) -> None:
 84 |         raise DataNotDirectlyModifiableException(FROZEN_ERROR)
 85 | 
 86 |     @history.deleter
 87 |     def history(self) -> None:
 88 |         raise DataNotDirectlyModifiableException(FROZEN_ERROR)
 89 | 
 90 |     def data_modifier(self, func: Callable) -> Callable:
 91 |         self.data_modifiers[func.__name__] = func
 92 | 
 93 |         @wraps
 94 |         def wrapper():
 95 |             raise DataModifierNotCallableException(DATA_MODIFIER_NOT_CALLABLE_ERROR)
 96 | 
 97 |         return wrapper
 98 | 
 99 |     def business_logic(self, func: Callable) -> Callable:
100 |         self.business_logics[func.__name__] = func
101 | 
102 |         @wraps
103 |         def wrapper():
104 |             raise BusinessLogicNotCallableException(BUSINESS_LOGIC_NOT_CALLABLE_ERROR)
105 | 
106 |         return wrapper
107 | 
108 |     def run_analyses(self):
109 |         if not self.parallelizables:
110 |             for func in self.analyses.values():
111 |                 func(self.data, self._history)
112 |         else:
113 |             analyses = [i for i in self.analyses.keys() if i in self.parallelizables]
114 | 
115 |             def run_analysis_by_key(key: str) -> Any:
116 |                 if key in self.parallelizables:
117 |                     return self.analyses[key](self.data, self._history)
118 |                 else:
119 |                     return None
120 | 
121 |             self.pool.map(run_analysis_by_key, analyses)
122 |             for func_name, func in self.analyses.items():
123 |                 if func_name not in self.parallelizables:
124 |                     func(self.data, self._history)
125 | 
126 |     def commit(self, data_modifier_name: str, *commit_args: List, **commit_kwargs: Dict) -> Dict:
127 |         version_key = Version(step=len(self._history), name=data_modifier_name)
128 |         self._history[version_key] = None
129 |         try:
130 |             self._data = self.data_modifiers[data_modifier_name](self.data, *commit_args, **commit_kwargs)
131 |         except KeyError:
132 |             raise DataModifierNotFound(DATA_MODIFIER_NOT_FOUND.format(data_modifier_name))
133 |         if data_modifier_name in self.checkpoints:
134 |             self.run_analyses()
135 |             self._history[version_key] = self.data if not self.on_checkpoint_save else self.on_checkpoint_save(
136 |                 self.data, self.history)
137 |         return self._data
138 | 
139 |     def dispatch(self, business_logic_name: str, *args: List, **kwargs: Dict) -> Any:
140 |         commit = self.commit
141 |         if self.unique_bl_steps:
142 |             for key in self.history.keys():
143 |                 if key.name == business_logic_name:
144 |                     raise BusinessLogicAlreadyExecutedException(BUSINESS_LOGIC_ALREADY_EXECUTED)
145 |         try:
146 |             output = self.business_logics[business_logic_name](self.data, *args, **kwargs, commit=commit)
147 |         except KeyError:
148 |             raise BusinessLogicNotFound(BUSINESS_LOGIC_NOT_FOUND.format(business_logic_name))
149 |         version_key = Version(step=len(self._history), name=business_logic_name)
150 |         if business_logic_name in self.checkpoints:
151 |             self.run_analyses()
152 |             self._history[version_key] = self.data if not self.on_checkpoint_save else self.on_checkpoint_save(
153 |                 self.data, version_key, self.history)
154 |         else:
155 |             self._history[version_key] = None
156 |         return output
157 | 
158 |     def checkpoint(self, func: Callable) -> Callable:
159 |         self.checkpoints.append(func.__name__)
160 |         return func
161 | 
162 |     def parallelizable(self, func: Callable) -> Callable:
163 |         self.parallelizables.append(func.__name__)
164 |         return func
165 | 
166 |     def analysis(self, func: Callable) -> Callable:
167 |         self.analyses[func.__name__] = func
168 |         return func
169 | 
170 |     def revert_version(self, version_key: Version) -> None:
171 |         prev_value = self._history[version_key] if not self.on_checkpoint_restore else self.on_checkpoint_restore(
172 |             version_key, self._history)
173 |         if not prev_value:
174 |             raise NoCheckpointAvailableForKey(version_key)
175 |         self._data = prev_value
176 |         for key in self.history.keys():
177 |             if key.step > version_key.step:
178 |                 del self._history[key]
179 | 
180 |     def rollback(self, number_of_steps: int = 1) -> None:
181 |         version_key = list(self._history.keys())[-(number_of_steps + 1)]
182 |         self.revert_version(version_key)
183 | 


--------------------------------------------------------------------------------
/readme.MD:
--------------------------------------------------------------------------------
  1 | # Bulldog
  2 | 
  3 | The guardian dog that prevents you from writing poor code when doing data analysis in Python.
  4 | 
  5 | ## Installation
  6 | 
  7 | Simply run:
  8 | 
  9 | `pip install bulldog`
 10 | 
 11 | ## Philosophy
 12 | 
 13 | Bulldog is a library for writing better code in your analysis that largely borrows from the state management libraries for application development ([Redux](https://github.com/reduxjs/redux), [Flux](https://github.com/facebook/flux), [Vuex](https://github.com/vuejs/vuex), [Katana](https://github.com/BendingSpoons/katana-swift)..).
 14 | 
 15 | Bulldog models are composed of five main building blocks:
 16 | 1) `data`, which is our model initial data
 17 | 2) `data_modifiers`, which are special function whose main task is to modify the model's data
 18 | 3) `business_logic`, which are function whose main task is to execute the business logic and invoke `data_modifiers`
 19 | 4) `analyses`, which subscribe to change on the model's `data`
 20 | 5) `history`, which is a backlog of all the operations that have occurred and the corresponding state of the model `data`
 21 | 
 22 | The philosophy behind bulldog is to **separate** data transformations, business logic and analyses, in order to make
 23 | clarity, testing and debugging easier to achieve.
 24 | 
 25 | ## Working with bulldog
 26 | 
 27 | To create a bulldog model simply run:
 28 | ```python
 29 | from bulldog.model import Model
 30 | import pandas as pd
 31 | 
 32 | model = Model(data={
 33 |     'df': pd.DataFrame(pd.np.ones((100, 100))),
 34 |     'other_data': [1, 2, 3]
 35 | })
 36 | ```
 37 | 
 38 | All the data is stored in our `model.data` and it's not directly modifiable. In fact, whenever we access `model.data` we are actually accessing a copy of the original model data.
 39 | 
 40 | In order to alter/modify our data we need to create some special functions called `data_modifiers`.
 41 | 
 42 | ```python
 43 | @model.data_modifier
 44 | def data_step(data, factor):
 45 |     df = data['df']
 46 |     df *= factor
 47 |     return data  # this will modify the data
 48 | ```
 49 | 
 50 | As we can see data modifiers are just simple, pure functions that take our model data as input and perform some kind of alteration on it 
 51 | and return the altered data. The signature of a business model is `function(data, *args, **kwargs)` and it needs to be
 52 | decorated with the `@model.data_modifier` decorator (where `model` is your instance of Bulldog's `Model`).
 53 | 
 54 | If we want to execute a `data_modifier`, rather than calling it directly we need to ask the model to commit it:
 55 | 
 56 | ```python
 57 | model.commit('data_step', factor=9)  # 'data_step' is the name of our `data_modifier`
 58 | ```
 59 | 
 60 | Note that any other way of calling the function will result in an error. E.g.:
 61 | 
 62 | ```python
 63 | data_step(data=model.data, factor=9)  # wrong; this will throw an error
 64 | ```
 65 | 
 66 | Great! but what if we need to run some business logic to conditionally modify our dataset?
 67 | Maybe we need to download some data and based on that perform some actions that will eventually 
 68 | lead us to modify our data. In this case we should use a `business_logic` function.
 69 | 
 70 | ```python
 71 | @model.business_logic
 72 | @model.checkpoint
 73 | def action1(data, commit):
 74 |     data['df'] /= 8000  # this has no effect whatsoever on our data, remember? We are modifying a copy
 75 |     if max(data['df']) < 0.38:
 76 |         commit("data_step", 9)  # but this will actually modify our data
 77 | ```
 78 | 
 79 | As we can see `business_logic` are function with the signature `function(data, commit, *args, **kwargs)` which take as input the data
 80 | and have the possibility of committing `data_modifier` functions to our original model
 81 | 
 82 | You might have noted the additional `@model.checkpoint` decorator (which can also be applied to `data_modifiers`). It will basically tell our model to store the current state data after computing
 83 | this function (and store it in `model.history`), allowing us to restore it or inspect it at a later stage, which is very convenient for debugging.
 84 | 
 85 | Similarly to `data_modifiers`, also `business_logic` cannot be execute directly, and have to be dispatched through the model in this way:
 86 | 
 87 | ```python
 88 | model.dispatch('action1')
 89 | ```
 90 | 
 91 | Now, you might wonder how to run analyses on the model's data. That's fairly simple!
 92 | 
 93 | ```python
 94 | @model.analysis
 95 | @model.parallelizable
 96 | def analysis(data, history):
 97 |     df = data['df']
 98 |     time.sleep(3)
 99 |     print('fast 1', list(history.keys())[-1].name, pd.np.mean(df.values))
100 | ```
101 | 
102 | Analyses are functions with signature `function(data, history)` that are run automatically every time a checkpoint step of our model is executed.
103 | Optionally analyses can be run in parallel (if you use the `@model.parallelizable` decorator, as above). This is particularly convenient
104 | in case we are computing a large number of metrics and want to leverage our CPU as much as possible.
105 | Note that only analyses can be parallelized in Bulldog.
106 | 
107 | ### Custom checkpoints management
108 | 
109 | Out of the box, Bulldog doesn't implement any custom diffing logic for the model `data` (since it's a generic dictionary which could contain anything),
110 | but you can provide your own functions to checkpoint & restore your data. For example you might want to write/read:
111 | 
112 | 1) from a database
113 | 2) from a pickled file on disk
114 | 3) from h5df
115 | 3) diffs from custom diffing tools (or generic ones like [csv-diff](https://github.com/aswinkarthik/csvdiff))
116 | 
117 | If you want to provide some custom save/load logic to handle checkpoint save & restore, pass these two functions to the Model initializer:
118 | 
119 | 1) `on_checkpoint_save(data, version_key, history)`: this function is responsible for saving the `data` (or a diff of it which you can compute by comparing it with your model `history`, holding every other checkpoint data)
120 | 2) `on_checkpoint_restore(version_key, history)`: this function is responsible for restoring data from a previous checkpoint
121 | 
122 | For example if you want to read from disk pickled objects you might do:
123 | 
124 | ```python
125 | def on_checkpoint_save(data, key, history):
126 |     file_name = 'data_{}.pkl'.format(key.step)
127 |     pickle.dump(data, open('data_{}.pkl'.format(key.step), 'wb'))
128 |     return file_name  # only the file name will be saved in memory
129 | 
130 | 
131 | def on_checkpoint_restore(key, history):
132 |     file_name = history[key]
133 |     return pickle.load(open(file_name, 'rb'))  # store this in model.data
134 | 
135 | 
136 | model = Model(
137 |     data={
138 |         'df': pd.DataFrame(pd.np.ones((100, 100))),
139 |     },
140 |     on_checkpoint_save=on_checkpoint_save,
141 |     on_checkpoint_restore=on_checkpoint_restore
142 | )
143 | ```
144 | 
145 | ### Advanced usage
146 | 
147 | Bulldog has a few nice features for people that use interactive editors (like `ipython` or `jupyter notebook`).
148 | 
149 | 1) You can prevent the same `business_logic` from running multiple times by setting `unique_bl_step=True` in `Model`. This will prevent your state from being modified multiple times if you re-run cells in a notebook.
150 | 2) You can restore the version model data at a previous checkpoint by running either `rollback(n_steps)` or `revert_version(Version)`. This is useful both for reproducibility/debugging and for jupyter users who don't want to re-run a whole lengthy analysis after a wrong alteration of the model data.
151 | 3) *Testing:* still to be developed. Ideally bulldog will allow you to test every single component in a much easier way and possibly also with mocked data.
152 | 
153 | 


--------------------------------------------------------------------------------