├── requirements.txt ├── bulldog ├── __init__.py └── model.py ├── .gitignore ├── setup.py ├── LICENSE ├── example └── example.py └── readme.MD /requirements.txt: -------------------------------------------------------------------------------- 1 | pathos -------------------------------------------------------------------------------- /bulldog/__init__.py: -------------------------------------------------------------------------------- 1 | name = "example_pkg" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea 3 | *.pyo 4 | *.pyc 5 | *.pyi 6 | build/ 7 | bulldog.egg-info/ 8 | dist/ 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from bulldog.model import version 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setuptools.setup( 8 | name="bulldog", 9 | version=version, 10 | author="Luca Giacomel", 11 | author_email="luca.giacomel@gmail.com", 12 | description="State management for Data Science & Analytics", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/luke14free/bulldog", 16 | packages=setuptools.find_packages(), 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | install_requires=['pathos'], 23 | python_requires='>=3.6', 24 | ) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | -------------------------------------------------------------------------------- /example/example.py: -------------------------------------------------------------------------------- 1 | from bulldog.bulldog import Model, Version 2 | import pandas as pd 3 | import time 4 | import pickle 5 | 6 | 7 | def on_checkpoint_save(data, key, history): 8 | file_name = 'data_{}.pkl'.format(key.step) 9 | pickle.dump(data, open('data_{}.pkl'.format(key.step), 'wb')) 10 | return file_name # only the file name will be saved in memory 11 | 12 | 13 | def on_checkpoint_restore(key, history): 14 | file_name = history[key] 15 | return pickle.load(open(file_name, 'rb')) # store this in model.data 16 | 17 | 18 | model = Model( 19 | data={ 20 | 'df': pd.DataFrame(pd.np.ones((100, 100))), 21 | }, 22 | on_checkpoint_save=on_checkpoint_save, 23 | on_checkpoint_restore=on_checkpoint_restore 24 | ) 25 | 26 | 27 | @model.data_modifier 28 | def data_step(data, factor): 29 | df = data['df'] 30 | df *= factor 31 | return data # this will modify the data 32 | 33 | 34 | @model.business_logic 35 | @model.checkpoint 36 | def action1(data, commit): 37 | data['df'] /= 8000 # this has no effect whatsoever, we are modifying a copy 38 | commit("data_step", 9) 39 | return data # consequently this does nothing 40 | 41 | 42 | @model.analysis 43 | @model.parallelizable 44 | def analysis(data, history): 45 | df = data['df'] 46 | time.sleep(3) 47 | print('fast 1', list(history.keys())[-1].name, pd.np.mean(df.values)) 48 | 49 | 50 | @model.analysis 51 | @model.parallelizable 52 | def analysis2(data, history): 53 | df = data['df'] 54 | time.sleep(3) 55 | print('fast 2', list(history.keys())[-1].name, pd.np.mean(df.values)) 56 | 57 | 58 | @model.analysis 59 | def analysis3(data, history): 60 | df = data['df'] 61 | time.sleep(3) 62 | print('slow', list(history.keys())[-1].name, pd.np.mean(df.values)) 63 | 64 | 65 | def main(): 66 | model.dispatch('action1') 67 | model.commit('data_step', factor=9) 68 | print(model.history.keys()) 69 | model.revert_version(Version(name='action1', step=1)) 70 | # or equivalently `model.rollback(2)` 71 | print(model.history.keys()) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /bulldog/model.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Callable, List, Any, Dict, Union 3 | from pathos.multiprocessing import ProcessingPool as Pool 4 | 5 | from copy import deepcopy 6 | from collections import OrderedDict, namedtuple 7 | from multiprocessing import cpu_count 8 | 9 | version = "1.0.3" 10 | 11 | FROZEN_ERROR = "model data is readonly. It can only be modified with a data_modifier" 12 | DATA_MODIFIER_NOT_CALLABLE_ERROR = "data_modifier(s) should be committed, not called directly" 13 | BUSINESS_LOGIC_NOT_CALLABLE_ERROR = "business_logic(s) should be dispatched, not called directly" 14 | DATA_MODIFIER_NOT_FOUND = "{} is not a registered data_modifier" 15 | BUSINESS_LOGIC_NOT_FOUND = "{} is not a registered business_logic" 16 | BUSINESS_LOGIC_ALREADY_EXECUTED = "{} was already executed, raising and error since `unique_bl_steps` was set to true" 17 | 18 | 19 | class BusinessLogicNotCallableException(Exception): 20 | pass 21 | 22 | 23 | class DataModifierNotCallableException(Exception): 24 | pass 25 | 26 | 27 | class DataNotDirectlyModifiableException(Exception): 28 | pass 29 | 30 | 31 | class DataModifierNotFound(Exception): 32 | pass 33 | 34 | 35 | class BusinessLogicNotFound(Exception): 36 | pass 37 | 38 | 39 | class NoCheckpointAvailableForKey(Exception): 40 | pass 41 | 42 | 43 | class BusinessLogicAlreadyExecutedException(Exception): 44 | pass 45 | 46 | 47 | Version = namedtuple('Version', 'step name') 48 | 49 | 50 | class Model: 51 | def __init__(self, data: Dict, max_pool_size=0, unique_bl_steps=True, 52 | on_checkpoint_save: Union[None, Callable] = None, 53 | on_checkpoint_restore: Union[None, Callable] = None) -> None: 54 | self._data = data 55 | self.checkpoints = [] 56 | self.data_modifiers = {} 57 | self.business_logics = {} 58 | self.analyses = {} 59 | self.parallelizables = [] 60 | self._history = OrderedDict({}) 61 | self.unique_bl_steps = unique_bl_steps 62 | self.pool = Pool(max_pool_size or cpu_count()) 63 | self.on_checkpoint_save = on_checkpoint_save 64 | self.on_checkpoint_restore = on_checkpoint_restore 65 | 66 | @property 67 | def data(self) -> Dict: 68 | return deepcopy(self._data) 69 | 70 | @data.setter 71 | def data(self, value: Dict) -> None: 72 | raise DataNotDirectlyModifiableException(FROZEN_ERROR) 73 | 74 | @data.deleter 75 | def data(self) -> None: 76 | raise DataNotDirectlyModifiableException(FROZEN_ERROR) 77 | 78 | @property 79 | def history(self) -> Dict: 80 | return deepcopy(self._history) 81 | 82 | @history.setter 83 | def history(self, value: Dict) -> None: 84 | raise DataNotDirectlyModifiableException(FROZEN_ERROR) 85 | 86 | @history.deleter 87 | def history(self) -> None: 88 | raise DataNotDirectlyModifiableException(FROZEN_ERROR) 89 | 90 | def data_modifier(self, func: Callable) -> Callable: 91 | self.data_modifiers[func.__name__] = func 92 | 93 | @wraps 94 | def wrapper(): 95 | raise DataModifierNotCallableException(DATA_MODIFIER_NOT_CALLABLE_ERROR) 96 | 97 | return wrapper 98 | 99 | def business_logic(self, func: Callable) -> Callable: 100 | self.business_logics[func.__name__] = func 101 | 102 | @wraps 103 | def wrapper(): 104 | raise BusinessLogicNotCallableException(BUSINESS_LOGIC_NOT_CALLABLE_ERROR) 105 | 106 | return wrapper 107 | 108 | def run_analyses(self): 109 | if not self.parallelizables: 110 | for func in self.analyses.values(): 111 | func(self.data, self._history) 112 | else: 113 | analyses = [i for i in self.analyses.keys() if i in self.parallelizables] 114 | 115 | def run_analysis_by_key(key: str) -> Any: 116 | if key in self.parallelizables: 117 | return self.analyses[key](self.data, self._history) 118 | else: 119 | return None 120 | 121 | self.pool.map(run_analysis_by_key, analyses) 122 | for func_name, func in self.analyses.items(): 123 | if func_name not in self.parallelizables: 124 | func(self.data, self._history) 125 | 126 | def commit(self, data_modifier_name: str, *commit_args: List, **commit_kwargs: Dict) -> Dict: 127 | version_key = Version(step=len(self._history), name=data_modifier_name) 128 | self._history[version_key] = None 129 | try: 130 | self._data = self.data_modifiers[data_modifier_name](self.data, *commit_args, **commit_kwargs) 131 | except KeyError: 132 | raise DataModifierNotFound(DATA_MODIFIER_NOT_FOUND.format(data_modifier_name)) 133 | if data_modifier_name in self.checkpoints: 134 | self.run_analyses() 135 | self._history[version_key] = self.data if not self.on_checkpoint_save else self.on_checkpoint_save( 136 | self.data, self.history) 137 | return self._data 138 | 139 | def dispatch(self, business_logic_name: str, *args: List, **kwargs: Dict) -> Any: 140 | commit = self.commit 141 | if self.unique_bl_steps: 142 | for key in self.history.keys(): 143 | if key.name == business_logic_name: 144 | raise BusinessLogicAlreadyExecutedException(BUSINESS_LOGIC_ALREADY_EXECUTED) 145 | try: 146 | output = self.business_logics[business_logic_name](self.data, *args, **kwargs, commit=commit) 147 | except KeyError: 148 | raise BusinessLogicNotFound(BUSINESS_LOGIC_NOT_FOUND.format(business_logic_name)) 149 | version_key = Version(step=len(self._history), name=business_logic_name) 150 | if business_logic_name in self.checkpoints: 151 | self.run_analyses() 152 | self._history[version_key] = self.data if not self.on_checkpoint_save else self.on_checkpoint_save( 153 | self.data, version_key, self.history) 154 | else: 155 | self._history[version_key] = None 156 | return output 157 | 158 | def checkpoint(self, func: Callable) -> Callable: 159 | self.checkpoints.append(func.__name__) 160 | return func 161 | 162 | def parallelizable(self, func: Callable) -> Callable: 163 | self.parallelizables.append(func.__name__) 164 | return func 165 | 166 | def analysis(self, func: Callable) -> Callable: 167 | self.analyses[func.__name__] = func 168 | return func 169 | 170 | def revert_version(self, version_key: Version) -> None: 171 | prev_value = self._history[version_key] if not self.on_checkpoint_restore else self.on_checkpoint_restore( 172 | version_key, self._history) 173 | if not prev_value: 174 | raise NoCheckpointAvailableForKey(version_key) 175 | self._data = prev_value 176 | for key in self.history.keys(): 177 | if key.step > version_key.step: 178 | del self._history[key] 179 | 180 | def rollback(self, number_of_steps: int = 1) -> None: 181 | version_key = list(self._history.keys())[-(number_of_steps + 1)] 182 | self.revert_version(version_key) 183 | -------------------------------------------------------------------------------- /readme.MD: -------------------------------------------------------------------------------- 1 | # Bulldog 2 | 3 | The guardian dog that prevents you from writing poor code when doing data analysis in Python. 4 | 5 | ## Installation 6 | 7 | Simply run: 8 | 9 | `pip install bulldog` 10 | 11 | ## Philosophy 12 | 13 | Bulldog is a library for writing better code in your analysis that largely borrows from the state management libraries for application development ([Redux](https://github.com/reduxjs/redux), [Flux](https://github.com/facebook/flux), [Vuex](https://github.com/vuejs/vuex), [Katana](https://github.com/BendingSpoons/katana-swift)..). 14 | 15 | Bulldog models are composed of five main building blocks: 16 | 1) `data`, which is our model initial data 17 | 2) `data_modifiers`, which are special function whose main task is to modify the model's data 18 | 3) `business_logic`, which are function whose main task is to execute the business logic and invoke `data_modifiers` 19 | 4) `analyses`, which subscribe to change on the model's `data` 20 | 5) `history`, which is a backlog of all the operations that have occurred and the corresponding state of the model `data` 21 | 22 | The philosophy behind bulldog is to **separate** data transformations, business logic and analyses, in order to make 23 | clarity, testing and debugging easier to achieve. 24 | 25 | ## Working with bulldog 26 | 27 | To create a bulldog model simply run: 28 | ```python 29 | from bulldog.model import Model 30 | import pandas as pd 31 | 32 | model = Model(data={ 33 | 'df': pd.DataFrame(pd.np.ones((100, 100))), 34 | 'other_data': [1, 2, 3] 35 | }) 36 | ``` 37 | 38 | All the data is stored in our `model.data` and it's not directly modifiable. In fact, whenever we access `model.data` we are actually accessing a copy of the original model data. 39 | 40 | In order to alter/modify our data we need to create some special functions called `data_modifiers`. 41 | 42 | ```python 43 | @model.data_modifier 44 | def data_step(data, factor): 45 | df = data['df'] 46 | df *= factor 47 | return data # this will modify the data 48 | ``` 49 | 50 | As we can see data modifiers are just simple, pure functions that take our model data as input and perform some kind of alteration on it 51 | and return the altered data. The signature of a business model is `function(data, *args, **kwargs)` and it needs to be 52 | decorated with the `@model.data_modifier` decorator (where `model` is your instance of Bulldog's `Model`). 53 | 54 | If we want to execute a `data_modifier`, rather than calling it directly we need to ask the model to commit it: 55 | 56 | ```python 57 | model.commit('data_step', factor=9) # 'data_step' is the name of our `data_modifier` 58 | ``` 59 | 60 | Note that any other way of calling the function will result in an error. E.g.: 61 | 62 | ```python 63 | data_step(data=model.data, factor=9) # wrong; this will throw an error 64 | ``` 65 | 66 | Great! but what if we need to run some business logic to conditionally modify our dataset? 67 | Maybe we need to download some data and based on that perform some actions that will eventually 68 | lead us to modify our data. In this case we should use a `business_logic` function. 69 | 70 | ```python 71 | @model.business_logic 72 | @model.checkpoint 73 | def action1(data, commit): 74 | data['df'] /= 8000 # this has no effect whatsoever on our data, remember? We are modifying a copy 75 | if max(data['df']) < 0.38: 76 | commit("data_step", 9) # but this will actually modify our data 77 | ``` 78 | 79 | As we can see `business_logic` are function with the signature `function(data, commit, *args, **kwargs)` which take as input the data 80 | and have the possibility of committing `data_modifier` functions to our original model 81 | 82 | You might have noted the additional `@model.checkpoint` decorator (which can also be applied to `data_modifiers`). It will basically tell our model to store the current state data after computing 83 | this function (and store it in `model.history`), allowing us to restore it or inspect it at a later stage, which is very convenient for debugging. 84 | 85 | Similarly to `data_modifiers`, also `business_logic` cannot be execute directly, and have to be dispatched through the model in this way: 86 | 87 | ```python 88 | model.dispatch('action1') 89 | ``` 90 | 91 | Now, you might wonder how to run analyses on the model's data. That's fairly simple! 92 | 93 | ```python 94 | @model.analysis 95 | @model.parallelizable 96 | def analysis(data, history): 97 | df = data['df'] 98 | time.sleep(3) 99 | print('fast 1', list(history.keys())[-1].name, pd.np.mean(df.values)) 100 | ``` 101 | 102 | Analyses are functions with signature `function(data, history)` that are run automatically every time a checkpoint step of our model is executed. 103 | Optionally analyses can be run in parallel (if you use the `@model.parallelizable` decorator, as above). This is particularly convenient 104 | in case we are computing a large number of metrics and want to leverage our CPU as much as possible. 105 | Note that only analyses can be parallelized in Bulldog. 106 | 107 | ### Custom checkpoints management 108 | 109 | Out of the box, Bulldog doesn't implement any custom diffing logic for the model `data` (since it's a generic dictionary which could contain anything), 110 | but you can provide your own functions to checkpoint & restore your data. For example you might want to write/read: 111 | 112 | 1) from a database 113 | 2) from a pickled file on disk 114 | 3) from h5df 115 | 3) diffs from custom diffing tools (or generic ones like [csv-diff](https://github.com/aswinkarthik/csvdiff)) 116 | 117 | If you want to provide some custom save/load logic to handle checkpoint save & restore, pass these two functions to the Model initializer: 118 | 119 | 1) `on_checkpoint_save(data, version_key, history)`: this function is responsible for saving the `data` (or a diff of it which you can compute by comparing it with your model `history`, holding every other checkpoint data) 120 | 2) `on_checkpoint_restore(version_key, history)`: this function is responsible for restoring data from a previous checkpoint 121 | 122 | For example if you want to read from disk pickled objects you might do: 123 | 124 | ```python 125 | def on_checkpoint_save(data, key, history): 126 | file_name = 'data_{}.pkl'.format(key.step) 127 | pickle.dump(data, open('data_{}.pkl'.format(key.step), 'wb')) 128 | return file_name # only the file name will be saved in memory 129 | 130 | 131 | def on_checkpoint_restore(key, history): 132 | file_name = history[key] 133 | return pickle.load(open(file_name, 'rb')) # store this in model.data 134 | 135 | 136 | model = Model( 137 | data={ 138 | 'df': pd.DataFrame(pd.np.ones((100, 100))), 139 | }, 140 | on_checkpoint_save=on_checkpoint_save, 141 | on_checkpoint_restore=on_checkpoint_restore 142 | ) 143 | ``` 144 | 145 | ### Advanced usage 146 | 147 | Bulldog has a few nice features for people that use interactive editors (like `ipython` or `jupyter notebook`). 148 | 149 | 1) You can prevent the same `business_logic` from running multiple times by setting `unique_bl_step=True` in `Model`. This will prevent your state from being modified multiple times if you re-run cells in a notebook. 150 | 2) You can restore the version model data at a previous checkpoint by running either `rollback(n_steps)` or `revert_version(Version)`. This is useful both for reproducibility/debugging and for jupyter users who don't want to re-run a whole lengthy analysis after a wrong alteration of the model data. 151 | 3) *Testing:* still to be developed. Ideally bulldog will allow you to test every single component in a much easier way and possibly also with mocked data. 152 | 153 | --------------------------------------------------------------------------------