├── .gitignore ├── MANIFEST.in ├── README.markdown ├── __init__.py ├── setup.py ├── static └── examples │ ├── 1.png │ ├── 2.png │ └── 3.png └── stepist ├── __init__.py ├── app.py ├── app_config.py ├── dbs.py └── flow ├── __init__.py ├── libs ├── __init__.py └── simple_queue.py ├── session.py ├── signals.py ├── stats ├── __init__.py ├── utils.py └── worker.py ├── steps ├── __init__.py ├── factory_step.py ├── hub.py ├── next_step.py ├── reducer_step.py └── step.py ├── utils.py └── workers ├── __init__.py ├── adapters ├── __init__.py ├── rm_queue.py ├── simple_queue.py ├── sqs_queue.py └── utils.py ├── boost ├── __init__.py ├── shared_memory.py └── sockets.py ├── reducer_engine.py ├── utils.py └── worker_engine.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | setup.cfg 3 | *.egg-info 4 | /dist 5 | tests 6 | .idea 7 | 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include stepist/tests/ * 2 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/stepist.svg)](https://badge.fury.io/py/stepist) 2 | 3 | Stepist. Framework for data processing. 4 | 5 | 6 | The main Stepist goal is to simplify working with data.
7 |
8 |
9 | 10 | **What for:**
11 | - RealTime distributing services 12 | - ETL tasks 13 | - Prepare data for AI models 14 | 15 | 16 |
17 | 18 | 19 | **So, what is Stepist?**

20 | This is tool for creating sequence of functions (called steps) which represents execution flow.
21 | The result of each step is input for a next step, as a result you will have graph (data pipeline), 22 | which could handle data using streaming services (celery, rq, redis) or batch processing tools (kafka). 23 | 24 |
25 | 26 | **Install** 27 | 28 | ```bash 29 | - setup redis https://redis.io/topics/quickstart 30 | - pip install stepist 31 | 32 | ``` 33 | 34 |
35 | 36 | ###### Basic defenitions: 37 | - **App** - Collect's all your objects and has full configuration of the system. 38 | - **Step** - Basic object. Connect multiple functions into flow. 39 | - **Flow** - Chain of steps, which start from simple step, and has last step with next_step=None. 40 | 41 |
42 | 43 | ### Examples: 44 | 45 | **Simple step by step flow. (result of each step is input for the next)** 46 | 47 | 48 | ```python 49 | from stepist import App 50 | 51 | app = App() 52 | 53 | @app.step(None) 54 | def step2(a_plus_b, a_minus_b): 55 | return dict(result=a_plus_b * 56 | a_minus_b) 57 | 58 | @app.step(step2) 59 | def step1(a, b): 60 | return dict(a_plus_b=a+b, 61 | a_minus_b=a-b) 62 | 63 | print(step1(5,5)) 64 | 65 | ``` 66 | 67 | 68 | 69 | **Simple step by step flow with workers** 70 | 71 | 72 | ```python 73 | import sys 74 | import requests 75 | 76 | from stepist import App 77 | 78 | app = App() 79 | 80 | URLS = ['https://www.python.org/', 81 | 'https://wikipedia.org/wiki/Python_(programming_language)'] 82 | 83 | @app.step(None) 84 | def step3(text, **kwargs): 85 | print(text.count('python')) 86 | 87 | @app.factory_step(step3, as_worker=True) 88 | def step2(url): 89 | r = requests.get(url) 90 | return dict(url=url, 91 | text=r.text) 92 | 93 | @app.step(step2) 94 | def step1(urls): 95 | for url in urls: 96 | yield dict(url=url) 97 | 98 | if sys.argv[1] == "worker": 99 | app.run(step2) # run worker 100 | else: 101 | step1(urls=URLS) 102 | 103 | # Worker process: 104 | # >>> 94 105 | # >>> 264 106 | 107 | ``` 108 | 109 | 110 | 111 | **Call multiple steps at once (Map)** 112 |
113 | Define Hub(list_of_next_steps) as a next step. 114 | 115 | ```python 116 | import sys 117 | import requests 118 | 119 | from stepist import Hub 120 | from stepist import App 121 | 122 | app = App() 123 | 124 | URLS = ['https://www.python.org/', 125 | 'https://wikipedia.org/wiki/Python_(programming_language)'] 126 | 127 | @app.step(None) 128 | def step3(text, **kwargs): 129 | c = text.count('python') 130 | return c 131 | 132 | @app.factory_step(step3, as_worker=True) 133 | def step2_v2(url): 134 | r = requests.get(url) 135 | return dict(url=url, 136 | text=r.text) 137 | 138 | @app.factory_step(step3, as_worker=True) 139 | def step2(url): 140 | r = requests.get(url) 141 | return dict(url=url, 142 | text=r.text) 143 | 144 | @app.step(Hub(step2, step2_v2)) 145 | def step1(urls): 146 | for url in urls: 147 | yield dict(url=url) 148 | 149 | if sys.argv[1] == "worker": 150 | app.run() # run workers 151 | else: 152 | print(step1(urls=URLS)) 153 | 154 | # print, from main process 155 | # >>> [94, 264] 156 | 157 | ``` 158 | 159 | **Сombine data from multiple steps. (Reduce)** 160 |
161 | Define @app.reducer_step and linked it with pipeline "leaves" 162 | 163 | ```python 164 | import sys 165 | import requests 166 | 167 | from stepist import Hub 168 | from stepist import App 169 | 170 | app = App() 171 | 172 | URLS = ['https://www.python.org/', 173 | 'https://wikipedia.org/wiki/Python_(programming_language)'] 174 | 175 | @app.reducer_step() 176 | def step3(job_list): 177 | return dict(c1=job_list[0].count('python'), 178 | c2=job_list[1].count('python')) 179 | 180 | @app.factory_step(step3, as_worker=True) 181 | def step2_v2(url): 182 | r = requests.get(url) 183 | return dict(url=url, 184 | text=r.text) 185 | 186 | @app.factory_step(step3, as_worker=True) 187 | def step2(url): 188 | r = requests.get(url) 189 | return dict(url=url, 190 | text=r.text) 191 | 192 | @app.step(Hub(step2, step2_v2)) 193 | def step1(urls): 194 | for url in urls: 195 | yield dict(url=url) 196 | 197 | if sys.argv[1] == "worker": 198 | app.run() # run workers 199 | else: 200 | print(step1(urls=URLS)) 201 | 202 | # print, from main process 203 | # >>> [94, 264] 204 | 205 | ``` 206 | 207 |


208 | **Celery** 209 |
210 | Stepist Campatible with Celery 211 | 212 | ```python 213 | 214 | from celery import Celery 215 | from stepist import App 216 | from stepist.flow.workers.adapters.celery_queue import CeleryAdapter 217 | 218 | app = App() 219 | 220 | celery = Celery(broker="redis://localhost:6379/0") 221 | app.worker_engine = CeleryAdapter(app, celery) 222 | 223 | 224 | @app.step(None, as_worker=True) 225 | def step3(result): 226 | return dict(result=result[:2]) 227 | 228 | @app.step(step3, as_worker=True) 229 | def step2(hello, world): 230 | return dict(result="%s %s" % (hello, world)) 231 | 232 | @app.step(step2) 233 | def step1(hello, world): 234 | return dict(hello=hello.upper(), 235 | world=world.upper()) 236 | 237 | if __name__ == "__main__": 238 | print(step1(hello='hello', 239 | world='world')) 240 | app.run() 241 | 242 | ``` 243 | 244 | **Custom streaming adapter** 245 |
246 | Just define following functions in Base adapter class and assign to app.worker_engine 247 | ```python 248 | 249 | from stepist import App 250 | from stepist.workers.worker_engine import BaseWorkerEngine 251 | 252 | 253 | class CustomWorkerEngine(BaseWorkerEngine): 254 | 255 | def add_job(self, step, data, result_reader, **kwargs): 256 | raise NotImplemented() 257 | 258 | def jobs_count(self, *steps): 259 | raise NotImplemented() 260 | 261 | def flush_queue(self, step): 262 | raise NotImplemented() 263 | 264 | def process(self, *steps): 265 | raise NotImplemented() 266 | 267 | def register_worker(self, handler): 268 | raise NotImplemented() 269 | 270 | 271 | app = App() 272 | app.worker_engine = CustomWorkerEngine() 273 | 274 | ``` 275 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/__init__.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="stepist", 6 | version="0.1.6.1", 7 | author="Aleh Shydlouski", 8 | author_email="oleg.ivye@gmail.com", 9 | description="Data process utils", 10 | keywords=['data', 'ai', 'distribute'], 11 | packages=find_packages(), 12 | include_package_data=True, 13 | install_requires=[ 14 | 'tqdm', 15 | 'redis >= 3.0.0', 16 | 'blinker', 17 | 'click', 18 | 'ujson>=1.0', 19 | 20 | ], 21 | url='https://github.com/electronick1/stepist', 22 | download_url='https://github.com/electronick1/stepist/archive/0.1.6.1.tar.gz', 23 | classifiers=[], 24 | ) 25 | -------------------------------------------------------------------------------- /static/examples/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/1.png -------------------------------------------------------------------------------- /static/examples/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/2.png -------------------------------------------------------------------------------- /static/examples/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/3.png -------------------------------------------------------------------------------- /stepist/__init__.py: -------------------------------------------------------------------------------- 1 | from .flow import * 2 | from .flow.steps import * 3 | from .app import App 4 | try: 5 | from .flow.workers.adapters.rm_queue import RQAdapter 6 | from .flow.workers.adapters.sqs_queue import SQSAdapter 7 | except ImportError: 8 | pass 9 | from .flow.workers.adapters.simple_queue import SimpleQueueAdapter as RedisAdapter 10 | 11 | -------------------------------------------------------------------------------- /stepist/app.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | from stepist.flow.steps import Step, FactoryStep 3 | 4 | from stepist.app_config import AppConfig 5 | from stepist.dbs import DBs 6 | from stepist.flow import workers 7 | 8 | from stepist.flow.workers import simple_multiprocessing 9 | from stepist.flow.workers.adapters import simple_queue 10 | from stepist.flow.workers import reducer_engine 11 | 12 | from stepist.flow.steps.reducer_step import ReducerStep 13 | 14 | from stepist.flow.workers.boost import sockets 15 | 16 | 17 | class App: 18 | 19 | def __init__(self, worker_engine=None, use_booster=False, booster=None, 20 | data_pickler=ujson, **config_kwargs): 21 | self.steps = dict() 22 | self.default_dbs = None 23 | self.verbose = None 24 | self.use_booster = use_booster 25 | 26 | self.data_pickler = data_pickler 27 | 28 | self.config = AppConfig(**{**AppConfig.init_default(), 29 | **config_kwargs}) 30 | self.load_config(self.config) 31 | 32 | self.worker_engine = worker_engine 33 | self.booster = booster 34 | 35 | if self.worker_engine is None: 36 | self.worker_engine = simple_queue.SimpleQueueAdapter( 37 | self.default_dbs.redis_db, 38 | data_pickler=self.data_pickler 39 | ) 40 | 41 | self.reducer_engine = reducer_engine.RedisReducerEngine( 42 | app=self, 43 | redis_db=self.default_dbs.redis_db, 44 | reducer_job_lifetime=30, # 30 sec 45 | reducer_no_job_sleep_time=1, # 1 sec 46 | ) 47 | 48 | if use_booster: 49 | if booster is not None: 50 | self.booster = booster 51 | else: 52 | self.booster = sockets.SocketBooster(self) 53 | else: 54 | self.booster = None 55 | 56 | def run(self, steps=None, die_on_error=True, die_when_empty=False): 57 | if steps is None: 58 | steps = self.get_workers_steps() 59 | 60 | return workers.process(self, 61 | *steps, 62 | die_on_error=die_on_error, 63 | die_when_empty=die_when_empty) 64 | 65 | def run_booster(self, steps=None, die_on_error=True, die_when_empty=False): 66 | if self.booster is None: 67 | raise RuntimeError("Booster is not enabled. Set use_booster=True " 68 | "in app initialization.") 69 | if steps is None: 70 | steps = self.get_workers_steps() 71 | 72 | self.booster.process(steps, 73 | die_on_error=die_on_error, 74 | die_when_empty=die_when_empty) 75 | 76 | def run_reducer(self, reducer_step): 77 | self.reducer_engine.process(reducer_step) 78 | 79 | def just_do_it(self, workers_count, *args, _warning=True, **kwargs): 80 | if _warning: 81 | print("You are using python multiprocessing for workers") 82 | return simple_multiprocessing(self, 83 | workers_count, 84 | *args, 85 | steps=self.get_workers_steps(), 86 | **kwargs) 87 | 88 | def load_config(self, config_object): 89 | self.config = config_object 90 | self.init_dbs(config_object) 91 | 92 | def init_dbs(self, config): 93 | self.default_dbs = DBs(config) 94 | 95 | def get_workers_steps(self): 96 | return list(filter(lambda step: step.as_worker, self.steps.values())) 97 | 98 | def get_reducers_steps(self): 99 | return list(filter(lambda step: isinstance(step, ReducerStep), 100 | self.steps.values())) 101 | 102 | def register_step(self, step): 103 | if str(step) in self.steps: 104 | raise RuntimeError("Step '%s' already exists!" % str(step)) 105 | 106 | self.steps[step.step_key()] = step 107 | if step.as_worker: 108 | self.worker_engine.register_worker(step) 109 | 110 | def add_job(self, step, data, skip_booster=False, **kwargs): 111 | if self.booster and not skip_booster: 112 | self.booster.send_job(step, data, **kwargs) 113 | else: 114 | self.worker_engine.add_job(step, data, **kwargs) 115 | 116 | def step(self, next_step, as_worker=False, wait_result=False, 117 | unique_id=None, save_result=False, name=None): 118 | """ 119 | Step decorator which initialize Step object, and register Step 120 | inside stepist 121 | 122 | 123 | :param next_step: next Step instance 124 | :param as_worker: True if it should be distribute 125 | :param wait_result: allow to return results in previous step 126 | :return: 127 | """ 128 | 129 | def _wrapper(handler): 130 | step = Step(self, 131 | handler, 132 | next_step, 133 | as_worker=as_worker, 134 | unique_id=unique_id, 135 | wait_result=wait_result, 136 | save_result=save_result, 137 | name=name) 138 | 139 | return step 140 | 141 | return _wrapper 142 | 143 | def reducer_step(self): 144 | """ 145 | ReducerStep decorator. We need this for aggregate all jobs results into one 146 | step. And also register step in global step list. 147 | 148 | In args you will get iterator which allow you go through all jobs results 149 | and process it. 150 | 151 | For example you can paste everything into AI model 152 | 153 | :return: ReducerStep instance 154 | """ 155 | 156 | def _wrapper(handler): 157 | step = ReducerStep(self, handler) 158 | 159 | self.register_step(step) 160 | return step 161 | 162 | return _wrapper 163 | 164 | def factory_step(self, next_step, as_worker=False): 165 | """ 166 | Factory step decorator. If your step decorated by this function - your 167 | step should return iterator, and each item from this iter will be added 168 | to next step. 169 | 170 | :param next_step: Step instance 171 | :param as_worker: True if it should be distribute 172 | :return: 173 | """ 174 | 175 | def _wrapper(handler): 176 | step = Step(self, 177 | handler, 178 | next_step, 179 | as_worker=as_worker, 180 | wait_result=False) 181 | 182 | step.set_factory(FactoryStep(step)) 183 | 184 | self.register_step(step) 185 | return step 186 | 187 | return _wrapper 188 | 189 | def set_verbose(self, verbose): 190 | self.verbose = verbose 191 | -------------------------------------------------------------------------------- /stepist/app_config.py: -------------------------------------------------------------------------------- 1 | from stepist.flow.utils import AttrDict 2 | 3 | 4 | DEFAULT_REDIS_KWARGS = dict( 5 | host='localhost', 6 | port=6379 7 | ) 8 | 9 | 10 | class AppConfig(AttrDict): 11 | 12 | @classmethod 13 | def init_default(cls): 14 | return cls( 15 | redis_kwargs=DEFAULT_REDIS_KWARGS, 16 | redis_stats_kwargs=DEFAULT_REDIS_KWARGS 17 | ) 18 | 19 | -------------------------------------------------------------------------------- /stepist/dbs.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | from stepist import app_config 4 | 5 | 6 | class DBs: 7 | 8 | def __init__(self, config): 9 | self.config = config 10 | self.redis_db = init_redis(**config.redis_kwargs) 11 | self.redis_stats = init_redis(**config.redis_stats_kwargs) 12 | 13 | def reset(self): 14 | return self.__class__(self.config) 15 | 16 | 17 | def init_redis(**redis_kwargs): 18 | return redis.Redis(**redis_kwargs) 19 | -------------------------------------------------------------------------------- /stepist/flow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/__init__.py -------------------------------------------------------------------------------- /stepist/flow/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/libs/__init__.py -------------------------------------------------------------------------------- /stepist/flow/libs/simple_queue.py: -------------------------------------------------------------------------------- 1 | import random 2 | import redis 3 | import time 4 | 5 | 6 | HANDLERS = {} 7 | 8 | 9 | class SimpleQueue: 10 | 11 | def __init__(self, pickler, redis_db): 12 | self.pickler = pickler 13 | self.redis_db = redis_db 14 | 15 | def process(self, jobs, wait_time_for_job=1, die_when_empty=False, 16 | die_on_error=True, verbose=False): 17 | keys = list(jobs.keys()) 18 | jobs_processed_before_empty = 0 19 | time_started_before_empty = time.time() 20 | 21 | while True: 22 | key, data = self.reserve_jobs(keys, wait_time_for_job) 23 | 24 | if data is None: 25 | if verbose and jobs_processed_before_empty: 26 | delta_time = round(time.time() - time_started_before_empty, 3) 27 | print("No more jobs in queues. Processed %s jobs in %s sec." % 28 | (jobs_processed_before_empty, delta_time)) 29 | print("Waiting for a jobs ....") 30 | 31 | jobs_processed_before_empty = 0 32 | time_started_before_empty = time.time() 33 | 34 | if die_when_empty: 35 | exit() 36 | time.sleep(0.5) 37 | continue 38 | 39 | jobs_processed_before_empty += 1 40 | handler = jobs[key] 41 | 42 | try: 43 | handler.receive_job(**data) 44 | except Exception: 45 | self.add_job(key, data) 46 | if die_on_error: 47 | raise 48 | 49 | def add_job(self, job_key, data): 50 | data = self.pickler.dumps({'data': data}) 51 | self.redis_db.lpush(job_key, data) 52 | 53 | def add_jobs(self, job_key, jobs_data): 54 | pipe = self.redis_db.pipeline() 55 | 56 | for job_data in jobs_data: 57 | data = self.pickler.dumps({'data': job_data}) 58 | pipe.lpush(job_key, data) 59 | 60 | pipe.execute() 61 | 62 | def reserve_jobs(self, job_keys, wait_timeout): 63 | random.shuffle(job_keys) 64 | try: 65 | job_data = self.redis_db.brpop(job_keys, 66 | timeout=wait_timeout) 67 | 68 | except redis.exceptions.TimeoutError: 69 | return None, None 70 | 71 | if not job_data: 72 | return None, None 73 | 74 | key = job_data[0].decode('utf-8') 75 | job_data = self.pickler.loads(job_data[1]) 76 | 77 | return key, job_data['data'] 78 | 79 | def flush_jobs(self, step_key): 80 | self.redis_db.delete(step_key) 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /stepist/flow/session.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | from contextlib import contextmanager 4 | 5 | 6 | local = threading.local() 7 | 8 | 9 | def storage(): 10 | if not hasattr(local, 'storage'): 11 | local.storage = dict( 12 | steps={}, 13 | steps_workers={}, 14 | steps_listen_for_job={}, 15 | meta_data={}, 16 | flow_data={}, 17 | ) 18 | return local.storage 19 | 20 | 21 | @contextmanager 22 | def change_flow_ctx(meta_data, flow_data): 23 | old_meta = storage().get("meta_data") 24 | old_flow_data = storage().get("flow_data") 25 | 26 | try: 27 | set_meta_data(meta_data) 28 | set_flow_data(flow_data) 29 | yield 30 | finally: 31 | set_meta_data(old_meta) 32 | set_flow_data(old_flow_data) 33 | 34 | 35 | def get_flow_data(): 36 | return storage().get("flow_data", {}) 37 | 38 | 39 | def get_meta_data(): 40 | return storage().get("meta_data", {}) 41 | 42 | 43 | def get_step_by_key(key): 44 | return get_steps().get(key, None) 45 | 46 | 47 | def get_steps(): 48 | return storage().get("steps", {}) 49 | 50 | 51 | def get_steps_workers(): 52 | return storage().get("steps_workers", {}) 53 | 54 | 55 | def get_steps_to_listen(): 56 | return storage().get("steps_listen_for_job", {}) 57 | 58 | 59 | def set_meta_data(meta_data): 60 | storage()['meta_data'] = meta_data 61 | 62 | 63 | def set_flow_data(flow_data): 64 | storage()['flow_data'] = flow_data 65 | 66 | 67 | def update_flow_data(flow_data): 68 | storage()['flow_data'].update(flow_data) 69 | 70 | 71 | def update_meta_data(**meta_data): 72 | current_meta_data = dict(get_meta_data()) 73 | current_meta_data.update(meta_data) 74 | 75 | set_meta_data(current_meta_data) 76 | 77 | 78 | def flush_session(): 79 | global local 80 | local = threading.local() 81 | -------------------------------------------------------------------------------- /stepist/flow/signals.py: -------------------------------------------------------------------------------- 1 | from blinker import signal 2 | 3 | before_step = signal("before_step") 4 | after_step = signal("after_step") 5 | 6 | flow_finished = signal("flow_finished") -------------------------------------------------------------------------------- /stepist/flow/stats/__init__.py: -------------------------------------------------------------------------------- 1 | from . import worker 2 | -------------------------------------------------------------------------------- /stepist/flow/stats/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def concat_steps_to_str(steps): 4 | return '||'.join([step.step_key() for step in steps]) 5 | -------------------------------------------------------------------------------- /stepist/flow/stats/worker.py: -------------------------------------------------------------------------------- 1 | from stepist.flow.dbs import get_redis_stats 2 | 3 | from . import utils 4 | 5 | 6 | WORKER_CONNECTION_NAME = "STATS::process::||{steps}" 7 | JOB_ADDED = "STATS::job_added::{job_key}" 8 | 9 | 10 | def starts(steps): 11 | steps_str = utils.concat_steps_to_str(steps) 12 | get_redis_stats().client_setname(WORKER_CONNECTION_NAME.format(steps=steps_str)) 13 | 14 | 15 | def job_added(job_key, data): 16 | get_redis_stats().incr(JOB_ADDED.format(job_key=job_key)) 17 | -------------------------------------------------------------------------------- /stepist/flow/steps/__init__.py: -------------------------------------------------------------------------------- 1 | from .step import Step 2 | from .factory_step import FactoryStep 3 | from .hub import Hub 4 | from .reducer_step import ReducerStep 5 | -------------------------------------------------------------------------------- /stepist/flow/steps/factory_step.py: -------------------------------------------------------------------------------- 1 | from .next_step import init_next_worker_step, init_next_step 2 | 3 | 4 | class FactoryStep(object): 5 | """ 6 | All to add jobs by iterator. Taking care about data iteration, and result 7 | reader. 8 | 9 | Able to return iterator, which can be used for reading flow result. 10 | 11 | """ 12 | 13 | # current Step instance 14 | step = None 15 | 16 | def __init__(self, step): 17 | self.step = step 18 | 19 | def add_data_iter(self, data_iter): 20 | """ 21 | Getting data iterator, and put each item in queue 22 | 23 | :param data_iter: any data iterator object 24 | """ 25 | 26 | for row_data in data_iter: 27 | if self.step.as_worker: 28 | init_next_worker_step(row_data, 29 | self.step) 30 | else: 31 | init_next_step(row_data, self.step) 32 | 33 | 34 | -------------------------------------------------------------------------------- /stepist/flow/steps/hub.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from stepist.flow.session import update_meta_data 3 | 4 | 5 | class Hub(object): 6 | """ 7 | Allow to push data in multiple steps 8 | """ 9 | def __init__(self, *steps): 10 | self.steps = list(steps) 11 | 12 | def update_meta(self): 13 | hub_job_id = "%s:%s" % (uuid.uuid4(), len(self.steps)) 14 | update_meta_data(hub_job_id=hub_job_id) 15 | -------------------------------------------------------------------------------- /stepist/flow/steps/next_step.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from .hub import Hub 4 | 5 | 6 | def call_next_step(data, next_step, batch_data=False, **kwargs): 7 | 8 | next_step_handler = choose_next_step_handler(next_step) 9 | return next_step_handler(data, next_step, batch_data=batch_data, **kwargs) 10 | 11 | 12 | def choose_next_step_handler(next_step): 13 | 14 | if next_step and isinstance(next_step, Hub): 15 | # WARNING! recursion here 16 | return init_next_hub_step 17 | 18 | if next_step.factory: 19 | return init_next_factory_step 20 | 21 | if next_step.as_worker: 22 | return init_next_worker_step 23 | else: 24 | return init_next_step 25 | 26 | 27 | def init_next_hub_step(data, hub_step, batch_data=False): 28 | """ 29 | WARNING: data copping happens here 30 | """ 31 | if batch_data: 32 | raise RuntimeError("It's not possible to use batch in hub step") 33 | 34 | if isinstance(data, list): 35 | if len(data) != len(hub_step.steps): 36 | raise RuntimeError("Amount of data not equal to amount of steps") 37 | data_list = data 38 | else: 39 | data_list = [data for _ in hub_step.steps] 40 | 41 | hub_step.update_meta() 42 | 43 | hub_result = None 44 | 45 | for i, next_step_item in enumerate(hub_step.steps): 46 | # WARNING! recursion here 47 | data = data_list[i] 48 | next_step_handler = choose_next_step_handler(next_step_item) 49 | result = next_step_handler(copy.deepcopy(data), next_step_item) 50 | 51 | if hub_result is None: 52 | hub_result = result 53 | else: 54 | hub_result.update(result) 55 | 56 | return hub_result 57 | 58 | 59 | def init_next_reducer_step(data, next_step, batch_data=False): 60 | from .reducer_step import ReducerStep 61 | 62 | if batch_data: 63 | raise RuntimeError("It's not possible to use batch in reducer step") 64 | 65 | if isinstance(next_step, ReducerStep): 66 | next_step.add_job(data) 67 | return None 68 | 69 | 70 | def init_next_worker_step(data, next_step, batch_data=False, **kwargs): 71 | if batch_data: 72 | return next_step.add_jobs(jobs_data=data) 73 | 74 | return next_step.add_job(data=data, **kwargs) 75 | 76 | 77 | def init_next_factory_step(data, next_step, batch_data=False): 78 | if batch_data: 79 | raise RuntimeError("It's not possible to use batch in factory step") 80 | 81 | next_step.factory.add_data_iter(data) 82 | 83 | return None 84 | 85 | 86 | def init_next_step(data, next_step, batch_data=False): 87 | return next_step(**data) 88 | 89 | -------------------------------------------------------------------------------- /stepist/flow/steps/reducer_step.py: -------------------------------------------------------------------------------- 1 | from stepist.flow.steps import Step 2 | 3 | 4 | class ReducerStep(Step): 5 | 6 | def __init__(self, app, handler): 7 | self.app = app 8 | self.handler = handler 9 | 10 | super(ReducerStep, self).__init__(app, 11 | handler, 12 | None, 13 | as_worker=True, 14 | wait_result=False) 15 | 16 | @property 17 | def __name__(self): 18 | return self.handler.__name__ 19 | 20 | def add_job(self, data, **kwargs): 21 | self.app.reducer_engine.add_job(self, data, **kwargs) 22 | 23 | def step_key(self): 24 | return "reducer_step::%s" % self.__name__ 25 | -------------------------------------------------------------------------------- /stepist/flow/steps/step.py: -------------------------------------------------------------------------------- 1 | import types 2 | from collections import Mapping, Generator 3 | 4 | from stepist.flow import utils, session 5 | 6 | from .next_step import call_next_step 7 | 8 | 9 | class StepData(object): 10 | 11 | flow_data = None 12 | meta_data = None 13 | 14 | def __init__(self, flow_data, meta_data=None): 15 | self.flow_data = flow_data 16 | self.meta_data = meta_data 17 | 18 | def get_dict(self): 19 | return { 20 | 'flow_data': self.flow_data, 21 | 'meta_data': self.meta_data 22 | } 23 | 24 | 25 | class FlowResult(utils.AttrDict): 26 | pass 27 | 28 | 29 | class Step(object): 30 | """ 31 | Step object. 32 | """ 33 | 34 | # handler function which handle data 35 | handler = None 36 | 37 | # next step object which getting current handler result 38 | next_step = None 39 | 40 | # True, if we need to run current handler in distribute way (using queues) 41 | as_worker = None 42 | 43 | # True, if we need to wait result from current handler 44 | # (used in previous step) 45 | wait_result = None 46 | 47 | # Factor object for iterator handling 48 | factory = None 49 | 50 | def __init__(self, app, handler, next_step, as_worker, wait_result, 51 | unique_id=None, save_result=False, name=None): 52 | self.app = app 53 | self.handler = handler 54 | self.next_step = next_step 55 | self.as_worker = as_worker 56 | self.wait_result = wait_result 57 | 58 | self.unique_id = unique_id 59 | self.name = name 60 | 61 | if not self.name: 62 | if isinstance(self.handler.__name__, str): 63 | self.name = self.handler.__name__ 64 | else: 65 | self.name = self.handler.__name__() 66 | 67 | if not self.unique_id: 68 | self.unique_id = self.name 69 | 70 | self.save_result = save_result 71 | 72 | self.factory = None 73 | self.register_step() 74 | 75 | @property 76 | def __name__(self): 77 | return self.unique_id or self.name 78 | 79 | def __call__(self, **kwargs): 80 | """ 81 | """ 82 | try: 83 | result_data = self.execute_step(**kwargs) 84 | except utils.StopFlowFlag: 85 | return None 86 | 87 | if self.is_last_step(): 88 | return FlowResult({self.name: result_data}) 89 | 90 | if isinstance(result_data, types.GeneratorType): 91 | 92 | for row_data in result_data: 93 | try: 94 | call_next_step(row_data, next_step=self.next_step) 95 | except utils.StopFlowFlag: 96 | continue 97 | 98 | return None 99 | 100 | flow_result = call_next_step(result_data, 101 | next_step=self.next_step) 102 | if self.save_result: 103 | flow_result[self.name] = result_data 104 | 105 | return flow_result 106 | 107 | def register_step(self): 108 | self.app.register_step(self) 109 | 110 | def execute_step(self, **data): 111 | """ 112 | :param data: next step data 113 | :param last_step: Step object or step_key value 114 | :return: Flow result 115 | """ 116 | 117 | # if 'self_step' in data: 118 | # raise RuntimeError("You can't use 'self_step' var in data") 119 | handler_data = utils.validate_handler_data(self.handler, data) 120 | result_data = self.handler(**handler_data) 121 | session.set_flow_data(result_data) 122 | 123 | return result_data 124 | 125 | def add_job(self, data, skip_booster=False, **kwargs): 126 | step_data = StepData(flow_data=data, 127 | meta_data=session.get_meta_data()) 128 | 129 | result = self.app.add_job(step=self, 130 | data=step_data, 131 | skip_booster=skip_booster, 132 | **kwargs) 133 | return result 134 | 135 | def add_jobs(self, jobs_data, **kwargs): 136 | engine_jobs = [] 137 | for data in jobs_data: 138 | step_data = StepData(flow_data=data, 139 | meta_data=session.get_meta_data()) 140 | engine_jobs.append(step_data) 141 | 142 | result = self.app.add_jobs(step=self, 143 | jobs_data=engine_jobs, 144 | **kwargs) 145 | return result 146 | 147 | def receive_job(self, **data): 148 | if "flow_data" not in data: 149 | raise RuntimeError("flow_data not found in job payload") 150 | 151 | with session.change_flow_ctx(data.get('meta_data', {}), data['flow_data']): 152 | return self(**session.get_flow_data()) 153 | 154 | def is_empty(self) -> bool: 155 | return self.jobs_count() == 0 156 | 157 | def jobs_count(self) -> int: 158 | return self.app.worker_engine.jobs_count(*[self]) 159 | 160 | def set_factory(self, factory): 161 | self.factory = factory 162 | 163 | def flush_all(self): 164 | self.app.worker_engine.flush_queue(step=self) 165 | 166 | def is_last_step(self): 167 | if self.next_step is None: 168 | return True 169 | 170 | return False 171 | 172 | def step_key(self): 173 | return self.unique_id 174 | 175 | def get_queue_name(self): 176 | return self.app.worker_engine.get_queue_name(self) 177 | -------------------------------------------------------------------------------- /stepist/flow/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | __all__ = ['StopFlowFlag'] 4 | 5 | handler_args = dict() 6 | 7 | 8 | class StopFlowFlag(Exception): 9 | 10 | def __init__(self, reason=''): 11 | self.reason = reason 12 | super(StopFlowFlag, self).__init__() 13 | 14 | 15 | class AttrDict(dict): 16 | 17 | def __getattr__(self, name): 18 | if name in self: 19 | return self[name] 20 | raise AttributeError('%s not found' % name) 21 | 22 | def __setattr__(self, name, value): 23 | self[name] = value 24 | 25 | @property 26 | def __members__(self): 27 | return self.keys() 28 | 29 | 30 | def validate_handler_data(handler, data): 31 | global handler_args 32 | 33 | if handler not in handler_args: 34 | spec = inspect.getfullargspec(handler) 35 | handler_args[handler] = spec 36 | else: 37 | spec = handler_args[handler] 38 | 39 | args = spec.args 40 | if spec.varkw: 41 | return data 42 | 43 | handler_data = {k:v for k,v in data.items() if k in args} 44 | 45 | return handler_data 46 | -------------------------------------------------------------------------------- /stepist/flow/workers/__init__.py: -------------------------------------------------------------------------------- 1 | from .adapters import simple_queue 2 | from . import utils 3 | 4 | 5 | def process(app, *steps, **kwargs): 6 | steps = utils.validate_steps(steps) 7 | app.worker_engine.process(*steps, **kwargs) 8 | 9 | 10 | def simple_multiprocessing(app, workers_count, steps, *args, **kwargs): 11 | from multiprocessing import Process 12 | 13 | process_list = [] 14 | for i in range(workers_count): 15 | p = Process(target=process, args=[app, *steps], kwargs=kwargs) 16 | p.start() 17 | process_list.append(p) 18 | 19 | return process_list 20 | 21 | 22 | -------------------------------------------------------------------------------- /stepist/flow/workers/adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/adapters/__init__.py -------------------------------------------------------------------------------- /stepist/flow/workers/adapters/rm_queue.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pika 3 | import ujson 4 | import random 5 | 6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine 7 | 8 | 9 | class RQAdapter(BaseWorkerEngine): 10 | def __init__(self, pika_params=None, data_pickler=ujson, jobs_limit=None, 11 | jobs_limit_wait_timeout=10): 12 | 13 | if not pika_params: 14 | self.params = pika.ConnectionParameters( 15 | host='localhost', 16 | port=5672, 17 | ) 18 | else: 19 | self.params = pika_params 20 | 21 | self.data_pickler = data_pickler 22 | self.jobs_limit = jobs_limit 23 | self.jobs_limit_wait_timeout = jobs_limit_wait_timeout 24 | 25 | self.pika_connection = pika.BlockingConnection(parameters=self.params) 26 | self.channel_producer = self.pika_connection.channel() 27 | self.channel_consumer = self.pika_connection.channel() 28 | self.queues = dict() 29 | 30 | def add_job(self, step, data, **kwargs): 31 | 32 | if self.jobs_limit: 33 | while self.jobs_count(step) >= self.jobs_limit: 34 | print("Jobs limit exceeded, waiting %s seconds" 35 | % self.jobs_limit_wait_timeout) 36 | time.sleep(self.jobs_limit_wait_timeout) 37 | 38 | queue_name = self.get_queue_name(step) 39 | json_data = self.data_pickler.dumps(data.get_dict()) 40 | self.channel_producer.basic_publish( 41 | exchange='', 42 | routing_key=queue_name, 43 | body=json_data) 44 | 45 | def add_jobs(self, step, jobs_data, **kwargs): 46 | 47 | if self.jobs_limit: 48 | while self.jobs_count(step) >= self.jobs_limit: 49 | print("Jobs limit exceeded, waiting %s seconds" 50 | % self.jobs_limit_wait_timeout) 51 | time.sleep(self.jobs_limit_wait_timeout) 52 | 53 | for job in jobs_data: 54 | self.add_job(step, job, **kwargs) 55 | 56 | def receive_job(self, step): 57 | q_name = step.get_queue_name() 58 | result = self.channel_consumer.basic_get(queue=q_name) 59 | 60 | if result and result[0] and result[2]: 61 | self.channel_consumer.basic_ack(delivery_tag=result[0].delivery_tag) 62 | return self.data_pickler.loads(result[2]) 63 | else: 64 | return None 65 | 66 | def process(self, *steps, die_when_empty=False, die_on_error=True): 67 | # Pika is not thread safe we need to create new connection per thread 68 | channel = self.channel_consumer 69 | receivers = [StepReceiver(step, channel, self.data_pickler) 70 | for step in steps] 71 | 72 | empty_count = 0 73 | 74 | while True: 75 | random.shuffle(receivers) 76 | 77 | r = receivers[0] 78 | q = r.step.get_queue_name() 79 | result = channel.basic_get(queue=q) 80 | 81 | if result and result[0] and result[2]: 82 | r(*result) 83 | empty_count = 0 84 | else: 85 | empty_count += 1 86 | if empty_count > len(receivers) * 3 and die_when_empty: 87 | exit() 88 | 89 | def flush_queue(self, step): 90 | queue_name = self.get_queue_name(step) 91 | self.channel_producer.queue_delete(queue=queue_name) 92 | self.channel_producer.queue_declare(queue=queue_name, 93 | auto_delete=False, 94 | durable=True) 95 | 96 | def jobs_count(self, *steps): 97 | sum_by_steps = 0 98 | 99 | for step in steps: 100 | queue_name = self.get_queue_name(step) 101 | sum_by_steps += self.queues[queue_name].method.message_count 102 | 103 | return sum_by_steps 104 | 105 | def register_worker(self, step): 106 | queue_name = self.get_queue_name(step) 107 | q = self.channel_producer.queue_declare(queue=queue_name, 108 | auto_delete=False, 109 | durable=True) 110 | 111 | self.queues[queue_name] = q 112 | 113 | def monitor_steps(self, step_keys, monitoring_for_sec): 114 | pass 115 | 116 | def get_queue_name(self, step): 117 | return step.step_key() 118 | 119 | 120 | class StepReceiver: 121 | def __init__(self, step, channel, data_pickler): 122 | self.step = step 123 | self.channel = channel 124 | self.data_pickler = data_pickler 125 | 126 | def __call__(self, method, properties, body): 127 | self.step.receive_job(**self.data_pickler.loads(body)) 128 | self.channel.basic_ack(delivery_tag=method.delivery_tag) 129 | -------------------------------------------------------------------------------- /stepist/flow/workers/adapters/simple_queue.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | import time 3 | 4 | from stepist.flow.libs.simple_queue import SimpleQueue 5 | 6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine 7 | from stepist.flow.workers.adapters import utils 8 | 9 | 10 | class SimpleQueueAdapter(BaseWorkerEngine): 11 | def __init__(self, redis_connection, data_pickler=ujson, verbose=True, 12 | jobs_limit=None, jobs_limit_wait_timeout=10): 13 | 14 | self.redis_connection = redis_connection 15 | self.jobs_limit = jobs_limit 16 | self.jobs_limit_wait_timeout = jobs_limit_wait_timeout 17 | self.verbose = verbose 18 | self.queue = SimpleQueue(data_pickler, 19 | self.redis_connection) 20 | 21 | def add_job(self, step, data, **kwargs): 22 | q_name = self.get_queue_name(step) 23 | 24 | if self.jobs_limit: 25 | while self.jobs_count(step) >= self.jobs_limit: 26 | print("Jobs limit exceeded, waiting %s seconds" 27 | % self.jobs_limit_wait_timeout) 28 | time.sleep(self.jobs_limit_wait_timeout) 29 | 30 | self.queue.add_job(q_name, data.get_dict()) 31 | 32 | def add_jobs(self, step, jobs_data, **kwargs): 33 | 34 | if self.jobs_limit: 35 | while self.jobs_count(step) >= self.jobs_limit: 36 | print("Jobs limit exceeded, waiting %s seconds" 37 | % self.jobs_limit_wait_timeout) 38 | time.sleep(self.jobs_limit_wait_timeout) 39 | 40 | jobs_data_dict = [data.get_dict() for data in jobs_data] 41 | self.queue.add_jobs(self.get_queue_name(step), jobs_data_dict) 42 | 43 | def receive_job(self, step, wait_timeout=3): 44 | key, data = self.queue.reserve_jobs([self.get_queue_name(step)], 45 | wait_timeout=wait_timeout) 46 | return data 47 | 48 | def process(self, *steps, die_when_empty=False, die_on_error=True): 49 | self.queue.process({self.get_queue_name(step): step for step in steps}, 50 | die_when_empty=die_when_empty, 51 | die_on_error=die_on_error, 52 | verbose=self.verbose) 53 | 54 | def flush_queue(self, step): 55 | queue_name = self.get_queue_name(step) 56 | self.queue.flush_jobs(queue_name) 57 | 58 | def jobs_count(self, *steps): 59 | sum_by_steps = 0 60 | for step in steps: 61 | q_key = step.get_queue_name() 62 | sum_by_steps += self.queue.redis_db.llen(q_key) 63 | 64 | return sum_by_steps 65 | 66 | def register_worker(self, handler): 67 | pass 68 | 69 | def monitor_steps(self, steps, monitoring_for_sec): 70 | push = dict() 71 | pop = dict() 72 | 73 | pool = self.redis_connection.connection_pool 74 | monitor = utils.RedisMonitor(pool) 75 | commands = monitor.monitor(monitoring_for_sec) 76 | 77 | for command in commands: 78 | command = command.lower() 79 | 80 | for step in steps: 81 | key = step.get_queue_name() 82 | step_key = step.step_key() 83 | if key in command and 'lpush' in command: 84 | push[step_key] = push.get(step_key, 0) + 1 85 | if key in command and 'lpop' in command: 86 | pop[step_key] = pop.get(step_key, 0) + 1 87 | 88 | return push, pop 89 | 90 | @staticmethod 91 | def get_queue_name(step): 92 | return "stepist::%s" % step.step_key() 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /stepist/flow/workers/adapters/sqs_queue.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import ujson 3 | import time 4 | import multiprocessing 5 | 6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine 7 | 8 | 9 | class SQSAdapter(BaseWorkerEngine): 10 | def __init__(self, session=boto3, visibility_timeout=None, 11 | message_retention_period=None, wait_seconds=5, 12 | data_pickler=ujson): 13 | 14 | self.data_pickler = data_pickler 15 | 16 | self.session = session 17 | self.sqs_client = session.client('sqs') 18 | self.sqs_resource = session.resource('sqs') 19 | 20 | self.message_retention_period = message_retention_period 21 | self.visibility_timeout = visibility_timeout 22 | self.wait_seconds = wait_seconds 23 | 24 | self._queues = dict() 25 | self._steps = dict() 26 | 27 | def add_job(self, step, data, **kwargs): 28 | queue_name = self.get_queue_name(step) 29 | 30 | queue = self._queues.get(queue_name, None) 31 | if not queue: 32 | raise RuntimeError("Queue %s not found" % queue_name) 33 | 34 | kwargs = { 35 | 'MessageBody': self.data_pickler.dumps(data.get_dict()), 36 | 'MessageAttributes': {}, 37 | 'DelaySeconds': 0 38 | } 39 | 40 | queue.send_message(**kwargs) 41 | 42 | def add_jobs(self, step, jobs_data, **kwargs): 43 | for job_data in jobs_data: 44 | self.add_job(step, job_data, **kwargs) 45 | 46 | def receive_job(self, step, wait_seconds=5): 47 | q_name = self.get_queue_name(step) 48 | queue = self.session.resource('sqs').get_queue_by_name( 49 | QueueName=q_name) 50 | 51 | kwargs = { 52 | 'WaitTimeSeconds': wait_seconds, 53 | 'MaxNumberOfMessages': 1, 54 | 'MessageAttributeNames': ['All'], 55 | 'AttributeNames': ['All'], 56 | } 57 | messages = queue.receive_messages(**kwargs) 58 | if not messages: 59 | return None 60 | 61 | if len(messages) != 1: 62 | raise RuntimeError("Got more than 1 job for some reason") 63 | 64 | msg = messages[0] 65 | 66 | msg_result = { 67 | 'Id': msg.message_id, 68 | 'ReceiptHandle': msg.receipt_handle 69 | } 70 | queue.delete_messages(Entries=[msg_result]) 71 | 72 | return self.data_pickler.loads(msg.body) 73 | 74 | def process(self, *steps, die_when_empty=False, die_on_error=True): 75 | queues = [] 76 | for step in steps: 77 | queues.append(self.get_queue_name(step)) 78 | 79 | if not queues: 80 | return 81 | 82 | mng = multiprocessing.Manager() 83 | empty_queues = mng.dict({q: False for q in queues}) 84 | 85 | processes = [] 86 | for queue_name in queues: 87 | p = multiprocessing.Process( 88 | target=self.process_queue, 89 | kwargs={ 90 | 'queue_name': queue_name, 91 | 'die_on_error': die_on_error, 92 | 'empty_queues': empty_queues, 93 | 'die_when_empty': die_when_empty, 94 | }, 95 | ) 96 | p.start() 97 | processes.append(p) 98 | 99 | for p in processes: 100 | p.join() 101 | p.terminate() 102 | 103 | def process_queue(self, queue_name, die_on_error, empty_queues, 104 | die_when_empty): 105 | try: 106 | queue = self.session.resource('sqs').get_queue_by_name(QueueName=queue_name) 107 | except Exception: 108 | empty_queues[queue_name] = True 109 | raise 110 | 111 | if not queue_name or not queue: 112 | empty_queues[queue_name] = True 113 | return 114 | 115 | while True: 116 | kwargs = { 117 | 'WaitTimeSeconds': self.wait_seconds, 118 | 'MaxNumberOfMessages': 10, 119 | 'MessageAttributeNames': ['All'], 120 | 'AttributeNames': ['All'], 121 | } 122 | messages = queue.receive_messages(**kwargs) 123 | 124 | if not messages: 125 | empty_queues[queue_name] = True 126 | if all(list(empty_queues.values())) and die_when_empty: 127 | exit() 128 | 129 | time.sleep(self.wait_seconds) 130 | continue 131 | 132 | empty_queues[queue_name] = False 133 | 134 | msg_results = [] 135 | for msg in messages: 136 | data = self.data_pickler.loads(msg.body) 137 | try: 138 | self._steps[queue_name].receive_job(**data) 139 | except Exception: 140 | empty_queues[queue_name] = True 141 | if die_on_error: 142 | raise 143 | 144 | msg_results.append({ 145 | 'Id': msg.message_id, 146 | 'ReceiptHandle': msg.receipt_handle 147 | }) 148 | 149 | if msg_results: 150 | queue.delete_messages(Entries=msg_results) 151 | 152 | def flush_queue(self, step): 153 | raise NotImplemented("Not implemented yet. Delete queue using " 154 | "SQS dashboard") 155 | 156 | def jobs_count(self, *steps): 157 | jobs = 0 158 | 159 | for step in steps: 160 | queue_name = self.get_queue_name(step) 161 | sqs_q = self.sqs_client.get_queue_url(QueueName=queue_name) 162 | attrs = self.sqs_client.get_queue_attributes( 163 | sqs_q, ['ApproximateNumberOfMessages']) 164 | jobs += attrs.get("ApproximateNumberOfMessages", 0) 165 | 166 | return jobs 167 | 168 | def register_worker(self, step): 169 | queue_name = self.get_queue_name(step) 170 | 171 | attrs = {} 172 | kwargs = { 173 | 'QueueName': queue_name, 174 | 'Attributes': attrs, 175 | } 176 | if self.message_retention_period is not None: 177 | attrs['MessageRetentionPeriod'] = str(self.message_retention_period) 178 | if self.visibility_timeout is not None: 179 | attrs['VisibilityTimeout'] = str(self.visibility_timeout) 180 | 181 | self.sqs_client.create_queue(**kwargs) 182 | 183 | queue = self.sqs_resource.get_queue_by_name(QueueName=queue_name) 184 | 185 | self._queues[queue_name] = queue 186 | self._steps[queue_name] = step 187 | 188 | def monitor_steps(self, step_keys, monitoring_for_sec): 189 | pass 190 | 191 | def get_queue_name(self, step): 192 | return step.step_key().replace(":", "-") 193 | 194 | 195 | def _move_first_to_the_end(a): 196 | return a[1:] + [a[0]] 197 | 198 | 199 | class DieWhenEmpty: 200 | def __init__(self, active, queues): 201 | self.active = active 202 | self.queues = queues 203 | 204 | self.queus_no_jobs = set() 205 | 206 | def update_status(self, queue_name, no_job): 207 | if no_job: 208 | self.queus_no_jobs.add(queue_name) 209 | elif queue_name in self.queus_no_jobs: 210 | self.queus_no_jobs.remove(queue_name) 211 | 212 | def __bool__(self): 213 | return len(self.queus_no_jobs) >= len(self.queues) 214 | -------------------------------------------------------------------------------- /stepist/flow/workers/adapters/utils.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | 4 | class Timeout(): 5 | """Timeout class using ALARM signal.""" 6 | class Timeout(Exception): 7 | pass 8 | 9 | def __init__(self, sec): 10 | self.sec = sec 11 | 12 | def __enter__(self): 13 | signal.signal(signal.SIGALRM, self.raise_timeout) 14 | signal.alarm(self.sec) 15 | 16 | def __exit__(self, *args): 17 | signal.alarm(0) # disable alarm 18 | 19 | def raise_timeout(self, *args): 20 | raise Timeout.Timeout() 21 | 22 | 23 | 24 | class RedisMonitor(): 25 | def __init__(self, connection_pool): 26 | self.connection_pool = connection_pool 27 | self.connection = None 28 | 29 | def __del__(self): 30 | try: 31 | self.reset() 32 | except: 33 | pass 34 | 35 | def reset(self): 36 | if self.connection: 37 | self.connection_pool.release(self.connection) 38 | self.connection = None 39 | 40 | def monitor(self, sec): 41 | if self.connection is None: 42 | self.connection = self.connection_pool.get_connection( 43 | 'monitor', None) 44 | self.connection.send_command("monitor") 45 | return self.listen(sec) 46 | 47 | def parse_response(self): 48 | return self.connection.read_response() 49 | 50 | def listen(self, sec): 51 | try: 52 | with Timeout(sec): 53 | while True: 54 | yield self.parse_response() 55 | except Timeout.Timeout: 56 | return 57 | -------------------------------------------------------------------------------- /stepist/flow/workers/boost/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/boost/__init__.py -------------------------------------------------------------------------------- /stepist/flow/workers/boost/shared_memory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/boost/shared_memory.py -------------------------------------------------------------------------------- /stepist/flow/workers/boost/sockets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add APP NAME FOR WORKERS (!!!) 3 | 4 | """ 5 | 6 | import random 7 | import socket 8 | import redis 9 | import selectors 10 | import asyncio 11 | 12 | from threading import Thread 13 | 14 | from stepist.flow.steps.step import StepData 15 | 16 | 17 | DATA_HEADER = b'/stairs_line_separation/\n' 18 | 19 | 20 | class SocketData: 21 | def __init__(self, step, step_data): 22 | self.step = step 23 | self.step_data = step_data 24 | 25 | def to_json(self): 26 | return dict( 27 | step_key=self.step.step_key(), 28 | step_data=self.step_data.get_dict(), 29 | ) 30 | 31 | @classmethod 32 | def from_json(cls, app, json_data): 33 | return cls( 34 | step=app.steps.get(json_data['step_key']), 35 | step_data=StepData(**json_data['step_data']) 36 | ) 37 | 38 | 39 | class SocketBooster: 40 | 41 | def __init__(self, app, socket_address='/tmp/stairs', use_ipc=True, 42 | socket_port_range=(49152, 65536), buffer_size=1000): 43 | 44 | self.app = app 45 | 46 | self.buffer_size = buffer_size 47 | self.use_ipc = use_ipc 48 | 49 | self.sender = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 50 | self.socket_address = self.gen_address(socket_address, socket_port_range) 51 | self.sender.bind(self.socket_address) 52 | self.sender.listen() 53 | 54 | self.socket_redis = redis.Redis(**self.app.config.redis_kwargs) 55 | self._register_client_in_redis() 56 | 57 | self.connections = SocketConnections(self.sender, 58 | self.socket_redis, 59 | self.socket_address) 60 | 61 | self.receiver_event_loop = SocketWorkersEventLoop(self, 62 | self.connections, 63 | self.buffer_size) 64 | 65 | self.sender_event_loop = SocketProducerEventLoop(self, 66 | self.connections, 67 | self.buffer_size) 68 | self.cnt_job_sent = 0 69 | 70 | self.socket_connections_updater = \ 71 | Thread(target=self.connections.connections_updater_loop, 72 | args=(self.sender_event_loop, self.receiver_event_loop)) 73 | self.socket_connections_updater.start() 74 | 75 | def gen_address(self, socket_address, socket_port_range): 76 | if self.use_ipc: 77 | return "%s%s" % (socket_address, random.randint(*socket_port_range)) 78 | else: 79 | return "%s:%s" % (socket_address, random.randint(*socket_port_range)) 80 | 81 | def _register_client_in_redis(self): 82 | key = "socket_client:%s" % self.socket_address 83 | self.socket_redis.client_setname(key) 84 | 85 | def send_job(self, step, data, **kwargs): 86 | self.sender_event_loop.send_job(step, data) 87 | 88 | def process(self, steps, die_on_error=True, die_when_empty=False): 89 | self.receiver_event_loop.run_event_loop() 90 | 91 | def handle_job(self, data_bytes): 92 | row_data = self.app.data_pickler.loads(data_bytes) 93 | 94 | s_data = SocketData.from_json(self.app, row_data) 95 | 96 | step = s_data.step 97 | step_data = s_data.step_data 98 | 99 | if step is not None: 100 | self.forward_to_queue(step, step_data) 101 | return 102 | 103 | # if step.step_key() not in available_handlers: 104 | # self.forward_to_queue(step, step_data) 105 | 106 | s_data.step.receive_job(**s_data.step_data.get_dict()) 107 | 108 | def forward_to_queue(self, step, step_data): 109 | self.app.add_job(step, step_data, skip_booster=True) 110 | 111 | 112 | class SocketConnections: 113 | """ 114 | Trying to update information about whole connections which we have 115 | in redis. 116 | 117 | """ 118 | def __init__(self, socket_host, socket_redis, current_address): 119 | self.socket_host = socket_host 120 | self.socket_redis = socket_redis 121 | 122 | self.ignore_addresses = [current_address] 123 | 124 | self.current_workers_sockets = [] 125 | 126 | self.current_producers = [] 127 | self.current_producers_sockets = [] 128 | self.producer_selector = selectors.DefaultSelector() 129 | 130 | def get_producers(self): 131 | workers_addresses = [] 132 | 133 | clients_list = self.socket_redis.client_list() 134 | 135 | for client in clients_list: 136 | client_name = client['name'] 137 | 138 | if 'socket_client:' not in client_name: 139 | continue 140 | 141 | address = client_name.split("socket_client:")[1] 142 | 143 | if address not in self.ignore_addresses: 144 | workers_addresses.append(address) 145 | 146 | return workers_addresses 147 | 148 | def connections_updater_loop(self, socket_producer, socker_receiver): 149 | while True: 150 | for producer_address in self.get_producers(): 151 | if producer_address in self.current_producers: 152 | continue 153 | 154 | producer_socket = socket.socket(socket.AF_UNIX, 155 | socket.SOCK_STREAM) 156 | producer_socket.connect(producer_address) 157 | producer_socket.setblocking(False) 158 | socker_receiver.add_new_producer(producer_socket) 159 | self.current_producers.append(producer_address) 160 | 161 | conn, addr = self.socket_host.accept() 162 | #conn.setblocking(1) 163 | conn.setblocking(False) 164 | self.current_workers_sockets.append(conn) 165 | socket_producer.add_new_worker(conn) 166 | 167 | 168 | class SocketProducerEventLoop: 169 | 170 | def __init__(self, booster, socket_workers, buffer_size): 171 | self.booster = booster 172 | self.loop = asyncio.new_event_loop() 173 | self.socket_workers = socket_workers 174 | 175 | self.socket_buffer_size = [] 176 | self.buffer_size = buffer_size 177 | 178 | self.workers_selector = selectors.DefaultSelector() 179 | self.sockets = [] 180 | 181 | def add_new_worker(self, conn): 182 | self.socket_buffer_size.append(0) 183 | self.sockets.append(conn) 184 | self.workers_selector.register(conn, 185 | selectors.EVENT_READ, 186 | data=len(self.socket_buffer_size)-1) 187 | 188 | def send_job(self, step, step_data): 189 | 190 | if not self.socket_buffer_size: 191 | self.booster.forward_to_queue(step, step_data) 192 | return 193 | 194 | socket_data = SocketData(step=step, step_data=step_data) 195 | data_encoded = self.booster.app.data_pickler.dumps(socket_data.to_json()) 196 | data_encoded = data_encoded.encode("utf-8") 197 | 198 | for i in range(len(self.socket_buffer_size)): 199 | if self.socket_buffer_size[i] > 0: 200 | try: 201 | self.sockets[i].send(data_encoded + DATA_HEADER) 202 | 203 | except socket.timeout: 204 | print("Timeout error for one of the worker,") 205 | print("It will be removed from workers list") 206 | # Disable socket buffer 207 | self.socket_buffer_size[i] = -1 208 | # continue send_job logic 209 | self.booster.forward_to_queue(step, step_data) 210 | return 211 | 212 | except BlockingIOError: 213 | self.booster.forward_to_queue(step, step_data) 214 | return 215 | 216 | except BrokenPipeError: 217 | print("BrokenPipeError for one of the worker") 218 | print("It will be removed from workers list") 219 | self.socket_buffer_size[i] = -1 220 | self.booster.forward_to_queue(step, step_data) 221 | return 222 | 223 | self.socket_buffer_size[i] = self.socket_buffer_size[i] - 1 224 | return 225 | 226 | self.booster.forward_to_queue(step, step_data) 227 | 228 | events = self.workers_selector.select(timeout=5) 229 | 230 | for key, mask in events: 231 | index = key.data 232 | sock = key.fileobj 233 | 234 | try: 235 | d = sock.recv(1024) 236 | except socket.timeout as e: 237 | continue 238 | 239 | if d: 240 | self.socket_buffer_size[index] = self.buffer_size 241 | 242 | 243 | class SocketWorkersEventLoop: 244 | 245 | def __init__(self, booster, socket_workers, buffer_size): 246 | self.booster = booster 247 | self.socket_workers = socket_workers 248 | 249 | self.socket_buffer_size = [] 250 | self.buffer_size = buffer_size 251 | self.sockets = [] 252 | 253 | self.threads_events = dict() 254 | 255 | self.producers_selector = selectors.DefaultSelector() 256 | 257 | def add_new_producer(self, producer_socket): 258 | self.sockets.append(producer_socket) 259 | self.socket_buffer_size.append(0) 260 | self.producers_selector.register(producer_socket, 261 | selectors.EVENT_READ, 262 | data=len(self.sockets)-1) 263 | 264 | def run_event_loop(self): 265 | while True: 266 | for i in range(len(self.sockets)): 267 | if self.socket_buffer_size[i] == 0: 268 | try: 269 | self.sockets[i].send(b'ready_to_consume') 270 | except (socket.timeout, BlockingIOError): 271 | continue 272 | self.socket_buffer_size[i] = self.buffer_size 273 | 274 | events = self.producers_selector.select(timeout=5) 275 | 276 | for key, mask in events: 277 | index = key.data 278 | sock = key.fileobj 279 | 280 | data = b'' 281 | try: 282 | while True: 283 | try: 284 | sock_data = sock.recv(256 * 1024 * 1024) 285 | except BlockingIOError: 286 | break 287 | 288 | data += sock_data 289 | if not sock_data: 290 | break 291 | 292 | except socket.timeout: 293 | continue 294 | 295 | if data: 296 | rows = data.split(DATA_HEADER) 297 | self.socket_buffer_size[index] -= (len(rows) - 1) 298 | -------------------------------------------------------------------------------- /stepist/flow/workers/reducer_engine.py: -------------------------------------------------------------------------------- 1 | import time 2 | from stepist.flow import session 3 | 4 | 5 | class BaseReducerEngine(object): 6 | 7 | def add_job(self, reducer_step, data): 8 | raise NotImplemented() 9 | 10 | 11 | class RedisReducerEngine(BaseReducerEngine): 12 | 13 | def __init__(self, app, redis_db, reducer_job_lifetime, 14 | reducer_no_job_sleep_time): 15 | self.app = app 16 | self.redis_db = redis_db 17 | 18 | self.reducer_job_lifetime = reducer_job_lifetime 19 | self.reducer_no_job_sleep_time = reducer_no_job_sleep_time 20 | 21 | def add_job(self, reducer_step, data, **kwargs): 22 | hub_job_id = session.get_meta_data().get("hub_job_id", None) 23 | if hub_job_id is None: 24 | raise RuntimeError("job id not found. Do you have 'HUB' step before" 25 | "reducer") 26 | 27 | current_amount = self.redis_db.zincrby( 28 | "%s:%s" % ("count", reducer_step.step_key()), 29 | 1, 30 | hub_job_id, 31 | 32 | ) 33 | 34 | pipe = self.redis_db.pipeline() 35 | pipe.hset( 36 | "%s:%s" % (reducer_step.step_key(), hub_job_id), 37 | current_amount, 38 | self.app.config.pickler.dumps(data), 39 | ) 40 | pipe.expire( 41 | "%s:%s" % (reducer_step.step_key(), hub_job_id), 42 | self.reducer_job_lifetime 43 | ) 44 | pipe.execute() 45 | 46 | def process(self, reducer_step): 47 | while True: 48 | max_value = self.redis_db.zpopmax( 49 | "%s:%s" % ("count", reducer_step.step_key()), 50 | ) 51 | if not max_value: 52 | time.sleep(self.reducer_no_job_sleep_time) 53 | continue 54 | 55 | hub_job_id, count = max_value[0] 56 | hub_job_id = hub_job_id.decode("utf-8") 57 | 58 | key_count = hub_job_id.split(":")[1] 59 | 60 | if int(count) < int(key_count): 61 | self.redis_db.zincrby( 62 | "%s:%s" % ("count", reducer_step.step_key()), 63 | hub_job_id, 64 | int(count) 65 | ) 66 | time.sleep(self.reducer_no_job_sleep_time) 67 | continue 68 | 69 | data = self.redis_db.hgetall( 70 | "%s:%s" % (reducer_step.step_key(), hub_job_id), 71 | ) 72 | if data: 73 | values = [] 74 | for v in data.values(): 75 | v = self.app.config.pickler.loads(v.decode('utf-8')) 76 | values.append(v) 77 | 78 | reducer_step(job_list=values) 79 | self.redis_db.delete("%s:%s" % (reducer_step.step_key(), 80 | hub_job_id)) 81 | -------------------------------------------------------------------------------- /stepist/flow/workers/utils.py: -------------------------------------------------------------------------------- 1 | from stepist.flow import session 2 | 3 | 4 | def validate_steps(steps): 5 | valid_steps = [] 6 | 7 | for step in steps: 8 | if not step.as_worker: 9 | print(step.step_key(), "is not worker") 10 | continue 11 | 12 | if isinstance(step, str): 13 | valid_steps.append(session.get_step_by_key(step)) 14 | else: 15 | valid_steps.append(step) 16 | 17 | return valid_steps 18 | -------------------------------------------------------------------------------- /stepist/flow/workers/worker_engine.py: -------------------------------------------------------------------------------- 1 | 2 | NOT_IMPLEMENTED_DESC = "Not supported yet ..." 3 | 4 | 5 | class BaseWorkerEngine(object): 6 | 7 | def add_job(self, step, data, result_reader, **kwargs): 8 | """ 9 | Add data to queue/streaming service. 10 | """ 11 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 12 | 13 | def add_jobs(self, step, data_iter, result_reader): 14 | """ 15 | Add batch of data to queue/streaming service in one transaction 16 | """ 17 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 18 | 19 | def receive_job(self, step): 20 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 21 | 22 | def jobs_count(self, *steps): 23 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 24 | 25 | def flush_queue(self, step): 26 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 27 | 28 | def process(self, *steps): 29 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 30 | 31 | def register_worker(self, handler): 32 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 33 | 34 | def monitor_steps(self, step_keys, monitoring_for_sec): 35 | raise NotImplemented(NOT_IMPLEMENTED_DESC) 36 | --------------------------------------------------------------------------------