├── .gitignore
├── MANIFEST.in
├── README.markdown
├── __init__.py
├── setup.py
├── static
└── examples
│ ├── 1.png
│ ├── 2.png
│ └── 3.png
└── stepist
├── __init__.py
├── app.py
├── app_config.py
├── dbs.py
└── flow
├── __init__.py
├── libs
├── __init__.py
└── simple_queue.py
├── session.py
├── signals.py
├── stats
├── __init__.py
├── utils.py
└── worker.py
├── steps
├── __init__.py
├── factory_step.py
├── hub.py
├── next_step.py
├── reducer_step.py
└── step.py
├── utils.py
└── workers
├── __init__.py
├── adapters
├── __init__.py
├── rm_queue.py
├── simple_queue.py
├── sqs_queue.py
└── utils.py
├── boost
├── __init__.py
├── shared_memory.py
└── sockets.py
├── reducer_engine.py
├── utils.py
└── worker_engine.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | setup.cfg
3 | *.egg-info
4 | /dist
5 | tests
6 | .idea
7 |
8 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include stepist/tests/ *
2 |
--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
1 | [](https://badge.fury.io/py/stepist)
2 |
3 | Stepist. Framework for data processing.
4 |
5 |
6 | The main Stepist goal is to simplify working with data.
7 |
8 |
9 |
10 | **What for:**
11 | - RealTime distributing services
12 | - ETL tasks
13 | - Prepare data for AI models
14 |
15 |
16 |
17 |
18 |
19 | **So, what is Stepist?**
20 | This is tool for creating sequence of functions (called steps) which represents execution flow.
21 | The result of each step is input for a next step, as a result you will have graph (data pipeline),
22 | which could handle data using streaming services (celery, rq, redis) or batch processing tools (kafka).
23 |
24 |
25 |
26 | **Install**
27 |
28 | ```bash
29 | - setup redis https://redis.io/topics/quickstart
30 | - pip install stepist
31 |
32 | ```
33 |
34 |
35 |
36 | ###### Basic defenitions:
37 | - **App** - Collect's all your objects and has full configuration of the system.
38 | - **Step** - Basic object. Connect multiple functions into flow.
39 | - **Flow** - Chain of steps, which start from simple step, and has last step with next_step=None.
40 |
41 |
42 |
43 | ### Examples:
44 |
45 | **Simple step by step flow. (result of each step is input for the next)**
46 |
47 |
48 | ```python
49 | from stepist import App
50 |
51 | app = App()
52 |
53 | @app.step(None)
54 | def step2(a_plus_b, a_minus_b):
55 | return dict(result=a_plus_b *
56 | a_minus_b)
57 |
58 | @app.step(step2)
59 | def step1(a, b):
60 | return dict(a_plus_b=a+b,
61 | a_minus_b=a-b)
62 |
63 | print(step1(5,5))
64 |
65 | ```
66 |
67 |
68 |
69 | **Simple step by step flow with workers**
70 |
71 |
72 | ```python
73 | import sys
74 | import requests
75 |
76 | from stepist import App
77 |
78 | app = App()
79 |
80 | URLS = ['https://www.python.org/',
81 | 'https://wikipedia.org/wiki/Python_(programming_language)']
82 |
83 | @app.step(None)
84 | def step3(text, **kwargs):
85 | print(text.count('python'))
86 |
87 | @app.factory_step(step3, as_worker=True)
88 | def step2(url):
89 | r = requests.get(url)
90 | return dict(url=url,
91 | text=r.text)
92 |
93 | @app.step(step2)
94 | def step1(urls):
95 | for url in urls:
96 | yield dict(url=url)
97 |
98 | if sys.argv[1] == "worker":
99 | app.run(step2) # run worker
100 | else:
101 | step1(urls=URLS)
102 |
103 | # Worker process:
104 | # >>> 94
105 | # >>> 264
106 |
107 | ```
108 |
109 |
110 |
111 | **Call multiple steps at once (Map)**
112 |
113 | Define Hub(list_of_next_steps) as a next step.
114 |
115 | ```python
116 | import sys
117 | import requests
118 |
119 | from stepist import Hub
120 | from stepist import App
121 |
122 | app = App()
123 |
124 | URLS = ['https://www.python.org/',
125 | 'https://wikipedia.org/wiki/Python_(programming_language)']
126 |
127 | @app.step(None)
128 | def step3(text, **kwargs):
129 | c = text.count('python')
130 | return c
131 |
132 | @app.factory_step(step3, as_worker=True)
133 | def step2_v2(url):
134 | r = requests.get(url)
135 | return dict(url=url,
136 | text=r.text)
137 |
138 | @app.factory_step(step3, as_worker=True)
139 | def step2(url):
140 | r = requests.get(url)
141 | return dict(url=url,
142 | text=r.text)
143 |
144 | @app.step(Hub(step2, step2_v2))
145 | def step1(urls):
146 | for url in urls:
147 | yield dict(url=url)
148 |
149 | if sys.argv[1] == "worker":
150 | app.run() # run workers
151 | else:
152 | print(step1(urls=URLS))
153 |
154 | # print, from main process
155 | # >>> [94, 264]
156 |
157 | ```
158 |
159 | **Сombine data from multiple steps. (Reduce)**
160 |
161 | Define @app.reducer_step and linked it with pipeline "leaves"
162 |
163 | ```python
164 | import sys
165 | import requests
166 |
167 | from stepist import Hub
168 | from stepist import App
169 |
170 | app = App()
171 |
172 | URLS = ['https://www.python.org/',
173 | 'https://wikipedia.org/wiki/Python_(programming_language)']
174 |
175 | @app.reducer_step()
176 | def step3(job_list):
177 | return dict(c1=job_list[0].count('python'),
178 | c2=job_list[1].count('python'))
179 |
180 | @app.factory_step(step3, as_worker=True)
181 | def step2_v2(url):
182 | r = requests.get(url)
183 | return dict(url=url,
184 | text=r.text)
185 |
186 | @app.factory_step(step3, as_worker=True)
187 | def step2(url):
188 | r = requests.get(url)
189 | return dict(url=url,
190 | text=r.text)
191 |
192 | @app.step(Hub(step2, step2_v2))
193 | def step1(urls):
194 | for url in urls:
195 | yield dict(url=url)
196 |
197 | if sys.argv[1] == "worker":
198 | app.run() # run workers
199 | else:
200 | print(step1(urls=URLS))
201 |
202 | # print, from main process
203 | # >>> [94, 264]
204 |
205 | ```
206 |
207 |
208 | **Celery**
209 |
210 | Stepist Campatible with Celery
211 |
212 | ```python
213 |
214 | from celery import Celery
215 | from stepist import App
216 | from stepist.flow.workers.adapters.celery_queue import CeleryAdapter
217 |
218 | app = App()
219 |
220 | celery = Celery(broker="redis://localhost:6379/0")
221 | app.worker_engine = CeleryAdapter(app, celery)
222 |
223 |
224 | @app.step(None, as_worker=True)
225 | def step3(result):
226 | return dict(result=result[:2])
227 |
228 | @app.step(step3, as_worker=True)
229 | def step2(hello, world):
230 | return dict(result="%s %s" % (hello, world))
231 |
232 | @app.step(step2)
233 | def step1(hello, world):
234 | return dict(hello=hello.upper(),
235 | world=world.upper())
236 |
237 | if __name__ == "__main__":
238 | print(step1(hello='hello',
239 | world='world'))
240 | app.run()
241 |
242 | ```
243 |
244 | **Custom streaming adapter**
245 |
246 | Just define following functions in Base adapter class and assign to app.worker_engine
247 | ```python
248 |
249 | from stepist import App
250 | from stepist.workers.worker_engine import BaseWorkerEngine
251 |
252 |
253 | class CustomWorkerEngine(BaseWorkerEngine):
254 |
255 | def add_job(self, step, data, result_reader, **kwargs):
256 | raise NotImplemented()
257 |
258 | def jobs_count(self, *steps):
259 | raise NotImplemented()
260 |
261 | def flush_queue(self, step):
262 | raise NotImplemented()
263 |
264 | def process(self, *steps):
265 | raise NotImplemented()
266 |
267 | def register_worker(self, handler):
268 | raise NotImplemented()
269 |
270 |
271 | app = App()
272 | app.worker_engine = CustomWorkerEngine()
273 |
274 | ```
275 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/__init__.py
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | setup(
5 | name="stepist",
6 | version="0.1.6.1",
7 | author="Aleh Shydlouski",
8 | author_email="oleg.ivye@gmail.com",
9 | description="Data process utils",
10 | keywords=['data', 'ai', 'distribute'],
11 | packages=find_packages(),
12 | include_package_data=True,
13 | install_requires=[
14 | 'tqdm',
15 | 'redis >= 3.0.0',
16 | 'blinker',
17 | 'click',
18 | 'ujson>=1.0',
19 |
20 | ],
21 | url='https://github.com/electronick1/stepist',
22 | download_url='https://github.com/electronick1/stepist/archive/0.1.6.1.tar.gz',
23 | classifiers=[],
24 | )
25 |
--------------------------------------------------------------------------------
/static/examples/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/1.png
--------------------------------------------------------------------------------
/static/examples/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/2.png
--------------------------------------------------------------------------------
/static/examples/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/static/examples/3.png
--------------------------------------------------------------------------------
/stepist/__init__.py:
--------------------------------------------------------------------------------
1 | from .flow import *
2 | from .flow.steps import *
3 | from .app import App
4 | try:
5 | from .flow.workers.adapters.rm_queue import RQAdapter
6 | from .flow.workers.adapters.sqs_queue import SQSAdapter
7 | except ImportError:
8 | pass
9 | from .flow.workers.adapters.simple_queue import SimpleQueueAdapter as RedisAdapter
10 |
11 |
--------------------------------------------------------------------------------
/stepist/app.py:
--------------------------------------------------------------------------------
1 | import ujson
2 | from stepist.flow.steps import Step, FactoryStep
3 |
4 | from stepist.app_config import AppConfig
5 | from stepist.dbs import DBs
6 | from stepist.flow import workers
7 |
8 | from stepist.flow.workers import simple_multiprocessing
9 | from stepist.flow.workers.adapters import simple_queue
10 | from stepist.flow.workers import reducer_engine
11 |
12 | from stepist.flow.steps.reducer_step import ReducerStep
13 |
14 | from stepist.flow.workers.boost import sockets
15 |
16 |
17 | class App:
18 |
19 | def __init__(self, worker_engine=None, use_booster=False, booster=None,
20 | data_pickler=ujson, **config_kwargs):
21 | self.steps = dict()
22 | self.default_dbs = None
23 | self.verbose = None
24 | self.use_booster = use_booster
25 |
26 | self.data_pickler = data_pickler
27 |
28 | self.config = AppConfig(**{**AppConfig.init_default(),
29 | **config_kwargs})
30 | self.load_config(self.config)
31 |
32 | self.worker_engine = worker_engine
33 | self.booster = booster
34 |
35 | if self.worker_engine is None:
36 | self.worker_engine = simple_queue.SimpleQueueAdapter(
37 | self.default_dbs.redis_db,
38 | data_pickler=self.data_pickler
39 | )
40 |
41 | self.reducer_engine = reducer_engine.RedisReducerEngine(
42 | app=self,
43 | redis_db=self.default_dbs.redis_db,
44 | reducer_job_lifetime=30, # 30 sec
45 | reducer_no_job_sleep_time=1, # 1 sec
46 | )
47 |
48 | if use_booster:
49 | if booster is not None:
50 | self.booster = booster
51 | else:
52 | self.booster = sockets.SocketBooster(self)
53 | else:
54 | self.booster = None
55 |
56 | def run(self, steps=None, die_on_error=True, die_when_empty=False):
57 | if steps is None:
58 | steps = self.get_workers_steps()
59 |
60 | return workers.process(self,
61 | *steps,
62 | die_on_error=die_on_error,
63 | die_when_empty=die_when_empty)
64 |
65 | def run_booster(self, steps=None, die_on_error=True, die_when_empty=False):
66 | if self.booster is None:
67 | raise RuntimeError("Booster is not enabled. Set use_booster=True "
68 | "in app initialization.")
69 | if steps is None:
70 | steps = self.get_workers_steps()
71 |
72 | self.booster.process(steps,
73 | die_on_error=die_on_error,
74 | die_when_empty=die_when_empty)
75 |
76 | def run_reducer(self, reducer_step):
77 | self.reducer_engine.process(reducer_step)
78 |
79 | def just_do_it(self, workers_count, *args, _warning=True, **kwargs):
80 | if _warning:
81 | print("You are using python multiprocessing for workers")
82 | return simple_multiprocessing(self,
83 | workers_count,
84 | *args,
85 | steps=self.get_workers_steps(),
86 | **kwargs)
87 |
88 | def load_config(self, config_object):
89 | self.config = config_object
90 | self.init_dbs(config_object)
91 |
92 | def init_dbs(self, config):
93 | self.default_dbs = DBs(config)
94 |
95 | def get_workers_steps(self):
96 | return list(filter(lambda step: step.as_worker, self.steps.values()))
97 |
98 | def get_reducers_steps(self):
99 | return list(filter(lambda step: isinstance(step, ReducerStep),
100 | self.steps.values()))
101 |
102 | def register_step(self, step):
103 | if str(step) in self.steps:
104 | raise RuntimeError("Step '%s' already exists!" % str(step))
105 |
106 | self.steps[step.step_key()] = step
107 | if step.as_worker:
108 | self.worker_engine.register_worker(step)
109 |
110 | def add_job(self, step, data, skip_booster=False, **kwargs):
111 | if self.booster and not skip_booster:
112 | self.booster.send_job(step, data, **kwargs)
113 | else:
114 | self.worker_engine.add_job(step, data, **kwargs)
115 |
116 | def step(self, next_step, as_worker=False, wait_result=False,
117 | unique_id=None, save_result=False, name=None):
118 | """
119 | Step decorator which initialize Step object, and register Step
120 | inside stepist
121 |
122 |
123 | :param next_step: next Step instance
124 | :param as_worker: True if it should be distribute
125 | :param wait_result: allow to return results in previous step
126 | :return:
127 | """
128 |
129 | def _wrapper(handler):
130 | step = Step(self,
131 | handler,
132 | next_step,
133 | as_worker=as_worker,
134 | unique_id=unique_id,
135 | wait_result=wait_result,
136 | save_result=save_result,
137 | name=name)
138 |
139 | return step
140 |
141 | return _wrapper
142 |
143 | def reducer_step(self):
144 | """
145 | ReducerStep decorator. We need this for aggregate all jobs results into one
146 | step. And also register step in global step list.
147 |
148 | In args you will get iterator which allow you go through all jobs results
149 | and process it.
150 |
151 | For example you can paste everything into AI model
152 |
153 | :return: ReducerStep instance
154 | """
155 |
156 | def _wrapper(handler):
157 | step = ReducerStep(self, handler)
158 |
159 | self.register_step(step)
160 | return step
161 |
162 | return _wrapper
163 |
164 | def factory_step(self, next_step, as_worker=False):
165 | """
166 | Factory step decorator. If your step decorated by this function - your
167 | step should return iterator, and each item from this iter will be added
168 | to next step.
169 |
170 | :param next_step: Step instance
171 | :param as_worker: True if it should be distribute
172 | :return:
173 | """
174 |
175 | def _wrapper(handler):
176 | step = Step(self,
177 | handler,
178 | next_step,
179 | as_worker=as_worker,
180 | wait_result=False)
181 |
182 | step.set_factory(FactoryStep(step))
183 |
184 | self.register_step(step)
185 | return step
186 |
187 | return _wrapper
188 |
189 | def set_verbose(self, verbose):
190 | self.verbose = verbose
191 |
--------------------------------------------------------------------------------
/stepist/app_config.py:
--------------------------------------------------------------------------------
1 | from stepist.flow.utils import AttrDict
2 |
3 |
4 | DEFAULT_REDIS_KWARGS = dict(
5 | host='localhost',
6 | port=6379
7 | )
8 |
9 |
10 | class AppConfig(AttrDict):
11 |
12 | @classmethod
13 | def init_default(cls):
14 | return cls(
15 | redis_kwargs=DEFAULT_REDIS_KWARGS,
16 | redis_stats_kwargs=DEFAULT_REDIS_KWARGS
17 | )
18 |
19 |
--------------------------------------------------------------------------------
/stepist/dbs.py:
--------------------------------------------------------------------------------
1 | import redis
2 |
3 | from stepist import app_config
4 |
5 |
6 | class DBs:
7 |
8 | def __init__(self, config):
9 | self.config = config
10 | self.redis_db = init_redis(**config.redis_kwargs)
11 | self.redis_stats = init_redis(**config.redis_stats_kwargs)
12 |
13 | def reset(self):
14 | return self.__class__(self.config)
15 |
16 |
17 | def init_redis(**redis_kwargs):
18 | return redis.Redis(**redis_kwargs)
19 |
--------------------------------------------------------------------------------
/stepist/flow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/__init__.py
--------------------------------------------------------------------------------
/stepist/flow/libs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/libs/__init__.py
--------------------------------------------------------------------------------
/stepist/flow/libs/simple_queue.py:
--------------------------------------------------------------------------------
1 | import random
2 | import redis
3 | import time
4 |
5 |
6 | HANDLERS = {}
7 |
8 |
9 | class SimpleQueue:
10 |
11 | def __init__(self, pickler, redis_db):
12 | self.pickler = pickler
13 | self.redis_db = redis_db
14 |
15 | def process(self, jobs, wait_time_for_job=1, die_when_empty=False,
16 | die_on_error=True, verbose=False):
17 | keys = list(jobs.keys())
18 | jobs_processed_before_empty = 0
19 | time_started_before_empty = time.time()
20 |
21 | while True:
22 | key, data = self.reserve_jobs(keys, wait_time_for_job)
23 |
24 | if data is None:
25 | if verbose and jobs_processed_before_empty:
26 | delta_time = round(time.time() - time_started_before_empty, 3)
27 | print("No more jobs in queues. Processed %s jobs in %s sec." %
28 | (jobs_processed_before_empty, delta_time))
29 | print("Waiting for a jobs ....")
30 |
31 | jobs_processed_before_empty = 0
32 | time_started_before_empty = time.time()
33 |
34 | if die_when_empty:
35 | exit()
36 | time.sleep(0.5)
37 | continue
38 |
39 | jobs_processed_before_empty += 1
40 | handler = jobs[key]
41 |
42 | try:
43 | handler.receive_job(**data)
44 | except Exception:
45 | self.add_job(key, data)
46 | if die_on_error:
47 | raise
48 |
49 | def add_job(self, job_key, data):
50 | data = self.pickler.dumps({'data': data})
51 | self.redis_db.lpush(job_key, data)
52 |
53 | def add_jobs(self, job_key, jobs_data):
54 | pipe = self.redis_db.pipeline()
55 |
56 | for job_data in jobs_data:
57 | data = self.pickler.dumps({'data': job_data})
58 | pipe.lpush(job_key, data)
59 |
60 | pipe.execute()
61 |
62 | def reserve_jobs(self, job_keys, wait_timeout):
63 | random.shuffle(job_keys)
64 | try:
65 | job_data = self.redis_db.brpop(job_keys,
66 | timeout=wait_timeout)
67 |
68 | except redis.exceptions.TimeoutError:
69 | return None, None
70 |
71 | if not job_data:
72 | return None, None
73 |
74 | key = job_data[0].decode('utf-8')
75 | job_data = self.pickler.loads(job_data[1])
76 |
77 | return key, job_data['data']
78 |
79 | def flush_jobs(self, step_key):
80 | self.redis_db.delete(step_key)
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/stepist/flow/session.py:
--------------------------------------------------------------------------------
1 | import threading
2 |
3 | from contextlib import contextmanager
4 |
5 |
6 | local = threading.local()
7 |
8 |
9 | def storage():
10 | if not hasattr(local, 'storage'):
11 | local.storage = dict(
12 | steps={},
13 | steps_workers={},
14 | steps_listen_for_job={},
15 | meta_data={},
16 | flow_data={},
17 | )
18 | return local.storage
19 |
20 |
21 | @contextmanager
22 | def change_flow_ctx(meta_data, flow_data):
23 | old_meta = storage().get("meta_data")
24 | old_flow_data = storage().get("flow_data")
25 |
26 | try:
27 | set_meta_data(meta_data)
28 | set_flow_data(flow_data)
29 | yield
30 | finally:
31 | set_meta_data(old_meta)
32 | set_flow_data(old_flow_data)
33 |
34 |
35 | def get_flow_data():
36 | return storage().get("flow_data", {})
37 |
38 |
39 | def get_meta_data():
40 | return storage().get("meta_data", {})
41 |
42 |
43 | def get_step_by_key(key):
44 | return get_steps().get(key, None)
45 |
46 |
47 | def get_steps():
48 | return storage().get("steps", {})
49 |
50 |
51 | def get_steps_workers():
52 | return storage().get("steps_workers", {})
53 |
54 |
55 | def get_steps_to_listen():
56 | return storage().get("steps_listen_for_job", {})
57 |
58 |
59 | def set_meta_data(meta_data):
60 | storage()['meta_data'] = meta_data
61 |
62 |
63 | def set_flow_data(flow_data):
64 | storage()['flow_data'] = flow_data
65 |
66 |
67 | def update_flow_data(flow_data):
68 | storage()['flow_data'].update(flow_data)
69 |
70 |
71 | def update_meta_data(**meta_data):
72 | current_meta_data = dict(get_meta_data())
73 | current_meta_data.update(meta_data)
74 |
75 | set_meta_data(current_meta_data)
76 |
77 |
78 | def flush_session():
79 | global local
80 | local = threading.local()
81 |
--------------------------------------------------------------------------------
/stepist/flow/signals.py:
--------------------------------------------------------------------------------
1 | from blinker import signal
2 |
3 | before_step = signal("before_step")
4 | after_step = signal("after_step")
5 |
6 | flow_finished = signal("flow_finished")
--------------------------------------------------------------------------------
/stepist/flow/stats/__init__.py:
--------------------------------------------------------------------------------
1 | from . import worker
2 |
--------------------------------------------------------------------------------
/stepist/flow/stats/utils.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def concat_steps_to_str(steps):
4 | return '||'.join([step.step_key() for step in steps])
5 |
--------------------------------------------------------------------------------
/stepist/flow/stats/worker.py:
--------------------------------------------------------------------------------
1 | from stepist.flow.dbs import get_redis_stats
2 |
3 | from . import utils
4 |
5 |
6 | WORKER_CONNECTION_NAME = "STATS::process::||{steps}"
7 | JOB_ADDED = "STATS::job_added::{job_key}"
8 |
9 |
10 | def starts(steps):
11 | steps_str = utils.concat_steps_to_str(steps)
12 | get_redis_stats().client_setname(WORKER_CONNECTION_NAME.format(steps=steps_str))
13 |
14 |
15 | def job_added(job_key, data):
16 | get_redis_stats().incr(JOB_ADDED.format(job_key=job_key))
17 |
--------------------------------------------------------------------------------
/stepist/flow/steps/__init__.py:
--------------------------------------------------------------------------------
1 | from .step import Step
2 | from .factory_step import FactoryStep
3 | from .hub import Hub
4 | from .reducer_step import ReducerStep
5 |
--------------------------------------------------------------------------------
/stepist/flow/steps/factory_step.py:
--------------------------------------------------------------------------------
1 | from .next_step import init_next_worker_step, init_next_step
2 |
3 |
4 | class FactoryStep(object):
5 | """
6 | All to add jobs by iterator. Taking care about data iteration, and result
7 | reader.
8 |
9 | Able to return iterator, which can be used for reading flow result.
10 |
11 | """
12 |
13 | # current Step instance
14 | step = None
15 |
16 | def __init__(self, step):
17 | self.step = step
18 |
19 | def add_data_iter(self, data_iter):
20 | """
21 | Getting data iterator, and put each item in queue
22 |
23 | :param data_iter: any data iterator object
24 | """
25 |
26 | for row_data in data_iter:
27 | if self.step.as_worker:
28 | init_next_worker_step(row_data,
29 | self.step)
30 | else:
31 | init_next_step(row_data, self.step)
32 |
33 |
34 |
--------------------------------------------------------------------------------
/stepist/flow/steps/hub.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from stepist.flow.session import update_meta_data
3 |
4 |
5 | class Hub(object):
6 | """
7 | Allow to push data in multiple steps
8 | """
9 | def __init__(self, *steps):
10 | self.steps = list(steps)
11 |
12 | def update_meta(self):
13 | hub_job_id = "%s:%s" % (uuid.uuid4(), len(self.steps))
14 | update_meta_data(hub_job_id=hub_job_id)
15 |
--------------------------------------------------------------------------------
/stepist/flow/steps/next_step.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | from .hub import Hub
4 |
5 |
6 | def call_next_step(data, next_step, batch_data=False, **kwargs):
7 |
8 | next_step_handler = choose_next_step_handler(next_step)
9 | return next_step_handler(data, next_step, batch_data=batch_data, **kwargs)
10 |
11 |
12 | def choose_next_step_handler(next_step):
13 |
14 | if next_step and isinstance(next_step, Hub):
15 | # WARNING! recursion here
16 | return init_next_hub_step
17 |
18 | if next_step.factory:
19 | return init_next_factory_step
20 |
21 | if next_step.as_worker:
22 | return init_next_worker_step
23 | else:
24 | return init_next_step
25 |
26 |
27 | def init_next_hub_step(data, hub_step, batch_data=False):
28 | """
29 | WARNING: data copping happens here
30 | """
31 | if batch_data:
32 | raise RuntimeError("It's not possible to use batch in hub step")
33 |
34 | if isinstance(data, list):
35 | if len(data) != len(hub_step.steps):
36 | raise RuntimeError("Amount of data not equal to amount of steps")
37 | data_list = data
38 | else:
39 | data_list = [data for _ in hub_step.steps]
40 |
41 | hub_step.update_meta()
42 |
43 | hub_result = None
44 |
45 | for i, next_step_item in enumerate(hub_step.steps):
46 | # WARNING! recursion here
47 | data = data_list[i]
48 | next_step_handler = choose_next_step_handler(next_step_item)
49 | result = next_step_handler(copy.deepcopy(data), next_step_item)
50 |
51 | if hub_result is None:
52 | hub_result = result
53 | else:
54 | hub_result.update(result)
55 |
56 | return hub_result
57 |
58 |
59 | def init_next_reducer_step(data, next_step, batch_data=False):
60 | from .reducer_step import ReducerStep
61 |
62 | if batch_data:
63 | raise RuntimeError("It's not possible to use batch in reducer step")
64 |
65 | if isinstance(next_step, ReducerStep):
66 | next_step.add_job(data)
67 | return None
68 |
69 |
70 | def init_next_worker_step(data, next_step, batch_data=False, **kwargs):
71 | if batch_data:
72 | return next_step.add_jobs(jobs_data=data)
73 |
74 | return next_step.add_job(data=data, **kwargs)
75 |
76 |
77 | def init_next_factory_step(data, next_step, batch_data=False):
78 | if batch_data:
79 | raise RuntimeError("It's not possible to use batch in factory step")
80 |
81 | next_step.factory.add_data_iter(data)
82 |
83 | return None
84 |
85 |
86 | def init_next_step(data, next_step, batch_data=False):
87 | return next_step(**data)
88 |
89 |
--------------------------------------------------------------------------------
/stepist/flow/steps/reducer_step.py:
--------------------------------------------------------------------------------
1 | from stepist.flow.steps import Step
2 |
3 |
4 | class ReducerStep(Step):
5 |
6 | def __init__(self, app, handler):
7 | self.app = app
8 | self.handler = handler
9 |
10 | super(ReducerStep, self).__init__(app,
11 | handler,
12 | None,
13 | as_worker=True,
14 | wait_result=False)
15 |
16 | @property
17 | def __name__(self):
18 | return self.handler.__name__
19 |
20 | def add_job(self, data, **kwargs):
21 | self.app.reducer_engine.add_job(self, data, **kwargs)
22 |
23 | def step_key(self):
24 | return "reducer_step::%s" % self.__name__
25 |
--------------------------------------------------------------------------------
/stepist/flow/steps/step.py:
--------------------------------------------------------------------------------
1 | import types
2 | from collections import Mapping, Generator
3 |
4 | from stepist.flow import utils, session
5 |
6 | from .next_step import call_next_step
7 |
8 |
9 | class StepData(object):
10 |
11 | flow_data = None
12 | meta_data = None
13 |
14 | def __init__(self, flow_data, meta_data=None):
15 | self.flow_data = flow_data
16 | self.meta_data = meta_data
17 |
18 | def get_dict(self):
19 | return {
20 | 'flow_data': self.flow_data,
21 | 'meta_data': self.meta_data
22 | }
23 |
24 |
25 | class FlowResult(utils.AttrDict):
26 | pass
27 |
28 |
29 | class Step(object):
30 | """
31 | Step object.
32 | """
33 |
34 | # handler function which handle data
35 | handler = None
36 |
37 | # next step object which getting current handler result
38 | next_step = None
39 |
40 | # True, if we need to run current handler in distribute way (using queues)
41 | as_worker = None
42 |
43 | # True, if we need to wait result from current handler
44 | # (used in previous step)
45 | wait_result = None
46 |
47 | # Factor object for iterator handling
48 | factory = None
49 |
50 | def __init__(self, app, handler, next_step, as_worker, wait_result,
51 | unique_id=None, save_result=False, name=None):
52 | self.app = app
53 | self.handler = handler
54 | self.next_step = next_step
55 | self.as_worker = as_worker
56 | self.wait_result = wait_result
57 |
58 | self.unique_id = unique_id
59 | self.name = name
60 |
61 | if not self.name:
62 | if isinstance(self.handler.__name__, str):
63 | self.name = self.handler.__name__
64 | else:
65 | self.name = self.handler.__name__()
66 |
67 | if not self.unique_id:
68 | self.unique_id = self.name
69 |
70 | self.save_result = save_result
71 |
72 | self.factory = None
73 | self.register_step()
74 |
75 | @property
76 | def __name__(self):
77 | return self.unique_id or self.name
78 |
79 | def __call__(self, **kwargs):
80 | """
81 | """
82 | try:
83 | result_data = self.execute_step(**kwargs)
84 | except utils.StopFlowFlag:
85 | return None
86 |
87 | if self.is_last_step():
88 | return FlowResult({self.name: result_data})
89 |
90 | if isinstance(result_data, types.GeneratorType):
91 |
92 | for row_data in result_data:
93 | try:
94 | call_next_step(row_data, next_step=self.next_step)
95 | except utils.StopFlowFlag:
96 | continue
97 |
98 | return None
99 |
100 | flow_result = call_next_step(result_data,
101 | next_step=self.next_step)
102 | if self.save_result:
103 | flow_result[self.name] = result_data
104 |
105 | return flow_result
106 |
107 | def register_step(self):
108 | self.app.register_step(self)
109 |
110 | def execute_step(self, **data):
111 | """
112 | :param data: next step data
113 | :param last_step: Step object or step_key value
114 | :return: Flow result
115 | """
116 |
117 | # if 'self_step' in data:
118 | # raise RuntimeError("You can't use 'self_step' var in data")
119 | handler_data = utils.validate_handler_data(self.handler, data)
120 | result_data = self.handler(**handler_data)
121 | session.set_flow_data(result_data)
122 |
123 | return result_data
124 |
125 | def add_job(self, data, skip_booster=False, **kwargs):
126 | step_data = StepData(flow_data=data,
127 | meta_data=session.get_meta_data())
128 |
129 | result = self.app.add_job(step=self,
130 | data=step_data,
131 | skip_booster=skip_booster,
132 | **kwargs)
133 | return result
134 |
135 | def add_jobs(self, jobs_data, **kwargs):
136 | engine_jobs = []
137 | for data in jobs_data:
138 | step_data = StepData(flow_data=data,
139 | meta_data=session.get_meta_data())
140 | engine_jobs.append(step_data)
141 |
142 | result = self.app.add_jobs(step=self,
143 | jobs_data=engine_jobs,
144 | **kwargs)
145 | return result
146 |
147 | def receive_job(self, **data):
148 | if "flow_data" not in data:
149 | raise RuntimeError("flow_data not found in job payload")
150 |
151 | with session.change_flow_ctx(data.get('meta_data', {}), data['flow_data']):
152 | return self(**session.get_flow_data())
153 |
154 | def is_empty(self) -> bool:
155 | return self.jobs_count() == 0
156 |
157 | def jobs_count(self) -> int:
158 | return self.app.worker_engine.jobs_count(*[self])
159 |
160 | def set_factory(self, factory):
161 | self.factory = factory
162 |
163 | def flush_all(self):
164 | self.app.worker_engine.flush_queue(step=self)
165 |
166 | def is_last_step(self):
167 | if self.next_step is None:
168 | return True
169 |
170 | return False
171 |
172 | def step_key(self):
173 | return self.unique_id
174 |
175 | def get_queue_name(self):
176 | return self.app.worker_engine.get_queue_name(self)
177 |
--------------------------------------------------------------------------------
/stepist/flow/utils.py:
--------------------------------------------------------------------------------
1 | import inspect
2 |
3 | __all__ = ['StopFlowFlag']
4 |
5 | handler_args = dict()
6 |
7 |
8 | class StopFlowFlag(Exception):
9 |
10 | def __init__(self, reason=''):
11 | self.reason = reason
12 | super(StopFlowFlag, self).__init__()
13 |
14 |
15 | class AttrDict(dict):
16 |
17 | def __getattr__(self, name):
18 | if name in self:
19 | return self[name]
20 | raise AttributeError('%s not found' % name)
21 |
22 | def __setattr__(self, name, value):
23 | self[name] = value
24 |
25 | @property
26 | def __members__(self):
27 | return self.keys()
28 |
29 |
30 | def validate_handler_data(handler, data):
31 | global handler_args
32 |
33 | if handler not in handler_args:
34 | spec = inspect.getfullargspec(handler)
35 | handler_args[handler] = spec
36 | else:
37 | spec = handler_args[handler]
38 |
39 | args = spec.args
40 | if spec.varkw:
41 | return data
42 |
43 | handler_data = {k:v for k,v in data.items() if k in args}
44 |
45 | return handler_data
46 |
--------------------------------------------------------------------------------
/stepist/flow/workers/__init__.py:
--------------------------------------------------------------------------------
1 | from .adapters import simple_queue
2 | from . import utils
3 |
4 |
5 | def process(app, *steps, **kwargs):
6 | steps = utils.validate_steps(steps)
7 | app.worker_engine.process(*steps, **kwargs)
8 |
9 |
10 | def simple_multiprocessing(app, workers_count, steps, *args, **kwargs):
11 | from multiprocessing import Process
12 |
13 | process_list = []
14 | for i in range(workers_count):
15 | p = Process(target=process, args=[app, *steps], kwargs=kwargs)
16 | p.start()
17 | process_list.append(p)
18 |
19 | return process_list
20 |
21 |
22 |
--------------------------------------------------------------------------------
/stepist/flow/workers/adapters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/adapters/__init__.py
--------------------------------------------------------------------------------
/stepist/flow/workers/adapters/rm_queue.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pika
3 | import ujson
4 | import random
5 |
6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine
7 |
8 |
9 | class RQAdapter(BaseWorkerEngine):
10 | def __init__(self, pika_params=None, data_pickler=ujson, jobs_limit=None,
11 | jobs_limit_wait_timeout=10):
12 |
13 | if not pika_params:
14 | self.params = pika.ConnectionParameters(
15 | host='localhost',
16 | port=5672,
17 | )
18 | else:
19 | self.params = pika_params
20 |
21 | self.data_pickler = data_pickler
22 | self.jobs_limit = jobs_limit
23 | self.jobs_limit_wait_timeout = jobs_limit_wait_timeout
24 |
25 | self.pika_connection = pika.BlockingConnection(parameters=self.params)
26 | self.channel_producer = self.pika_connection.channel()
27 | self.channel_consumer = self.pika_connection.channel()
28 | self.queues = dict()
29 |
30 | def add_job(self, step, data, **kwargs):
31 |
32 | if self.jobs_limit:
33 | while self.jobs_count(step) >= self.jobs_limit:
34 | print("Jobs limit exceeded, waiting %s seconds"
35 | % self.jobs_limit_wait_timeout)
36 | time.sleep(self.jobs_limit_wait_timeout)
37 |
38 | queue_name = self.get_queue_name(step)
39 | json_data = self.data_pickler.dumps(data.get_dict())
40 | self.channel_producer.basic_publish(
41 | exchange='',
42 | routing_key=queue_name,
43 | body=json_data)
44 |
45 | def add_jobs(self, step, jobs_data, **kwargs):
46 |
47 | if self.jobs_limit:
48 | while self.jobs_count(step) >= self.jobs_limit:
49 | print("Jobs limit exceeded, waiting %s seconds"
50 | % self.jobs_limit_wait_timeout)
51 | time.sleep(self.jobs_limit_wait_timeout)
52 |
53 | for job in jobs_data:
54 | self.add_job(step, job, **kwargs)
55 |
56 | def receive_job(self, step):
57 | q_name = step.get_queue_name()
58 | result = self.channel_consumer.basic_get(queue=q_name)
59 |
60 | if result and result[0] and result[2]:
61 | self.channel_consumer.basic_ack(delivery_tag=result[0].delivery_tag)
62 | return self.data_pickler.loads(result[2])
63 | else:
64 | return None
65 |
66 | def process(self, *steps, die_when_empty=False, die_on_error=True):
67 | # Pika is not thread safe we need to create new connection per thread
68 | channel = self.channel_consumer
69 | receivers = [StepReceiver(step, channel, self.data_pickler)
70 | for step in steps]
71 |
72 | empty_count = 0
73 |
74 | while True:
75 | random.shuffle(receivers)
76 |
77 | r = receivers[0]
78 | q = r.step.get_queue_name()
79 | result = channel.basic_get(queue=q)
80 |
81 | if result and result[0] and result[2]:
82 | r(*result)
83 | empty_count = 0
84 | else:
85 | empty_count += 1
86 | if empty_count > len(receivers) * 3 and die_when_empty:
87 | exit()
88 |
89 | def flush_queue(self, step):
90 | queue_name = self.get_queue_name(step)
91 | self.channel_producer.queue_delete(queue=queue_name)
92 | self.channel_producer.queue_declare(queue=queue_name,
93 | auto_delete=False,
94 | durable=True)
95 |
96 | def jobs_count(self, *steps):
97 | sum_by_steps = 0
98 |
99 | for step in steps:
100 | queue_name = self.get_queue_name(step)
101 | sum_by_steps += self.queues[queue_name].method.message_count
102 |
103 | return sum_by_steps
104 |
105 | def register_worker(self, step):
106 | queue_name = self.get_queue_name(step)
107 | q = self.channel_producer.queue_declare(queue=queue_name,
108 | auto_delete=False,
109 | durable=True)
110 |
111 | self.queues[queue_name] = q
112 |
113 | def monitor_steps(self, step_keys, monitoring_for_sec):
114 | pass
115 |
116 | def get_queue_name(self, step):
117 | return step.step_key()
118 |
119 |
120 | class StepReceiver:
121 | def __init__(self, step, channel, data_pickler):
122 | self.step = step
123 | self.channel = channel
124 | self.data_pickler = data_pickler
125 |
126 | def __call__(self, method, properties, body):
127 | self.step.receive_job(**self.data_pickler.loads(body))
128 | self.channel.basic_ack(delivery_tag=method.delivery_tag)
129 |
--------------------------------------------------------------------------------
/stepist/flow/workers/adapters/simple_queue.py:
--------------------------------------------------------------------------------
1 | import ujson
2 | import time
3 |
4 | from stepist.flow.libs.simple_queue import SimpleQueue
5 |
6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine
7 | from stepist.flow.workers.adapters import utils
8 |
9 |
10 | class SimpleQueueAdapter(BaseWorkerEngine):
11 | def __init__(self, redis_connection, data_pickler=ujson, verbose=True,
12 | jobs_limit=None, jobs_limit_wait_timeout=10):
13 |
14 | self.redis_connection = redis_connection
15 | self.jobs_limit = jobs_limit
16 | self.jobs_limit_wait_timeout = jobs_limit_wait_timeout
17 | self.verbose = verbose
18 | self.queue = SimpleQueue(data_pickler,
19 | self.redis_connection)
20 |
21 | def add_job(self, step, data, **kwargs):
22 | q_name = self.get_queue_name(step)
23 |
24 | if self.jobs_limit:
25 | while self.jobs_count(step) >= self.jobs_limit:
26 | print("Jobs limit exceeded, waiting %s seconds"
27 | % self.jobs_limit_wait_timeout)
28 | time.sleep(self.jobs_limit_wait_timeout)
29 |
30 | self.queue.add_job(q_name, data.get_dict())
31 |
32 | def add_jobs(self, step, jobs_data, **kwargs):
33 |
34 | if self.jobs_limit:
35 | while self.jobs_count(step) >= self.jobs_limit:
36 | print("Jobs limit exceeded, waiting %s seconds"
37 | % self.jobs_limit_wait_timeout)
38 | time.sleep(self.jobs_limit_wait_timeout)
39 |
40 | jobs_data_dict = [data.get_dict() for data in jobs_data]
41 | self.queue.add_jobs(self.get_queue_name(step), jobs_data_dict)
42 |
43 | def receive_job(self, step, wait_timeout=3):
44 | key, data = self.queue.reserve_jobs([self.get_queue_name(step)],
45 | wait_timeout=wait_timeout)
46 | return data
47 |
48 | def process(self, *steps, die_when_empty=False, die_on_error=True):
49 | self.queue.process({self.get_queue_name(step): step for step in steps},
50 | die_when_empty=die_when_empty,
51 | die_on_error=die_on_error,
52 | verbose=self.verbose)
53 |
54 | def flush_queue(self, step):
55 | queue_name = self.get_queue_name(step)
56 | self.queue.flush_jobs(queue_name)
57 |
58 | def jobs_count(self, *steps):
59 | sum_by_steps = 0
60 | for step in steps:
61 | q_key = step.get_queue_name()
62 | sum_by_steps += self.queue.redis_db.llen(q_key)
63 |
64 | return sum_by_steps
65 |
66 | def register_worker(self, handler):
67 | pass
68 |
69 | def monitor_steps(self, steps, monitoring_for_sec):
70 | push = dict()
71 | pop = dict()
72 |
73 | pool = self.redis_connection.connection_pool
74 | monitor = utils.RedisMonitor(pool)
75 | commands = monitor.monitor(monitoring_for_sec)
76 |
77 | for command in commands:
78 | command = command.lower()
79 |
80 | for step in steps:
81 | key = step.get_queue_name()
82 | step_key = step.step_key()
83 | if key in command and 'lpush' in command:
84 | push[step_key] = push.get(step_key, 0) + 1
85 | if key in command and 'lpop' in command:
86 | pop[step_key] = pop.get(step_key, 0) + 1
87 |
88 | return push, pop
89 |
90 | @staticmethod
91 | def get_queue_name(step):
92 | return "stepist::%s" % step.step_key()
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/stepist/flow/workers/adapters/sqs_queue.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | import ujson
3 | import time
4 | import multiprocessing
5 |
6 | from stepist.flow.workers.worker_engine import BaseWorkerEngine
7 |
8 |
9 | class SQSAdapter(BaseWorkerEngine):
10 | def __init__(self, session=boto3, visibility_timeout=None,
11 | message_retention_period=None, wait_seconds=5,
12 | data_pickler=ujson):
13 |
14 | self.data_pickler = data_pickler
15 |
16 | self.session = session
17 | self.sqs_client = session.client('sqs')
18 | self.sqs_resource = session.resource('sqs')
19 |
20 | self.message_retention_period = message_retention_period
21 | self.visibility_timeout = visibility_timeout
22 | self.wait_seconds = wait_seconds
23 |
24 | self._queues = dict()
25 | self._steps = dict()
26 |
27 | def add_job(self, step, data, **kwargs):
28 | queue_name = self.get_queue_name(step)
29 |
30 | queue = self._queues.get(queue_name, None)
31 | if not queue:
32 | raise RuntimeError("Queue %s not found" % queue_name)
33 |
34 | kwargs = {
35 | 'MessageBody': self.data_pickler.dumps(data.get_dict()),
36 | 'MessageAttributes': {},
37 | 'DelaySeconds': 0
38 | }
39 |
40 | queue.send_message(**kwargs)
41 |
42 | def add_jobs(self, step, jobs_data, **kwargs):
43 | for job_data in jobs_data:
44 | self.add_job(step, job_data, **kwargs)
45 |
46 | def receive_job(self, step, wait_seconds=5):
47 | q_name = self.get_queue_name(step)
48 | queue = self.session.resource('sqs').get_queue_by_name(
49 | QueueName=q_name)
50 |
51 | kwargs = {
52 | 'WaitTimeSeconds': wait_seconds,
53 | 'MaxNumberOfMessages': 1,
54 | 'MessageAttributeNames': ['All'],
55 | 'AttributeNames': ['All'],
56 | }
57 | messages = queue.receive_messages(**kwargs)
58 | if not messages:
59 | return None
60 |
61 | if len(messages) != 1:
62 | raise RuntimeError("Got more than 1 job for some reason")
63 |
64 | msg = messages[0]
65 |
66 | msg_result = {
67 | 'Id': msg.message_id,
68 | 'ReceiptHandle': msg.receipt_handle
69 | }
70 | queue.delete_messages(Entries=[msg_result])
71 |
72 | return self.data_pickler.loads(msg.body)
73 |
74 | def process(self, *steps, die_when_empty=False, die_on_error=True):
75 | queues = []
76 | for step in steps:
77 | queues.append(self.get_queue_name(step))
78 |
79 | if not queues:
80 | return
81 |
82 | mng = multiprocessing.Manager()
83 | empty_queues = mng.dict({q: False for q in queues})
84 |
85 | processes = []
86 | for queue_name in queues:
87 | p = multiprocessing.Process(
88 | target=self.process_queue,
89 | kwargs={
90 | 'queue_name': queue_name,
91 | 'die_on_error': die_on_error,
92 | 'empty_queues': empty_queues,
93 | 'die_when_empty': die_when_empty,
94 | },
95 | )
96 | p.start()
97 | processes.append(p)
98 |
99 | for p in processes:
100 | p.join()
101 | p.terminate()
102 |
103 | def process_queue(self, queue_name, die_on_error, empty_queues,
104 | die_when_empty):
105 | try:
106 | queue = self.session.resource('sqs').get_queue_by_name(QueueName=queue_name)
107 | except Exception:
108 | empty_queues[queue_name] = True
109 | raise
110 |
111 | if not queue_name or not queue:
112 | empty_queues[queue_name] = True
113 | return
114 |
115 | while True:
116 | kwargs = {
117 | 'WaitTimeSeconds': self.wait_seconds,
118 | 'MaxNumberOfMessages': 10,
119 | 'MessageAttributeNames': ['All'],
120 | 'AttributeNames': ['All'],
121 | }
122 | messages = queue.receive_messages(**kwargs)
123 |
124 | if not messages:
125 | empty_queues[queue_name] = True
126 | if all(list(empty_queues.values())) and die_when_empty:
127 | exit()
128 |
129 | time.sleep(self.wait_seconds)
130 | continue
131 |
132 | empty_queues[queue_name] = False
133 |
134 | msg_results = []
135 | for msg in messages:
136 | data = self.data_pickler.loads(msg.body)
137 | try:
138 | self._steps[queue_name].receive_job(**data)
139 | except Exception:
140 | empty_queues[queue_name] = True
141 | if die_on_error:
142 | raise
143 |
144 | msg_results.append({
145 | 'Id': msg.message_id,
146 | 'ReceiptHandle': msg.receipt_handle
147 | })
148 |
149 | if msg_results:
150 | queue.delete_messages(Entries=msg_results)
151 |
152 | def flush_queue(self, step):
153 | raise NotImplemented("Not implemented yet. Delete queue using "
154 | "SQS dashboard")
155 |
156 | def jobs_count(self, *steps):
157 | jobs = 0
158 |
159 | for step in steps:
160 | queue_name = self.get_queue_name(step)
161 | sqs_q = self.sqs_client.get_queue_url(QueueName=queue_name)
162 | attrs = self.sqs_client.get_queue_attributes(
163 | sqs_q, ['ApproximateNumberOfMessages'])
164 | jobs += attrs.get("ApproximateNumberOfMessages", 0)
165 |
166 | return jobs
167 |
168 | def register_worker(self, step):
169 | queue_name = self.get_queue_name(step)
170 |
171 | attrs = {}
172 | kwargs = {
173 | 'QueueName': queue_name,
174 | 'Attributes': attrs,
175 | }
176 | if self.message_retention_period is not None:
177 | attrs['MessageRetentionPeriod'] = str(self.message_retention_period)
178 | if self.visibility_timeout is not None:
179 | attrs['VisibilityTimeout'] = str(self.visibility_timeout)
180 |
181 | self.sqs_client.create_queue(**kwargs)
182 |
183 | queue = self.sqs_resource.get_queue_by_name(QueueName=queue_name)
184 |
185 | self._queues[queue_name] = queue
186 | self._steps[queue_name] = step
187 |
188 | def monitor_steps(self, step_keys, monitoring_for_sec):
189 | pass
190 |
191 | def get_queue_name(self, step):
192 | return step.step_key().replace(":", "-")
193 |
194 |
195 | def _move_first_to_the_end(a):
196 | return a[1:] + [a[0]]
197 |
198 |
199 | class DieWhenEmpty:
200 | def __init__(self, active, queues):
201 | self.active = active
202 | self.queues = queues
203 |
204 | self.queus_no_jobs = set()
205 |
206 | def update_status(self, queue_name, no_job):
207 | if no_job:
208 | self.queus_no_jobs.add(queue_name)
209 | elif queue_name in self.queus_no_jobs:
210 | self.queus_no_jobs.remove(queue_name)
211 |
212 | def __bool__(self):
213 | return len(self.queus_no_jobs) >= len(self.queues)
214 |
--------------------------------------------------------------------------------
/stepist/flow/workers/adapters/utils.py:
--------------------------------------------------------------------------------
1 | import signal
2 |
3 |
4 | class Timeout():
5 | """Timeout class using ALARM signal."""
6 | class Timeout(Exception):
7 | pass
8 |
9 | def __init__(self, sec):
10 | self.sec = sec
11 |
12 | def __enter__(self):
13 | signal.signal(signal.SIGALRM, self.raise_timeout)
14 | signal.alarm(self.sec)
15 |
16 | def __exit__(self, *args):
17 | signal.alarm(0) # disable alarm
18 |
19 | def raise_timeout(self, *args):
20 | raise Timeout.Timeout()
21 |
22 |
23 |
24 | class RedisMonitor():
25 | def __init__(self, connection_pool):
26 | self.connection_pool = connection_pool
27 | self.connection = None
28 |
29 | def __del__(self):
30 | try:
31 | self.reset()
32 | except:
33 | pass
34 |
35 | def reset(self):
36 | if self.connection:
37 | self.connection_pool.release(self.connection)
38 | self.connection = None
39 |
40 | def monitor(self, sec):
41 | if self.connection is None:
42 | self.connection = self.connection_pool.get_connection(
43 | 'monitor', None)
44 | self.connection.send_command("monitor")
45 | return self.listen(sec)
46 |
47 | def parse_response(self):
48 | return self.connection.read_response()
49 |
50 | def listen(self, sec):
51 | try:
52 | with Timeout(sec):
53 | while True:
54 | yield self.parse_response()
55 | except Timeout.Timeout:
56 | return
57 |
--------------------------------------------------------------------------------
/stepist/flow/workers/boost/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/boost/__init__.py
--------------------------------------------------------------------------------
/stepist/flow/workers/boost/shared_memory.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/electronick1/stepist/2f0fcd5c12c9ae4894a31b9a3b4a7f303ece41aa/stepist/flow/workers/boost/shared_memory.py
--------------------------------------------------------------------------------
/stepist/flow/workers/boost/sockets.py:
--------------------------------------------------------------------------------
1 | """
2 | Add APP NAME FOR WORKERS (!!!)
3 |
4 | """
5 |
6 | import random
7 | import socket
8 | import redis
9 | import selectors
10 | import asyncio
11 |
12 | from threading import Thread
13 |
14 | from stepist.flow.steps.step import StepData
15 |
16 |
17 | DATA_HEADER = b'/stairs_line_separation/\n'
18 |
19 |
20 | class SocketData:
21 | def __init__(self, step, step_data):
22 | self.step = step
23 | self.step_data = step_data
24 |
25 | def to_json(self):
26 | return dict(
27 | step_key=self.step.step_key(),
28 | step_data=self.step_data.get_dict(),
29 | )
30 |
31 | @classmethod
32 | def from_json(cls, app, json_data):
33 | return cls(
34 | step=app.steps.get(json_data['step_key']),
35 | step_data=StepData(**json_data['step_data'])
36 | )
37 |
38 |
39 | class SocketBooster:
40 |
41 | def __init__(self, app, socket_address='/tmp/stairs', use_ipc=True,
42 | socket_port_range=(49152, 65536), buffer_size=1000):
43 |
44 | self.app = app
45 |
46 | self.buffer_size = buffer_size
47 | self.use_ipc = use_ipc
48 |
49 | self.sender = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
50 | self.socket_address = self.gen_address(socket_address, socket_port_range)
51 | self.sender.bind(self.socket_address)
52 | self.sender.listen()
53 |
54 | self.socket_redis = redis.Redis(**self.app.config.redis_kwargs)
55 | self._register_client_in_redis()
56 |
57 | self.connections = SocketConnections(self.sender,
58 | self.socket_redis,
59 | self.socket_address)
60 |
61 | self.receiver_event_loop = SocketWorkersEventLoop(self,
62 | self.connections,
63 | self.buffer_size)
64 |
65 | self.sender_event_loop = SocketProducerEventLoop(self,
66 | self.connections,
67 | self.buffer_size)
68 | self.cnt_job_sent = 0
69 |
70 | self.socket_connections_updater = \
71 | Thread(target=self.connections.connections_updater_loop,
72 | args=(self.sender_event_loop, self.receiver_event_loop))
73 | self.socket_connections_updater.start()
74 |
75 | def gen_address(self, socket_address, socket_port_range):
76 | if self.use_ipc:
77 | return "%s%s" % (socket_address, random.randint(*socket_port_range))
78 | else:
79 | return "%s:%s" % (socket_address, random.randint(*socket_port_range))
80 |
81 | def _register_client_in_redis(self):
82 | key = "socket_client:%s" % self.socket_address
83 | self.socket_redis.client_setname(key)
84 |
85 | def send_job(self, step, data, **kwargs):
86 | self.sender_event_loop.send_job(step, data)
87 |
88 | def process(self, steps, die_on_error=True, die_when_empty=False):
89 | self.receiver_event_loop.run_event_loop()
90 |
91 | def handle_job(self, data_bytes):
92 | row_data = self.app.data_pickler.loads(data_bytes)
93 |
94 | s_data = SocketData.from_json(self.app, row_data)
95 |
96 | step = s_data.step
97 | step_data = s_data.step_data
98 |
99 | if step is not None:
100 | self.forward_to_queue(step, step_data)
101 | return
102 |
103 | # if step.step_key() not in available_handlers:
104 | # self.forward_to_queue(step, step_data)
105 |
106 | s_data.step.receive_job(**s_data.step_data.get_dict())
107 |
108 | def forward_to_queue(self, step, step_data):
109 | self.app.add_job(step, step_data, skip_booster=True)
110 |
111 |
112 | class SocketConnections:
113 | """
114 | Trying to update information about whole connections which we have
115 | in redis.
116 |
117 | """
118 | def __init__(self, socket_host, socket_redis, current_address):
119 | self.socket_host = socket_host
120 | self.socket_redis = socket_redis
121 |
122 | self.ignore_addresses = [current_address]
123 |
124 | self.current_workers_sockets = []
125 |
126 | self.current_producers = []
127 | self.current_producers_sockets = []
128 | self.producer_selector = selectors.DefaultSelector()
129 |
130 | def get_producers(self):
131 | workers_addresses = []
132 |
133 | clients_list = self.socket_redis.client_list()
134 |
135 | for client in clients_list:
136 | client_name = client['name']
137 |
138 | if 'socket_client:' not in client_name:
139 | continue
140 |
141 | address = client_name.split("socket_client:")[1]
142 |
143 | if address not in self.ignore_addresses:
144 | workers_addresses.append(address)
145 |
146 | return workers_addresses
147 |
148 | def connections_updater_loop(self, socket_producer, socker_receiver):
149 | while True:
150 | for producer_address in self.get_producers():
151 | if producer_address in self.current_producers:
152 | continue
153 |
154 | producer_socket = socket.socket(socket.AF_UNIX,
155 | socket.SOCK_STREAM)
156 | producer_socket.connect(producer_address)
157 | producer_socket.setblocking(False)
158 | socker_receiver.add_new_producer(producer_socket)
159 | self.current_producers.append(producer_address)
160 |
161 | conn, addr = self.socket_host.accept()
162 | #conn.setblocking(1)
163 | conn.setblocking(False)
164 | self.current_workers_sockets.append(conn)
165 | socket_producer.add_new_worker(conn)
166 |
167 |
168 | class SocketProducerEventLoop:
169 |
170 | def __init__(self, booster, socket_workers, buffer_size):
171 | self.booster = booster
172 | self.loop = asyncio.new_event_loop()
173 | self.socket_workers = socket_workers
174 |
175 | self.socket_buffer_size = []
176 | self.buffer_size = buffer_size
177 |
178 | self.workers_selector = selectors.DefaultSelector()
179 | self.sockets = []
180 |
181 | def add_new_worker(self, conn):
182 | self.socket_buffer_size.append(0)
183 | self.sockets.append(conn)
184 | self.workers_selector.register(conn,
185 | selectors.EVENT_READ,
186 | data=len(self.socket_buffer_size)-1)
187 |
188 | def send_job(self, step, step_data):
189 |
190 | if not self.socket_buffer_size:
191 | self.booster.forward_to_queue(step, step_data)
192 | return
193 |
194 | socket_data = SocketData(step=step, step_data=step_data)
195 | data_encoded = self.booster.app.data_pickler.dumps(socket_data.to_json())
196 | data_encoded = data_encoded.encode("utf-8")
197 |
198 | for i in range(len(self.socket_buffer_size)):
199 | if self.socket_buffer_size[i] > 0:
200 | try:
201 | self.sockets[i].send(data_encoded + DATA_HEADER)
202 |
203 | except socket.timeout:
204 | print("Timeout error for one of the worker,")
205 | print("It will be removed from workers list")
206 | # Disable socket buffer
207 | self.socket_buffer_size[i] = -1
208 | # continue send_job logic
209 | self.booster.forward_to_queue(step, step_data)
210 | return
211 |
212 | except BlockingIOError:
213 | self.booster.forward_to_queue(step, step_data)
214 | return
215 |
216 | except BrokenPipeError:
217 | print("BrokenPipeError for one of the worker")
218 | print("It will be removed from workers list")
219 | self.socket_buffer_size[i] = -1
220 | self.booster.forward_to_queue(step, step_data)
221 | return
222 |
223 | self.socket_buffer_size[i] = self.socket_buffer_size[i] - 1
224 | return
225 |
226 | self.booster.forward_to_queue(step, step_data)
227 |
228 | events = self.workers_selector.select(timeout=5)
229 |
230 | for key, mask in events:
231 | index = key.data
232 | sock = key.fileobj
233 |
234 | try:
235 | d = sock.recv(1024)
236 | except socket.timeout as e:
237 | continue
238 |
239 | if d:
240 | self.socket_buffer_size[index] = self.buffer_size
241 |
242 |
243 | class SocketWorkersEventLoop:
244 |
245 | def __init__(self, booster, socket_workers, buffer_size):
246 | self.booster = booster
247 | self.socket_workers = socket_workers
248 |
249 | self.socket_buffer_size = []
250 | self.buffer_size = buffer_size
251 | self.sockets = []
252 |
253 | self.threads_events = dict()
254 |
255 | self.producers_selector = selectors.DefaultSelector()
256 |
257 | def add_new_producer(self, producer_socket):
258 | self.sockets.append(producer_socket)
259 | self.socket_buffer_size.append(0)
260 | self.producers_selector.register(producer_socket,
261 | selectors.EVENT_READ,
262 | data=len(self.sockets)-1)
263 |
264 | def run_event_loop(self):
265 | while True:
266 | for i in range(len(self.sockets)):
267 | if self.socket_buffer_size[i] == 0:
268 | try:
269 | self.sockets[i].send(b'ready_to_consume')
270 | except (socket.timeout, BlockingIOError):
271 | continue
272 | self.socket_buffer_size[i] = self.buffer_size
273 |
274 | events = self.producers_selector.select(timeout=5)
275 |
276 | for key, mask in events:
277 | index = key.data
278 | sock = key.fileobj
279 |
280 | data = b''
281 | try:
282 | while True:
283 | try:
284 | sock_data = sock.recv(256 * 1024 * 1024)
285 | except BlockingIOError:
286 | break
287 |
288 | data += sock_data
289 | if not sock_data:
290 | break
291 |
292 | except socket.timeout:
293 | continue
294 |
295 | if data:
296 | rows = data.split(DATA_HEADER)
297 | self.socket_buffer_size[index] -= (len(rows) - 1)
298 |
--------------------------------------------------------------------------------
/stepist/flow/workers/reducer_engine.py:
--------------------------------------------------------------------------------
1 | import time
2 | from stepist.flow import session
3 |
4 |
5 | class BaseReducerEngine(object):
6 |
7 | def add_job(self, reducer_step, data):
8 | raise NotImplemented()
9 |
10 |
11 | class RedisReducerEngine(BaseReducerEngine):
12 |
13 | def __init__(self, app, redis_db, reducer_job_lifetime,
14 | reducer_no_job_sleep_time):
15 | self.app = app
16 | self.redis_db = redis_db
17 |
18 | self.reducer_job_lifetime = reducer_job_lifetime
19 | self.reducer_no_job_sleep_time = reducer_no_job_sleep_time
20 |
21 | def add_job(self, reducer_step, data, **kwargs):
22 | hub_job_id = session.get_meta_data().get("hub_job_id", None)
23 | if hub_job_id is None:
24 | raise RuntimeError("job id not found. Do you have 'HUB' step before"
25 | "reducer")
26 |
27 | current_amount = self.redis_db.zincrby(
28 | "%s:%s" % ("count", reducer_step.step_key()),
29 | 1,
30 | hub_job_id,
31 |
32 | )
33 |
34 | pipe = self.redis_db.pipeline()
35 | pipe.hset(
36 | "%s:%s" % (reducer_step.step_key(), hub_job_id),
37 | current_amount,
38 | self.app.config.pickler.dumps(data),
39 | )
40 | pipe.expire(
41 | "%s:%s" % (reducer_step.step_key(), hub_job_id),
42 | self.reducer_job_lifetime
43 | )
44 | pipe.execute()
45 |
46 | def process(self, reducer_step):
47 | while True:
48 | max_value = self.redis_db.zpopmax(
49 | "%s:%s" % ("count", reducer_step.step_key()),
50 | )
51 | if not max_value:
52 | time.sleep(self.reducer_no_job_sleep_time)
53 | continue
54 |
55 | hub_job_id, count = max_value[0]
56 | hub_job_id = hub_job_id.decode("utf-8")
57 |
58 | key_count = hub_job_id.split(":")[1]
59 |
60 | if int(count) < int(key_count):
61 | self.redis_db.zincrby(
62 | "%s:%s" % ("count", reducer_step.step_key()),
63 | hub_job_id,
64 | int(count)
65 | )
66 | time.sleep(self.reducer_no_job_sleep_time)
67 | continue
68 |
69 | data = self.redis_db.hgetall(
70 | "%s:%s" % (reducer_step.step_key(), hub_job_id),
71 | )
72 | if data:
73 | values = []
74 | for v in data.values():
75 | v = self.app.config.pickler.loads(v.decode('utf-8'))
76 | values.append(v)
77 |
78 | reducer_step(job_list=values)
79 | self.redis_db.delete("%s:%s" % (reducer_step.step_key(),
80 | hub_job_id))
81 |
--------------------------------------------------------------------------------
/stepist/flow/workers/utils.py:
--------------------------------------------------------------------------------
1 | from stepist.flow import session
2 |
3 |
4 | def validate_steps(steps):
5 | valid_steps = []
6 |
7 | for step in steps:
8 | if not step.as_worker:
9 | print(step.step_key(), "is not worker")
10 | continue
11 |
12 | if isinstance(step, str):
13 | valid_steps.append(session.get_step_by_key(step))
14 | else:
15 | valid_steps.append(step)
16 |
17 | return valid_steps
18 |
--------------------------------------------------------------------------------
/stepist/flow/workers/worker_engine.py:
--------------------------------------------------------------------------------
1 |
2 | NOT_IMPLEMENTED_DESC = "Not supported yet ..."
3 |
4 |
5 | class BaseWorkerEngine(object):
6 |
7 | def add_job(self, step, data, result_reader, **kwargs):
8 | """
9 | Add data to queue/streaming service.
10 | """
11 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
12 |
13 | def add_jobs(self, step, data_iter, result_reader):
14 | """
15 | Add batch of data to queue/streaming service in one transaction
16 | """
17 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
18 |
19 | def receive_job(self, step):
20 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
21 |
22 | def jobs_count(self, *steps):
23 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
24 |
25 | def flush_queue(self, step):
26 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
27 |
28 | def process(self, *steps):
29 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
30 |
31 | def register_worker(self, handler):
32 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
33 |
34 | def monitor_steps(self, step_keys, monitoring_for_sec):
35 | raise NotImplemented(NOT_IMPLEMENTED_DESC)
36 |
--------------------------------------------------------------------------------