├── idataapi_transform ├── DataProcess │ ├── __init__.py │ ├── Meta │ │ ├── __init__.py │ │ └── BaseDataProcess.py │ ├── Config │ │ ├── __init__.py │ │ ├── ConfigUtil │ │ │ ├── __init__.py │ │ │ ├── BaseConfig.py │ │ │ ├── AsyncHelper.py │ │ │ ├── WriterConfig.py │ │ │ └── GetterConfig.py │ │ ├── ConnectorConfig.py │ │ ├── LogConfig.py │ │ ├── DefaultValue.py │ │ ├── MainConfig.py │ │ └── ESConfig.py │ ├── DataGetter │ │ ├── __init__.py │ │ ├── BaseGetter.py │ │ ├── CSVGetter.py │ │ ├── JsonGetter.py │ │ ├── XLSXGetter.py │ │ ├── MongoGetter.py │ │ ├── ESGetter.py │ │ ├── RedisGetter.py │ │ ├── MySQLGetter.py │ │ └── APIGetter.py │ ├── DataWriter │ │ ├── __init__.py │ │ ├── BaseWriter.py │ │ ├── JsonWriter.py │ │ ├── TXTWriter.py │ │ ├── RedisWriter.py │ │ ├── KafkaWriter.py │ │ ├── ESWriter.py │ │ ├── MongoWriter.py │ │ ├── CSVWriter.py │ │ ├── XLSXWriter.py │ │ └── MySQLWriter.py │ ├── PersistentUtil │ │ ├── __init__.py │ │ └── PersistentWriter.py │ └── ProcessFactory.py ├── __init__.py └── cli.py ├── .gitignore ├── idataapi-transform.png ├── requirements.txt ├── pyproject.toml ├── LICENSE └── README_CN_simple.md /idataapi_transform/DataProcess/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Meta/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/PersistentUtil/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConfigUtil/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs/* 2 | test* 3 | .DS_Store 4 | .idea/* 5 | config.ini 6 | *.pyc 7 | venv/* 8 | -------------------------------------------------------------------------------- /idataapi-transform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zpoint/idataapi-transform/HEAD/idataapi-transform.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | openpyxl 3 | elasticsearch==7.9.0 4 | aioredis>=1.0 5 | PyMySQL>=0.7.5,<0.9;python_version>="3.5.3" 6 | aiomysql 7 | confluent_kafka 8 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConfigUtil/BaseConfig.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class BaseGetterConfig(object, metaclass=abc.ABCMeta): 5 | @abc.abstractmethod 6 | def __init__(self, *args, **kwargs): 7 | pass 8 | 9 | 10 | class BaseWriterConfig(object, metaclass=abc.ABCMeta): 11 | @abc.abstractmethod 12 | def __init__(self, *args, **kwargs): 13 | pass 14 | 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [tool.flit.metadata] 6 | module="idataapi_transform" 7 | author="zpoint" 8 | author-email="zp0int@qq.com" 9 | home-page="https://github.com/zpoint/idataapi-transform" 10 | classifiers=["License :: OSI Approved :: MIT License"] 11 | requires=["aiohttp", "openpyxl", "elasticsearch-async", "aioredis", "confluent_kafka"] 12 | requires-python=">=3.5.2" 13 | keywords="idataapi transform" 14 | dist-name="idataapi-transform" 15 | 16 | [tool.flit.scripts] 17 | transform="idataapi_transform:main" 18 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/BaseWriter.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from ..Meta.BaseDataProcess import BaseDataProcess 3 | 4 | 5 | class BaseWriter(BaseDataProcess, metaclass=abc.ABCMeta): 6 | @abc.abstractmethod 7 | def __init__(self, *args, **kwargs): 8 | """ 9 | :param config 10 | """ 11 | pass 12 | 13 | @abc.abstractmethod 14 | async def write(self, responses): 15 | pass 16 | 17 | @abc.abstractmethod 18 | def __enter__(self): 19 | pass 20 | 21 | @abc.abstractmethod 22 | def __exit__(self, exc_type, exc_val, exc_tb): 23 | pass 24 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/BaseGetter.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from ..Meta.BaseDataProcess import BaseDataProcess 3 | 4 | 5 | class BaseGetter(BaseDataProcess, metaclass=abc.ABCMeta): 6 | @abc.abstractmethod 7 | def __init__(self, *args, **kwargs): 8 | """ 9 | :param config 10 | config contains attribute: 11 | source: where to read data 12 | per_limit: return at most per_limit data each time 13 | max_limit: return at most max_limit data total 14 | """ 15 | pass 16 | 17 | @abc.abstractmethod 18 | def __aiter__(self): 19 | return self 20 | 21 | @abc.abstractmethod 22 | async def __anext__(self): 23 | pass 24 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConfigUtil/AsyncHelper.py: -------------------------------------------------------------------------------- 1 | class AsyncGenerator(object): 2 | def __init__(self, items, process_func): 3 | self.items = items 4 | if hasattr(self.items, "__aiter__"): 5 | self.is_async = True 6 | else: 7 | self.is_async = False 8 | self.items = self.to_generator(items) 9 | self.process_func = process_func 10 | 11 | def __aiter__(self): 12 | return self 13 | 14 | async def __anext__(self): 15 | if self.is_async: 16 | r = await self.items.__anext__() 17 | return self.process_func(r) 18 | else: 19 | try: 20 | r = next(self.items) 21 | return self.process_func(r) 22 | except StopIteration: 23 | raise StopAsyncIteration 24 | 25 | @staticmethod 26 | def to_generator(items): 27 | for i in items: 28 | yield i 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 zpoint 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /idataapi_transform/__init__.py: -------------------------------------------------------------------------------- 1 | """convert data from a format to another format, read or write from file or database, suitable for iDataAPI""" 2 | 3 | from .cli import main 4 | from .DataProcess.Config.ConfigUtil import WriterConfig 5 | from .DataProcess.Config.ConfigUtil import GetterConfig 6 | from .DataProcess.ProcessFactory import ProcessFactory 7 | 8 | 9 | class ManualConfig(object): 10 | @staticmethod 11 | def set_config(ini_path): 12 | from .DataProcess.Config.MainConfig import main_config_box 13 | from .DataProcess.Config.DefaultValue import DefaultVal 14 | main_config_box.read_config(ini_path) 15 | DefaultVal.refresh() 16 | 17 | @staticmethod 18 | def disable_log(): 19 | from .DataProcess.Config.LogConfig import remove_log 20 | remove_log() 21 | 22 | @staticmethod 23 | def set_log_path(log_path, max_log_file_bytes): 24 | """ 25 | :param log_path: directory where log stores, i.e ===> /Desktop/logs/ 26 | :param max_log_file_bytes: max log file size, in bytes, i.e: 5242880(5MB) 27 | :return: 28 | """ 29 | from .DataProcess.Config.MainConfig import main_config_box 30 | from .DataProcess.Config.DefaultValue import DefaultVal 31 | main_config_box.config_log(log_path, max_log_file_bytes) 32 | 33 | 34 | __version__ = '2.0.2' 35 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/JsonWriter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from .BaseWriter import BaseWriter 4 | 5 | 6 | class JsonWriter(BaseWriter): 7 | def __init__(self, config): 8 | super().__init__() 9 | self.config = config 10 | self.total_miss_count = 0 11 | self.success_count = 0 12 | self.f_out = open(self.config.filename, self.config.mode, encoding=self.config.encoding) 13 | 14 | def write(self, responses): 15 | miss_count = 0 16 | for each_response in responses: 17 | if self.config.expand: 18 | each_response = self.expand_dict(each_response, max_expand=self.config.expand) 19 | 20 | if self.config.filter: 21 | each_response = self.config.filter(each_response) 22 | if not each_response: 23 | miss_count += 1 24 | continue 25 | self.f_out.write(json.dumps(each_response) + self.config.new_line) 26 | self.success_count += 1 27 | self.total_miss_count += miss_count 28 | logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count)) 29 | 30 | def __exit__(self, exc_type, exc_val, exc_tb): 31 | self.f_out.close() 32 | logging.info("%s write done, total filtered %d item, total write %d item" % 33 | (self.config.filename, self.total_miss_count, self.success_count)) 34 | 35 | def __enter__(self): 36 | return self 37 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/TXTWriter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .BaseWriter import BaseWriter 3 | 4 | 5 | class TXTWriter(BaseWriter): 6 | def __init__(self, config): 7 | super().__init__() 8 | self.config = config 9 | self.f_out = open(config.filename, config.mode, encoding=config.encoding) 10 | self.total_miss_count = 0 11 | self.success_count = 0 12 | 13 | def write(self, responses): 14 | miss_count = 0 15 | for each_response in responses: 16 | if self.config.expand: 17 | each_response = self.expand_dict(each_response, max_expand=self.config.expand) 18 | 19 | if self.config.filter: 20 | each_response = self.config.filter(each_response) 21 | if not each_response: 22 | miss_count += 1 23 | continue 24 | 25 | self.f_out.write(self.config.join_val.join(str(value) for value in each_response.values()) + self.config.new_line) 26 | self.success_count += 1 27 | 28 | self.total_miss_count += miss_count 29 | logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count)) 30 | 31 | def __exit__(self, exc_type, exc_val, exc_tb): 32 | self.f_out.close() 33 | logging.info("%s write done, total filtered %d item, total write %d item" % 34 | (self.config.filename, self.total_miss_count, self.success_count)) 35 | 36 | def __enter__(self): 37 | return self 38 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Meta/BaseDataProcess.py: -------------------------------------------------------------------------------- 1 | 2 | class BaseDataProcess(object): 3 | @staticmethod 4 | def expand_dict(origin_item, max_expand=0, current_expand=0, parent_key=None, parent_item=None): 5 | if max_expand == 0: 6 | return origin_item 7 | if max_expand != -1 and current_expand >= max_expand: 8 | return origin_item 9 | if parent_key: 10 | if isinstance(origin_item, dict): 11 | for sub_k, sub_v in origin_item.items(): 12 | parent_item[parent_key + "_" + sub_k] = sub_v 13 | if parent_key in parent_item: 14 | del parent_item[parent_key] 15 | elif isinstance(origin_item, list): 16 | for item in origin_item: 17 | BaseDataProcess.expand_dict(item, max_expand, current_expand + 1, parent_key, parent_item) 18 | return origin_item 19 | 20 | keys = [k for k in origin_item.keys()] 21 | has_sub_dict = False 22 | for k in keys: 23 | if isinstance(origin_item[k], dict): 24 | has_sub_dict = True 25 | sub_dict = origin_item[k] 26 | for sub_k, sub_v in sub_dict.items(): 27 | origin_item[k + "_" + sub_k] = sub_v 28 | del origin_item[k] 29 | elif isinstance(origin_item[k], list): 30 | for item in origin_item[k]: 31 | BaseDataProcess.expand_dict(item, max_expand, current_expand + 1, k, origin_item) 32 | 33 | if has_sub_dict: 34 | return BaseDataProcess.expand_dict(origin_item, max_expand, current_expand + 1) 35 | else: 36 | return origin_item 37 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConnectorConfig.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | import inspect 4 | from .MainConfig import main_config 5 | 6 | 7 | class _SessionManger(object): 8 | def __init__(self, concurrency_limit=None, loop=None): 9 | concurrency_limit = main_config()["main"].getint("concurrency") if concurrency_limit is None else concurrency_limit 10 | self.session = self._generate_session(concurrency_limit=concurrency_limit, loop=loop) 11 | 12 | @staticmethod 13 | def _generate_connector(limit=None, loop=None): 14 | """ 15 | https://github.com/KeepSafe/aiohttp/issues/883 16 | if connector is passed to session, it is not available anymore 17 | """ 18 | limit = main_config()["main"].getint("concurrency") if limit is None else limit 19 | if not loop: 20 | loop = asyncio.get_event_loop() 21 | return aiohttp.TCPConnector(limit=limit, loop=loop) 22 | 23 | @staticmethod 24 | def _generate_session(concurrency_limit=None, loop=None): 25 | if not loop: 26 | loop = asyncio.get_event_loop() 27 | concurrency_limit = main_config()["main"].getint("concurrency") if concurrency_limit is None else concurrency_limit 28 | return aiohttp.ClientSession(connector=_SessionManger._generate_connector(limit=concurrency_limit, loop=loop), 29 | loop=loop) 30 | 31 | def get_session(self): 32 | return self.session 33 | 34 | def __del__(self): 35 | try: 36 | if inspect.iscoroutinefunction(self.session.close): 37 | loop = asyncio.get_event_loop() 38 | loop.run_until_complete(self.session.close()) 39 | else: 40 | self.session.close() 41 | except Exception as e: 42 | pass 43 | 44 | 45 | session_manger = _SessionManger() 46 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/PersistentUtil/PersistentWriter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import hashlib 5 | import logging 6 | 7 | class PersistentWriter(object): 8 | def __init__(self, persistent_key): 9 | self.f_name = persistent_key + ".json" 10 | self.latest_record = set() 11 | self.load_last_record() 12 | self.f_out = open(self.f_name, "a+", encoding="utf8") 13 | self.prev_latest_record_num = len(self.latest_record) 14 | 15 | def load_last_record(self): 16 | if os.path.exists(self.f_name): 17 | try: 18 | with open(self.f_name, "r", encoding="utf8") as f: 19 | self.latest_record = set(json.loads(f.read())["record"]) 20 | except Exception: 21 | logging.error("Broken record file: %s, recreating file" % (self.f_name, )) 22 | self.remove_file() 23 | 24 | def write(self): 25 | if len(self.latest_record) == self.prev_latest_record_num: 26 | return 27 | else: 28 | self.prev_latest_record_num = len(self.latest_record) 29 | 30 | self.truncate() 31 | self.f_out.seek(0) 32 | ts = int(time.time()) 33 | struct_time = time.localtime(ts) 34 | dt = time.strftime('%Y-%m-%d %H:%M:%S', struct_time) 35 | record = { 36 | "record": list(self.latest_record), 37 | "record_length": len(self.latest_record), 38 | "timestamp": ts, 39 | "date": dt, 40 | "filename": self.f_name 41 | } 42 | self.f_out.write(json.dumps(record)) 43 | logging.info("persistent to disk, f_name: %s, total_task_num: %d" % (self.f_name, len(self.latest_record))) 44 | 45 | def add(self, key): 46 | key = hashlib.md5(key.encode("utf8")).hexdigest() 47 | self.latest_record.add(key) 48 | 49 | def __contains__(self, item): 50 | key = hashlib.md5(item.encode("utf8")).hexdigest() 51 | return key in self.latest_record 52 | 53 | def sync(self): 54 | self.f_out.flush() 55 | 56 | def remove_file(self): 57 | os.unlink(self.f_name) 58 | 59 | def truncate(self): 60 | self.f_out.truncate(0) 61 | 62 | def clear(self, start_fresh_if_done): 63 | self.latest_record = None 64 | if start_fresh_if_done: 65 | self.remove_file() 66 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/LogConfig.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from logging.handlers import RotatingFileHandler 4 | 5 | format_str = "%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s" 6 | date_formatter_str = '[%Y-%m-%d %H:%M:%S]' 7 | formatter = logging.Formatter(format_str, datefmt=date_formatter_str) 8 | 9 | 10 | class SingleLevelFilter(logging.Filter): 11 | def __init__(self, passlevel, reject): 12 | super(SingleLevelFilter, self).__init__() 13 | self.passlevel = passlevel 14 | self.reject = reject 15 | 16 | def filter(self, record): 17 | if self.reject: 18 | return record.levelno != self.passlevel 19 | else: 20 | return record.levelno == self.passlevel 21 | 22 | 23 | def init_log(log_dir, max_log_file_bytes, ini_path, manual=False): 24 | root_logger = logging.getLogger() 25 | root_logger.setLevel(logging.INFO) 26 | # console 27 | console = logging.StreamHandler() 28 | console.setFormatter(formatter) 29 | root_logger.addHandler(console) 30 | if log_dir: 31 | if not os.path.exists(log_dir): 32 | logging.error("log_dir(%s)%s not exists, I will not log to file" % (log_dir, "" if manual else " in configure file(%s)" % (ini_path, ))) 33 | return False 34 | if not max_log_file_bytes: 35 | logging.error("log_byte not set, please %s, or I will not log to file" % ("pass log_byte as parameters" if manual else "configure log_byte in configure file(%s)" % (ini_path, ))) 36 | return False 37 | # info 38 | h1 = RotatingFileHandler("%s/info.log" % (log_dir, ), mode="a", maxBytes=max_log_file_bytes, 39 | encoding="utf8", backupCount=1) 40 | h1.setFormatter(formatter) 41 | f1 = SingleLevelFilter(logging.INFO, False) 42 | h1.addFilter(f1) 43 | root_logger.addHandler(h1) 44 | 45 | # error 46 | h1 = RotatingFileHandler("%s/error.log" % (log_dir, ), mode="a", maxBytes=max_log_file_bytes, 47 | encoding="utf8", backupCount=1) 48 | h1.setFormatter(formatter) 49 | f1 = SingleLevelFilter(logging.ERROR, False) 50 | h1.addFilter(f1) 51 | root_logger.addHandler(h1) 52 | # logging.info("log dir set to: %s" % (log_dir, )) 53 | return True 54 | return False 55 | 56 | 57 | def remove_log(): 58 | root_logger = logging.getLogger() 59 | root_logger.handlers.clear() 60 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/RedisWriter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import asyncio 3 | import random 4 | import traceback 5 | import json 6 | import zlib 7 | from .BaseWriter import BaseWriter 8 | 9 | 10 | class RedisWriter(BaseWriter): 11 | def __init__(self, config): 12 | super().__init__() 13 | self.config = config 14 | self.total_miss_count = 0 15 | self.success_count = 0 16 | 17 | def encode(self, dict_object): 18 | string = json.dumps(dict_object) 19 | if self.config.compress: 20 | string = zlib.compress(string.encode(self.config.encoding)) 21 | return string 22 | 23 | async def write(self, responses): 24 | await self.config.get_redis_pool_cli() # init redis pool 25 | miss_count = 0 26 | target_responses = list() 27 | for each_response in responses: 28 | if self.config.filter: 29 | each_response = self.config.filter(each_response) 30 | if not each_response: 31 | miss_count += 1 32 | continue 33 | target_responses.append(each_response) 34 | self.success_count += 1 35 | self.total_miss_count += miss_count 36 | if target_responses: 37 | try_time = 0 38 | while try_time < self.config.max_retry: 39 | try: 40 | if self.config.is_range: 41 | await self.config.redis_write_method(self.config.key, *(self.encode(i) for i in target_responses)) 42 | else: 43 | pipe_line = self.config.redis_pool_cli.pipeline() 44 | for each in responses: 45 | pipe_line.hset(self.config.key, each["id"], self.encode(each)) 46 | await pipe_line.execute() 47 | 48 | logging.info("%s write %d item, filtered %d item" % (self.config.name, len(responses), miss_count)) 49 | break 50 | except Exception as e: 51 | try_time += 1 52 | if try_time >= self.config.max_retry: 53 | logging.error("Fail to write after try: %d times, Write 0 items to redis, " 54 | "filtered %d item before write, error: %s" % 55 | (self.config.max_retry, miss_count, str(traceback.format_exc()))) 56 | else: 57 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 58 | else: 59 | logging.info("Write 0 items to %s, filtered: %d, (all filtered, or pass empty result)" % (self.config.name, miss_count)) 60 | 61 | def __exit__(self, exc_type, exc_val, exc_tb): 62 | logging.info("%s write done, total filtered %d item, total write %d item" % 63 | (self.config.name, self.total_miss_count, self.success_count)) 64 | 65 | def __enter__(self): 66 | return self 67 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/KafkaWriter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .BaseWriter import BaseWriter 3 | 4 | import asyncio 5 | import confluent_kafka 6 | from confluent_kafka import KafkaException 7 | from threading import Thread 8 | import json 9 | 10 | 11 | 12 | class AIOProducer: 13 | def __init__(self, configs, loop=None): 14 | self._loop = loop or asyncio.get_event_loop() 15 | self._producer = confluent_kafka.Producer(configs) 16 | self._cancelled = False 17 | self._poll_thread = Thread(target=self._poll_loop) 18 | self._poll_thread.start() 19 | 20 | def _poll_loop(self): 21 | while not self._cancelled: 22 | self._producer.poll(0.1) 23 | 24 | def close(self): 25 | self._cancelled = True 26 | self._poll_thread.join() 27 | 28 | def produce(self, topic, value): 29 | """ 30 | An awaitable produce method. 31 | """ 32 | result = self._loop.create_future() 33 | 34 | def ack(err, msg): 35 | if err: 36 | self._loop.call_soon_threadsafe(result.set_exception, KafkaException(err)) 37 | else: 38 | self._loop.call_soon_threadsafe(result.set_result, msg) 39 | self._producer.produce(topic, value, on_delivery=ack) 40 | return result 41 | 42 | def produce2(self, topic, value, on_delivery): 43 | """ 44 | A produce method in which delivery notifications are made available 45 | via both the returned future and on_delivery callback (if specified). 46 | """ 47 | result = self._loop.create_future() 48 | 49 | def ack(err, msg): 50 | if err: 51 | self._loop.call_soon_threadsafe( 52 | result.set_exception, KafkaException(err)) 53 | else: 54 | self._loop.call_soon_threadsafe( 55 | result.set_result, msg) 56 | if on_delivery: 57 | self._loop.call_soon_threadsafe( 58 | on_delivery, err, msg) 59 | self._producer.produce(topic, value, on_delivery=ack) 60 | return result 61 | 62 | 63 | class KafkaWriter(BaseWriter): 64 | def __init__(self, config, topic, loop=None): 65 | super().__init__() 66 | self.topic = topic 67 | self.total_miss_count = 0 68 | self.success_count = 0 69 | self.producer = AIOProducer(configs={"bootstrap.servers": config.bootstrap_servers}, loop=loop) 70 | 71 | async def write(self, responses): 72 | for each_response in responses: 73 | if isinstance(each_response, dict): 74 | each_response = json.dumps(each_response, indent=2).encode('utf-8') 75 | await self.producer.produce(self.topic, each_response) 76 | self.success_count += 1 77 | logging.info("%s write %d item" % (self.topic, len(responses))) 78 | 79 | def __exit__(self, exc_type, exc_val, exc_tb): 80 | self.producer.close() 81 | logging.info("%s write done, total write %d item" % 82 | (self.topic, self.success_count)) 83 | 84 | def __enter__(self): 85 | return self 86 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/CSVGetter.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import logging 4 | from .BaseGetter import BaseGetter 5 | 6 | if sys.platform == "linux": 7 | csv.field_size_limit(sys.maxsize) 8 | 9 | 10 | class CSVGetter(BaseGetter): 11 | def __init__(self, config): 12 | super().__init__() 13 | self.config = config 14 | self.f_in = open(self.config.filename, self.config.mode, encoding=self.config.encoding) 15 | self.reader = csv.DictReader(self.f_in) 16 | 17 | self.done = False 18 | self.responses = list() 19 | self.miss_count = 0 20 | self.total_count = 0 21 | 22 | def init_val(self): 23 | self.done = False 24 | self.responses = list() 25 | self.f_in.seek(0, 0) 26 | self.miss_count = 0 27 | self.total_count = 0 28 | 29 | def __aiter__(self): 30 | return self 31 | 32 | async def __anext__(self): 33 | if self.done: 34 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 35 | (self.config.filename, self.total_count, self.miss_count)) 36 | self.init_val() 37 | raise StopAsyncIteration 38 | 39 | for row in self.reader: 40 | if self.config.max_limit and self.total_count > self.config.max_limit: 41 | self.done = True 42 | return self.clear_and_return() 43 | 44 | self.total_count += 1 45 | if self.config.filter: 46 | row = self.config.filter(row) 47 | if not row: 48 | self.miss_count += 1 49 | continue 50 | 51 | self.responses.append(row) 52 | if len(self.responses) > self.config.per_limit: 53 | return self.clear_and_return() 54 | 55 | if self.responses: 56 | self.done = True 57 | return self.clear_and_return() 58 | 59 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 60 | (self.config.filename, self.total_count, self.miss_count)) 61 | self.init_val() 62 | raise StopAsyncIteration 63 | 64 | def __iter__(self): 65 | for row in self.reader: 66 | if self.config.max_limit and self.total_count > self.config.max_limit: 67 | self.done = True 68 | yield self.clear_and_return() 69 | break 70 | 71 | self.total_count += 1 72 | if self.config.filter: 73 | row = self.config.filter(row) 74 | if not row: 75 | self.miss_count += 1 76 | continue 77 | 78 | self.responses.append(row) 79 | if len(self.responses) > self.config.per_limit: 80 | yield self.clear_and_return() 81 | 82 | if self.responses: 83 | yield self.responses 84 | 85 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 86 | (self.config.filename, self.total_count, self.miss_count)) 87 | self.init_val() 88 | 89 | def clear_and_return(self): 90 | resp = self.responses 91 | self.responses = list() 92 | return resp 93 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/ProcessFactory.py: -------------------------------------------------------------------------------- 1 | # config 2 | from .Config.MainConfig import main_config 3 | 4 | from .Config.ConfigUtil import GetterConfig 5 | from .Config.ConfigUtil import WriterConfig 6 | 7 | from .DataGetter.ESGetter import ESScrollGetter 8 | from .DataGetter.CSVGetter import CSVGetter 9 | from .DataGetter.APIGetter import APIGetter, APIBulkGetter 10 | from .DataGetter.JsonGetter import JsonGetter 11 | from .DataGetter.XLSXGetter import XLSXGetter 12 | from .DataGetter.RedisGetter import RedisGetter 13 | from .DataGetter.MySQLGetter import MySQLGetter 14 | from .DataGetter.MongoGetter import MongoGetter 15 | 16 | from .DataWriter.CSVWriter import CSVWriter 17 | from .DataWriter.ESWriter import ESWriter 18 | from .DataWriter.JsonWriter import JsonWriter 19 | from .DataWriter.TXTWriter import TXTWriter 20 | from .DataWriter.XLSXWriter import XLSXWriter 21 | from .DataWriter.RedisWriter import RedisWriter 22 | from .DataWriter.MySQLWriter import MySQLWriter 23 | from .DataWriter.MongoWriter import MongoWriter 24 | from .DataWriter.KafkaWriter import KafkaWriter 25 | 26 | 27 | class ProcessFactory(object): 28 | config_getter_map = { 29 | GetterConfig.RAPIConfig: APIGetter, 30 | GetterConfig.RCSVConfig: CSVGetter, 31 | GetterConfig.RESConfig: ESScrollGetter, 32 | GetterConfig.RJsonConfig: JsonGetter, 33 | GetterConfig.RXLSXConfig: XLSXGetter, 34 | GetterConfig.RAPIBulkConfig: APIBulkGetter, 35 | GetterConfig.RRedisConfig: RedisGetter, 36 | GetterConfig.RMySQLConfig: MySQLGetter, 37 | GetterConfig.RMongoConfig: MongoGetter 38 | } 39 | 40 | config_writer_map = { 41 | WriterConfig.WCSVConfig: CSVWriter, 42 | WriterConfig.WESConfig: ESWriter, 43 | WriterConfig.WJsonConfig: JsonWriter, 44 | WriterConfig.WTXTConfig: TXTWriter, 45 | WriterConfig.WXLSXConfig: XLSXWriter, 46 | WriterConfig.WRedisConfig: RedisWriter, 47 | WriterConfig.WMySQLConfig: MySQLWriter, 48 | WriterConfig.WMongoConfig: MongoWriter, 49 | WriterConfig.WKafkaConfig: KafkaWriter 50 | } 51 | 52 | @staticmethod 53 | def create_getter(config): 54 | """ 55 | create a getter based on config 56 | :return: getter 57 | """ 58 | for config_class, getter_class in ProcessFactory.config_getter_map.items(): 59 | if isinstance(config, config_class): 60 | return getter_class(config) 61 | raise ValueError("create_getter must pass one of the instance of [RAPIConfig, RCSVConfig, RESConfig, " 62 | "RJsonConfig, RXLSXConfig, RAPIBulkConfig, RRedisConfig, RMySQLConfig, RMongoConfig]") 63 | 64 | @staticmethod 65 | def create_writer(config, **kwargs): 66 | """ 67 | create a writer based on config 68 | :return: a writer 69 | """ 70 | for config_class, writer_class in ProcessFactory.config_writer_map.items(): 71 | if isinstance(config, config_class): 72 | return writer_class(config, **kwargs) 73 | else: 74 | raise ValueError("create_writer must pass one of the instance of [WCSVConfig, WESConfig, WJsonConfig, " 75 | "WTXTConfig, WXLSXConfig, WRedisConfig, WMySQLConfig, WMongoConfig]") 76 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/DefaultValue.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | from .MainConfig import main_config 4 | 5 | 6 | class DefaultValObject(object): 7 | def __init__(self): 8 | self.refresh() 9 | 10 | def refresh(self): 11 | self.main_config = main_config() 12 | self.per_limit = self.main_config["main"].getint("per_limit") 13 | self.max_limit = self.main_config["main"].get("max_limit") 14 | if self.max_limit != "None": 15 | self.max_limit = int(self.max_limit) 16 | else: 17 | self.max_limit = None 18 | self.max_retry = self.main_config["main"].getint("max_retry") 19 | self.random_min_sleep = self.main_config["main"].getint("random_min_sleep") 20 | self.random_max_sleep = self.main_config["main"].getint("random_max_sleep") 21 | 22 | # redis 23 | self.redis_host = self.main_config["redis"].get("host") 24 | self.redis_port = self.main_config["redis"].getint("port") 25 | self.redis_db = self.main_config["redis"].get("db") 26 | self.redis_password = self.main_config["redis"].get("password") 27 | self.redis_timeout = self.main_config["redis"].getint("timeout") 28 | self.redis_encoding = self.main_config["redis"].get("encoding") 29 | self.redis_direction = self.main_config["redis"].get("direction") 30 | self.redis_compress = self.main_config["redis"].getboolean("compress") 31 | self.redis_need_del = self.main_config["redis"].getboolean("need_del") 32 | 33 | # mysql config 34 | self.mysql_host = self.main_config["mysql"].get("host") 35 | self.mysql_port = self.main_config["mysql"].getint("port") 36 | self.mysql_user = self.main_config["mysql"].get("user") 37 | self.mysql_password = self.main_config["mysql"].get("password") 38 | self.mysql_database = self.main_config["mysql"].get("database") 39 | self.mysql_encoding = self.main_config["mysql"].get("encoding") 40 | if not self.mysql_encoding: 41 | self.mysql_encoding = self.default_encoding 42 | 43 | # mongo config 44 | self.mongo_host = self.main_config["mongo"].get("host") 45 | self.mongo_port = self.main_config["mongo"].getint("port") 46 | self.mongo_username = self.main_config["mongo"].get("username") 47 | self.mongo_password = self.main_config["mongo"].get("password") 48 | self.mongo_database = self.main_config["mongo"].get("database") 49 | self.mongo_protocol = self.main_config["mongo"].get("protocol") 50 | self.mongo_other_params = self.main_config["mongo"].get("other_params") 51 | 52 | # kafka config 53 | self.kafka_bootstrap_servers = self.main_config["kafka"].get("bootstrap.servers") 54 | 55 | default_file_mode_r = "r" 56 | default_file_mode_w = "w" 57 | default_encoding = "utf8" 58 | new_line = "\n" 59 | join_val = " " 60 | title = "example" 61 | qsn = None 62 | 63 | query_body = None 64 | dest_without_path = "result" 65 | dest = os.getcwd() + "/" + dest_without_path 66 | interval = 5 67 | concurrency = 50 68 | default_key_type = "LIST" 69 | default_quote_char = '"' 70 | report_interval = 10 71 | success_ret_code = ("100002", "100301", "100103") 72 | trim_to_max_limit = False 73 | exclude_filtered_to_max_limit = True 74 | 75 | @staticmethod 76 | def default_id_hash_func(item): 77 | if "appCode" in item and item["appCode"] and "id" in item and item["id"]: 78 | value = (item["appCode"] + "_" + item["id"]).encode("utf8") 79 | else: 80 | value = str(item).encode("utf8") 81 | return hashlib.md5(value).hexdigest() 82 | 83 | 84 | DefaultVal = DefaultValObject() 85 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/ESWriter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import asyncio 3 | import random 4 | from .BaseWriter import BaseWriter 5 | from ..Config.MainConfig import main_config 6 | 7 | 8 | class ESWriter(BaseWriter): 9 | def __init__(self, config): 10 | if not main_config.has_es_configured: 11 | raise ValueError("You must config es_hosts before using ESWriter, Please edit configure file: %s" % (main_config.ini_path, )) 12 | 13 | super().__init__() 14 | self.config = config 15 | self.total_miss_count = 0 16 | self.success_count = 0 17 | self.fail_count = 0 18 | 19 | async def write(self, responses): 20 | response = None # something to return 21 | origin_length = len(responses) 22 | if self.config.filter: 23 | responses = [self.config.filter(i) for i in responses] 24 | responses = [i for i in responses if i] 25 | miss_count = origin_length - len(responses) 26 | self.total_miss_count += miss_count 27 | if responses: 28 | if self.config.expand: 29 | responses = [self.expand_dict(i) for i in responses] 30 | try_time = 0 31 | while try_time < self.config.max_retry: 32 | success, fail, response = await self.config.es_client.add_dict_to_es( 33 | self.config.indices, self.config.doc_type, responses, 34 | self.config.id_hash_func, self.config.app_code, 35 | self.config.actions, self.config.create_date, 36 | self.config.error_if_fail, self.config.timeout, self.config.auto_insert_createDate) 37 | if response is not None: 38 | self.success_count += success 39 | self.fail_count += fail 40 | logging.info("Write %d items to index: %s, doc_type: %s, fail: %d, filtered: %d" % ( 41 | len(responses), self.config.indices, self.config.doc_type, fail, miss_count)) 42 | break 43 | else: 44 | # exception happened 45 | try_time += 1 46 | if try_time >= self.config.max_retry: 47 | logging.error("Fail to write after try: %d times, Write 0 items to index: %s, doc_type: %s" % 48 | (self.config.max_retry, self.config.indices, self.config.doc_type)) 49 | else: 50 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 51 | else: 52 | # all filtered, or pass empty result 53 | logging.info("Write 0 items to index: %s, doc_type: %s (all filtered, or pass empty result)" % (self.config.indices, self.config.doc_type)) 54 | return response 55 | 56 | async def delete_all(self, body=None): 57 | """ 58 | inefficient delete 59 | """ 60 | if not body: 61 | body = { 62 | "query": { 63 | "match_all": {} 64 | } 65 | } 66 | result = await self.config.es_client.delete_by_query(index=self.config.indices, doc_type=self.config.doc_type, 67 | body=body, params={"conflicts": "proceed"}) 68 | return result 69 | 70 | def __enter__(self): 71 | return self 72 | 73 | def __exit__(self, exc_type, exc_val, exc_tb): 74 | logging.info("%s->%s write done, total filtered %d item, total write %d item, total fail: %d item" % 75 | (self.config.indices, self.config.doc_type, self.total_miss_count, self.success_count, 76 | self.fail_count)) 77 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/MongoWriter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import random 4 | import logging 5 | import traceback 6 | from .BaseWriter import BaseWriter 7 | 8 | InsertOne = DeleteMany = ReplaceOne = UpdateOne = None 9 | try: 10 | from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne 11 | except Exception: 12 | pass 13 | 14 | 15 | class MongoWriter(BaseWriter): 16 | def __init__(self, config): 17 | super().__init__() 18 | self.config = config 19 | self.total_miss_count = 0 20 | self.success_count = 0 21 | self.table_checked = False 22 | self.key_fields = list() 23 | 24 | async def write(self, responses): 25 | self.config.get_mongo_cli() # init mongodb pool 26 | 27 | miss_count = 0 28 | original_length = len(responses) 29 | if self.config.filter: 30 | target_responses = list() 31 | for i in responses: 32 | i = self.config.filter(i) 33 | if i: 34 | target_responses.append(i) 35 | else: 36 | miss_count += 1 37 | responses = target_responses 38 | 39 | if not responses: 40 | self.finish_once(miss_count, original_length) 41 | return 42 | 43 | # After filtered, still have responses to write 44 | if await self.perform_write(responses): 45 | self.finish_once(miss_count, original_length) 46 | 47 | def __exit__(self, exc_type, exc_val, exc_tb): 48 | logging.info("%s write done, total filtered %d item, total write %d item" % 49 | (self.config.name, self.total_miss_count, self.success_count)) 50 | 51 | def __enter__(self): 52 | return self 53 | 54 | def finish_once(self, miss_count, original_length): 55 | self.total_miss_count += miss_count 56 | self.success_count += original_length 57 | logging.info("%s write %d item, filtered %d item" % (self.config.name, original_length - miss_count, miss_count)) 58 | 59 | async def perform_write(self, responses): 60 | try_time = 0 61 | for each in responses: 62 | if self.config.auto_insert_createDate and self.config.createDate is not None: 63 | each["createDate"] = self.config.createDate 64 | if "_id" not in each: 65 | each["_id"] = self.config.id_hash_func(each) 66 | 67 | while try_time < self.config.max_retry: 68 | try: 69 | if UpdateOne is not None: 70 | await self.config.collection_cli.bulk_write([UpdateOne({'_id': each["_id"]}, {"$set": each}, upsert=True) for each in responses]) 71 | else: 72 | bulk = self.config.collection_cli.initialize_ordered_bulk_op() 73 | for each in responses: 74 | bulk.find({"_id": each["_id"]}).upsert().replace_one(each) 75 | await bulk.execute() 76 | return True 77 | except Exception as e: 78 | try_time += 1 79 | if try_time < self.config.max_retry: 80 | logging.error("retry: %d, %s" % (try_time, str(e))) 81 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 82 | else: 83 | logging.error("Give up MongoWriter writer: %s, After retry: %d times, still fail to write, " 84 | "total write %d items, total filtered: %d items, reason: %s" % 85 | (self.config.name, self.config.max_retry, self.success_count, self.total_miss_count, 86 | str(traceback.format_exc()))) 87 | return False 88 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/JsonGetter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from .BaseGetter import BaseGetter 4 | 5 | 6 | class JsonGetter(BaseGetter): 7 | def __init__(self, config): 8 | super().__init__(self) 9 | self.config = config 10 | self.responses = list() 11 | self.done = False 12 | self.f_in = open(self.config.filename, self.config.mode, encoding=self.config.encoding) 13 | self.miss_count = 0 14 | self.total_count = 0 15 | 16 | def init_val(self): 17 | self.responses = list() 18 | self.done = False 19 | self.f_in.seek(0, 0) 20 | self.miss_count = 0 21 | self.total_count = 0 22 | 23 | def __aiter__(self): 24 | return self 25 | 26 | async def __anext__(self): 27 | if self.done: 28 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 29 | (self.config.filename, self.total_count, self.miss_count)) 30 | self.init_val() 31 | raise StopAsyncIteration 32 | 33 | for line in self.f_in: 34 | if self.config.max_limit and self.total_count > self.config.max_limit: 35 | self.done = True 36 | return self.clear_and_return() 37 | 38 | self.total_count += 1 39 | try: 40 | json_obj = json.loads(line) 41 | except json.decoder.JSONDecodeError: 42 | logging.error("JSONDecodeError. give up. line: %d" % (self.total_count, )) 43 | continue 44 | 45 | if self.config.filter: 46 | json_obj = self.config.filter(json_obj) 47 | if not json_obj: 48 | self.miss_count += 1 49 | continue 50 | 51 | self.responses.append(json_obj) 52 | 53 | if len(self.responses) > self.config.per_limit: 54 | return self.clear_and_return() 55 | 56 | self.done = True 57 | if self.responses: 58 | return self.clear_and_return() 59 | 60 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 61 | (self.config.filename, self.total_count, self.miss_count)) 62 | self.init_val() 63 | raise StopAsyncIteration 64 | 65 | def __iter__(self): 66 | for line in self.f_in: 67 | if self.config.max_limit and self.total_count > self.config.max_limit: 68 | self.done = True 69 | yield self.clear_and_return() 70 | break 71 | 72 | self.total_count += 1 73 | try: 74 | json_obj = json.loads(line) 75 | except json.decoder.JSONDecodeError: 76 | logging.error("JSONDecodeError. give up. line: %d" % (self.total_count, )) 77 | continue 78 | 79 | if self.config.filter: 80 | json_obj = self.config.filter(json_obj) 81 | if not json_obj: 82 | self.miss_count += 1 83 | continue 84 | 85 | self.responses.append(json_obj) 86 | 87 | if len(self.responses) > self.config.per_limit: 88 | yield self.clear_and_return() 89 | 90 | if self.responses: 91 | yield self.clear_and_return() 92 | 93 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 94 | (self.config.filename, self.total_count, self.miss_count)) 95 | self.init_val() 96 | 97 | def __del__(self): 98 | self.f_in.close() 99 | 100 | def clear_and_return(self): 101 | resp = self.responses 102 | self.responses = list() 103 | return resp 104 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/CSVWriter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import types 4 | import logging 5 | from .BaseWriter import BaseWriter 6 | 7 | 8 | class CSVWriter(BaseWriter): 9 | def __init__(self, config): 10 | super().__init__() 11 | self.config = config 12 | self.file_already_exists = os.path.exists(self.config.filename) and os.path.getsize(self.config.filename) 13 | self.f_out = open(self.config.filename, self.config.mode, encoding=self.config.encoding, newline="") 14 | self.f_csv = None 15 | self.headers = dict() if not self.config.headers else self.config.headers 16 | self.total_miss_count = 0 17 | self.success_count = 0 18 | # logging.info("self.config.quotechar: %s, %s", self.config.quotechar, repr(self.config.quotechar)) 19 | 20 | def write(self, responses): 21 | miss_count = 0 22 | 23 | # filter 24 | if self.config.filter: 25 | new_result = list() 26 | for each_response in responses: 27 | each_response = self.config.filter(each_response) 28 | if not each_response: 29 | miss_count += 1 30 | continue 31 | new_result.append(each_response) 32 | responses = new_result 33 | self.total_miss_count += miss_count 34 | 35 | # all filtered 36 | if not responses: 37 | logging.info("%s write 0 item, filtered %d item" % (self.config.filename, miss_count)) 38 | return 39 | 40 | # expand 41 | if self.config.expand: 42 | responses = [self.expand_dict(i, max_expand=self.config.expand) for i in responses] 43 | else: 44 | responses = [i for i in responses] if isinstance(responses, types.GeneratorType) else responses 45 | 46 | # headers 47 | if not self.f_csv: 48 | if "a" in self.config.mode and self.file_already_exists: 49 | self.headers = self.generate_headers(responses, append_mode=True) 50 | self.f_csv = csv.DictWriter(self.f_out, self.headers, quotechar=self.config.quotechar) 51 | else: 52 | if not self.headers: 53 | self.headers = self.generate_headers(responses) 54 | self.f_csv = csv.DictWriter(self.f_out, self.headers, quotechar=self.config.quotechar) 55 | self.f_csv.writeheader() 56 | 57 | # encoding process 58 | for each_response in responses: 59 | for k, v in each_response.items(): 60 | if v is None: 61 | each_response[k] = "" 62 | 63 | elif self.config.qsn and v != "" and (isinstance(v, (int, float)) or isinstance(v, str) and all(i.isdigit() for i in v)): 64 | each_response[k] = repr(str(v)) 65 | 66 | elif self.config.encoding not in ("utf8", "utf-8"): 67 | each_response[k] = str(v).encode(self.config.encoding, "ignore").decode(self.config.encoding) 68 | 69 | self.success_count += 1 70 | self.f_csv.writerow(each_response) 71 | logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count)) 72 | 73 | def generate_headers(self, responses, append_mode=False): 74 | headers = set() 75 | for r in responses: 76 | for key in r.keys(): 77 | headers.add(key) 78 | 79 | if append_mode: 80 | f_in = open(self.config.filename, "r", encoding=self.config.encoding, newline="") 81 | reader = csv.DictReader(f_in) 82 | exists_fields = reader.fieldnames 83 | if set(exists_fields) != headers: 84 | raise ValueError("append mode for csv file: %s, but header field mismatch, exist fields: %s, generated fields: %s" % (self.config.filename, repr(exists_fields), repr(headers))) 85 | return exists_fields 86 | return list(headers) 87 | 88 | def __enter__(self): 89 | return self 90 | 91 | def __exit__(self, exc_type, exc_val, exc_tb): 92 | self.f_out.close() 93 | logging.info("%s write done, total filtered %d item, total write %d item" % 94 | (self.config.filename, self.total_miss_count, self.success_count)) 95 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/XLSXWriter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import logging 4 | from openpyxl import Workbook, load_workbook 5 | from .BaseWriter import BaseWriter 6 | from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE 7 | from ..Config.DefaultValue import DefaultVal 8 | 9 | _warning = False 10 | 11 | 12 | class XLSXWriter(BaseWriter): 13 | def __init__(self, config): 14 | global _warning 15 | super().__init__() 16 | self.config = config 17 | self.col_dict = dict() 18 | self.row = 2 19 | # headers 20 | self.header_generated = False 21 | self.file_already_exists = os.path.exists(self.config.filename) 22 | if "a" in self.config.mode and self.file_already_exists: 23 | self.wb = load_workbook(filename=self.config.filename, read_only=False) 24 | self.generate_header(from_file=True) 25 | else: 26 | self.wb = Workbook() 27 | self.ws1 = self.wb.active 28 | self.ws1.title = config.title 29 | self.total_miss_count = 0 30 | self.success_count = 0 31 | if not _warning: 32 | logging.warning("XLSXWriter will actually write to file when __exit__ of XLSXWriter called") 33 | _warning = True 34 | 35 | def write(self, responses): 36 | if not self.header_generated and self.config.headers: 37 | self.generate_header() 38 | 39 | miss_count = 0 40 | for each_response in responses: 41 | if self.config.expand: 42 | each_response = self.expand_dict(each_response, max_expand=self.config.expand) 43 | if self.config.filter: 44 | each_response = self.config.filter(each_response) 45 | if not each_response: 46 | miss_count += 1 47 | continue 48 | 49 | for key, value in each_response.items(): 50 | if key not in self.col_dict: 51 | self.col_dict[key] = len(self.col_dict) + 1 52 | self.ws1.cell(row=1, column=self.col_dict[key], value=key) 53 | value = str(value) if value is not None else "" 54 | try: 55 | self.ws1.cell(row=self.row, column=self.col_dict[key], value=value) 56 | except Exception: 57 | new_value = re.sub(ILLEGAL_CHARACTERS_RE, "", value) 58 | logging.warning("row num: %d, key: %s, value: %s contains illegal characters, " 59 | "replaced illegal characters to: %s" % (self.row, key, value, new_value)) 60 | self.ws1.cell(row=self.row, column=self.col_dict[key], value=new_value) 61 | 62 | self.row += 1 63 | self.success_count += 1 64 | logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count)) 65 | 66 | def __exit__(self, exc_type, exc_val, exc_tb): 67 | self.wb.save(filename=self.config.filename) 68 | self.wb.close() 69 | logging.info("%s write done, total filtered %d item, total write %d item" % 70 | (self.config.filename, self.total_miss_count, self.success_count)) 71 | 72 | def __enter__(self): 73 | return self 74 | 75 | def generate_header(self, from_file=False): 76 | if from_file: 77 | if not self.wb.worksheets: 78 | # empty file 79 | return 80 | sheet = self.wb.worksheets[self.config.sheet_index] 81 | row_iter = sheet.rows 82 | try: 83 | row = next(row_iter) 84 | for each in row: 85 | self.col_dict[each.value] = len(self.col_dict) + 1 86 | except StopIteration: 87 | # empty file 88 | return 89 | if len(self.col_dict) == 1 and list(self.col_dict.keys())[0] is None: 90 | # empty file 91 | self.col_dict.clear() 92 | return 93 | max_row = sheet.max_row 94 | self.row = max_row + 1 95 | else: 96 | for key in self.config.headers: 97 | self.col_dict[key] = len(self.col_dict) + 1 98 | self.ws1.cell(row=1, column=self.col_dict[key], value=key) 99 | 100 | self.header_generated = True 101 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/XLSXGetter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from openpyxl import load_workbook 3 | from .BaseGetter import BaseGetter 4 | 5 | 6 | class XLSXGetter(BaseGetter): 7 | def __init__(self, config): 8 | super().__init__() 9 | self.config = config 10 | self.wb = load_workbook(filename=self.config.filename, read_only=True) 11 | if not self.wb.worksheets: 12 | raise ValueError("Empty file: %s" % (self.config.filename, )) 13 | self.sheet = self.wb.worksheets[self.config.sheet_index] 14 | self.row_iter = self.sheet.rows 15 | self.headers = self.generate_headers() 16 | 17 | self.max_row = self.sheet.max_row 18 | if self.config.max_limit and self.config.max_limit > self.max_row: 19 | self.max_row = self.config.max_limit + 1 # add first headers 20 | 21 | self.row_num = 0 22 | self.responses = list() 23 | self.curr_size = 0 24 | self.done = False 25 | self.miss_count = 0 26 | self.total_count = 0 27 | 28 | def init_val(self): 29 | self.row_num = 0 30 | self.responses = list() 31 | self.curr_size = 0 32 | self.done = False 33 | self.miss_count = 0 34 | self.total_count = 0 35 | 36 | self.row_iter = self.sheet.rows 37 | 38 | def __aiter__(self): 39 | return self 40 | 41 | async def __anext__(self): 42 | if self.done: 43 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 44 | (self.config.filename, self.total_count, self.miss_count)) 45 | self.init_val() 46 | raise StopAsyncIteration 47 | 48 | while self.row_num < self.max_row: 49 | if self.row_num == 0: 50 | self.row_num += 1 51 | continue 52 | 53 | self.row_num += 1 54 | self.total_count += 1 55 | row = self.get_next_row() 56 | if self.config.filter: 57 | row = self.config.filter(row) 58 | if not row: 59 | self.miss_count += 1 60 | continue 61 | self.responses.append(row) 62 | if len(self.responses) > self.config.per_limit: 63 | self.curr_size += len(self.responses) 64 | return self.clear_and_return() 65 | 66 | if self.responses: 67 | self.done = True 68 | return self.clear_and_return() 69 | 70 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 71 | (self.config.filename, self.total_count, self.miss_count)) 72 | self.init_val() 73 | raise StopAsyncIteration 74 | 75 | def generate_headers(self): 76 | keys = list() 77 | try: 78 | row = next(self.row_iter) 79 | for each in row: 80 | keys.append(each.value) 81 | except StopIteration: 82 | pass 83 | return keys 84 | 85 | def get_next_row(self): 86 | ret_item = dict() 87 | r = next(self.row_iter) 88 | for key, cell in zip(self.headers, r): 89 | ret_item[key] = cell.value 90 | return ret_item 91 | 92 | def __iter__(self): 93 | for row_num in range(self.max_row): 94 | if row_num == 0: 95 | continue 96 | 97 | row_num += 1 98 | self.total_count += 1 99 | row = self.get_next_row() 100 | if self.config.filter: 101 | row = self.config.filter(row) 102 | if not row: 103 | self.miss_count += 1 104 | continue 105 | self.responses.append(row) 106 | if len(self.responses) > self.config.per_limit: 107 | self.curr_size += len(self.responses) 108 | yield self.clear_and_return() 109 | 110 | if self.responses: 111 | yield self.responses 112 | 113 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 114 | (self.config.filename, self.total_count, self.miss_count)) 115 | self.init_val() 116 | 117 | def __del__(self): 118 | self.wb.close() 119 | 120 | def clear_and_return(self): 121 | resp = self.responses 122 | self.responses = list() 123 | return resp 124 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/MongoGetter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import traceback 3 | import random 4 | import logging 5 | from .BaseGetter import BaseGetter 6 | 7 | 8 | class MongoGetter(BaseGetter): 9 | def __init__(self, config): 10 | super().__init__(self) 11 | self.config = config 12 | self.responses = list() 13 | self.miss_count = 0 14 | self.total_count = 0 15 | self.total_size = None 16 | self.need_finish = False 17 | 18 | def init_val(self): 19 | self.responses = list() 20 | self.miss_count = 0 21 | self.total_count = 0 22 | self.total_size = None 23 | self.need_finish = False 24 | 25 | def __aiter__(self): 26 | return self 27 | 28 | async def __anext__(self): 29 | self.config.get_mongo_cli() # init mongo pool 30 | 31 | if self.need_finish: 32 | await self.finish() 33 | 34 | if self.total_size is None: 35 | self.total_size = await self.get_total_size() 36 | 37 | if self.total_count < self.total_size: 38 | await self.fetch_per_limit() 39 | return self.clear_and_return() 40 | 41 | # reach here, means done 42 | await self.finish() 43 | 44 | def __iter__(self): 45 | raise ValueError("MongoGetter must be used with async generator, not normal generator") 46 | 47 | async def finish(self): 48 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 49 | (self.config.name, self.total_count, self.miss_count)) 50 | self.init_val() 51 | raise StopAsyncIteration 52 | 53 | async def get_total_size(self): 54 | if hasattr(self.config.cursor, "count"): 55 | size = await self.config.cursor.count() 56 | else: 57 | size = await self.config.client[self.config.database][self.config.collection].count_documents({} if not self.config.query_body else self.config.query_body) 58 | size = min(size, self.config.max_limit if self.config.max_limit is not None else size) 59 | if size == 0: 60 | await self.finish() 61 | return size 62 | 63 | async def fetch_per_limit(self): 64 | curr_size = 0 65 | try_time = 0 66 | get_all = True 67 | 68 | while try_time < self.config.max_retry: 69 | try: 70 | async for document in self.config.cursor: 71 | curr_size += 1 72 | self.responses.append(document) 73 | if curr_size >= self.config.per_limit: 74 | get_all = False 75 | break 76 | if get_all: 77 | # get all item 78 | if self.total_count + curr_size < self.total_size: 79 | logging.error("get all items: %d, but not reach 'total_size': %d" % (self.total_count + curr_size, self.total_size)) 80 | self.need_finish = True 81 | break 82 | except Exception as e: 83 | try_time += 1 84 | if try_time < self.config.max_retry: 85 | logging.error("retry: %d, %s" % (try_time, str(e))) 86 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 87 | else: 88 | logging.error("Give up MongoGetter getter: %s, After retry: %d times, still fail, " 89 | "total get %d items, total filtered: %d items, reason: %s" % 90 | (self.config.name, self.config.max_retry, self.total_count, self.miss_count, 91 | str(traceback.format_exc()))) 92 | self.need_finish = True 93 | 94 | self.total_count += len(self.responses) 95 | 96 | curr_miss_count = 0 97 | if self.config.filter: 98 | target_results = list() 99 | for each in self.responses: 100 | each = self.config.filter(each) 101 | if each: 102 | target_results.append(each) 103 | else: 104 | curr_miss_count += 1 105 | self.responses = target_results 106 | self.miss_count += curr_miss_count 107 | 108 | logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" % 109 | (len(self.responses), self.config.name, curr_miss_count, 110 | (self.total_count / self.total_size * 100) if self.total_size else 0)) 111 | 112 | def clear_and_return(self): 113 | resp = self.responses 114 | self.responses = list() 115 | return resp 116 | -------------------------------------------------------------------------------- /README_CN_simple.md: -------------------------------------------------------------------------------- 1 | # idataapi-transform 2 | 3 | **idataapi-transform** 是一个纯python实现的,所有功能均支持异步化处理的工具包,你可以使用他将数据从一个位置/格式方便的转换到另一个位置/格式,提供易用的**命令行调用支持**和功能丰富的**python模块支持** 4 | 5 | 该工具现用于 [IDataAPI](http://www.idataapi.cn/) 团队作为基础工具包,以提高效率 6 | 7 | ##### idataapi 转换工具(简洁版) 8 | 9 | ------------------- 10 | 11 | 以下是简单的命令行示例,如果需要详细的命令行示例/代码调用示例: 12 | 13 | * [详细中文说明戳这里](https://github.com/zpoint/idataapi-transform/blob/master/README_CN.md) 14 | 15 | ------------------- 16 | 17 | #### 环境要求 18 | * python 版本号 >= 3.5.2 19 | * 如果你需要使用 MySQL 模块, 你的 python 版本号要 >= 3.5.3 20 | * 如果你需要使用 MongoDB 模块,你需要在非 Windows 下 21 | 22 | ------------------- 23 | 24 | #### 安装指南 25 | 26 | python3 -m pip install idataapi-transform 27 | # 安装完成后在终端跑如下命令 28 | transform --help # 解释各个参数的作用以及创建默认的配置文件 29 | # 编辑配置文件 ~/idataapi-transform.ini 配置 ElasticSearch, redis, mysql 主机, 端口, 默认并发数等参数 30 | 31 | # 如果你的 python 版本 >= 3.5.3, 并且需要安装 MySQL 模块 32 | python3 -m pip install 'PyMySQL<=0.9.2,>=0.9' 33 | python3 -m pip install aiomysql 34 | 35 | # 如果你不在 Windows 下, 并且需要安装 MongoDB 模块 36 | python3 -m pip install motor 37 | 38 | 39 | ------------------- 40 | 41 | #### 命令行支持及示例 42 | 43 | * 从以下任意一格式读数据 **[API, ES, CSV, XLSX, JSON, Redis, MySQL, MongoDB]** 44 | * 写数据至以下任意一格式 **[CSV, XLSX, JSON, TXT, ES, Redis, MySQL, MongoDB, Kafka]** 45 | 46 | ##### 从 API 读取数据 转换为 XLSX 格式 47 | 48 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.xlsx** (默认参数) 49 | 50 | transform API xlsx "http://xxx/post/dengta?kw=中国石化&apikey=xxx" 51 | 52 | ##### 从 API 读取数据 转换为 XLSX 格式 53 | 54 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入到 /Users/zpoint/Desktop/result.xlsx 中, **写入文件为可选参数, 可以不填, 默认参数是 ./result.xlsx** 55 | 56 | transform API xlsx "http://xxx/post/dengta?kw=中国石化&apikey=xxx" "/Users/zpoint/Desktop/result" 57 | 58 | ##### 从 API 读取数据 转换为 CSV 格式 59 | 60 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.csv** (默认参数) 61 | 62 | transform API csv "http://xxx/post/dengta?kw=中国石化&apikey=xxx" 63 | 64 | ##### 从 API 读取数据 转换为 CSV 格式 65 | 66 | w_encoding 表示写入文件的编码,默认为 utf8 67 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.csv** (默认参数), ./result.csv 以 gbk 编码保存 68 | 69 | transform API csv "http://xxx/post/dengta?kw=中国石化&apikey=xxx" --w_encoding=gbk 70 | 71 | 72 | ##### 从 API 读取数据 转换为 JSON 格式 73 | 74 | JSON 为一行一条数据的 JSON 文件 75 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.json** (默认参数) 76 | 77 | transform API json "http://xxx/post/dengta?kw=中国石化&apikey=xxx" 78 | 79 | ##### 从 API 读取数据 转换为 JSON 格式 80 | 81 | max_limit 表示最多只获取到这么多条数据 82 | 会从提供的http请求读取所有数据(翻到最后一页或者获取到超过100条为止), 并写入 **./result.json** (默认参数) 83 | 84 | transform API json "http://xxx/post/dengta?kw=中国石化&apikey=xxx" --max_limit=100 85 | 86 | ##### 从 CSV 读取数据 转换至 xlsx 87 | 88 | 会从 ./a.csv 读取数据, 并保存至 **./result.xlsx** 89 | 90 | transform CSV xlsx "./a.csv" 91 | 92 | 93 | ##### 从 Elasticsearch 读取数据 转换至 CSV (复杂示例) 94 | * 以 "gbk" **(--w_encoding)** 编码保存 CSV 文件 95 | * 指定 ES 的 index: knowledge20170517 **(knowledge20170517)** 96 | * 指定如下过滤条件 **(--query_body)** 97 | 98 | body = { 99 | "size": 100, 100 | "_source": { 101 | "includes": ["location", "title", "city", "id"] 102 | } 103 | } 104 | 105 | * 在写入 CSV 之前, 为每一条获取到的数据增加时间戳,以及移除 "city" 字段为空的对象 **(--filter)** 106 | 107 | # 创建一个文件叫做 my_filter.py (随便什么名字都行) 108 | import time 109 | def my_filter(item): # 函数名必须为 "my_filter" 110 | # item 是一条数据,在这里是一个字段对象 111 | item["createtime"] = int(time.time()) 112 | if item["city"]: 113 | return item # item 会被写入你指定的目的地 114 | # 执行到了这里, 说明返回 None, 这一条 item 会被抛弃,不会被写入目的地 115 | 116 | * 终端: 117 | 118 | transform ES csv "knowledge20170517" --w_encoding gbk --query_body '{"size": 100, "_source": {"includes": ["location", "title", "city", "id"]}}' --filter ./my_filter.py 119 | 120 | ##### 从 API 读取数据 存储至 Redis 121 | 122 | * 键名称为 my_key 123 | * redis 存储/读取 支持 LIST, 以及 HASH 两种数据结构, 默认为 LIST, 可用参数 --key_type 指明 124 | 125 | 会从 ./a.csv 读取数据, 并保存至 **./result.xlsx** 126 | 127 | transform API redis "http://xxx/post/dengta?kw=中国石化&apikey=xxx" "/Users/zpoint/Desktop/result" 128 | 129 | ##### 从 Redis 读取数据 存储至 csv 130 | 131 | 会从 my_key 中读取至多100条数据, 并保存至 **./result.csv** 132 | 133 | transform Redis csv my_key --max_limit 100 134 | 135 | ##### 从 API 读取数据 写入 MySQL 136 | 137 | * 当表格不存在是自动创建 138 | 139 | 会至多从API获取50条数据, 写入 MySQL 表格: **my_table** 140 | 141 | transform API MYSQL 'http://xxx' my_table --max_limit=50 142 | 143 | ##### 从 MySQL 读取数据 写入 redis 144 | 145 | 会从 MySQL 表格 **table** 获取数据,每次网络请求60条数据,写入 redis LIST 结构,默认键名称为 result 146 | 147 | transform MYSQL redis my_table --per_limit=60 148 | 149 | ##### 从 MongoDB 读取数据 写入 csv 150 | 151 | * 你也可以提供 --query_body 参数进行过滤查询 152 | 153 | 会从 my_coll 中读取至多50条数据, 并保存至 **./result.csv** 154 | 155 | transform MONGO csv my_coll --max_limit=50 156 | 157 | ------------------- 158 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/ESGetter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | import logging 4 | import traceback 5 | from .BaseGetter import BaseGetter 6 | 7 | 8 | class ESScrollGetter(BaseGetter): 9 | def __init__(self, config): 10 | super().__init__(self) 11 | self.config = config 12 | self.es_client = config.es_client 13 | 14 | self.total_size = None 15 | self.result = None 16 | self.scroll_id = None 17 | self.miss_count = 0 18 | self.total_count = 0 19 | 20 | def __aiter__(self): 21 | return self 22 | 23 | def init_val(self): 24 | self.total_size = None 25 | self.result = None 26 | self.scroll_id = None 27 | self.miss_count = 0 28 | self.total_count = 0 29 | 30 | async def __anext__(self, retry=1): 31 | if self.total_size is None: 32 | self.result = await self.es_client.search( 33 | index=self.config.indices, doc_type=self.config.doc_type, 34 | params={"scroll": self.config.scroll}, body=self.config.query_body 35 | ) 36 | self.total_size = self.result['hits']['total']['value'] 37 | self.total_size = self.config.max_limit if (self.config.max_limit and self.config.max_limit < self.result['hits']['total']['value']) else self.total_size 38 | self.total_count += len(self.result['hits']['hits']) 39 | logging.info("Get %d items from %s, percentage: %.2f%%" % 40 | (len(self.result['hits']['hits']), self.config.indices + "->" + str(self.config.doc_type), 41 | (self.total_count / self.total_size * 100) if self.total_size else 0)) 42 | 43 | origin_length = len(self.result['hits']['hits']) 44 | if self.config.return_source: 45 | results = [i["_source"] for i in self.result['hits']['hits']] 46 | else: 47 | results = self.result 48 | if self.config.filter: 49 | results = [self.config.filter(i) for i in results] 50 | results = [i for i in results if i] 51 | self.miss_count += origin_length - len(results) 52 | self.get_score_id_and_clear_result() 53 | return results 54 | 55 | if self.scroll_id and self.total_count < self.total_size: 56 | try: 57 | self.result = await self.es_client.scroll(scroll_id=self.scroll_id, 58 | scroll=self.config.scroll) 59 | except Exception as e: 60 | if retry < self.config.max_retry: 61 | logging.error("retry: %d, %s" % (retry, str(e))) 62 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 63 | return await self.__anext__(retry+1) 64 | else: 65 | logging.error("Give up es getter, After retry: %d times, still fail to get result: %s, " 66 | "total get %d items, total filtered: %d items, reason: %s" % 67 | (self.config.max_retry, self.config.indices + "->" + str(self.config.doc_type), 68 | self.total_count, self.miss_count, traceback.format_exc())) 69 | raise StopAsyncIteration 70 | 71 | logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" % 72 | (len(self.result['hits']['hits']), self.config.indices + "->" + str(self.config.doc_type), 73 | self.miss_count, (self.total_count / self.total_size * 100) if self.total_size else 0)) 74 | 75 | origin_length = len(self.result['hits']['hits']) 76 | self.total_count += origin_length 77 | if self.config.return_source: 78 | results = [i["_source"] for i in self.result['hits']['hits']] 79 | else: 80 | results = self.result 81 | if self.config.filter: 82 | results = [self.config.filter(i) for i in results] 83 | results = [i for i in results if i] 84 | self.miss_count += origin_length - len(results) 85 | 86 | self.get_score_id_and_clear_result() 87 | if origin_length > 0: 88 | return results 89 | else: 90 | # if scroll empty item, means no more next page 91 | logging.info("empty result, terminating scroll, scroll id: %s" % (str(self.scroll_id), )) 92 | 93 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 94 | (self.config.indices + "->" + str(self.config.doc_type), self.total_count, self.miss_count)) 95 | self.init_val() 96 | raise StopAsyncIteration 97 | 98 | async def delete_all(self): 99 | """ 100 | inefficient delete 101 | """ 102 | body = { 103 | "query": { 104 | "match_all": {} 105 | } 106 | } 107 | result = await self.config.es_client.delete_by_query(index=self.config.indices, doc_type=self.config.doc_type, 108 | body=body, params={"conflicts": "proceed"}) 109 | return result 110 | 111 | def __iter__(self): 112 | raise ValueError("ESGetter must be used with async generator, not normal generator") 113 | 114 | def get_score_id_and_clear_result(self): 115 | if "_scroll_id" in self.result and self.result["_scroll_id"]: 116 | self.scroll_id = self.result["_scroll_id"] 117 | else: 118 | self.scroll_id = None 119 | self.result = dict() 120 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/RedisGetter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | import logging 4 | import traceback 5 | import json 6 | import zlib 7 | from .BaseGetter import BaseGetter 8 | 9 | 10 | class RedisGetter(BaseGetter): 11 | def __init__(self, config): 12 | super().__init__(self) 13 | self.config = config 14 | self.is_range = self.config.is_range 15 | self.need_del = self.config.need_del 16 | self.responses = list() 17 | self.done = False 18 | self.total_size = None 19 | self.miss_count = 0 20 | self.total_count = 0 21 | self.redis_object_length = 0 22 | 23 | def init_val(self): 24 | self.responses = list() 25 | self.done = False 26 | self.miss_count = 0 27 | self.total_count = 0 28 | self.redis_object_length = 0 29 | self.total_size = None 30 | 31 | def decode(self, loaded_object): 32 | if self.config.compress: 33 | return zlib.decompress(loaded_object).decode(self.config.encoding) 34 | else: 35 | return json.loads(loaded_object) 36 | 37 | def __aiter__(self): 38 | return self 39 | 40 | async def __anext__(self, retry=1): 41 | await self.config.get_redis_pool_cli() # init redis pool 42 | if self.is_range and self.total_size is None: 43 | self.redis_object_length = await self.config.redis_len_method(self.config.key) 44 | self.total_size = self.config.max_limit if (self.config.max_limit and self.config.max_limit < self.redis_object_length) else self.redis_object_length 45 | 46 | if self.done: 47 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 48 | (self.config.name, self.total_count, self.miss_count)) 49 | self.init_val() 50 | raise StopAsyncIteration 51 | 52 | if self.is_range: 53 | if self.config.direction == "L": 54 | left = self.total_count 55 | right = self.total_count + self.config.per_limit - 1 56 | else: 57 | left = self.total_size - self.config.per_limit - 1 58 | if left < 0: 59 | left = 0 60 | right = left + self.config.per_limit 61 | 62 | try: 63 | self.responses = await self.config.redis_read_method(self.config.key, left, right) 64 | self.responses = [self.decode(i) for i in self.responses] 65 | except Exception as e: 66 | if retry < self.config.max_retry: 67 | logging.error("retry: %d, %s" % (retry, str(e))) 68 | await asyncio.sleep(random.randint(self.config.random_min_sleep, self.config.random_max_sleep)) 69 | return await self.__anext__(retry+1) 70 | else: 71 | logging.error("Give up redis getter, After retry: %d times, still fail to get key: %s, " 72 | "total get %d items, total filtered: %d items, error: %s" % (self.config.max_retry, self.config.key, self.total_count, self.miss_count, str(traceback.format_exc()))) 73 | raise StopAsyncIteration 74 | 75 | if len(self.responses) < self.config.per_limit or not self.responses or self.total_count + len(self.responses) >= self.total_size: 76 | self.done = True 77 | if self.need_del: 78 | await self.config.redis_del_method(self.config.key) 79 | else: 80 | 81 | try: 82 | self.responses = await self.config.redis_read_method(self.config.key) 83 | self.responses = [self.decode(i) for i in self.responses.values()][:self.total_size] 84 | except Exception as e: 85 | if retry < self.config.max_retry: 86 | logging.error("retry: %d, %s" % (retry, str(e))) 87 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 88 | return await self.__anext__(retry+1) 89 | else: 90 | logging.error("Give up redis getter, After retry: %d times, still fail to get key: %s, " 91 | "total get %d items, total filtered: %d items, reason: %s" % 92 | (self.config.max_retry, self.config.key, self.total_count, self.miss_count, str(traceback.format_exc()))) 93 | raise StopAsyncIteration 94 | 95 | if self.config.max_limit: 96 | self.responses = self.responses[:self.config.max_limit] 97 | self.done = True 98 | if self.need_del: 99 | await self.config.redis_del_method(self.config.key) 100 | 101 | current_response_length = len(self.responses) 102 | curr_miss_count = 0 103 | self.total_count += current_response_length 104 | if self.config.filter: 105 | target_responses = list() 106 | for i in self.responses: 107 | if self.config.filter: 108 | i = self.config.filter(i) 109 | if i: 110 | target_responses.append(i) 111 | else: 112 | curr_miss_count += 1 113 | self.responses = target_responses 114 | 115 | self.miss_count += curr_miss_count 116 | if self.is_range: 117 | logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" % 118 | (current_response_length, self.config.name, curr_miss_count, 119 | (self.total_count / self.total_size * 100) if self.total_size else 0)) 120 | return self.clear_and_return() 121 | 122 | def __iter__(self): 123 | raise ValueError("RedisGetter must be used with async generator, not normal generator") 124 | 125 | def clear_and_return(self): 126 | resp = self.responses 127 | self.responses = list() 128 | return resp 129 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/MySQLGetter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import traceback 4 | import random 5 | import logging 6 | from .BaseGetter import BaseGetter 7 | 8 | 9 | class MySQLGetter(BaseGetter): 10 | def __init__(self, config): 11 | super().__init__(self) 12 | self.config = config 13 | self.responses = list() 14 | self.miss_count = 0 15 | self.total_count = 0 16 | self.total_size = None 17 | self.key_fields = list() 18 | self.key_fields_map = dict() 19 | self.need_finish = False 20 | 21 | def init_val(self): 22 | self.responses = list() 23 | self.miss_count = 0 24 | self.total_count = 0 25 | self.total_size = None 26 | self.key_fields = list() 27 | self.key_fields_map = dict() 28 | self.need_finish = False 29 | 30 | def __aiter__(self): 31 | return self 32 | 33 | async def __anext__(self): 34 | await self.config.get_mysql_pool_cli() # init mysql pool 35 | 36 | if self.need_finish: 37 | await self.finish() 38 | 39 | if self.total_size is None: 40 | self.total_size, self.key_fields = await self.get_total_size_and_key_field() 41 | 42 | if self.total_count < self.total_size: 43 | await self.fetch_per_limit() 44 | return self.clear_and_return() 45 | 46 | # reach here, means done 47 | await self.finish() 48 | 49 | def __iter__(self): 50 | raise ValueError("MySQLGetter must be used with async generator, not normal generator") 51 | 52 | async def finish(self): 53 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 54 | (self.config.name, self.total_count, self.miss_count)) 55 | self.init_val() 56 | self.config.free_resource() 57 | raise StopAsyncIteration 58 | 59 | async def get_total_size_and_key_field(self): 60 | await self.config.cursor.execute("DESC %s" % (self.config.table, )) 61 | result = await self.config.cursor.fetchall() 62 | field = result[0][0] 63 | await self.config.cursor.execute("select count(%s) from %s" % (field, self.config.table)) 64 | result = await self.config.cursor.fetchone() 65 | # key field 66 | await self.config.cursor.execute("DESC %s" % (self.config.table, )) 67 | results = await self.config.cursor.fetchall() 68 | key_fields = list() 69 | for each in results: 70 | key_fields.append(each[0]) 71 | if "tinyint" in each[1]: 72 | self.key_fields_map[each[0]] = bool 73 | elif "text" in each[1]: 74 | self.key_fields_map[each[0]] = str # or json 75 | 76 | key_fields = list(i[0] for i in results) 77 | return result[0], key_fields 78 | 79 | async def fetch_per_limit(self): 80 | results = list() 81 | try_time = 0 82 | while try_time < self.config.max_retry: 83 | try: 84 | await self.config.cursor.execute("SELECT * FROM %s LIMIT %d,%d" % 85 | (self.config.table, self.total_count, self.config.per_limit)) 86 | results = await self.config.cursor.fetchall() 87 | break 88 | except Exception as e: 89 | try_time += 1 90 | if try_time < self.config.max_retry: 91 | logging.error("retry: %d, %s" % (try_time, str(e))) 92 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 93 | else: 94 | logging.error("Give up MySQL getter: %s, After retry: %d times, still fail, " 95 | "total get %d items, total filtered: %d items, reason: %s" % 96 | (self.config.name, self.config.max_retry, self.total_count, self.miss_count, 97 | str(traceback.format_exc()))) 98 | self.need_finish = True 99 | 100 | self.responses = [self.decode(i) for i in results] 101 | curr_miss_count = 0 102 | if self.config.filter: 103 | target_results = list() 104 | for each in results: 105 | each = self.config.filter(each) 106 | if each: 107 | target_results.append(each) 108 | else: 109 | curr_miss_count += 1 110 | self.responses = target_results 111 | self.miss_count += curr_miss_count 112 | 113 | self.total_count += len(results) 114 | logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" % 115 | (len(results), self.config.name, curr_miss_count, 116 | (self.total_count / self.total_size * 100) if self.total_size else 0)) 117 | if self.total_count >= self.total_size: 118 | self.need_finish = True 119 | return 120 | 121 | def decode(self, item): 122 | """ 123 | :param item: tuple 124 | :return: dict 125 | """ 126 | ret_dict = dict() 127 | index = 0 128 | for key in self.key_fields: 129 | if key in self.key_fields_map: 130 | if self.key_fields_map[key] is bool: 131 | ret_dict[key] = bool(item[index]) 132 | elif item[index] is None: 133 | ret_dict[key] = None 134 | elif item[index][0] in ("{", "["): 135 | try: 136 | val = json.loads(item[index]) 137 | except json.decoder.JSONDecodeError: 138 | val = item[index] 139 | ret_dict[key] = val 140 | else: 141 | ret_dict[key] = item[index] 142 | else: 143 | ret_dict[key] = item[index] 144 | index += 1 145 | return ret_dict 146 | 147 | def clear_and_return(self): 148 | resp = self.responses 149 | self.responses = list() 150 | return resp 151 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/MainConfig.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import configparser 4 | from os.path import expanduser 5 | from .LogConfig import init_log, remove_log 6 | from .ESConfig import init_es 7 | 8 | 9 | default_configure_content = """ 10 | [main] 11 | # default max concurrency value for APIGetter 12 | concurrency = 50 13 | 14 | # buffer size 15 | per_limit = 100 16 | 17 | # fetch at most max_limit items 18 | max_limit = None 19 | 20 | # max retry for getter before give up if fail to get data 21 | max_retry = 3 22 | 23 | # sleep interval if fail 24 | random_min_sleep = 1 25 | random_max_sleep = 3 26 | 27 | [es] 28 | # elasticsearch host 29 | # hosts = ["localhost:9393"] 30 | 31 | # elasticsearch headers when perform http request 32 | # headers = {"Host": "localhost", "value": "value"} 33 | 34 | # request timeout, seconds 35 | # timeout = 10 36 | 37 | # http auth 38 | # http_auth = ["user", "passwd"] 39 | 40 | [log] 41 | # a directory to save log file 42 | # path = /Users/zpoint/Desktop/idataapi-transform/logs/ 43 | 44 | # max byte per log file 45 | # log_byte = 5242880 46 | """ 47 | 48 | redis_config_content = """ 49 | [redis] 50 | host = localhost 51 | port = 0 52 | db = 0 53 | password = 54 | timeout = 3 55 | encoding = utf8 56 | # whether need to del the key after get object from redis, 0 means false, 1 means true 57 | need_del = 0 58 | # default direction when read/write , "L" means lpop/lpush, "R" means rpop/rpush 59 | direction = L 60 | """ 61 | 62 | mysql_config_content = """ 63 | [mysql] 64 | host = localhost 65 | port = 0 66 | user = root 67 | password = 68 | database = 69 | # default charset 70 | encoding = utf8 71 | """ 72 | 73 | mongo_config_content = """ 74 | [mongo] 75 | protocal = mongodb # or mongodb+srv 76 | host = localhost 77 | port = 0 78 | username = 79 | password = 80 | database = test_database 81 | other_params = 82 | """ 83 | 84 | kafka_config_content = """ 85 | [kafka] 86 | bootstrap.servers = localhost:9092 87 | """ 88 | 89 | main_config_box = None 90 | 91 | 92 | class MainConfig(object): 93 | def __init__(self, ini_path=None): 94 | global main_config_box 95 | main_config_box = self 96 | # singleton 97 | if not hasattr(self, "__instance"): 98 | if not ini_path: 99 | home = expanduser("~") 100 | ini_path = home + "/idataapi-transform.ini" 101 | 102 | if not os.path.exists(ini_path): 103 | with open(ini_path, "w") as f: 104 | f.write(default_configure_content + redis_config_content + mysql_config_content + mongo_config_content) 105 | 106 | if os.path.exists("./idataapi-transform.ini"): 107 | ini_path = "./idataapi-transform.ini" 108 | 109 | self.read_config(ini_path) 110 | 111 | def read_config(self, ini_path): 112 | self.ini_path = ini_path 113 | self.__instance = configparser.ConfigParser() 114 | 115 | self.__instance.read(ini_path) 116 | MainConfig.__instance = self.__instance 117 | 118 | self.has_log_file = self.__instance.has_log_file = self.config_log() 119 | self.has_es_configured = self.__instance.has_es_configured = self.config_es() 120 | self.has_redis_configured = self.__instance.has_redis_configured = self.config_redis() 121 | self.has_mysql_configured = self.__instance.has_mysql_configured = self.config_mysql() 122 | self.has_mongo_configured = self.__instance.has_mongo_configured = self.config_mongo() 123 | self.has_kafka_configured = self.__instance.has_kafka_configured = self.config_kafka() 124 | 125 | self.__instance.ini_path = self.ini_path 126 | 127 | def __call__(self): 128 | return self.__instance 129 | 130 | def config_log(self, log_path=None, max_log_file_bytes=None): 131 | remove_log() 132 | if log_path: 133 | manual = True 134 | else: 135 | max_log_file_bytes = self.__instance["log"].getint("log_byte") 136 | log_path = self.__instance["log"].get("path") 137 | manual = False 138 | return init_log(log_path, max_log_file_bytes, self.ini_path, manual=manual) 139 | 140 | def config_es(self): 141 | hosts = self.__instance["es"].get("hosts") 142 | timeout = self.__instance["es"].getint("timeout") 143 | http_auth = self.__instance["es"].get("http_auth") 144 | if hosts: 145 | try: 146 | hosts = json.loads(hosts) 147 | except Exception as e: 148 | raise ValueError("es host must be json serialized") 149 | 150 | headers = self.__instance["es"].get("headers") 151 | if headers and headers != "None": 152 | try: 153 | headers = json.loads(headers) 154 | except Exception as e: 155 | raise ValueError("es headers must be json serialized") 156 | if http_auth and http_auth != "None": 157 | try: 158 | http_auth = json.loads(http_auth) 159 | except Exception as e: 160 | raise ValueError("es http_auth must be json serialized") 161 | else: 162 | headers = None 163 | return init_es(hosts, headers, timeout, http_auth) 164 | 165 | def config_redis(self): 166 | try: 167 | self.__instance["redis"].get("port") 168 | except KeyError as e: 169 | with open(self.ini_path, "a+") as f: 170 | f.write(redis_config_content) 171 | self.__instance.read(self.ini_path) 172 | 173 | port = self.__instance["redis"].getint("port") 174 | return port > 0 175 | 176 | def config_mysql(self): 177 | try: 178 | self.__instance["mysql"].get("port") 179 | except KeyError as e: 180 | with open(self.ini_path, "a+") as f: 181 | f.write(mysql_config_content) 182 | self.__instance.read(self.ini_path) 183 | 184 | port = self.__instance["mysql"].getint("port") 185 | return port > 0 186 | 187 | def config_mongo(self): 188 | try: 189 | self.__instance["mongo"].get("port") 190 | except KeyError as e: 191 | with open(self.ini_path, "a+") as f: 192 | f.write(mongo_config_content) 193 | self.__instance.read(self.ini_path) 194 | 195 | port = self.__instance["mongo"].getint("port") 196 | return port > 0 197 | 198 | def config_kafka(self): 199 | try: 200 | self.__instance["kafka"].get("bootstrap.servers") 201 | except KeyError as e: 202 | with open(self.ini_path, "a+") as f: 203 | f.write(kafka_config_content) 204 | self.__instance.read(self.ini_path) 205 | 206 | return "bootstrap.servers" in self.__instance["kafka"] 207 | 208 | 209 | main_config = MainConfig() 210 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ESConfig.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import logging 4 | from collections.abc import Iterable 5 | from elasticsearch._async.transport import AsyncTransport as OriginAsyncTransport 6 | from elasticsearch._async.client.utils import _make_path 7 | from elasticsearch import TransportError 8 | from elasticsearch.exceptions import ConnectionError, ConnectionTimeout 9 | from elasticsearch import AsyncElasticsearch 10 | 11 | es_hosts = None 12 | http_auth = None 13 | 14 | 15 | def init_es(hosts, es_headers, timeout_, http_auth_): 16 | global es_hosts, http_auth, AsyncElasticsearch, AsyncTransport 17 | es_hosts = hosts 18 | http_auth = tuple(http_auth_) if isinstance(http_auth_, Iterable) else None 19 | if not es_hosts: 20 | return False 21 | 22 | class MyAsyncTransport(OriginAsyncTransport): 23 | """ 24 | Override default AsyncTransport to add timeout 25 | """ 26 | async def perform_request(self, method, url, params=None, body=None, timeout=None, headers=None): 27 | await self._async_call() 28 | 29 | method, headers, params, body, ignore, __timeout = self._resolve_request_args( 30 | method, headers, params, body 31 | ) 32 | 33 | for attempt in range(self.max_retries + 1): 34 | connection = self.get_connection() 35 | 36 | try: 37 | status, headers, data = await connection.perform_request( 38 | method, 39 | url, 40 | params, 41 | body, 42 | headers=headers, 43 | ignore=ignore, 44 | timeout=timeout, 45 | ) 46 | except TransportError as e: 47 | if method == "HEAD" and e.status_code == 404: 48 | return False 49 | 50 | retry = False 51 | if isinstance(e, ConnectionTimeout): 52 | retry = self.retry_on_timeout 53 | elif isinstance(e, ConnectionError): 54 | retry = True 55 | elif e.status_code in self.retry_on_status: 56 | retry = True 57 | 58 | if retry: 59 | try: 60 | # only mark as dead if we are retrying 61 | self.mark_dead(connection) 62 | except TransportError: 63 | # If sniffing on failure, it could fail too. Catch the 64 | # exception not to interrupt the retries. 65 | pass 66 | # raise exception on last retry 67 | if attempt == self.max_retries: 68 | raise e 69 | else: 70 | raise e 71 | 72 | else: 73 | # connection didn't fail, confirm it's live status 74 | self.connection_pool.mark_live(connection) 75 | 76 | if method == "HEAD": 77 | return 200 <= status < 300 78 | 79 | if data: 80 | data = self.deserializer.loads(data, headers.get("content-type")) 81 | return data 82 | 83 | class MyAsyncElasticsearch(AsyncElasticsearch): 84 | def __init__(self, *args, **kwargs): 85 | super().__init__(*args, **kwargs) 86 | if "headers" in kwargs: 87 | self.headers = kwargs["headers"] 88 | else: 89 | self.headers = None 90 | 91 | async def add_dict_to_es(self, indices, doc_type, items, id_hash_func, app_code=None, actions=None, 92 | create_date=None, error_if_fail=True, timeout=None, auto_insert_createDate=True): 93 | if not actions: 94 | actions = "index" 95 | body = "" 96 | for item in items: 97 | if app_code: 98 | item["appCode"] = app_code 99 | if auto_insert_createDate and "createDate" not in item: 100 | if create_date: 101 | item["createDate"] = create_date 102 | else: 103 | item["createDate"] = int(time.time()) 104 | 105 | action = { 106 | actions: { 107 | "_index": indices, 108 | "_type": doc_type, 109 | "_id": id_hash_func(item) 110 | } 111 | } 112 | if actions == "update": 113 | item = {"doc": item} 114 | body += json.dumps(action) + "\n" + json.dumps(item) + "\n" 115 | try: 116 | success = fail = 0 117 | r = await self.transport.perform_request( 118 | "POST", "/_bulk?pretty", body=body, timeout=timeout or timeout_, headers=self.headers or es_headers) 119 | if r["errors"]: 120 | for item in r["items"]: 121 | for k, v in item.items(): 122 | if "error" in v: 123 | if error_if_fail: 124 | # log error 125 | logging.error(json.dumps(v["error"])) 126 | fail += 1 127 | else: 128 | success += 1 129 | else: 130 | success = len(r["items"]) 131 | return success, fail, r 132 | except Exception as e: 133 | import traceback 134 | logging.error(traceback.format_exc()) 135 | logging.error("elasticsearch Exception, give up: %s" % (str(e), )) 136 | return None, None, None 137 | 138 | async def search( 139 | self, body=None, index=None, doc_type=None, params=None, headers=None 140 | ): 141 | if "from_" in params: 142 | params["from"] = params.pop("from_") 143 | 144 | return await self.transport.perform_request( 145 | "POST", 146 | _make_path(index, doc_type, "_search"), 147 | params=params, 148 | headers=headers if headers else self.headers, 149 | body=body, 150 | ) 151 | 152 | OriginAsyncTransport.perform_request = MyAsyncTransport.perform_request 153 | 154 | AsyncElasticsearch = MyAsyncElasticsearch 155 | return True 156 | 157 | 158 | global_client = None 159 | 160 | 161 | def get_es_client(hosts=None, headers=None): 162 | global global_client 163 | if not hosts: 164 | if global_client is None: 165 | global_client = AsyncElasticsearch(hosts=es_hosts, headers=headers, http_auth=http_auth) 166 | return global_client 167 | else: 168 | return AsyncElasticsearch(hosts=hosts, headers=headers, http_auth=http_auth) 169 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataWriter/MySQLWriter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import random 4 | import logging 5 | import traceback 6 | from .BaseWriter import BaseWriter 7 | 8 | 9 | class MySQLWriter(BaseWriter): 10 | def __init__(self, config): 11 | super().__init__() 12 | self.config = config 13 | self.total_miss_count = 0 14 | self.success_count = 0 15 | self.table_checked = False 16 | self.key_fields = list() 17 | self.auto_increment_keys = set() 18 | 19 | async def write(self, responses): 20 | await self.config.get_mysql_pool_cli() # init mysql pool 21 | 22 | miss_count = 0 23 | original_length = len(responses) 24 | if self.config.filter: 25 | target_responses = list() 26 | for i in responses: 27 | i = self.config.filter(i) 28 | if i: 29 | target_responses.append(i) 30 | else: 31 | miss_count += 1 32 | responses = target_responses 33 | 34 | if not responses: 35 | self.finish_once(miss_count, original_length) 36 | return 37 | 38 | # After filtered, still have responses to write 39 | if not self.table_checked: 40 | await self.table_check(responses) 41 | 42 | if await self.perform_write(responses): 43 | self.finish_once(miss_count, original_length) 44 | 45 | def __exit__(self, exc_type, exc_val, exc_tb): 46 | self.config.free_resource() 47 | logging.info("%s write done, total filtered %d item, total write %d item" % 48 | (self.config.name, self.total_miss_count, self.success_count)) 49 | 50 | def __enter__(self): 51 | return self 52 | 53 | def finish_once(self, miss_count, original_length): 54 | self.total_miss_count += miss_count 55 | self.success_count += original_length 56 | logging.info("%s write %d item, filtered %d item" % (self.config.name, original_length - miss_count, miss_count)) 57 | 58 | async def table_check(self, responses): 59 | await self.config.cursor.execute("SHOW TABLES LIKE '%s'" % (self.config.table, )) 60 | result = await self.config.cursor.fetchone() 61 | if result is None: 62 | await self.create_table(responses) 63 | # check field 64 | await self.config.cursor.execute("DESC %s" % (self.config.table, )) 65 | results = await self.config.cursor.fetchall() 66 | for field in results: 67 | if "auto_increment" in field: 68 | self.auto_increment_keys.add(field[0]) 69 | 70 | fields = set(i[0] for i in results) 71 | self.key_fields = list(i[0] for i in results) 72 | real_keys = set(responses[0].keys()) 73 | difference_set = real_keys.difference(fields) 74 | if difference_set: 75 | # real keys not subset of fields 76 | raise ValueError("Field %s not in MySQL Table: %s" % (str(difference_set), self.config.table)) 77 | 78 | self.table_checked = True 79 | 80 | async def create_table(self, responses): 81 | test_response = dict() 82 | for response in responses[:50]: 83 | for k, v in response.items(): 84 | if k not in test_response: 85 | test_response[k] = v 86 | elif test_response[k] is None: 87 | test_response[k] = v 88 | elif isinstance(v, dict) or isinstance(v, list): 89 | if len(json.dumps(test_response[k])) < len(json.dumps(v)): 90 | test_response[k] = v 91 | elif v is not None and test_response[k] < v: 92 | test_response[k] = v 93 | 94 | sql = """ 95 | CREATE TABLE `%s` ( 96 | """ % (self.config.table, ) 97 | first_field = True 98 | for key, value in responses[0].items(): 99 | if "Count" in key: 100 | field_type = "BIGINT" 101 | elif value is None: 102 | field_type = "TEXT" 103 | elif key in ("content", ) or isinstance(value, dict) or isinstance(value, list): 104 | field_type = "TEXT" 105 | elif isinstance(value, bool): 106 | field_type = "BOOLEAN" 107 | elif isinstance(value, int): 108 | field_type = "BIGINT" 109 | elif isinstance(value, float): 110 | field_type = "DOUBLE" 111 | # varchar can store at most 65536 bytes, utf8 occupy 1-8 bytes per character, 112 | # so length should be less than 65536 / 8 = 8192 113 | # assume this field (the shortest length) * 4 <= the longest length(8192) 114 | elif len(value) > 2048: 115 | field_type = "TEXT" 116 | else: 117 | length = len(value) * 4 118 | if length < 256: 119 | length = 256 120 | field_type = "VARCHAR(%d)" % (length, ) 121 | sql += ("\t" if first_field else "\t\t") + "`%s` %s" % (key, field_type) 122 | if key == "id": 123 | sql += " NOT NULL,\n" 124 | else: 125 | sql += ",\n" 126 | if first_field: 127 | first_field = False 128 | 129 | tail_sql = """ 130 | \tPRIMARY KEY (`id`) 131 | ) ENGINE=InnoDB DEFAULT CHARSET=%s 132 | """ % (self.config.charset, ) 133 | sql += tail_sql 134 | logging.info("Creating table: %s\n%s", self.config.table, sql) 135 | await self.config.cursor.execute(sql) 136 | await self.config.connection.commit() 137 | logging.info("table created") 138 | 139 | async def perform_write(self, responses): 140 | sql = "REPLACE INTO %s VALUES " % (self.config.table, ) 141 | normal_sql = False 142 | sql_without_auto_increment_keys = list() 143 | 144 | for each in responses: 145 | need_specific_sql = False 146 | keys = list() 147 | 148 | curr_sql = '(' 149 | for field in self.key_fields: 150 | if field in self.auto_increment_keys and field not in each: 151 | need_specific_sql = True 152 | continue 153 | val = each[field] 154 | keys.append(field) 155 | if isinstance(val, dict) or isinstance(val, list): 156 | val = json.dumps(val) 157 | if val is None: 158 | curr_sql += 'NULL,' 159 | else: 160 | curr_sql += repr(val) + "," 161 | curr_sql = curr_sql[:-1] + '),\n' 162 | if need_specific_sql: 163 | sql_keys = "(" 164 | for each_sql_key in keys: 165 | sql_keys += each_sql_key + "," 166 | sql_keys = sql_keys[:-1] + ")" 167 | sql_without_auto_increment_keys.append("REPLACE INTO %s%s VALUES " % (self.config.table, sql_keys) + curr_sql[:-2]) 168 | else: 169 | normal_sql = True 170 | sql += curr_sql 171 | sql = sql[:-2] 172 | try_time = 0 173 | while try_time < self.config.max_retry: 174 | try: 175 | ret_sql = "" 176 | if normal_sql: 177 | ret_sql += sql + ";\n" 178 | if sql_without_auto_increment_keys: 179 | ret_sql += ";\n".join(sql_without_auto_increment_keys) 180 | ret_sql += ";" 181 | await self.config.cursor.execute(ret_sql) 182 | await self.config.cursor.connection.commit() 183 | return True 184 | except Exception as e: 185 | try_time += 1 186 | if try_time < self.config.max_retry: 187 | logging.error("retry: %d, %s" % (try_time, str(e))) 188 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 189 | else: 190 | logging.error("Give up MySQL writer: %s, After retry: %d times, still fail to write, " 191 | "total write %d items, total filtered: %d items, reason: %s" % 192 | (self.config.name, self.config.max_retry, self.success_count, self.total_miss_count, 193 | str(traceback.format_exc()))) 194 | return False 195 | -------------------------------------------------------------------------------- /idataapi_transform/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import argparse 4 | from .DataProcess.Config.DefaultValue import DefaultVal 5 | from .DataProcess.Config.ConfigUtil import GetterConfig 6 | from .DataProcess.Config.ConfigUtil import WriterConfig 7 | from .DataProcess.ProcessFactory import ProcessFactory 8 | 9 | 10 | class Args(object): 11 | from_choices = ["API", "ES", "CSV", "XLSX", "JSON", "REDIS", "MYSQL", "MONGO"] 12 | from_desc = "argument 'from' can only set to one of 'API', 'ES', 'CSV', 'XLSX', " \ 13 | "'JSON'(means json line by line file), 'REDIS', 'MYSQL' or 'MONGO'" 14 | 15 | to_choices = ["csv", "xlsx", "json", "txt", "es", "redis", 'mysql', 'mongo', 'kafka'] 16 | to_desc = "argument 'to' can only set to one of \"csv\", \"xlsx\", \"json\", \"txt\" \"es\", \"json\", \"redis\", \"kafka\", " \ 17 | "\"mysql\", \"mongo\", \"json\" will write 'json.dumps(item)' line by line. " \ 18 | "\"txt\" will write each item line by line, each element in each line is separated by 'space' bu default" 19 | 20 | source_desc = """ 21 | argument 'source', When argument '-from' set to 'ES', source should be 'index' When 22 | argument 'from' set tp 'API', source should be 'http://... 23 | argument 'from' set tp 'REDIS', source should be key name 24 | argument 'from' set tp 'MYSQL', source should be table name 25 | argument 'from' set to others, source should be file path 26 | """ 27 | dest_desc = "argument 'dest', filename to save result, no need for suffix, " \ 28 | "ie '/Desktop/result', default: './result'\n" \ 29 | "When argument '-to' set to 'ES', dest should be 'index'" 30 | 31 | per_limit_desc = "amount of data buffered, when buffer filled, Program will write buffered data to 'dest', default 100" 32 | max_limit_desc = "write at most 'max_limit' data to 'dest', if 'max_limit' set to 0, means no limit, default to None" 33 | retry_desc = "when fetch data failed, retry at most 'retry' time, default 3" 34 | r_encoding_desc = "encoding of input file, ignore for xlsx format, default 'utf8'" 35 | w_encoding_desc = "encoding of output file, ignore for xlsx format, default 'utf8'" 36 | 37 | filter_desc = "file contains a 'my_filter(item)' function for filter" 38 | 39 | param_file_desc = """When you have many item save in id.json, --param_file './id.json::id::pid' means open './id.json 40 | ', read each json object line by line, use each_json['id'] as the parameter 'pid' and add it to the tail part of 41 | 'source'. --param_file can be either "filename.json::json_param::request_param" or "filename.txt::request_param" 42 | """ 43 | 44 | expand_desc = """If your item is {"a": {"b": "c"}, "b": "d"}, --expand 1 will make your item become 45 | {"a_b": "c", "b": "d"}, --expand N means expand at most N level deep of your object, --expand -1 means expand all 46 | level -- expand 0 means no expand of your item. Default 0. 47 | """ 48 | qsn_desc = """quote scientific notation, ie: 4324234234234234123123 will become 4.32423423423423E+021 in normal csv, 49 | If quote like '4324234234234234123123', it won't become scientific notation, Only work for output format 'csv' 50 | --qsn True means quote scientific notation, --qsn False means not quote scientific notation""" 51 | 52 | query_body_desc = """ElasticSearch query body, size has same function as "--limit", i.e: 53 | body = { 54 | "size": 100, 55 | "_source": { 56 | "includes": ["location", "title", "city", "id"] 57 | }, 58 | "query": { 59 | "bool": { 60 | "must": [ 61 | { 62 | "term": {"appCode": {"value": "ctrip"}} 63 | } 64 | ] 65 | } 66 | } 67 | } 68 | """ 69 | 70 | write_mode_desc = """'w' or 'a+'""" 71 | key_type_desc = """redis data type to operate, options: [LIST] or [HASH], default: [LIST]""" 72 | quote_char_desc = """csv only, default quote char is '"'""" 73 | 74 | getter_config_map = { 75 | Args.from_choices[0]: GetterConfig.RAPIConfig, 76 | Args.from_choices[1]: GetterConfig.RESConfig, 77 | Args.from_choices[2]: GetterConfig.RCSVConfig, 78 | Args.from_choices[3]: GetterConfig.RXLSXConfig, 79 | Args.from_choices[4]: GetterConfig.RJsonConfig, 80 | Args.from_choices[5]: GetterConfig.RRedisConfig, 81 | Args.from_choices[6]: GetterConfig.RMySQLConfig, 82 | Args.from_choices[7]: GetterConfig.RMongoConfig 83 | } 84 | 85 | writer_config_map = { 86 | Args.to_choices[0]: WriterConfig.WCSVConfig, 87 | Args.to_choices[1]: WriterConfig.WXLSXConfig, 88 | Args.to_choices[2]: WriterConfig.WJsonConfig, 89 | Args.to_choices[3]: WriterConfig.WJsonConfig, 90 | Args.to_choices[4]: WriterConfig.WESConfig, 91 | Args.to_choices[5]: WriterConfig.WRedisConfig, 92 | Args.to_choices[6]: WriterConfig.WMySQLConfig, 93 | Args.to_choices[7]: WriterConfig.WMongoConfig 94 | } 95 | 96 | 97 | def get_arg(): 98 | parser = argparse.ArgumentParser(prog="idataapi_transform", 99 | description='convert data from a format to another format, ' 100 | 'read/write from file or database, suitable for iDataAPI') 101 | parser.add_argument("from", choices=Args.from_choices, help=Args.from_desc, type=str.upper) 102 | parser.add_argument("to", choices=Args.to_choices, help=Args.to_desc, type=str.lower) 103 | 104 | parser.add_argument("source", help=Args.source_desc) 105 | 106 | parser.add_argument("dest", help=Args.dest_desc, default=DefaultVal.dest, nargs="?") 107 | parser.add_argument("--per_limit", default=DefaultVal.per_limit, type=int, help=Args.per_limit_desc) 108 | parser.add_argument("--max_limit", default=DefaultVal.max_limit, type=int, help=Args.max_limit_desc) 109 | parser.add_argument("--max_retry", default=DefaultVal.max_retry, type=int, help=Args.retry_desc) 110 | parser.add_argument("--r_encoding", default=DefaultVal.default_encoding, help=Args.r_encoding_desc) 111 | parser.add_argument("--w_encoding", default=DefaultVal.default_encoding, help=Args.w_encoding_desc) 112 | parser.add_argument("--filter", default=None, help=Args.filter_desc) 113 | parser.add_argument("--expand", default=None, type=int, help=Args.expand_desc) 114 | parser.add_argument("--qsn", default=None, type=bool, help=Args.qsn_desc) 115 | parser.add_argument("--query_body", default=DefaultVal.query_body, type=str, help=Args.query_body_desc) 116 | parser.add_argument("--write_mode", default=DefaultVal.default_file_mode_w, type=str, help=Args.write_mode_desc) 117 | parser.add_argument("--key_type", default=DefaultVal.default_key_type, type=str.upper, help=Args.key_type_desc) 118 | parser.add_argument("--quotechar", default=DefaultVal.default_quote_char, type=str, help=Args.quote_char_desc) 119 | return parser.parse_args() 120 | 121 | 122 | def get_filter(filter_file): 123 | if not filter_file: 124 | return None 125 | with open(filter_file, "r") as f: 126 | exec(f.read()) 127 | func = locals()["my_filter"] 128 | return func 129 | 130 | 131 | async def getter_to_writer(getter, writer): 132 | with writer as safe_writer: 133 | async for items in getter: 134 | if asyncio.iscoroutinefunction(safe_writer.write): 135 | await safe_writer.write(items) 136 | else: 137 | safe_writer.write(items) 138 | 139 | 140 | def clean(): 141 | from idataapi_transform.DataProcess.Config.ESConfig import global_client 142 | if global_client is not None: 143 | loop = asyncio.get_event_loop() 144 | loop.run_until_complete(global_client.close()) 145 | 146 | 147 | def main(): 148 | args = get_arg() 149 | from_ = getattr(args, "from") 150 | 151 | from_args = list() 152 | from_kwargs = dict() 153 | to_args = list() 154 | to_kwargs = dict() 155 | 156 | if from_ != Args.from_choices[0]: # not api 157 | from_args.extend(args.source.split(":")) 158 | else: 159 | from_args.extend([args.source]) 160 | 161 | from_kwargs["encoding"] = args.r_encoding 162 | from_kwargs["key_type"] = args.key_type 163 | if args.query_body: 164 | try: 165 | from_kwargs["query_body"] = json.loads(args.query_body) 166 | except Exception as e: 167 | raise SyntaxError("--query_body must be json serialized") 168 | 169 | for key in ("per_limit", "max_limit", "max_retry"): 170 | from_kwargs[key] = getattr(args, key) 171 | 172 | to_kwargs["filter_"] = get_filter(args.filter) 173 | to_kwargs["encoding"] = args.w_encoding 174 | to_kwargs["mode"] = args.write_mode 175 | to_kwargs["key_type"] = args.key_type 176 | for key in ("max_retry", "expand", "qsn", "quotechar"): 177 | to_kwargs[key] = getattr(args, key) 178 | 179 | if from_ not in getter_config_map: 180 | raise ValueError("argument from must be in %s" % (str(Args.from_choices), )) 181 | getter_config = getter_config_map[from_](*from_args, **from_kwargs) 182 | getter = ProcessFactory.create_getter(getter_config) 183 | 184 | if args.to == Args.to_choices[4]: 185 | # es 186 | to_args.extend(args.dest.split(":")) 187 | elif args.to in Args.to_choices[5:]: 188 | # redis, mysql, mongo 189 | if args.dest == DefaultVal.dest: 190 | to_args.append(DefaultVal.dest_without_path) 191 | else: 192 | to_args.append(args.dest) 193 | else: 194 | dest = args.dest + "." + args.to 195 | to_args.append(dest) 196 | 197 | writer_config = writer_config_map[args.to](*to_args, **to_kwargs) 198 | writer = ProcessFactory.create_writer(writer_config) 199 | loop = asyncio.get_event_loop() 200 | loop.run_until_complete(getter_to_writer(getter, writer)) 201 | # close 202 | clean() 203 | 204 | 205 | if __name__ == "__main__": 206 | main() 207 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/DataGetter/APIGetter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import hashlib 4 | import random 5 | import logging 6 | import asyncio 7 | import inspect 8 | import traceback 9 | from .BaseGetter import BaseGetter 10 | from ..Config.ConfigUtil.GetterConfig import RAPIConfig 11 | from ..Config.ConfigUtil.AsyncHelper import AsyncGenerator 12 | from ..PersistentUtil.PersistentWriter import PersistentWriter 13 | 14 | headers = { 15 | "Accept-Encoding": "gzip", 16 | # "Connection": "close" 17 | } 18 | 19 | post_headers = { 20 | "Accept-Encoding": "gzip", 21 | "Content-Type": "application/x-www-form-urlencoded" 22 | } 23 | 24 | 25 | class SourceObject(object): 26 | def __init__(self, response, tag, source, error_url, post_body): 27 | """ 28 | When error occur 29 | :param response: error response body 30 | :param tag: tag user pass in 31 | :param source: source url user pass in 32 | :param error_url: current url elicit error 33 | :param post_body: HTTP post body 34 | """ 35 | self.response = response 36 | self.tag = tag 37 | self.source = source 38 | self.error_url = error_url 39 | self.post_body = post_body 40 | 41 | 42 | class APIGetter(BaseGetter): 43 | def __init__(self, config): 44 | super().__init__() 45 | self.config = config 46 | self.base_url = self.config.source 47 | self.retry_count = 0 48 | self.responses = list() 49 | self.bad_responses = list() 50 | self.done = False 51 | self.page_token = "" 52 | self.miss_count = 0 53 | self.total_count = 0 54 | self.call_back = self.async_call_back = None 55 | if self.config.call_back is not None: 56 | if inspect.iscoroutinefunction(self.config.call_back): 57 | self.async_call_back = self.config.call_back 58 | else: 59 | self.call_back = self.config.call_back 60 | self.request_time = 0 61 | self.method = "POST" if self.config.post_body else "GET" 62 | self.give_up = False 63 | self.need_keep_fields = None 64 | self.origin_filter = None 65 | if self.config.http_headers: 66 | self.headers = self.config.http_headers 67 | elif self.config.post_body: 68 | self.headers = post_headers 69 | else: 70 | self.headers = headers 71 | 72 | def init_val(self): 73 | self.base_url = self.config.source 74 | self.retry_count = 0 75 | self.responses = list() 76 | self.bad_responses = list() 77 | self.done = False 78 | self.page_token = "" 79 | self.miss_count = 0 80 | self.total_count = 0 81 | self.call_back = self.async_call_back = None 82 | self.request_time = 0 83 | self.config.persistent_writer = None 84 | self.give_up = False 85 | self.need_keep_fields = None 86 | self.origin_filter = None 87 | 88 | def generate_sub_func(self): 89 | def sub_func(match): 90 | 91 | return match.group(1) + self.page_token + match.group(3) 92 | return sub_func 93 | 94 | def update_base_url(self, key="pageToken"): 95 | if self.base_url[-1] == "/": 96 | self.base_url = self.base_url[:-1] 97 | elif self.base_url[-1] == "?": 98 | self.base_url = self.base_url[:-1] 99 | 100 | key += "=" 101 | if key not in self.base_url: 102 | if "?" not in self.base_url: 103 | self.base_url = self.base_url + "?" + key + self.page_token 104 | else: 105 | self.base_url = self.base_url + "&" + key + self.page_token 106 | else: 107 | self.base_url = re.sub("(" + key + ")(.+?)($|&)", self.generate_sub_func(), self.base_url) 108 | 109 | def generate_new_filter(self, json_result): 110 | def next_filter(item): 111 | for each_key in self.need_keep_fields: 112 | if each_key not in json_result: 113 | logging.error("keep_other_field set to True, but key: %s not found in curr_page: %s" % (each_key, self.base_url)) 114 | return item 115 | item[each_key] = json_result[each_key] 116 | return item 117 | 118 | def combine(item): 119 | result = self.origin_filter(item) if self.origin_filter else item 120 | if result is not None: 121 | return next_filter(result) 122 | 123 | if self.need_keep_fields is None: 124 | # first time to generate field map 125 | self.need_keep_fields = dict() 126 | for key in self.config.keep_fields: 127 | if key not in json_result: 128 | logging.error("key: %s not in page response, not going to add this filed in the following result" % (key, )) 129 | continue 130 | self.need_keep_fields[key] = json_result[key] 131 | self.origin_filter = self.config.filter 132 | 133 | if not self.need_keep_fields or not self.config.keep_fields: 134 | return 135 | self.config.filter = combine 136 | 137 | def __aiter__(self): 138 | return self 139 | 140 | async def __anext__(self): 141 | if self.done: 142 | logging.info("get source done: %s, total get %d items, total filtered: %d items" % 143 | (self.config.source, self.total_count, self.miss_count)) 144 | if self.config.persistent_writer and (not self.give_up or self.config.persistent_to_disk_if_give_up): 145 | self.config.persistent_writer.add(self.config.source) 146 | self.init_val() 147 | raise StopAsyncIteration 148 | 149 | while True: 150 | result = None # for SourceObject 151 | try: 152 | if self.config.debug_mode: 153 | log_str = "HTTP method: %s, url: %s" % (self.method, self.base_url) 154 | logging.info(log_str) 155 | resp = await self.config.session._request(self.method, self.base_url, headers=self.headers, data=self.config.post_body, timeout=self.config.http_timeout) 156 | text = await resp.text() 157 | # print(text) 158 | result = json.loads(text) 159 | if "data" not in result: 160 | if "retcode" not in result or result["retcode"] not in self.config.success_ret_code: 161 | raise ValueError("Bad retcode: %s" % (str(result["retcode"]) if "retcode" in result else str(result), )) 162 | if self.config.keep_other_fields: 163 | self.generate_new_filter(result) 164 | 165 | except Exception as e: 166 | self.retry_count += 1 167 | logging.error("retry: %d, %s: %s" % (self.retry_count, str(e), self.base_url)) 168 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 169 | if self.retry_count < self.config.max_retry: 170 | continue 171 | else: 172 | # fail 173 | logging.error("Give up, After retry: %d times, Unable to get url: %s, total get %d items, " 174 | "total filtered: %d items, error: %s" % (self.config.max_retry, self.base_url, 175 | self.total_count, self.miss_count, 176 | str(traceback.format_exc()) if "Bad retcode" not in str(e) else str(e))) 177 | self.done = self.give_up = True 178 | if self.config.return_fail: 179 | self.bad_responses.append(SourceObject(result, self.config.tag, self.config.source, self.base_url, self.config.post_body)) 180 | return await self.clear_and_return() 181 | elif self.responses: 182 | return await self.clear_and_return() 183 | else: 184 | return await self.__anext__() 185 | 186 | self.request_time += 1 187 | if "data" in result: 188 | # success 189 | self.retry_count = 0 190 | origin_length = len(result["data"]) 191 | 192 | if self.config.filter: 193 | curr_response = [self.config.filter(i) for i in result["data"]] 194 | curr_response = [i for i in curr_response if i] 195 | self.miss_count += origin_length - len(curr_response) 196 | else: 197 | curr_response = result["data"] 198 | self.total_count += origin_length if self.config.exclude_filtered_to_max_limit else len(curr_response) 199 | self.responses.extend(curr_response) 200 | # trim_to_max_limit 201 | if self.config.trim_to_max_limit and self.config.max_limit and self.total_count > self.config.max_limit: 202 | need_trim_items = self.total_count - self.config.max_limit 203 | self.responses = self.responses[:-need_trim_items] 204 | logging.info("trim %d items to fit max_limit: %d" % (need_trim_items, self.config.max_limit)) 205 | self.total_count -= need_trim_items 206 | # check if done 207 | if self.config.done_if is not None and self.config.done_if(curr_response): 208 | self.done = True 209 | return await self.clear_and_return() 210 | 211 | # get next page if success, retry if fail 212 | if "pageToken" in result: 213 | if not result["pageToken"]: 214 | self.done = True 215 | if self.need_return(): 216 | return await self.clear_and_return() 217 | 218 | self.page_token = str(result["pageToken"]) 219 | self.update_base_url() 220 | 221 | elif "retcode" in result and result["retcode"] in self.config.success_ret_code: 222 | self.done = True 223 | if self.need_return(): 224 | return await self.clear_and_return() 225 | return await self.__anext__() 226 | else: 227 | self.retry_count += 1 228 | if self.retry_count >= self.config.max_retry: 229 | logging.error("Give up, After retry: %d times, Unable to get url: %s, total get %d items, " 230 | "total filtered: %d items" % (self.config.max_retry, self.base_url, 231 | self.total_count, self.miss_count)) 232 | self.done = self.give_up = True 233 | if self.need_return(): 234 | return await self.clear_and_return() 235 | 236 | await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep)) 237 | return await self.__anext__() 238 | 239 | if self.config.max_limit and self.total_count >= self.config.max_limit: 240 | self.done = True 241 | return await self.clear_and_return() 242 | elif len(self.responses) >= self.config.per_limit: 243 | return await self.clear_and_return() 244 | elif self.done: 245 | # buffer has empty data, and done fetching 246 | return await self.__anext__() 247 | 248 | if self.request_time % self.config.report_interval == 0: 249 | logging.info("After request %d pages, current item count(%d) < per_limit(%d), latest request page: %s" % 250 | (self.request_time, len(self.responses), self.config.per_limit, self.base_url)) 251 | 252 | def __iter__(self): 253 | raise ValueError("APIGetter must be used with async generator, not normal generator") 254 | 255 | async def clear_and_return(self): 256 | self.request_time = 0 257 | if self.config.return_fail: 258 | resp, bad_resp = self.responses, self.bad_responses 259 | self.responses, self.bad_responses = list(), list() 260 | if self.call_back is not None: 261 | r = self.call_back(resp, bad_resp) 262 | if inspect.iscoroutine(r): 263 | # bind function for coroutine 264 | self.async_call_back = self.call_back 265 | self.call_back = None 266 | return await r 267 | return r 268 | elif self.async_call_back is not None: 269 | return await self.async_call_back(resp, bad_resp) 270 | else: 271 | return resp, bad_resp 272 | else: 273 | resp = self.responses 274 | self.responses = list() 275 | if self.call_back is not None: 276 | r = self.call_back(resp) 277 | if inspect.iscoroutine(r): 278 | # bind function for coroutine 279 | self.async_call_back = self.call_back 280 | self.call_back = None 281 | return await r 282 | return r 283 | elif self.async_call_back is not None: 284 | return await self.async_call_back(resp) 285 | else: 286 | return resp 287 | 288 | def need_return(self): 289 | return self.responses or (self.config.return_fail and (self.responses or self.bad_responses)) 290 | 291 | 292 | class APIBulkGetter(BaseGetter): 293 | def __init__(self, config): 294 | super().__init__() 295 | self.config = config 296 | self.async_api_configs = AsyncGenerator(self.config.sources, self.to_config) 297 | 298 | self.pending_tasks = list() 299 | self.buffers = list() 300 | self.bad_buffers = list() 301 | self.success_task = 0 302 | self.curr_size = 0 303 | self.curr_bad_size = 0 304 | self.persistent_writer = None 305 | self.skip_num = 0 306 | 307 | def to_config(self, item): 308 | if isinstance(item, RAPIConfig): 309 | r = item 310 | else: 311 | r = RAPIConfig(item, session=self.config.session, filter_=self.config.filter, 312 | return_fail=self.config.return_fail, done_if=self.config.done_if, 313 | trim_to_max_limit=self.config.trim_to_max_limit, 314 | exclude_filtered_to_max_limit=self.config.exclude_filtered_to_max_limit, 315 | persistent_to_disk_if_give_up=self.config.persistent_to_disk_if_give_up, 316 | debug_mode=self.config.debug_mode, http_headers=self.config.http_headers) 317 | # persistent 318 | if self.config.persistent: 319 | if not self.config.persistent_key: 320 | self.config.persistent_key = hashlib.md5(r.source.encode("utf8")).hexdigest() 321 | if self.persistent_writer is None: 322 | self.persistent_writer = PersistentWriter(self.config.persistent_key) 323 | r.persistent_writer = self.persistent_writer 324 | return r 325 | 326 | async def fetch_items(self, api_config): 327 | if api_config.return_fail: 328 | async for items, bad_items in APIGetter(api_config): 329 | if self.config.return_fail: 330 | self.bad_buffers.extend(bad_items) 331 | self.buffers.extend(items) 332 | else: 333 | async for items in APIGetter(api_config): 334 | self.buffers.extend(items) 335 | 336 | async def fill_tasks(self): 337 | if len(self.pending_tasks) >= self.config.concurrency: 338 | return 339 | 340 | async for api_config in self.async_api_configs: 341 | # skip already done task 342 | if self.config.persistent: 343 | if api_config.source in self.persistent_writer: 344 | self.skip_num += 1 345 | continue 346 | self.pending_tasks.append(self.fetch_items(api_config)) 347 | if len(self.pending_tasks) >= self.config.concurrency: 348 | self.persistent() 349 | return 350 | 351 | self.persistent() 352 | 353 | def __aiter__(self): 354 | return self 355 | 356 | async def __anext__(self): 357 | await self.fill_tasks() 358 | while self.pending_tasks: 359 | done, pending = await asyncio.wait(self.pending_tasks, timeout=self.config.interval) 360 | self.pending_tasks = list(pending) 361 | self.success_task += len(done) 362 | if self.buffers or (self.config.return_fail and (self.buffers or self.bad_buffers)): 363 | return self.clear_and_return() 364 | else: 365 | # after interval seconds, no item fetched 366 | await self.fill_tasks() 367 | log_str = "After %.2f seconds, no new item fetched, current done task: %d, pending tasks: %d" % (float(self.config.interval), self.success_task, len(self.pending_tasks)) 368 | if self.config.persistent: 369 | log_str += ", skip %d already finished tasks with persistent mode on" % (self.skip_num, ) 370 | logging.info(log_str) 371 | continue 372 | 373 | ret_log = "APIBulkGetter Done, total perform: %d tasks, fetch: %d items" % (self.success_task, self.curr_size) 374 | if self.config.return_fail: 375 | ret_log += ", fail: %d items" % (self.curr_bad_size, ) 376 | if self.config.persistent: 377 | ret_log += ", skip %d already finished tasks with persistent mode on" % (self.skip_num,) 378 | logging.info(ret_log) 379 | if self.config.persistent: 380 | self.persistent_writer.clear(self.config.persistent_start_fresh_if_done) 381 | raise StopAsyncIteration 382 | 383 | def __iter__(self): 384 | raise ValueError("APIBulkGetter must be used with async generator, not normal generator") 385 | 386 | def clear_and_return(self): 387 | if self.config.return_fail: 388 | buffers, bad_buffers = self.buffers, self.bad_buffers 389 | self.curr_size += len(self.buffers) 390 | self.curr_bad_size += len(self.bad_buffers) 391 | self.buffers, self.bad_buffers = list(), list() 392 | return buffers, bad_buffers 393 | else: 394 | buffers = self.buffers 395 | self.curr_size += len(self.buffers) 396 | self.buffers = list() 397 | return buffers 398 | 399 | def persistent(self): 400 | # persistent task to file 401 | if self.config.persistent: 402 | self.persistent_writer.write() 403 | # logging.info("persistent mode on, after sync, totally skip %d already finished tasks" % (self.skip_num,)) 404 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConfigUtil/WriterConfig.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aioredis 3 | import inspect 4 | 5 | try: 6 | import aiomysql 7 | except Exception as e: 8 | pass 9 | 10 | try: 11 | import motor.motor_asyncio 12 | except Exception as e: 13 | pass 14 | 15 | try: 16 | import confluent_kafka 17 | except Exception: 18 | pass 19 | 20 | 21 | from .BaseConfig import BaseWriterConfig 22 | from ..ESConfig import get_es_client 23 | from ..DefaultValue import DefaultVal 24 | 25 | 26 | class WCSVConfig(BaseWriterConfig): 27 | def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding, 28 | headers=None, filter_=None, expand=None, qsn=DefaultVal.qsn, 29 | quotechar=DefaultVal.default_quote_char, **kwargs): 30 | """ 31 | :param filename: filename to write 32 | :param mode: file open mode, i.e "w" or "a+" 33 | :param encoding: file encoding i.e "utf8" 34 | :param headers: csv headers in first row, if not set, automatically extract in first bulk of items 35 | :param filter_: run "transform --help" to see command line interface explanation for detail 36 | :param expand: run "transform --help" to see command line interface explanation for detail 37 | :param qsn: run "transform --help" to see command line interface explanation for detail 38 | :param quotechar: run "transform --help" to see command line interface explanation for detail 39 | :param kwargs: 40 | 41 | Example: 42 | ... 43 | csv_config = WCSVConfig("./result.csv", encoding="utf8", headers=["likeCount", "id", "title"]) 44 | with ProcessFactory.create_writer(csv_config) as csv_writer: 45 | async for items in es_getter: 46 | # do whatever you want with items 47 | csv_writer.write(items) 48 | """ 49 | super().__init__() 50 | self.filename = filename 51 | self.encoding = encoding 52 | self.mode = mode 53 | self.headers = headers 54 | self.filter = filter_ 55 | self.expand = expand 56 | self.qsn = qsn 57 | self.quotechar = quotechar 58 | 59 | 60 | class WESConfig(BaseWriterConfig): 61 | def __init__(self, indices, doc_type=None, filter_=None, expand=None, id_hash_func=DefaultVal.default_id_hash_func, 62 | appCode=None, actions=None, createDate=None, error_if_fail=True, timeout=None, max_retry=None, 63 | random_min_sleep=None, random_max_sleep=None, auto_insert_createDate=True, hosts=None, headers=None, 64 | **kwargs): 65 | """ 66 | :param indices: elasticsearch indices 67 | :param doc_type: elasticsearch doc_type 68 | :param filter_: run "transform --help" to see command line interface explanation for detail 69 | :param expand: run "transform --help" to see command line interface explanation for detail 70 | :param id_hash_func: function to generate id_ for each item 71 | :param appCode: if not None, add appCode to each item before write to es 72 | :param actions: if not None, will set actions to user define actions, else default actions is 'index' 73 | :param createDate: if not None, add createDate to each item before write to es 74 | :param error_if_fail: if True, log to error if fail to insert to es, else log nothing 75 | :param timeout: http connection timeout when connect to es, seconds 76 | :param max_retry: if request fail, retry max_retry times 77 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 78 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 79 | :param auto_insert_createDate: whether insert createDate for each item automatic -> boolean 80 | :param hosts: elasticsearch hosts, list type, i.e: ["localhost:8888", "127.0.0.2:8889"] 81 | :param headers: headers when perform http requests to elasticsearch, dict type, i.e: {"Host": "aaa", "apikey": "bbb"} 82 | :param kwargs: 83 | 84 | Example: 85 | ... 86 | es_config = WESConfig("post20170630", "news") 87 | with ProcessFactory.create_writer(es_config) as es_writer: 88 | # asyncio function must call with await 89 | await csv_writer.write(items) 90 | """ 91 | super().__init__() 92 | 93 | if not random_min_sleep: 94 | random_min_sleep = DefaultVal.random_min_sleep 95 | if not random_max_sleep: 96 | random_max_sleep = DefaultVal.random_max_sleep 97 | if not max_retry: 98 | max_retry = DefaultVal.max_retry 99 | 100 | if not DefaultVal.main_config.has_es_configured: 101 | raise ValueError("You must config es_hosts before using Elasticsearch, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 102 | 103 | self.indices = indices 104 | self.doc_type = doc_type 105 | self.filter = filter_ 106 | self.expand = expand 107 | self.id_hash_func = id_hash_func 108 | self.es_client = get_es_client(hosts=hosts, headers=headers) 109 | self.app_code = appCode 110 | self.actions = actions 111 | self.create_date = createDate 112 | self.error_if_fail = error_if_fail 113 | self.timeout = timeout 114 | self.max_retry = max_retry 115 | self.random_min_sleep = random_min_sleep 116 | self.random_max_sleep = random_max_sleep 117 | self.auto_insert_createDate = auto_insert_createDate 118 | 119 | 120 | class WJsonConfig(BaseWriterConfig): 121 | def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding, 122 | expand=None, filter_=None, new_line=DefaultVal.new_line, **kwargs): 123 | """ 124 | :param filename: filename to write 125 | :param mode: file open mode, i.e "w" or "a+" 126 | :param encoding: file encoding i.e "utf8" 127 | :param expand: run "transform --help" to see command line interface explanation for detail 128 | :param filter_: run "transform --help" to see command line interface explanation for detail 129 | :param new_line: new_line seperator for each item, default is "\n" 130 | :param kwargs: 131 | 132 | Example: 133 | ... 134 | json_config = WJsonConfig("./result.json") 135 | with ProcessFactory.create_writer(csv_config) as json_writer: 136 | async for items in es_getter: 137 | json_writer.write(items) 138 | """ 139 | super().__init__() 140 | self.filename = filename 141 | self.mode = mode 142 | self.encoding = encoding 143 | self.expand = expand 144 | self.filter = filter_ 145 | self.new_line = new_line 146 | 147 | 148 | class WTXTConfig(BaseWriterConfig): 149 | def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding, 150 | expand=None, filter_=None, new_line=DefaultVal.new_line, join_val=DefaultVal.join_val, **kwargs): 151 | """ 152 | :param filename: filename to write 153 | :param mode: file open mode, i.e "w" or "a+" 154 | :param encoding: file encoding i.e "utf8" 155 | :param expand: run "transform --help" to see command line interface explanation for detail 156 | :param filter_: run "transform --help" to see command line interface explanation for detail 157 | :param new_line: new_line seperator for each item, default is "\n" 158 | :param join_val: space seperator for each key in each item, default is " " 159 | :param kwargs: 160 | 161 | Example: 162 | ... 163 | txt_config = WTXTConfig("./result.txt") 164 | with ProcessFactory.create_writer(txt_config) as txt_writer: 165 | async for items in es_getter: 166 | txt_writer.write(items) 167 | """ 168 | super().__init__() 169 | self.filename = filename 170 | self.mode = mode 171 | self.encoding = encoding 172 | self.expand = expand 173 | self.filter = filter_ 174 | self.new_line = new_line 175 | self.join_val = join_val 176 | 177 | 178 | class WXLSXConfig(BaseWriterConfig): 179 | def __init__(self, filename, mode=DefaultVal.default_file_mode_w, title=DefaultVal.title, expand=None, filter_=None, headers=None, sheet_index=0, **kwargs): 180 | """ 181 | :param filename: filename to write 182 | :param mode: file open mode, i.e "w" or "a+" 183 | :param title: sheet title 184 | :param expand: run "transform --help" to see command line interface explanation for detail 185 | :param filter_: run "transform --help" to see command line interface explanation for detail 186 | :param headers: xlsx headers in first row, if not set, automatically extract in first bulk of items 187 | :param sheet_index: which sheet to get, 0 means 0th sheet, only work for append mode 188 | :param kwargs: 189 | 190 | Example: 191 | ... 192 | xlsx_config = WXLSXConfig("./result.xlsx") 193 | with ProcessFactory.create_writer(xlsx_config) as xlsx_writer: 194 | async for items in es_getter: 195 | xlsx_writer.write(items) 196 | """ 197 | super().__init__() 198 | self.filename = filename 199 | self.mode = mode 200 | self.title = title 201 | self.expand = expand 202 | self.filter = filter_ 203 | self.headers = headers 204 | self.sheet_index = sheet_index 205 | 206 | 207 | class WRedisConfig(BaseWriterConfig): 208 | def __init__(self, key, key_type="LIST", filter_=None, host=None, port=None, db=None, password=None, timeout=None, 209 | encoding=None, direction=None, max_retry=None, random_min_sleep=None, random_max_sleep=None, 210 | compress=None, **kwargs): 211 | """ 212 | :param key: redis key to write data 213 | :param key_type: redis data type to operate, current only support LIST, HASH 214 | :param filter_: run "transform --help" to see command line interface explanation for detail 215 | :param host: redis host -> str 216 | :param port: redis port -> int 217 | :param db: redis database number -> int 218 | :param password: redis password -> int 219 | :param timeout: timeout per redis connection -> float 220 | :param encoding: redis object encoding -> str 221 | :param direction: "L" or "R", lpush or rpush 222 | :param compress: whether compress data use zlib before write to redis -> boolean 223 | :param kwargs: 224 | 225 | Example: 226 | redis_config = WRedisConfig("my_key") 227 | with ProcessFactory.create_writer(redis_config) as redis_writer: 228 | async for items in es_getter: 229 | await redis_writer.write(items) 230 | """ 231 | super().__init__() 232 | # load default value 233 | if not random_min_sleep: 234 | random_min_sleep = DefaultVal.random_min_sleep 235 | if not random_max_sleep: 236 | random_max_sleep = DefaultVal.random_max_sleep 237 | if not max_retry: 238 | max_retry = DefaultVal.max_retry 239 | if host is None: 240 | host = DefaultVal.redis_host 241 | if port is None: 242 | port = DefaultVal.redis_port 243 | if db is None: 244 | db = DefaultVal.redis_db 245 | if password is None: 246 | password = DefaultVal.redis_password 247 | if timeout is None: 248 | timeout = DefaultVal.redis_timeout 249 | if encoding is None: 250 | encoding = DefaultVal.redis_encoding 251 | if direction is None: 252 | direction = DefaultVal.redis_direction 253 | if compress is None: 254 | compress = DefaultVal.redis_compress 255 | 256 | # check value 257 | if not DefaultVal.main_config.has_redis_configured and port <= 0: 258 | raise ValueError("You must config redis before using Redis, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 259 | if key_type not in ("LIST", "HASH"): 260 | raise ValueError("key_type must be one of (%s)" % (str(("LIST", )), )) 261 | if not encoding: 262 | raise ValueError("You must specific encoding, since I am going to load each object in json format, " 263 | "and treat it as dictionary in python") 264 | if not password: 265 | password = None 266 | 267 | self.redis_pool_cli = None 268 | self.key = key 269 | self.host = host 270 | self.port = port 271 | self.db = db 272 | self.password = password 273 | self.encoding = encoding 274 | self.timeout = timeout 275 | 276 | self.key_type = key_type 277 | self.filter = filter_ 278 | 279 | self.name = "%s_%s->%s" % (str(host), str(port), str(key)) 280 | 281 | self.redis_write_method = None 282 | self.direction = direction 283 | self.max_retry = max_retry 284 | self.random_min_sleep = random_min_sleep 285 | self.random_max_sleep = random_max_sleep 286 | self.compress = compress 287 | 288 | if key_type == "LIST": 289 | self.is_range = True 290 | else: 291 | self.is_range = False 292 | 293 | async def get_redis_pool_cli(self): 294 | """ 295 | :return: an async redis client 296 | """ 297 | if self.redis_pool_cli is None: 298 | kwargs = { 299 | "db": int(self.db), 300 | "password": self.password, 301 | "encoding": self.encoding, 302 | "timeout": self.timeout, 303 | "minsize": 1, 304 | "maxsize": 3 305 | } 306 | if self.compress: 307 | del kwargs["encoding"] 308 | self.redis_pool_cli = await aioredis.create_redis_pool((self.host, self.port), **kwargs) 309 | if self.key_type == "LIST": 310 | if self.direction == "L": 311 | self.redis_write_method = self.redis_pool_cli.lpush 312 | else: 313 | self.redis_write_method = self.redis_pool_cli.rpush 314 | else: 315 | self.redis_write_method = self.redis_pool_cli.hset 316 | 317 | return self.redis_pool_cli 318 | 319 | 320 | class WMySQLConfig(BaseWriterConfig): 321 | def __init__(self, table, filter_=None, max_retry=None, random_min_sleep=None, random_max_sleep=None, 322 | host=None, port=None, user=None, password=None, database=None, charset=None, loop=None, **kwargs): 323 | """ 324 | :param table: mysql table 325 | :param filter_: run "transform --help" to see command line interface explanation for detail 326 | :param max_retry: if request fail, retry max_retry times 327 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 328 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 329 | :param host: mysql host -> str 330 | :param port: mysql port -> int 331 | :param user: mysql user -> str 332 | :param password: mysql password -> str 333 | :param database: mysql database -> str 334 | :param charset: default utf8 -> str 335 | :param loop: async loop instance 336 | :param kwargs: 337 | 338 | Example: 339 | mysql_config = WMySQLConfig("my_table") 340 | mysql_writer = ProcessFactory.create_writer(mysql_config) 341 | async for items in redis_getter: 342 | await mysql_writer.write(items) 343 | """ 344 | super().__init__() 345 | if not random_min_sleep: 346 | random_min_sleep = DefaultVal.random_min_sleep 347 | if not random_max_sleep: 348 | random_max_sleep = DefaultVal.random_max_sleep 349 | if not max_retry: 350 | max_retry = DefaultVal.max_retry 351 | if not host: 352 | host = DefaultVal.mysql_host 353 | if not port: 354 | port = DefaultVal.mysql_port 355 | if not user: 356 | user = DefaultVal.mysql_user 357 | if not password: 358 | password = DefaultVal.mysql_password 359 | if not database: 360 | database = DefaultVal.mysql_database 361 | if not charset: 362 | charset = DefaultVal.mysql_encoding 363 | 364 | if not DefaultVal.main_config.has_mysql_configured and port <= 0: 365 | raise ValueError("You must config mysql before using MySQL, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 366 | if "aiomysql" not in globals(): 367 | raise ValueError("module mysql disabled, please reinstall " 368 | "requirements with python version higher than 3.5.3 to enable it") 369 | 370 | self.table = table 371 | self.database = database 372 | 373 | self.max_retry = max_retry 374 | self.random_min_sleep = random_min_sleep 375 | self.random_max_sleep = random_max_sleep 376 | self.filter = filter_ 377 | 378 | self.name = "%s->%s" % (self.database, self.table) 379 | 380 | self.host = host 381 | self.port = port 382 | self.user = user 383 | if not password: 384 | password = '' 385 | self.password = password 386 | self.database = database 387 | self.charset = charset 388 | 389 | if not loop: 390 | loop = asyncio.get_event_loop() 391 | self.loop = loop 392 | self.mysql_pool_cli = self.connection = self.cursor = None 393 | 394 | async def get_mysql_pool_cli(self): 395 | """ 396 | :return: an async mysql client 397 | """ 398 | if self.mysql_pool_cli is None: 399 | self.mysql_pool_cli = await aiomysql.create_pool(host=self.host, port=self.port, user=self.user, 400 | password=self.password, db=self.database, loop=self.loop, 401 | minsize=1, maxsize=3, charset=self.charset) 402 | self.connection = await self.mysql_pool_cli.acquire() 403 | self.cursor = await self.connection.cursor() 404 | return self.mysql_pool_cli 405 | 406 | def free_resource(self): 407 | if self.mysql_pool_cli is not None: 408 | self.mysql_pool_cli.release(self.connection) 409 | self.mysql_pool_cli.close() 410 | self.loop.create_task(self.mysql_pool_cli.wait_closed()) 411 | self.mysql_pool_cli = self.connection = self.cursor = None 412 | 413 | 414 | class WMongoConfig(BaseWriterConfig): 415 | def __init__(self, collection, id_hash_func=DefaultVal.default_id_hash_func, max_retry=None, random_min_sleep=None, 416 | random_max_sleep=None, filter_=None, protocol=None, host=None, port=None, username=None, password=None, 417 | database=None, other_params=None, auto_insert_createDate=False, createDate=None, **kwargs): 418 | """ 419 | :param collection: collection name 420 | :param id_hash_func: function to generate id_ for each item, only if "_id" not in item will I use 'id_hash_func' to generate "_id" 421 | :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object 422 | if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}} 423 | :param max_retry: if request fail, retry max_retry times 424 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 425 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 426 | :param filter_: run "transform --help" to see command line interface explanation for detail 427 | :param protocol: connection url protocol 428 | :param host: mongodb host -> str 429 | :param port: mongodb port -> int 430 | :param user: mongodb user -> str 431 | :param password: mongodb password -> str 432 | :param database: mongodb database -> str 433 | :param other_params: connection url's params after ? 434 | :param createDate: if not None, add createDate to each item before write to mongodb 435 | :param auto_insert_createDate: whether insert createDate for each item automatic -> boolean 436 | :param kwargs: 437 | 438 | Example: 439 | data = [json_obj, json_obj, json_obj] 440 | mongo_config = WMongoConfig("my_coll") 441 | async with ProcessFactory.create_writer(mongo_config) as mongo_writer: 442 | await mongo_writer.write(data) 443 | """ 444 | super().__init__() 445 | if not random_min_sleep: 446 | random_min_sleep = DefaultVal.random_min_sleep 447 | if not random_max_sleep: 448 | random_max_sleep = DefaultVal.random_max_sleep 449 | if not max_retry: 450 | max_retry = DefaultVal.max_retry 451 | if not host: 452 | host = DefaultVal.mongo_host 453 | if not port: 454 | port = DefaultVal.mongo_port 455 | if not username: 456 | username = DefaultVal.mongo_username 457 | if not password: 458 | password = DefaultVal.mongo_password 459 | if not database: 460 | database = DefaultVal.mongo_database 461 | if not protocol: 462 | protocol = DefaultVal.mongo_protocol 463 | else: 464 | raise ValueError("Must define URI Scheme in mongo") 465 | if not other_params: 466 | other_params = DefaultVal.mongo_other_params 467 | 468 | if not DefaultVal.main_config.has_mongo_configured: 469 | raise ValueError("You must config MongoDB before using MongoDB, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 470 | if "motor" not in globals(): 471 | raise ValueError("module motor disabled, please reinstall " 472 | "requirements in linux") 473 | 474 | self.collection = collection 475 | self.max_retry = max_retry 476 | self.random_min_sleep = random_min_sleep 477 | self.random_max_sleep = random_max_sleep 478 | self.filter = filter_ 479 | if "srv" in protocol: 480 | try: 481 | import dns # required for mongodb connecting with SRV 482 | except Exception: 483 | raise ValueError("can't find dnspython, install it first!") 484 | self.protocol = protocol 485 | self.host = host 486 | self.port = port 487 | self.username = username 488 | self.password = password 489 | self.database = database 490 | self.other_params = other_params 491 | self.name = "%s->%s" % (self.database, self.collection) 492 | self.id_hash_func = id_hash_func 493 | self.auto_insert_createDate = auto_insert_createDate 494 | self.createDate = createDate 495 | 496 | self.client = self.collection_cli = None 497 | 498 | def get_mongo_cli(self): 499 | if self.client is None: 500 | kwargs = { 501 | "host": self.host, 502 | "port": self.port 503 | } 504 | if self.protocol and self.username: 505 | if "srv" in self.protocol: # mongodb+srv must not include port number 506 | self.client = motor.motor_asyncio.AsyncIOMotorClient( 507 | "%s://%s:%s@%s/%s?%s" % (self.protocol, self.username, self.password, kwargs["host"], 508 | self.database, self.other_params)) 509 | else: 510 | self.client = motor.motor_asyncio.AsyncIOMotorClient( 511 | "%s://%s:%s@%s:%s/%s?%s" % (self.protocol, self.username, self.password, kwargs["host"], 512 | str(kwargs["port"]), self.database, self.other_params)) 513 | else: 514 | self.client = motor.motor_asyncio.AsyncIOMotorClient(**kwargs) 515 | self.collection_cli = self.client[self.database][self.collection] 516 | return self.client 517 | 518 | 519 | class WKafkaConfig(BaseWriterConfig): 520 | def __init__(self, max_retry=None, random_min_sleep=None, 521 | random_max_sleep=None, filter_=None, bootstrap_servers=None, **kwargs): 522 | """ 523 | :param max_retry: if request fail, retry max_retry times 524 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 525 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 526 | :param filter_: run "transform --help" to see command line interface explanation for detail 527 | :param bootstrap_servers: kafka bootstrap.servers -> str 528 | :param kwargs: 529 | 530 | Example: 531 | data = [json_obj, json_obj, json_obj] 532 | mongo_config = WMongoConfig("my_coll") 533 | async with ProcessFactory.create_writer(mongo_config) as mongo_writer: 534 | await mongo_writer.write(data) 535 | """ 536 | super().__init__() 537 | if not random_min_sleep: 538 | random_min_sleep = DefaultVal.random_min_sleep 539 | if not random_max_sleep: 540 | random_max_sleep = DefaultVal.random_max_sleep 541 | if not max_retry: 542 | max_retry = DefaultVal.max_retry 543 | if not bootstrap_servers: 544 | bootstrap_servers = DefaultVal.kafka_bootstrap_servers 545 | else: 546 | raise ValueError("Must define bootstrap.servers in kafka") 547 | 548 | if not DefaultVal.main_config.has_kafka_configured: 549 | raise ValueError("You must config kafka before using Kafka, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 550 | if "confluent_kafka" not in globals(): 551 | raise ValueError("module confluent_kafka disabled, please reinstall " 552 | "requirements in linux") 553 | 554 | self.max_retry = max_retry 555 | self.random_min_sleep = random_min_sleep 556 | self.random_max_sleep = random_max_sleep 557 | self.filter = filter_ 558 | self.bootstrap_servers = bootstrap_servers 559 | -------------------------------------------------------------------------------- /idataapi_transform/DataProcess/Config/ConfigUtil/GetterConfig.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | import inspect 4 | import aioredis 5 | 6 | try: 7 | import aiomysql 8 | except Exception as e: 9 | pass 10 | 11 | try: 12 | import motor.motor_asyncio 13 | except Exception as e: 14 | pass 15 | 16 | from aiohttp.client import sentinel 17 | from .BaseConfig import BaseGetterConfig 18 | 19 | from ..ESConfig import get_es_client 20 | from ..DefaultValue import DefaultVal 21 | from ..ConnectorConfig import session_manger 22 | 23 | 24 | class RAPIConfig(BaseGetterConfig): 25 | def __init__(self, source, per_limit=DefaultVal.per_limit, max_limit=DefaultVal.max_limit, 26 | max_retry=DefaultVal.max_retry, random_min_sleep=None, random_max_sleep=None, session=None, 27 | filter_=None, return_fail=False, tag=None, call_back=None, report_interval=10, success_ret_code=None, 28 | done_if=None, trim_to_max_limit=DefaultVal.trim_to_max_limit, 29 | exclude_filtered_to_max_limit=DefaultVal.exclude_filtered_to_max_limit, post_body=None, 30 | persistent_writer=None, persistent_to_disk_if_give_up=True, debug_mode=False, keep_other_fields=False, 31 | keep_fields=("dataType", "appCode"), http_headers=None, http_timeout=None, **kwargs): 32 | """ 33 | will request until no more next_page to get, or get "max_limit" items 34 | 35 | :param source: API to get, i.e. "http://..." 36 | :param per_limit: how many items to get per time (counter will add each item after filter) 37 | :param max_limit: get at most max_limit items, if not set, get all (counter will add each item before filter) 38 | :param max_retry: if request fail, retry max_retry times 39 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 40 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 41 | :param session: aiohttp session to perform request 42 | :param filter_: run "transform --help" to see command line interface explanation for detail 43 | :param return_fail: if set to True, for each iteration, will return a tuple, 44 | api_getter = ProcessFactory.create_getter(RAPIConfig("http://...")) 45 | async for items, bad_objects in getter: 46 | A = bad_objects[0] 47 | A.response: -> json object: '{"appCode": "weixinpro", "dataType": "post", "message": "param error", "retcode": "100005"}', if fail in request, response will be None 48 | A.tag: -> tag you pass to RAPIConfig 49 | A.source: -> source you pass to RAPIConfig 50 | A.post_body: -> http post body 51 | 52 | :param call_back: a function(can be async function) to call on results before each "async for" return 53 | :param report_interval: an integer value, if set to 5, after 5 request times, current response counter still 54 | less than 'per_limit', the "async for' won't return to user, there's going to be an INFO log to tell user what happen 55 | :param success_ret_code: ret_code indicate success, default is ("100002", "100301", "100103") ===> ("search no result", "account not found", "account processing") 56 | :param done_if: the APIGetter will automatically fetch next page until max_limit or no more page, if you provide a function, APIGetter will terminate fetching next page when done_if(items) return True 57 | :param trim_to_max_limit: set max_limit to the precise value, default max_limit is rough value 58 | :param exclude_filtered_to_max_limit: max_limit including filtered object or excluding filtered object 59 | :param post_body: POST with post_body instead of get 60 | :param persistent_writer: corporate with RAPIBulkConfig 61 | :param persistent_to_disk_if_give_up: corporate with RAPIBulkConfig, when retry to max_retry times, still fail to get result, whether regard this job as success and persistent to disk or not 62 | :param debug_mode: whether log every http request url 63 | :param keep_other_fields: keep field in "keep_fields" in each json_object 64 | :param http_headers: http_headers, dict object 65 | :param http_timeout: in seconds 66 | :param args: 67 | :param kwargs: 68 | 69 | Example: 70 | api_config = RAPIConfig("http://...") 71 | api_getter = ProcessFactory.create_getter(api_config) 72 | async for items in api_getter: 73 | print(items) 74 | """ 75 | super().__init__() 76 | if not random_min_sleep: 77 | random_min_sleep = DefaultVal.random_min_sleep 78 | if not random_max_sleep: 79 | random_max_sleep = DefaultVal.random_max_sleep 80 | if not success_ret_code: 81 | success_ret_code = DefaultVal.success_ret_code 82 | 83 | self.source = source 84 | self.per_limit = per_limit 85 | self.max_limit = max_limit 86 | self.max_retry = max_retry 87 | self.random_min_sleep = random_min_sleep 88 | self.random_max_sleep = random_max_sleep 89 | self.session = session_manger.get_session() if not session else session 90 | self.filter = filter_ 91 | self.return_fail = return_fail 92 | self.tag = tag 93 | self.call_back = call_back 94 | self.report_interval = report_interval 95 | self.success_ret_code = success_ret_code 96 | self.done_if = done_if 97 | self.trim_to_max_limit = trim_to_max_limit 98 | self.exclude_filtered_to_max_limit = exclude_filtered_to_max_limit 99 | if post_body: 100 | if not isinstance(post_body, (bytes, str)): 101 | post_body = json.dumps(post_body).encode(DefaultVal.default_encoding) 102 | self.post_body = post_body 103 | self.persistent_writer = persistent_writer 104 | self.persistent_to_disk_if_give_up = persistent_to_disk_if_give_up 105 | self.debug_mode = debug_mode 106 | self.keep_other_fields = keep_other_fields 107 | self.keep_fields = keep_fields 108 | self.http_headers = http_headers 109 | self.http_timeout = http_timeout if http_timeout is not None else sentinel 110 | 111 | 112 | class RCSVConfig(BaseGetterConfig): 113 | def __init__(self, filename, mode=DefaultVal.default_file_mode_r, encoding=DefaultVal.default_encoding, 114 | per_limit=None, max_limit=None, filter_=None, **kwargs): 115 | """ 116 | :param filename: filename to read 117 | :param mode: file open mode, i.e "r" 118 | :param encoding: file encoding i.e "utf8" 119 | :param per_limit: how many items to get per time 120 | :param max_limit: get at most max_limit items, if not set, get all 121 | :param filter_: run "transform --help" to see command line interface explanation for detail 122 | :param kwargs: 123 | 124 | Example: 125 | csv_config = RJsonConfig("./result.csv", encoding="gbk") 126 | csv_getter = ProcessFactory.create_getter(csv_config) 127 | async for items in csv_getter: 128 | print(items) 129 | 130 | # both async generator and generator implemented 131 | for items in csv_getter: 132 | print(items) 133 | """ 134 | super().__init__() 135 | if not per_limit: 136 | per_limit = DefaultVal.per_limit 137 | if not max_limit: 138 | max_limit = DefaultVal.max_limit 139 | 140 | self.filename = filename 141 | self.mode = mode 142 | self.encoding = encoding 143 | self.per_limit = per_limit 144 | self.max_limit = max_limit 145 | self.filter = filter_ 146 | 147 | 148 | class RESConfig(BaseGetterConfig): 149 | def __init__(self, indices, doc_type=None, per_limit=None, max_limit=None, scroll="1m", query_body=None, 150 | return_source=True, max_retry=None, random_min_sleep=None, random_max_sleep=None, filter_=None, 151 | hosts=None, headers=None, **kwargs): 152 | """ 153 | :param indices: elasticsearch indices 154 | :param doc_type: elasticsearch doc_type 155 | :param per_limit: how many items to get per request 156 | :param max_limit: get at most max_limit items, if not set, get all 157 | :param scroll: default is "1m" 158 | :param query_body: default is '{"size": "per_limit", "query": {"match_all": {}}}' 159 | :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object 160 | if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}} 161 | :param max_retry: if request fail, retry max_retry times 162 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 163 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 164 | :param filter_: run "transform --help" to see command line interface explanation for detail, 165 | only work if return_source is False 166 | :param hosts: elasticsearch hosts, list type, i.e: ["localhost:8888", "127.0.0.2:8889"] 167 | :param headers: headers when perform http requests to elasticsearch, dict type, i.e: {"Host": "aaa", "apikey": "bbb"} 168 | :param kwargs: 169 | 170 | Example: 171 | body = { 172 | "size": 100, 173 | "_source": { 174 | "includes": ["likeCount", "id", "title"] 175 | } 176 | } 177 | es_config = RESConfig("post20170630", "news", max_limit=1000, query_body=body) 178 | es_getter = ProcessFactory.create_getter(es_config) 179 | async for items in es_getter: 180 | print(item) 181 | """ 182 | super().__init__() 183 | 184 | if not random_min_sleep: 185 | random_min_sleep = DefaultVal.random_min_sleep 186 | if not random_max_sleep: 187 | random_max_sleep = DefaultVal.random_max_sleep 188 | if not per_limit: 189 | per_limit = DefaultVal.per_limit 190 | if not max_limit: 191 | max_limit = DefaultVal.max_limit 192 | if not max_retry: 193 | max_retry = DefaultVal.max_retry 194 | 195 | if not DefaultVal.main_config.has_es_configured: 196 | raise ValueError("You must config es_hosts before using Elasticsearch, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 197 | 198 | if not query_body: 199 | query_body = { 200 | "size": per_limit, 201 | "query": { 202 | "match_all": {} 203 | } 204 | } 205 | self.query_body = query_body 206 | self.indices = indices 207 | self.doc_type = doc_type 208 | self.per_limit = per_limit 209 | self.max_limit = max_limit 210 | self.scroll = scroll 211 | self.es_client = get_es_client(hosts=hosts, headers=headers) 212 | self.return_source = return_source 213 | self.max_retry = max_retry 214 | self.random_min_sleep = random_min_sleep 215 | self.random_max_sleep = random_max_sleep 216 | self.filter = filter_ 217 | 218 | 219 | class RJsonConfig(BaseGetterConfig): 220 | def __init__(self, filename, mode=DefaultVal.default_file_mode_r, encoding=DefaultVal.default_encoding, 221 | per_limit=None, max_limit=None, filter_=None, **kwargs): 222 | """ 223 | :param filename: line by line json file to read 224 | :param mode: file open mode, i.e "r" 225 | :param encoding: file encoding i.e "utf8" 226 | :param per_limit: how many items to get per time 227 | :param max_limit: get at most max_limit items, if not set, get all 228 | :param filter_: run "transform --help" to see command line interface explanation for detail 229 | :param kwargs: 230 | 231 | Example: 232 | json_config = RJsonConfig("./result.json") 233 | json_getter = ProcessFactory.create_getter(json_config) 234 | async for items in json_getter: 235 | print(items) 236 | 237 | # both async generator and generator implemented 238 | for items in json_getter: 239 | print(items) 240 | """ 241 | super().__init__() 242 | 243 | if not per_limit: 244 | per_limit = DefaultVal.per_limit 245 | if not max_limit: 246 | max_limit = DefaultVal.max_limit 247 | 248 | self.filename = filename 249 | self.mode = mode 250 | self.encoding = encoding 251 | self.per_limit = per_limit 252 | self.max_limit = max_limit 253 | self.filter = filter_ 254 | 255 | 256 | class RXLSXConfig(BaseGetterConfig): 257 | def __init__(self, filename, per_limit=None, max_limit=None, sheet_index=0, filter_=None, **kwargs): 258 | """ 259 | :param filename: filename to read 260 | :param per_limit: how many items to get per time 261 | :param max_limit: get at most max_limit items, if not set, get all 262 | :param sheet_index: which sheet to get, 0 means 0th sheet 263 | :param filter_: run "transform --help" to see command line interface explanation for detail 264 | :param kwargs: 265 | 266 | Example: 267 | xlsx_config = RXLSXConfig("./result.xlsx") 268 | xlsx_getter = ProcessFactory.create_getter(xlsx_config) 269 | async for items in xlsx_getter: 270 | print(items) 271 | 272 | # both async generator and generator implemented 273 | for items in xlsx_getter: 274 | print(items) 275 | 276 | """ 277 | super().__init__() 278 | 279 | if not per_limit: 280 | per_limit = DefaultVal.per_limit 281 | if not max_limit: 282 | max_limit = DefaultVal.max_limit 283 | 284 | self.filename = filename 285 | self.per_limit = per_limit 286 | self.max_limit = max_limit 287 | self.sheet_index = sheet_index 288 | self.filter = filter_ 289 | 290 | 291 | class RAPIBulkConfig(BaseGetterConfig): 292 | def __init__(self, sources, interval=DefaultVal.interval, concurrency=None, filter_=None, return_fail=False, 293 | done_if=None, trim_to_max_limit=DefaultVal.trim_to_max_limit, 294 | exclude_filtered_to_max_limit=DefaultVal.exclude_filtered_to_max_limit, persistent=False, 295 | persistent_key=None, persistent_start_fresh_if_done=True, persistent_to_disk_if_give_up=True, 296 | debug_mode=False, http_headers=None, **kwargs): 297 | """ 298 | :param sources: an iterable object (can be async generator), each item must be "url" or instance of RAPIConfig 299 | :param interval: integer or float, each time you call async generator, you will wait for "interval" seconds 300 | and get all items fetch during this "interval", notice if sources is an "async generator", 301 | the "interval" seconds will exclude the time processing async fenerator 302 | :param concurrency: how many concurrency task run, default read from config file, if concurrency set, 303 | only string(url) in "sources" will work with this concurrency level, RAPIConfig instance won't 304 | :param filter_: run "transform --help" to see command line interface explanation for detail 305 | :param return_fail: if set to True, for each iteration, will return a tuple, 306 | api_getter = ProcessFactory.create_getter(RAPIBulkConfig([...])) 307 | async for items, bad_objects in getter: 308 | A = bad_objects[0] 309 | A.response: -> json object: '{"appCode": "weixinpro", "dataType": "post", "message": "param error", "retcode": "100005"}', if fail in request, response will be None 310 | A.tag: -> tag you pass to RAPIConfig 311 | A.source: -> source you pass to RAPIConfig 312 | :param done_if: if will only work if the source[n] is type string, if the source[n] is type RAPIConfig, it won't work, please refer to RAPIConfig for more detail 313 | :param trim_to_max_limit: set max_limit to the precise value, default max_limit is rough value 314 | :param exclude_filtered_to_max_limit: max_limit including filtered object or excluding filtered object 315 | :param persistent: whether save progress to disk, if set to true, the job progress will be persistent to disk every "interval" seconds 316 | :param persistent_key: the key to identify the task 317 | :param persistent_start_fresh_if_done: if all task done, whether remove the persistent record file, if the persistent file hasn't been removed and all of the jobs finished, 318 | next time you run the program, there will be no job to schedule 319 | :param persistent_to_disk_if_give_up: if there's a job fail after retry max_retry times, whether regard this job as success and persistent to disk or not 320 | :param debug_mode: log every http request url 321 | :param http_headers: http_headers, dict object 322 | :param kwargs: 323 | 324 | Example: 325 | sources = ["http://....", "http://....", "http://....", RAPIConfig("http://....")] 326 | bulk_config = RAPUBulkConfig(sources) 327 | bulk_getter = ProcessFactory.create_getter(bulk_config) 328 | async for items in bulk_getter: 329 | print(items) 330 | 331 | """ 332 | super().__init__() 333 | if not concurrency: 334 | concurrency = DefaultVal.main_config["main"].getint("concurrency") 335 | self.sources = sources 336 | self.interval = interval 337 | self.concurrency = concurrency 338 | self.session = session_manger._generate_session(concurrency_limit=concurrency) 339 | self.filter = filter_ 340 | self.return_fail = return_fail 341 | self.done_if = done_if 342 | self.trim_to_max_limit = trim_to_max_limit 343 | self.exclude_filtered_to_max_limit = exclude_filtered_to_max_limit 344 | self.persistent = persistent 345 | self.persistent_key = persistent_key 346 | self.persistent_start_fresh_if_done = persistent_start_fresh_if_done 347 | self.persistent_to_disk_if_give_up = persistent_to_disk_if_give_up 348 | self.debug_mode = debug_mode 349 | self.http_headers = http_headers 350 | 351 | def __del__(self): 352 | if inspect.iscoroutinefunction(self.session.close): 353 | if not self.session.closed: 354 | if self.session._connector is not None and self.session._connector_owner: 355 | self.session._connector.close() 356 | self._connector = None 357 | else: 358 | self.session.close() 359 | 360 | 361 | class RRedisConfig(BaseGetterConfig): 362 | def __init__(self, key, key_type="LIST", per_limit=None, max_limit=None, filter_=None, max_retry=None, 363 | random_min_sleep=None, random_max_sleep=None, host=None, port=None, db=None, password=None, 364 | timeout=None, encoding=None, need_del=None, direction=None, compress=None, **kwargs): 365 | """ 366 | :param key: redis key to get data 367 | :param key_type: redis data type to operate, current only support LIST, HASH 368 | :param per_limit: how many items to get per time 369 | :param max_limit: get at most max_limit items, if not set, get all 370 | :param max_retry: if request fail, retry max_retry times 371 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 372 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 373 | :param filter_: run "transform --help" to see command line interface explanation for detail 374 | :param host: redis host -> str 375 | :param port: redis port -> int 376 | :param db: redis database number -> int 377 | :param password: redis password -> int 378 | :param timeout: timeout per redis connection -> float 379 | :param encoding: redis object encoding -> str 380 | :param need_del: whether need to del the key after get object from redis -> boolean 381 | :param direction: "L" or "R", left to right or roght to left 382 | :param compress: whether compress data use zlib before write to redis -> boolean 383 | :param kwargs: 384 | 385 | Example: 386 | redis_config = RRedisConfig("my_key") 387 | redis_getter = ProcessFactory.create_getter(redis_config) 388 | async for items in redis_getter: 389 | print(items) 390 | """ 391 | super().__init__() 392 | # load default value 393 | if not random_min_sleep: 394 | random_min_sleep = DefaultVal.random_min_sleep 395 | if not random_max_sleep: 396 | random_max_sleep = DefaultVal.random_max_sleep 397 | if not per_limit: 398 | per_limit = DefaultVal.per_limit 399 | if not max_limit: 400 | max_limit = DefaultVal.max_limit 401 | if not max_retry: 402 | max_retry = DefaultVal.max_retry 403 | if host is None: 404 | host = DefaultVal.redis_host 405 | if port is None: 406 | port = DefaultVal.redis_port 407 | if db is None: 408 | db = DefaultVal.redis_db 409 | if password is None: 410 | password = DefaultVal.redis_password 411 | if timeout is None: 412 | timeout = DefaultVal.redis_timeout 413 | if encoding is None: 414 | encoding = DefaultVal.redis_encoding 415 | if direction is None: 416 | direction = DefaultVal.redis_direction 417 | if need_del is None: 418 | need_del = DefaultVal.redis_need_del 419 | if compress is None: 420 | compress = DefaultVal.redis_compress 421 | 422 | if not DefaultVal.main_config.has_redis_configured and port <= 0: 423 | raise ValueError("You must config redis before using Redis, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 424 | 425 | if key_type not in ("LIST", "HASH"): 426 | raise ValueError("key_type must be one of (%s)" % (str(("LIST", "HASH")), )) 427 | if not encoding: 428 | raise ValueError("You must specific encoding, since I am going to load each object in json format, " 429 | "and treat it as dictionary in python") 430 | if not password: 431 | password = None 432 | 433 | self.redis_pool_cli = None 434 | self.key = key 435 | self.host = host 436 | self.port = port 437 | self.db = db 438 | self.password = password 439 | self.encoding = encoding 440 | self.timeout = timeout 441 | 442 | self.key_type = key_type 443 | self.per_limit = per_limit 444 | self.max_limit = max_limit 445 | self.filter = filter_ 446 | self.max_retry = max_retry 447 | self.random_min_sleep = random_min_sleep 448 | self.random_max_sleep = random_max_sleep 449 | self.need_del = need_del 450 | 451 | self.name = "%s_%s->%s" % (str(host), str(port), str(key)) 452 | 453 | self.redis_read_method = self.redis_len_method = self.redis_del_method = None 454 | self.direction = direction 455 | self.compress = compress 456 | 457 | if key_type == "LIST": 458 | self.is_range = True 459 | else: 460 | self.is_range = False 461 | 462 | async def get_redis_pool_cli(self): 463 | """ 464 | :return: an async redis client 465 | """ 466 | if self.redis_pool_cli is None: 467 | kwargs = { 468 | "db": int(self.db), 469 | "password": self.password, 470 | "encoding": self.encoding, 471 | "timeout": self.timeout, 472 | "minsize": 1, 473 | "maxsize": 3 474 | } 475 | if self.compress: 476 | del kwargs["encoding"] 477 | self.redis_pool_cli = await aioredis.create_redis_pool((self.host, self.port), **kwargs) 478 | if self.key_type == "LIST": 479 | self.redis_read_method = self.redis_pool_cli.lrange 480 | self.redis_len_method = self.redis_pool_cli.llen 481 | self.redis_del_method = self.redis_pool_cli.delete 482 | else: 483 | self.redis_read_method = self.redis_pool_cli.hgetall 484 | self.redis_len_method = self.redis_pool_cli.hlen 485 | self.redis_del_method = self.redis_pool_cli.delete 486 | 487 | return self.redis_pool_cli 488 | 489 | 490 | class RMySQLConfig(BaseGetterConfig): 491 | def __init__(self, table, per_limit=None, max_limit=None, filter_=None, max_retry=None, random_min_sleep=None, 492 | random_max_sleep=None, host=None, port=None, user=None, password=None, database=None, 493 | charset=None, loop=None, **kwargs): 494 | """ 495 | :param table: mysql table 496 | :param per_limit: how many items to get per time 497 | :param max_limit: get at most max_limit items, if not set, get all 498 | :param filter_: run "transform --help" to see command line interface explanation for detail 499 | :param max_retry: if request fail, retry max_retry times 500 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 501 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 502 | :param host: mysql host -> str 503 | :param port: mysql port -> int 504 | :param user: mysql user -> str 505 | :param password: mysql password -> str 506 | :param database: mysql database -> str 507 | :param charset: default utf8 -> str 508 | :param loop: async loop instance 509 | :param kwargs: 510 | 511 | Example: 512 | mysql_config = RRedisConfig("my_table") 513 | redis_getter = ProcessFactory.create_getter(mysql_config) 514 | async for items in redis_getter: 515 | print(items) 516 | """ 517 | super().__init__() 518 | 519 | if not random_min_sleep: 520 | random_min_sleep = DefaultVal.random_min_sleep 521 | if not random_max_sleep: 522 | random_max_sleep = DefaultVal.random_max_sleep 523 | if not per_limit: 524 | per_limit = DefaultVal.per_limit 525 | if not max_limit: 526 | max_limit = DefaultVal.max_limit 527 | if not max_retry: 528 | max_retry = DefaultVal.max_retry 529 | if not host: 530 | host = DefaultVal.mysql_host 531 | if not port: 532 | port = DefaultVal.mysql_port 533 | if not user: 534 | user = DefaultVal.mysql_user 535 | if not password: 536 | password = DefaultVal.mysql_password 537 | if not database: 538 | database = DefaultVal.mysql_database 539 | if not charset: 540 | charset = DefaultVal.mysql_encoding 541 | 542 | if not DefaultVal.main_config.has_mysql_configured and port <= 0: 543 | raise ValueError("You must config mysql before using MySQL, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 544 | if "aiomysql" not in globals(): 545 | raise ValueError("module mysql disabled, please reinstall " 546 | "requirements with python version higher than 3.5.3 to enable it") 547 | 548 | self.table = table 549 | self.database = database 550 | 551 | self.max_limit = max_limit 552 | self.per_limit = per_limit 553 | self.max_retry = max_retry 554 | self.random_min_sleep = random_min_sleep 555 | self.random_max_sleep = random_max_sleep 556 | self.filter = filter_ 557 | 558 | self.name = "%s->%s" % (self.database, self.table) 559 | 560 | self.host = host 561 | self.port = port 562 | self.user = user 563 | if not password: 564 | password = '' 565 | self.password = password 566 | self.database = database 567 | self.charset = charset 568 | 569 | if not loop: 570 | loop = asyncio.get_event_loop() 571 | self.loop = loop 572 | self.mysql_pool_cli = self.connection = self.cursor = None 573 | 574 | async def get_mysql_pool_cli(self): 575 | """ 576 | :return: an async mysql client 577 | """ 578 | if self.mysql_pool_cli is None: 579 | self.mysql_pool_cli = await aiomysql.create_pool(host=self.host, port=self.port, user=self.user, 580 | password=self.password, db=self.database, loop=self.loop, 581 | minsize=1, maxsize=3, charset=self.charset) 582 | self.connection = await self.mysql_pool_cli.acquire() 583 | self.cursor = await self.connection.cursor() 584 | return self.mysql_pool_cli 585 | 586 | def free_resource(self): 587 | if self.mysql_pool_cli is not None: 588 | self.mysql_pool_cli.release(self.connection) 589 | self.mysql_pool_cli.close() 590 | self.loop.create_task(self.mysql_pool_cli.wait_closed()) 591 | self.mysql_pool_cli = self.connection = self.cursor = None 592 | 593 | 594 | class RMongoConfig(BaseGetterConfig): 595 | def __init__(self, collection, per_limit=None, max_limit=None, query_body=None, max_retry=None, 596 | random_min_sleep=None, random_max_sleep=None, filter_=None, host=None, port=None, username=None, 597 | password=None, database=None, **kwargs): 598 | """ 599 | :param collection: collection name 600 | :param per_limit: how many items to get per request 601 | :param max_limit: get at most max_limit items, if not set, get all 602 | :param query_body: search query, default None, i.e: {'i': {'$lt': 5}} 603 | :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object 604 | if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}} 605 | :param max_retry: if request fail, retry max_retry times 606 | :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again 607 | :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again 608 | :param filter_: run "transform --help" to see command line interface explanation for detail 609 | :param kwargs: 610 | 611 | Example: 612 | mongo_config = RMongoConfig("my_coll") 613 | mongo_getter = ProcessFactory.create_getter(mongo_config) 614 | async for items in mongo_getter: 615 | print(item) 616 | """ 617 | super().__init__() 618 | 619 | if not random_min_sleep: 620 | random_min_sleep = DefaultVal.random_min_sleep 621 | if not random_max_sleep: 622 | random_max_sleep = DefaultVal.random_max_sleep 623 | if not per_limit: 624 | per_limit = DefaultVal.per_limit 625 | if not max_limit: 626 | max_limit = DefaultVal.max_limit 627 | if not max_retry: 628 | max_retry = DefaultVal.max_retry 629 | if not host: 630 | host = DefaultVal.mongo_host 631 | if not port: 632 | port = DefaultVal.mongo_port 633 | if not username: 634 | username = DefaultVal.mongo_username 635 | if not password: 636 | password = DefaultVal.mongo_password 637 | if not database: 638 | database = DefaultVal.mongo_database 639 | 640 | if not DefaultVal.main_config.has_mongo_configured: 641 | raise ValueError("You must config MongoDB before using MongoDB, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, )) 642 | if "motor" not in globals(): 643 | raise ValueError("module motor disabled, please reinstall " 644 | "requirements in linux") 645 | 646 | self.collection = collection 647 | self.query_body = query_body 648 | self.per_limit = per_limit 649 | self.max_limit = max_limit 650 | self.max_retry = max_retry 651 | self.random_min_sleep = random_min_sleep 652 | self.random_max_sleep = random_max_sleep 653 | self.filter = filter_ 654 | self.host = host 655 | self.port = port 656 | self.username = username 657 | self.password = password 658 | self.database = database 659 | self.name = "%s->%s" % (self.database, self.collection) 660 | 661 | self.client = self.cursor = None 662 | 663 | def get_mongo_cli(self): 664 | if self.client is None: 665 | kwargs = { 666 | "host": self.host, 667 | "port": self.port 668 | } 669 | if self.username: 670 | address = "mongodb://%s:%s@%s:%s/%s" % (self.username, self.password, kwargs["host"], str(kwargs["port"]), self.database) 671 | self.client = motor.motor_asyncio.AsyncIOMotorClient(address) 672 | else: 673 | self.client = motor.motor_asyncio.AsyncIOMotorClient(**kwargs) 674 | 675 | if self.query_body: 676 | self.cursor = self.client[self.database][self.collection].find(self.query_body) 677 | else: 678 | self.cursor = self.client[self.database][self.collection].find() 679 | return self.client 680 | --------------------------------------------------------------------------------