├── idataapi_transform
    ├── DataProcess
    │   ├── __init__.py
    │   ├── Meta
    │   │   ├── __init__.py
    │   │   └── BaseDataProcess.py
    │   ├── Config
    │   │   ├── __init__.py
    │   │   ├── ConfigUtil
    │   │   │   ├── __init__.py
    │   │   │   ├── BaseConfig.py
    │   │   │   ├── AsyncHelper.py
    │   │   │   ├── WriterConfig.py
    │   │   │   └── GetterConfig.py
    │   │   ├── ConnectorConfig.py
    │   │   ├── LogConfig.py
    │   │   ├── DefaultValue.py
    │   │   ├── MainConfig.py
    │   │   └── ESConfig.py
    │   ├── DataGetter
    │   │   ├── __init__.py
    │   │   ├── BaseGetter.py
    │   │   ├── CSVGetter.py
    │   │   ├── JsonGetter.py
    │   │   ├── XLSXGetter.py
    │   │   ├── MongoGetter.py
    │   │   ├── ESGetter.py
    │   │   ├── RedisGetter.py
    │   │   ├── MySQLGetter.py
    │   │   └── APIGetter.py
    │   ├── DataWriter
    │   │   ├── __init__.py
    │   │   ├── BaseWriter.py
    │   │   ├── JsonWriter.py
    │   │   ├── TXTWriter.py
    │   │   ├── RedisWriter.py
    │   │   ├── KafkaWriter.py
    │   │   ├── ESWriter.py
    │   │   ├── MongoWriter.py
    │   │   ├── CSVWriter.py
    │   │   ├── XLSXWriter.py
    │   │   └── MySQLWriter.py
    │   ├── PersistentUtil
    │   │   ├── __init__.py
    │   │   └── PersistentWriter.py
    │   └── ProcessFactory.py
    ├── __init__.py
    └── cli.py
├── .gitignore
├── idataapi-transform.png
├── requirements.txt
├── pyproject.toml
├── LICENSE
└── README_CN_simple.md


/idataapi_transform/DataProcess/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Meta/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/PersistentUtil/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConfigUtil/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | logs/*
2 | test*
3 | .DS_Store
4 | .idea/*
5 | config.ini
6 | *.pyc
7 | venv/*
8 | 


--------------------------------------------------------------------------------
/idataapi-transform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zpoint/idataapi-transform/HEAD/idataapi-transform.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | openpyxl
3 | elasticsearch==7.9.0
4 | aioredis>=1.0
5 | PyMySQL>=0.7.5,<0.9;python_version>="3.5.3"
6 | aiomysql
7 | confluent_kafka
8 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConfigUtil/BaseConfig.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class BaseGetterConfig(object, metaclass=abc.ABCMeta):
 5 |     @abc.abstractmethod
 6 |     def __init__(self, *args, **kwargs):
 7 |         pass
 8 | 
 9 | 
10 | class BaseWriterConfig(object, metaclass=abc.ABCMeta):
11 |     @abc.abstractmethod
12 |     def __init__(self, *args, **kwargs):
13 |         pass
14 | 
15 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [tool.flit.metadata]
 6 | module="idataapi_transform"
 7 | author="zpoint"
 8 | author-email="zp0int@qq.com"
 9 | home-page="https://github.com/zpoint/idataapi-transform"
10 | classifiers=["License :: OSI Approved :: MIT License"]
11 | requires=["aiohttp", "openpyxl", "elasticsearch-async", "aioredis", "confluent_kafka"]
12 | requires-python=">=3.5.2"
13 | keywords="idataapi transform"
14 | dist-name="idataapi-transform"
15 | 
16 | [tool.flit.scripts]
17 | transform="idataapi_transform:main"
18 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/BaseWriter.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from ..Meta.BaseDataProcess import BaseDataProcess
 3 | 
 4 | 
 5 | class BaseWriter(BaseDataProcess, metaclass=abc.ABCMeta):
 6 |     @abc.abstractmethod
 7 |     def __init__(self, *args, **kwargs):
 8 |         """
 9 |         :param config
10 |         """
11 |         pass
12 | 
13 |     @abc.abstractmethod
14 |     async def write(self, responses):
15 |         pass
16 | 
17 |     @abc.abstractmethod
18 |     def __enter__(self):
19 |         pass
20 | 
21 |     @abc.abstractmethod
22 |     def __exit__(self, exc_type, exc_val, exc_tb):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/BaseGetter.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from ..Meta.BaseDataProcess import BaseDataProcess
 3 | 
 4 | 
 5 | class BaseGetter(BaseDataProcess, metaclass=abc.ABCMeta):
 6 |     @abc.abstractmethod
 7 |     def __init__(self, *args, **kwargs):
 8 |         """
 9 |         :param config
10 |             config contains attribute:
11 |                 source: where to read data
12 |                 per_limit: return at most per_limit data each time
13 |                 max_limit: return at most max_limit data total
14 |         """
15 |         pass
16 | 
17 |     @abc.abstractmethod
18 |     def __aiter__(self):
19 |         return self
20 | 
21 |     @abc.abstractmethod
22 |     async def __anext__(self):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConfigUtil/AsyncHelper.py:
--------------------------------------------------------------------------------
 1 | class AsyncGenerator(object):
 2 |     def __init__(self, items, process_func):
 3 |         self.items = items
 4 |         if hasattr(self.items, "__aiter__"):
 5 |             self.is_async = True
 6 |         else:
 7 |             self.is_async = False
 8 |             self.items = self.to_generator(items)
 9 |         self.process_func = process_func
10 | 
11 |     def __aiter__(self):
12 |         return self
13 | 
14 |     async def __anext__(self):
15 |         if self.is_async:
16 |             r = await self.items.__anext__()
17 |             return self.process_func(r)
18 |         else:
19 |             try:
20 |                 r = next(self.items)
21 |                 return self.process_func(r)
22 |             except StopIteration:
23 |                 raise StopAsyncIteration
24 | 
25 |     @staticmethod
26 |     def to_generator(items):
27 |         for i in items:
28 |             yield i
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 zpoint
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/idataapi_transform/__init__.py:
--------------------------------------------------------------------------------
 1 | """convert data from a format to another format, read or write from file or database, suitable for iDataAPI"""
 2 | 
 3 | from .cli import main
 4 | from .DataProcess.Config.ConfigUtil import WriterConfig
 5 | from .DataProcess.Config.ConfigUtil import GetterConfig
 6 | from .DataProcess.ProcessFactory import ProcessFactory
 7 | 
 8 | 
 9 | class ManualConfig(object):
10 |     @staticmethod
11 |     def set_config(ini_path):
12 |         from .DataProcess.Config.MainConfig import main_config_box
13 |         from .DataProcess.Config.DefaultValue import DefaultVal
14 |         main_config_box.read_config(ini_path)
15 |         DefaultVal.refresh()
16 | 
17 |     @staticmethod
18 |     def disable_log():
19 |         from .DataProcess.Config.LogConfig import remove_log
20 |         remove_log()
21 | 
22 |     @staticmethod
23 |     def set_log_path(log_path, max_log_file_bytes):
24 |         """
25 |         :param log_path: directory where log stores, i.e ===> /Desktop/logs/
26 |         :param max_log_file_bytes: max log file size, in bytes, i.e: 5242880(5MB)
27 |         :return:
28 |         """
29 |         from .DataProcess.Config.MainConfig import main_config_box
30 |         from .DataProcess.Config.DefaultValue import DefaultVal
31 |         main_config_box.config_log(log_path, max_log_file_bytes)
32 | 
33 | 
34 | __version__ = '2.0.2'
35 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/JsonWriter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from .BaseWriter import BaseWriter
 4 | 
 5 | 
 6 | class JsonWriter(BaseWriter):
 7 |     def __init__(self, config):
 8 |         super().__init__()
 9 |         self.config = config
10 |         self.total_miss_count = 0
11 |         self.success_count = 0
12 |         self.f_out = open(self.config.filename, self.config.mode, encoding=self.config.encoding)
13 | 
14 |     def write(self, responses):
15 |         miss_count = 0
16 |         for each_response in responses:
17 |             if self.config.expand:
18 |                 each_response = self.expand_dict(each_response, max_expand=self.config.expand)
19 | 
20 |             if self.config.filter:
21 |                 each_response = self.config.filter(each_response)
22 |                 if not each_response:
23 |                     miss_count += 1
24 |                     continue
25 |             self.f_out.write(json.dumps(each_response) + self.config.new_line)
26 |             self.success_count += 1
27 |         self.total_miss_count += miss_count
28 |         logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count))
29 | 
30 |     def __exit__(self, exc_type, exc_val, exc_tb):
31 |         self.f_out.close()
32 |         logging.info("%s write done, total filtered %d item, total write %d item" %
33 |                      (self.config.filename, self.total_miss_count, self.success_count))
34 | 
35 |     def __enter__(self):
36 |         return self
37 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/TXTWriter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .BaseWriter import BaseWriter
 3 | 
 4 | 
 5 | class TXTWriter(BaseWriter):
 6 |     def __init__(self, config):
 7 |         super().__init__()
 8 |         self.config = config
 9 |         self.f_out = open(config.filename, config.mode, encoding=config.encoding)
10 |         self.total_miss_count = 0
11 |         self.success_count = 0
12 | 
13 |     def write(self, responses):
14 |         miss_count = 0
15 |         for each_response in responses:
16 |             if self.config.expand:
17 |                 each_response = self.expand_dict(each_response, max_expand=self.config.expand)
18 | 
19 |             if self.config.filter:
20 |                 each_response = self.config.filter(each_response)
21 |                 if not each_response:
22 |                     miss_count += 1
23 |                     continue
24 | 
25 |             self.f_out.write(self.config.join_val.join(str(value) for value in each_response.values()) + self.config.new_line)
26 |             self.success_count += 1
27 | 
28 |         self.total_miss_count += miss_count
29 |         logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count))
30 | 
31 |     def __exit__(self, exc_type, exc_val, exc_tb):
32 |         self.f_out.close()
33 |         logging.info("%s write done, total filtered %d item, total write %d item" %
34 |                      (self.config.filename, self.total_miss_count, self.success_count))
35 | 
36 |     def __enter__(self):
37 |         return self
38 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Meta/BaseDataProcess.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class BaseDataProcess(object):
 3 |     @staticmethod
 4 |     def expand_dict(origin_item, max_expand=0, current_expand=0, parent_key=None, parent_item=None):
 5 |         if max_expand == 0:
 6 |             return origin_item
 7 |         if max_expand != -1 and current_expand >= max_expand:
 8 |             return origin_item
 9 |         if parent_key:
10 |             if isinstance(origin_item, dict):
11 |                 for sub_k, sub_v in origin_item.items():
12 |                     parent_item[parent_key + "_" + sub_k] = sub_v
13 |                     if parent_key in parent_item:
14 |                         del parent_item[parent_key]
15 |             elif isinstance(origin_item, list):
16 |                 for item in origin_item:
17 |                     BaseDataProcess.expand_dict(item, max_expand, current_expand + 1, parent_key, parent_item)
18 |             return origin_item
19 | 
20 |         keys = [k for k in origin_item.keys()]
21 |         has_sub_dict = False
22 |         for k in keys:
23 |             if isinstance(origin_item[k], dict):
24 |                 has_sub_dict = True
25 |                 sub_dict = origin_item[k]
26 |                 for sub_k, sub_v in sub_dict.items():
27 |                     origin_item[k + "_" + sub_k] = sub_v
28 |                     del origin_item[k]
29 |             elif isinstance(origin_item[k], list):
30 |                 for item in origin_item[k]:
31 |                     BaseDataProcess.expand_dict(item, max_expand, current_expand + 1, k, origin_item)
32 | 
33 |         if has_sub_dict:
34 |             return BaseDataProcess.expand_dict(origin_item, max_expand, current_expand + 1)
35 |         else:
36 |             return origin_item
37 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConnectorConfig.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | import asyncio
 3 | import inspect
 4 | from .MainConfig import main_config
 5 | 
 6 | 
 7 | class _SessionManger(object):
 8 |     def __init__(self, concurrency_limit=None, loop=None):
 9 |         concurrency_limit = main_config()["main"].getint("concurrency") if concurrency_limit is None else concurrency_limit
10 |         self.session = self._generate_session(concurrency_limit=concurrency_limit, loop=loop)
11 | 
12 |     @staticmethod
13 |     def _generate_connector(limit=None, loop=None):
14 |         """
15 |         https://github.com/KeepSafe/aiohttp/issues/883
16 |         if connector is passed to session, it is not available anymore
17 |         """
18 |         limit = main_config()["main"].getint("concurrency") if limit is None else limit
19 |         if not loop:
20 |             loop = asyncio.get_event_loop()
21 |         return aiohttp.TCPConnector(limit=limit, loop=loop)
22 | 
23 |     @staticmethod
24 |     def _generate_session(concurrency_limit=None, loop=None):
25 |         if not loop:
26 |             loop = asyncio.get_event_loop()
27 |         concurrency_limit = main_config()["main"].getint("concurrency") if concurrency_limit is None else concurrency_limit
28 |         return aiohttp.ClientSession(connector=_SessionManger._generate_connector(limit=concurrency_limit, loop=loop),
29 |                                      loop=loop)
30 | 
31 |     def get_session(self):
32 |         return self.session
33 | 
34 |     def __del__(self):
35 |         try:
36 |             if inspect.iscoroutinefunction(self.session.close):
37 |                 loop = asyncio.get_event_loop()
38 |                 loop.run_until_complete(self.session.close())
39 |             else:
40 |                 self.session.close()
41 |         except Exception as e:
42 |             pass
43 | 
44 | 
45 | session_manger = _SessionManger()
46 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/PersistentUtil/PersistentWriter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | import hashlib
 5 | import logging
 6 | 
 7 | class PersistentWriter(object):
 8 |     def __init__(self, persistent_key):
 9 |         self.f_name = persistent_key + ".json"
10 |         self.latest_record = set()
11 |         self.load_last_record()
12 |         self.f_out = open(self.f_name, "a+", encoding="utf8")
13 |         self.prev_latest_record_num = len(self.latest_record)
14 | 
15 |     def load_last_record(self):
16 |         if os.path.exists(self.f_name):
17 |             try:
18 |                 with open(self.f_name, "r", encoding="utf8") as f:
19 |                     self.latest_record = set(json.loads(f.read())["record"])
20 |             except Exception:
21 |                 logging.error("Broken record file: %s, recreating file" % (self.f_name, ))
22 |                 self.remove_file()
23 | 
24 |     def write(self):
25 |         if len(self.latest_record) == self.prev_latest_record_num:
26 |             return
27 |         else:
28 |             self.prev_latest_record_num = len(self.latest_record)
29 | 
30 |         self.truncate()
31 |         self.f_out.seek(0)
32 |         ts = int(time.time())
33 |         struct_time = time.localtime(ts)
34 |         dt = time.strftime('%Y-%m-%d %H:%M:%S', struct_time)
35 |         record = {
36 |             "record": list(self.latest_record),
37 |             "record_length": len(self.latest_record),
38 |             "timestamp": ts,
39 |             "date": dt,
40 |             "filename": self.f_name
41 |         }
42 |         self.f_out.write(json.dumps(record))
43 |         logging.info("persistent to disk, f_name: %s, total_task_num: %d" % (self.f_name, len(self.latest_record)))
44 | 
45 |     def add(self, key):
46 |         key = hashlib.md5(key.encode("utf8")).hexdigest()
47 |         self.latest_record.add(key)
48 | 
49 |     def __contains__(self, item):
50 |         key = hashlib.md5(item.encode("utf8")).hexdigest()
51 |         return key in self.latest_record
52 | 
53 |     def sync(self):
54 |         self.f_out.flush()
55 | 
56 |     def remove_file(self):
57 |         os.unlink(self.f_name)
58 | 
59 |     def truncate(self):
60 |         self.f_out.truncate(0)
61 | 
62 |     def clear(self, start_fresh_if_done):
63 |         self.latest_record = None
64 |         if start_fresh_if_done:
65 |             self.remove_file()
66 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/LogConfig.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | from logging.handlers import RotatingFileHandler
 4 | 
 5 | format_str = "%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s"
 6 | date_formatter_str = '[%Y-%m-%d %H:%M:%S]'
 7 | formatter = logging.Formatter(format_str, datefmt=date_formatter_str)
 8 | 
 9 | 
10 | class SingleLevelFilter(logging.Filter):
11 |     def __init__(self, passlevel, reject):
12 |         super(SingleLevelFilter, self).__init__()
13 |         self.passlevel = passlevel
14 |         self.reject = reject
15 | 
16 |     def filter(self, record):
17 |         if self.reject:
18 |             return record.levelno != self.passlevel
19 |         else:
20 |             return record.levelno == self.passlevel
21 | 
22 | 
23 | def init_log(log_dir, max_log_file_bytes, ini_path, manual=False):
24 |     root_logger = logging.getLogger()
25 |     root_logger.setLevel(logging.INFO)
26 |     # console
27 |     console = logging.StreamHandler()
28 |     console.setFormatter(formatter)
29 |     root_logger.addHandler(console)
30 |     if log_dir:
31 |         if not os.path.exists(log_dir):
32 |             logging.error("log_dir(%s)%s not exists, I will not log to file" % (log_dir, "" if manual else " in configure file(%s)" % (ini_path, )))
33 |             return False
34 |         if not max_log_file_bytes:
35 |             logging.error("log_byte not set, please %s, or I will not log to file" % ("pass log_byte as parameters" if manual else "configure log_byte in configure file(%s)" % (ini_path, )))
36 |             return False
37 |         # info
38 |         h1 = RotatingFileHandler("%s/info.log" % (log_dir, ), mode="a", maxBytes=max_log_file_bytes,
39 |                                  encoding="utf8", backupCount=1)
40 |         h1.setFormatter(formatter)
41 |         f1 = SingleLevelFilter(logging.INFO, False)
42 |         h1.addFilter(f1)
43 |         root_logger.addHandler(h1)
44 | 
45 |         # error
46 |         h1 = RotatingFileHandler("%s/error.log" % (log_dir, ), mode="a", maxBytes=max_log_file_bytes,
47 |                                  encoding="utf8", backupCount=1)
48 |         h1.setFormatter(formatter)
49 |         f1 = SingleLevelFilter(logging.ERROR, False)
50 |         h1.addFilter(f1)
51 |         root_logger.addHandler(h1)
52 |         # logging.info("log dir set to: %s" % (log_dir, ))
53 |         return True
54 |     return False
55 | 
56 | 
57 | def remove_log():
58 |     root_logger = logging.getLogger()
59 |     root_logger.handlers.clear()
60 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/RedisWriter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import asyncio
 3 | import random
 4 | import traceback
 5 | import json
 6 | import zlib
 7 | from .BaseWriter import BaseWriter
 8 | 
 9 | 
10 | class RedisWriter(BaseWriter):
11 |     def __init__(self, config):
12 |         super().__init__()
13 |         self.config = config
14 |         self.total_miss_count = 0
15 |         self.success_count = 0
16 | 
17 |     def encode(self, dict_object):
18 |         string = json.dumps(dict_object)
19 |         if self.config.compress:
20 |             string = zlib.compress(string.encode(self.config.encoding))
21 |         return string
22 | 
23 |     async def write(self, responses):
24 |         await self.config.get_redis_pool_cli()  # init redis pool
25 |         miss_count = 0
26 |         target_responses = list()
27 |         for each_response in responses:
28 |             if self.config.filter:
29 |                 each_response = self.config.filter(each_response)
30 |                 if not each_response:
31 |                     miss_count += 1
32 |                     continue
33 |             target_responses.append(each_response)
34 |             self.success_count += 1
35 |         self.total_miss_count += miss_count
36 |         if target_responses:
37 |             try_time = 0
38 |             while try_time < self.config.max_retry:
39 |                 try:
40 |                     if self.config.is_range:
41 |                         await self.config.redis_write_method(self.config.key, *(self.encode(i) for i in target_responses))
42 |                     else:
43 |                         pipe_line = self.config.redis_pool_cli.pipeline()
44 |                         for each in responses:
45 |                             pipe_line.hset(self.config.key, each["id"], self.encode(each))
46 |                         await pipe_line.execute()
47 | 
48 |                     logging.info("%s write %d item, filtered %d item" % (self.config.name, len(responses), miss_count))
49 |                     break
50 |                 except Exception as e:
51 |                     try_time += 1
52 |                     if try_time >= self.config.max_retry:
53 |                         logging.error("Fail to write after try: %d times, Write 0 items to redis, "
54 |                                       "filtered %d item before write, error: %s" %
55 |                                       (self.config.max_retry, miss_count, str(traceback.format_exc())))
56 |                     else:
57 |                         await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
58 |         else:
59 |             logging.info("Write 0 items to %s, filtered: %d, (all filtered, or pass empty result)" % (self.config.name, miss_count))
60 | 
61 |     def __exit__(self, exc_type, exc_val, exc_tb):
62 |         logging.info("%s write done, total filtered %d item, total write %d item" %
63 |                      (self.config.name, self.total_miss_count, self.success_count))
64 | 
65 |     def __enter__(self):
66 |         return self
67 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/KafkaWriter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .BaseWriter import BaseWriter
 3 | 
 4 | import asyncio
 5 | import confluent_kafka
 6 | from confluent_kafka import KafkaException
 7 | from threading import Thread
 8 | import json
 9 | 
10 | 
11 | 
12 | class AIOProducer:
13 |     def __init__(self, configs, loop=None):
14 |         self._loop = loop or asyncio.get_event_loop()
15 |         self._producer = confluent_kafka.Producer(configs)
16 |         self._cancelled = False
17 |         self._poll_thread = Thread(target=self._poll_loop)
18 |         self._poll_thread.start()
19 | 
20 |     def _poll_loop(self):
21 |         while not self._cancelled:
22 |             self._producer.poll(0.1)
23 | 
24 |     def close(self):
25 |         self._cancelled = True
26 |         self._poll_thread.join()
27 | 
28 |     def produce(self, topic, value):
29 |         """
30 |         An awaitable produce method.
31 |         """
32 |         result = self._loop.create_future()
33 | 
34 |         def ack(err, msg):
35 |             if err:
36 |                 self._loop.call_soon_threadsafe(result.set_exception, KafkaException(err))
37 |             else:
38 |                 self._loop.call_soon_threadsafe(result.set_result, msg)
39 |         self._producer.produce(topic, value, on_delivery=ack)
40 |         return result
41 | 
42 |     def produce2(self, topic, value, on_delivery):
43 |         """
44 |         A produce method in which delivery notifications are made available
45 |         via both the returned future and on_delivery callback (if specified).
46 |         """
47 |         result = self._loop.create_future()
48 | 
49 |         def ack(err, msg):
50 |             if err:
51 |                 self._loop.call_soon_threadsafe(
52 |                     result.set_exception, KafkaException(err))
53 |             else:
54 |                 self._loop.call_soon_threadsafe(
55 |                     result.set_result, msg)
56 |             if on_delivery:
57 |                 self._loop.call_soon_threadsafe(
58 |                     on_delivery, err, msg)
59 |         self._producer.produce(topic, value, on_delivery=ack)
60 |         return result
61 | 
62 | 
63 | class KafkaWriter(BaseWriter):
64 |     def __init__(self, config, topic, loop=None):
65 |         super().__init__()
66 |         self.topic = topic
67 |         self.total_miss_count = 0
68 |         self.success_count = 0
69 |         self.producer = AIOProducer(configs={"bootstrap.servers": config.bootstrap_servers}, loop=loop)
70 | 
71 |     async def write(self, responses):
72 |         for each_response in responses:
73 |             if isinstance(each_response, dict):
74 |                 each_response = json.dumps(each_response, indent=2).encode('utf-8')
75 |             await self.producer.produce(self.topic, each_response)
76 |             self.success_count += 1
77 |         logging.info("%s write %d item" % (self.topic, len(responses)))
78 | 
79 |     def __exit__(self, exc_type, exc_val, exc_tb):
80 |         self.producer.close()
81 |         logging.info("%s write done, total write %d item" %
82 |                      (self.topic, self.success_count))
83 | 
84 |     def __enter__(self):
85 |         return self
86 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/CSVGetter.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | import logging
 4 | from .BaseGetter import BaseGetter
 5 | 
 6 | if sys.platform == "linux":
 7 |     csv.field_size_limit(sys.maxsize)
 8 | 
 9 | 
10 | class CSVGetter(BaseGetter):
11 |     def __init__(self, config):
12 |         super().__init__()
13 |         self.config = config
14 |         self.f_in = open(self.config.filename, self.config.mode, encoding=self.config.encoding)
15 |         self.reader = csv.DictReader(self.f_in)
16 | 
17 |         self.done = False
18 |         self.responses = list()
19 |         self.miss_count = 0
20 |         self.total_count = 0
21 | 
22 |     def init_val(self):
23 |         self.done = False
24 |         self.responses = list()
25 |         self.f_in.seek(0, 0)
26 |         self.miss_count = 0
27 |         self.total_count = 0
28 | 
29 |     def __aiter__(self):
30 |         return self
31 | 
32 |     async def __anext__(self):
33 |         if self.done:
34 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
35 |                          (self.config.filename, self.total_count, self.miss_count))
36 |             self.init_val()
37 |             raise StopAsyncIteration
38 | 
39 |         for row in self.reader:
40 |             if self.config.max_limit and self.total_count > self.config.max_limit:
41 |                 self.done = True
42 |                 return self.clear_and_return()
43 | 
44 |             self.total_count += 1
45 |             if self.config.filter:
46 |                 row = self.config.filter(row)
47 |             if not row:
48 |                 self.miss_count += 1
49 |                 continue
50 | 
51 |             self.responses.append(row)
52 |             if len(self.responses) > self.config.per_limit:
53 |                 return self.clear_and_return()
54 | 
55 |         if self.responses:
56 |             self.done = True
57 |             return self.clear_and_return()
58 | 
59 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
60 |                      (self.config.filename, self.total_count, self.miss_count))
61 |         self.init_val()
62 |         raise StopAsyncIteration
63 | 
64 |     def __iter__(self):
65 |         for row in self.reader:
66 |             if self.config.max_limit and self.total_count > self.config.max_limit:
67 |                 self.done = True
68 |                 yield self.clear_and_return()
69 |                 break
70 | 
71 |             self.total_count += 1
72 |             if self.config.filter:
73 |                 row = self.config.filter(row)
74 |             if not row:
75 |                 self.miss_count += 1
76 |                 continue
77 | 
78 |             self.responses.append(row)
79 |             if len(self.responses) > self.config.per_limit:
80 |                 yield self.clear_and_return()
81 | 
82 |         if self.responses:
83 |             yield self.responses
84 | 
85 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
86 |                      (self.config.filename, self.total_count, self.miss_count))
87 |         self.init_val()
88 | 
89 |     def clear_and_return(self):
90 |         resp = self.responses
91 |         self.responses = list()
92 |         return resp
93 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/ProcessFactory.py:
--------------------------------------------------------------------------------
 1 | # config
 2 | from .Config.MainConfig import main_config
 3 | 
 4 | from .Config.ConfigUtil import GetterConfig
 5 | from .Config.ConfigUtil import WriterConfig
 6 | 
 7 | from .DataGetter.ESGetter import ESScrollGetter
 8 | from .DataGetter.CSVGetter import CSVGetter
 9 | from .DataGetter.APIGetter import APIGetter, APIBulkGetter
10 | from .DataGetter.JsonGetter import JsonGetter
11 | from .DataGetter.XLSXGetter import XLSXGetter
12 | from .DataGetter.RedisGetter import RedisGetter
13 | from .DataGetter.MySQLGetter import MySQLGetter
14 | from .DataGetter.MongoGetter import MongoGetter
15 | 
16 | from .DataWriter.CSVWriter import CSVWriter
17 | from .DataWriter.ESWriter import ESWriter
18 | from .DataWriter.JsonWriter import JsonWriter
19 | from .DataWriter.TXTWriter import TXTWriter
20 | from .DataWriter.XLSXWriter import XLSXWriter
21 | from .DataWriter.RedisWriter import RedisWriter
22 | from .DataWriter.MySQLWriter import MySQLWriter
23 | from .DataWriter.MongoWriter import MongoWriter
24 | from .DataWriter.KafkaWriter import KafkaWriter
25 | 
26 | 
27 | class ProcessFactory(object):
28 |     config_getter_map = {
29 |         GetterConfig.RAPIConfig: APIGetter,
30 |         GetterConfig.RCSVConfig: CSVGetter,
31 |         GetterConfig.RESConfig: ESScrollGetter,
32 |         GetterConfig.RJsonConfig: JsonGetter,
33 |         GetterConfig.RXLSXConfig: XLSXGetter,
34 |         GetterConfig.RAPIBulkConfig: APIBulkGetter,
35 |         GetterConfig.RRedisConfig: RedisGetter,
36 |         GetterConfig.RMySQLConfig: MySQLGetter,
37 |         GetterConfig.RMongoConfig: MongoGetter
38 |     }
39 | 
40 |     config_writer_map = {
41 |         WriterConfig.WCSVConfig: CSVWriter,
42 |         WriterConfig.WESConfig: ESWriter,
43 |         WriterConfig.WJsonConfig: JsonWriter,
44 |         WriterConfig.WTXTConfig: TXTWriter,
45 |         WriterConfig.WXLSXConfig: XLSXWriter,
46 |         WriterConfig.WRedisConfig: RedisWriter,
47 |         WriterConfig.WMySQLConfig: MySQLWriter,
48 |         WriterConfig.WMongoConfig: MongoWriter,
49 |         WriterConfig.WKafkaConfig: KafkaWriter
50 |     }
51 | 
52 |     @staticmethod
53 |     def create_getter(config):
54 |         """
55 |         create a getter based on config
56 |         :return: getter
57 |         """
58 |         for config_class, getter_class in ProcessFactory.config_getter_map.items():
59 |             if isinstance(config, config_class):
60 |                 return getter_class(config)
61 |         raise ValueError("create_getter must pass one of the instance of [RAPIConfig, RCSVConfig, RESConfig, "
62 |                          "RJsonConfig, RXLSXConfig, RAPIBulkConfig, RRedisConfig, RMySQLConfig, RMongoConfig]")
63 | 
64 |     @staticmethod
65 |     def create_writer(config, **kwargs):
66 |         """
67 |         create a writer based on config
68 |         :return: a writer
69 |         """
70 |         for config_class, writer_class in ProcessFactory.config_writer_map.items():
71 |             if isinstance(config, config_class):
72 |                 return writer_class(config, **kwargs)
73 |         else:
74 |             raise ValueError("create_writer must pass one of the instance of [WCSVConfig, WESConfig, WJsonConfig, "
75 |                              "WTXTConfig, WXLSXConfig, WRedisConfig, WMySQLConfig, WMongoConfig]")
76 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/DefaultValue.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | from .MainConfig import main_config
 4 | 
 5 | 
 6 | class DefaultValObject(object):
 7 |     def __init__(self):
 8 |         self.refresh()
 9 | 
10 |     def refresh(self):
11 |         self.main_config = main_config()
12 |         self.per_limit = self.main_config["main"].getint("per_limit")
13 |         self.max_limit = self.main_config["main"].get("max_limit")
14 |         if self.max_limit != "None":
15 |             self.max_limit = int(self.max_limit)
16 |         else:
17 |             self.max_limit = None
18 |         self.max_retry = self.main_config["main"].getint("max_retry")
19 |         self.random_min_sleep = self.main_config["main"].getint("random_min_sleep")
20 |         self.random_max_sleep = self.main_config["main"].getint("random_max_sleep")
21 | 
22 |         # redis
23 |         self.redis_host = self.main_config["redis"].get("host")
24 |         self.redis_port = self.main_config["redis"].getint("port")
25 |         self.redis_db = self.main_config["redis"].get("db")
26 |         self.redis_password = self.main_config["redis"].get("password")
27 |         self.redis_timeout = self.main_config["redis"].getint("timeout")
28 |         self.redis_encoding = self.main_config["redis"].get("encoding")
29 |         self.redis_direction = self.main_config["redis"].get("direction")
30 |         self.redis_compress = self.main_config["redis"].getboolean("compress")
31 |         self.redis_need_del = self.main_config["redis"].getboolean("need_del")
32 | 
33 |         # mysql config
34 |         self.mysql_host = self.main_config["mysql"].get("host")
35 |         self.mysql_port = self.main_config["mysql"].getint("port")
36 |         self.mysql_user = self.main_config["mysql"].get("user")
37 |         self.mysql_password = self.main_config["mysql"].get("password")
38 |         self.mysql_database = self.main_config["mysql"].get("database")
39 |         self.mysql_encoding = self.main_config["mysql"].get("encoding")
40 |         if not self.mysql_encoding:
41 |             self.mysql_encoding = self.default_encoding
42 | 
43 |         # mongo config
44 |         self.mongo_host = self.main_config["mongo"].get("host")
45 |         self.mongo_port = self.main_config["mongo"].getint("port")
46 |         self.mongo_username = self.main_config["mongo"].get("username")
47 |         self.mongo_password = self.main_config["mongo"].get("password")
48 |         self.mongo_database = self.main_config["mongo"].get("database")
49 |         self.mongo_protocol = self.main_config["mongo"].get("protocol")
50 |         self.mongo_other_params = self.main_config["mongo"].get("other_params")
51 | 
52 |         # kafka config
53 |         self.kafka_bootstrap_servers = self.main_config["kafka"].get("bootstrap.servers")
54 | 
55 |     default_file_mode_r = "r"
56 |     default_file_mode_w = "w"
57 |     default_encoding = "utf8"
58 |     new_line = "\n"
59 |     join_val = " "
60 |     title = "example"
61 |     qsn = None
62 | 
63 |     query_body = None
64 |     dest_without_path = "result"
65 |     dest = os.getcwd() + "/" + dest_without_path
66 |     interval = 5
67 |     concurrency = 50
68 |     default_key_type = "LIST"
69 |     default_quote_char = '"'
70 |     report_interval = 10
71 |     success_ret_code = ("100002", "100301", "100103")
72 |     trim_to_max_limit = False
73 |     exclude_filtered_to_max_limit = True
74 | 
75 |     @staticmethod
76 |     def default_id_hash_func(item):
77 |         if "appCode" in item and item["appCode"] and "id" in item and item["id"]:
78 |             value = (item["appCode"] + "_" + item["id"]).encode("utf8")
79 |         else:
80 |             value = str(item).encode("utf8")
81 |         return hashlib.md5(value).hexdigest()
82 | 
83 | 
84 | DefaultVal = DefaultValObject()
85 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/ESWriter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import asyncio
 3 | import random
 4 | from .BaseWriter import BaseWriter
 5 | from ..Config.MainConfig import main_config
 6 | 
 7 | 
 8 | class ESWriter(BaseWriter):
 9 |     def __init__(self, config):
10 |         if not main_config.has_es_configured:
11 |             raise ValueError("You must config es_hosts before using ESWriter, Please edit configure file: %s" % (main_config.ini_path, ))
12 | 
13 |         super().__init__()
14 |         self.config = config
15 |         self.total_miss_count = 0
16 |         self.success_count = 0
17 |         self.fail_count = 0
18 | 
19 |     async def write(self, responses):
20 |         response = None  # something to return
21 |         origin_length = len(responses)
22 |         if self.config.filter:
23 |             responses = [self.config.filter(i) for i in responses]
24 |             responses = [i for i in responses if i]
25 |         miss_count = origin_length - len(responses)
26 |         self.total_miss_count += miss_count
27 |         if responses:
28 |             if self.config.expand:
29 |                 responses = [self.expand_dict(i) for i in responses]
30 |             try_time = 0
31 |             while try_time < self.config.max_retry:
32 |                 success, fail, response = await self.config.es_client.add_dict_to_es(
33 |                     self.config.indices, self.config.doc_type, responses,
34 |                     self.config.id_hash_func, self.config.app_code,
35 |                     self.config.actions, self.config.create_date,
36 |                     self.config.error_if_fail, self.config.timeout, self.config.auto_insert_createDate)
37 |                 if response is not None:
38 |                     self.success_count += success
39 |                     self.fail_count += fail
40 |                     logging.info("Write %d items to index: %s, doc_type: %s, fail: %d, filtered: %d" % (
41 |                         len(responses), self.config.indices, self.config.doc_type, fail, miss_count))
42 |                     break
43 |                 else:
44 |                     # exception happened
45 |                     try_time += 1
46 |                     if try_time >= self.config.max_retry:
47 |                         logging.error("Fail to write after try: %d times, Write 0 items to index: %s, doc_type: %s" %
48 |                                       (self.config.max_retry, self.config.indices, self.config.doc_type))
49 |                     else:
50 |                         await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
51 |         else:
52 |             # all filtered, or pass empty result
53 |             logging.info("Write 0 items to index: %s, doc_type: %s (all filtered, or pass empty result)" % (self.config.indices, self.config.doc_type))
54 |         return response
55 | 
56 |     async def delete_all(self, body=None):
57 |         """
58 |         inefficient delete
59 |         """
60 |         if not body:
61 |             body = {
62 |                 "query": {
63 |                     "match_all": {}
64 |                 }
65 |             }
66 |         result = await self.config.es_client.delete_by_query(index=self.config.indices, doc_type=self.config.doc_type,
67 |                                                              body=body, params={"conflicts": "proceed"})
68 |         return result
69 | 
70 |     def __enter__(self):
71 |         return self
72 | 
73 |     def __exit__(self, exc_type, exc_val, exc_tb):
74 |         logging.info("%s->%s write done, total filtered %d item, total write %d item, total fail: %d item" %
75 |                      (self.config.indices, self.config.doc_type, self.total_miss_count, self.success_count,
76 |                       self.fail_count))
77 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/MongoWriter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import asyncio
 3 | import random
 4 | import logging
 5 | import traceback
 6 | from .BaseWriter import BaseWriter
 7 | 
 8 | InsertOne = DeleteMany = ReplaceOne = UpdateOne = None
 9 | try:
10 |     from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
11 | except Exception:
12 |     pass
13 | 
14 | 
15 | class MongoWriter(BaseWriter):
16 |     def __init__(self, config):
17 |         super().__init__()
18 |         self.config = config
19 |         self.total_miss_count = 0
20 |         self.success_count = 0
21 |         self.table_checked = False
22 |         self.key_fields = list()
23 | 
24 |     async def write(self, responses):
25 |         self.config.get_mongo_cli()  # init mongodb pool
26 | 
27 |         miss_count = 0
28 |         original_length = len(responses)
29 |         if self.config.filter:
30 |             target_responses = list()
31 |             for i in responses:
32 |                 i = self.config.filter(i)
33 |                 if i:
34 |                     target_responses.append(i)
35 |                 else:
36 |                     miss_count += 1
37 |             responses = target_responses
38 | 
39 |         if not responses:
40 |             self.finish_once(miss_count, original_length)
41 |             return
42 | 
43 |         # After filtered, still have responses to write
44 |         if await self.perform_write(responses):
45 |             self.finish_once(miss_count, original_length)
46 | 
47 |     def __exit__(self, exc_type, exc_val, exc_tb):
48 |         logging.info("%s write done, total filtered %d item, total write %d item" %
49 |                      (self.config.name, self.total_miss_count, self.success_count))
50 | 
51 |     def __enter__(self):
52 |         return self
53 | 
54 |     def finish_once(self, miss_count, original_length):
55 |         self.total_miss_count += miss_count
56 |         self.success_count += original_length
57 |         logging.info("%s write %d item, filtered %d item" % (self.config.name, original_length - miss_count, miss_count))
58 | 
59 |     async def perform_write(self, responses):
60 |         try_time = 0
61 |         for each in responses:
62 |             if self.config.auto_insert_createDate and self.config.createDate is not None:
63 |                 each["createDate"] = self.config.createDate
64 |             if "_id" not in each:
65 |                 each["_id"] = self.config.id_hash_func(each)
66 | 
67 |         while try_time < self.config.max_retry:
68 |             try:
69 |                 if UpdateOne is not None:
70 |                     await self.config.collection_cli.bulk_write([UpdateOne({'_id': each["_id"]}, {"$set": each}, upsert=True) for each in responses])
71 |                 else:
72 |                     bulk = self.config.collection_cli.initialize_ordered_bulk_op()
73 |                     for each in responses:
74 |                         bulk.find({"_id": each["_id"]}).upsert().replace_one(each)
75 |                     await bulk.execute()
76 |                 return True
77 |             except Exception as e:
78 |                 try_time += 1
79 |                 if try_time < self.config.max_retry:
80 |                     logging.error("retry: %d, %s" % (try_time, str(e)))
81 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
82 |                 else:
83 |                     logging.error("Give up MongoWriter writer: %s, After retry: %d times, still fail to write, "
84 |                                   "total write %d items, total filtered: %d items, reason: %s" %
85 |                                   (self.config.name, self.config.max_retry, self.success_count, self.total_miss_count,
86 |                                    str(traceback.format_exc())))
87 |         return False
88 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/JsonGetter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from .BaseGetter import BaseGetter
  4 | 
  5 | 
  6 | class JsonGetter(BaseGetter):
  7 |     def __init__(self, config):
  8 |         super().__init__(self)
  9 |         self.config = config
 10 |         self.responses = list()
 11 |         self.done = False
 12 |         self.f_in = open(self.config.filename, self.config.mode, encoding=self.config.encoding)
 13 |         self.miss_count = 0
 14 |         self.total_count = 0
 15 | 
 16 |     def init_val(self):
 17 |         self.responses = list()
 18 |         self.done = False
 19 |         self.f_in.seek(0, 0)
 20 |         self.miss_count = 0
 21 |         self.total_count = 0
 22 | 
 23 |     def __aiter__(self):
 24 |         return self
 25 | 
 26 |     async def __anext__(self):
 27 |         if self.done:
 28 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 29 |                          (self.config.filename, self.total_count, self.miss_count))
 30 |             self.init_val()
 31 |             raise StopAsyncIteration
 32 | 
 33 |         for line in self.f_in:
 34 |             if self.config.max_limit and self.total_count > self.config.max_limit:
 35 |                 self.done = True
 36 |                 return self.clear_and_return()
 37 | 
 38 |             self.total_count += 1
 39 |             try:
 40 |                 json_obj = json.loads(line)
 41 |             except json.decoder.JSONDecodeError:
 42 |                 logging.error("JSONDecodeError. give up. line: %d" % (self.total_count, ))
 43 |                 continue
 44 | 
 45 |             if self.config.filter:
 46 |                 json_obj = self.config.filter(json_obj)
 47 |                 if not json_obj:
 48 |                     self.miss_count += 1
 49 |                     continue
 50 | 
 51 |             self.responses.append(json_obj)
 52 | 
 53 |             if len(self.responses) > self.config.per_limit:
 54 |                 return self.clear_and_return()
 55 | 
 56 |         self.done = True
 57 |         if self.responses:
 58 |             return self.clear_and_return()
 59 | 
 60 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 61 |                      (self.config.filename, self.total_count, self.miss_count))
 62 |         self.init_val()
 63 |         raise StopAsyncIteration
 64 | 
 65 |     def __iter__(self):
 66 |         for line in self.f_in:
 67 |             if self.config.max_limit and self.total_count > self.config.max_limit:
 68 |                 self.done = True
 69 |                 yield self.clear_and_return()
 70 |                 break
 71 | 
 72 |             self.total_count += 1
 73 |             try:
 74 |                 json_obj = json.loads(line)
 75 |             except json.decoder.JSONDecodeError:
 76 |                 logging.error("JSONDecodeError. give up. line: %d" % (self.total_count, ))
 77 |                 continue
 78 | 
 79 |             if self.config.filter:
 80 |                 json_obj = self.config.filter(json_obj)
 81 |                 if not json_obj:
 82 |                     self.miss_count += 1
 83 |                     continue
 84 | 
 85 |             self.responses.append(json_obj)
 86 | 
 87 |             if len(self.responses) > self.config.per_limit:
 88 |                 yield self.clear_and_return()
 89 | 
 90 |         if self.responses:
 91 |             yield self.clear_and_return()
 92 | 
 93 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 94 |                      (self.config.filename, self.total_count, self.miss_count))
 95 |         self.init_val()
 96 | 
 97 |     def __del__(self):
 98 |         self.f_in.close()
 99 | 
100 |     def clear_and_return(self):
101 |         resp = self.responses
102 |         self.responses = list()
103 |         return resp
104 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/CSVWriter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import types
 4 | import logging
 5 | from .BaseWriter import BaseWriter
 6 | 
 7 | 
 8 | class CSVWriter(BaseWriter):
 9 |     def __init__(self, config):
10 |         super().__init__()
11 |         self.config = config
12 |         self.file_already_exists = os.path.exists(self.config.filename) and os.path.getsize(self.config.filename)
13 |         self.f_out = open(self.config.filename, self.config.mode, encoding=self.config.encoding, newline="")
14 |         self.f_csv = None
15 |         self.headers = dict() if not self.config.headers else self.config.headers
16 |         self.total_miss_count = 0
17 |         self.success_count = 0
18 |         # logging.info("self.config.quotechar: %s, %s", self.config.quotechar, repr(self.config.quotechar))
19 | 
20 |     def write(self, responses):
21 |         miss_count = 0
22 | 
23 |         # filter
24 |         if self.config.filter:
25 |             new_result = list()
26 |             for each_response in responses:
27 |                 each_response = self.config.filter(each_response)
28 |                 if not each_response:
29 |                     miss_count += 1
30 |                     continue
31 |                 new_result.append(each_response)
32 |             responses = new_result
33 |             self.total_miss_count += miss_count
34 | 
35 |         # all filtered
36 |         if not responses:
37 |             logging.info("%s write 0 item, filtered %d item" % (self.config.filename, miss_count))
38 |             return
39 | 
40 |         # expand
41 |         if self.config.expand:
42 |             responses = [self.expand_dict(i, max_expand=self.config.expand) for i in responses]
43 |         else:
44 |             responses = [i for i in responses] if isinstance(responses, types.GeneratorType) else responses
45 | 
46 |         # headers
47 |         if not self.f_csv:
48 |             if "a" in self.config.mode and self.file_already_exists:
49 |                 self.headers = self.generate_headers(responses, append_mode=True)
50 |                 self.f_csv = csv.DictWriter(self.f_out, self.headers, quotechar=self.config.quotechar)
51 |             else:
52 |                 if not self.headers:
53 |                     self.headers = self.generate_headers(responses)
54 |                 self.f_csv = csv.DictWriter(self.f_out, self.headers, quotechar=self.config.quotechar)
55 |                 self.f_csv.writeheader()
56 | 
57 |         # encoding process
58 |         for each_response in responses:
59 |             for k, v in each_response.items():
60 |                 if v is None:
61 |                     each_response[k] = ""
62 | 
63 |                 elif self.config.qsn and v != "" and (isinstance(v, (int, float)) or isinstance(v, str) and all(i.isdigit() for i in v)):
64 |                     each_response[k] = repr(str(v))
65 | 
66 |                 elif self.config.encoding not in ("utf8", "utf-8"):
67 |                     each_response[k] = str(v).encode(self.config.encoding, "ignore").decode(self.config.encoding)
68 | 
69 |             self.success_count += 1
70 |             self.f_csv.writerow(each_response)
71 |         logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count))
72 | 
73 |     def generate_headers(self, responses, append_mode=False):
74 |         headers = set()
75 |         for r in responses:
76 |             for key in r.keys():
77 |                 headers.add(key)
78 | 
79 |         if append_mode:
80 |             f_in = open(self.config.filename, "r", encoding=self.config.encoding, newline="")
81 |             reader = csv.DictReader(f_in)
82 |             exists_fields = reader.fieldnames
83 |             if set(exists_fields) != headers:
84 |                 raise ValueError("append mode for csv file: %s, but header field mismatch, exist fields: %s, generated fields: %s" % (self.config.filename, repr(exists_fields), repr(headers)))
85 |             return exists_fields
86 |         return list(headers)
87 | 
88 |     def __enter__(self):
89 |         return self
90 | 
91 |     def __exit__(self, exc_type, exc_val, exc_tb):
92 |         self.f_out.close()
93 |         logging.info("%s write done, total filtered %d item, total write %d item" %
94 |                      (self.config.filename, self.total_miss_count, self.success_count))
95 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/XLSXWriter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import logging
  4 | from openpyxl import Workbook, load_workbook
  5 | from .BaseWriter import BaseWriter
  6 | from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
  7 | from ..Config.DefaultValue import DefaultVal
  8 | 
  9 | _warning = False
 10 | 
 11 | 
 12 | class XLSXWriter(BaseWriter):
 13 |     def __init__(self, config):
 14 |         global _warning
 15 |         super().__init__()
 16 |         self.config = config
 17 |         self.col_dict = dict()
 18 |         self.row = 2
 19 |         # headers
 20 |         self.header_generated = False
 21 |         self.file_already_exists = os.path.exists(self.config.filename)
 22 |         if "a" in self.config.mode and self.file_already_exists:
 23 |             self.wb = load_workbook(filename=self.config.filename, read_only=False)
 24 |             self.generate_header(from_file=True)
 25 |         else:
 26 |             self.wb = Workbook()
 27 |         self.ws1 = self.wb.active
 28 |         self.ws1.title = config.title
 29 |         self.total_miss_count = 0
 30 |         self.success_count = 0
 31 |         if not _warning:
 32 |             logging.warning("XLSXWriter will actually write to file when __exit__ of XLSXWriter called")
 33 |             _warning = True
 34 | 
 35 |     def write(self, responses):
 36 |         if not self.header_generated and self.config.headers:
 37 |             self.generate_header()
 38 | 
 39 |         miss_count = 0
 40 |         for each_response in responses:
 41 |             if self.config.expand:
 42 |                 each_response = self.expand_dict(each_response, max_expand=self.config.expand)
 43 |             if self.config.filter:
 44 |                 each_response = self.config.filter(each_response)
 45 |                 if not each_response:
 46 |                     miss_count += 1
 47 |                     continue
 48 | 
 49 |             for key, value in each_response.items():
 50 |                 if key not in self.col_dict:
 51 |                     self.col_dict[key] = len(self.col_dict) + 1
 52 |                     self.ws1.cell(row=1, column=self.col_dict[key], value=key)
 53 |                 value = str(value) if value is not None else ""
 54 |                 try:
 55 |                     self.ws1.cell(row=self.row, column=self.col_dict[key], value=value)
 56 |                 except Exception:
 57 |                     new_value = re.sub(ILLEGAL_CHARACTERS_RE, "", value)
 58 |                     logging.warning("row num: %d, key: %s, value: %s contains illegal characters, "
 59 |                                     "replaced illegal characters to: %s" % (self.row, key, value, new_value))
 60 |                     self.ws1.cell(row=self.row, column=self.col_dict[key], value=new_value)
 61 | 
 62 |             self.row += 1
 63 |             self.success_count += 1
 64 |         logging.info("%s write %d item, filtered %d item" % (self.config.filename, len(responses), miss_count))
 65 | 
 66 |     def __exit__(self, exc_type, exc_val, exc_tb):
 67 |         self.wb.save(filename=self.config.filename)
 68 |         self.wb.close()
 69 |         logging.info("%s write done, total filtered %d item, total write %d item" %
 70 |                      (self.config.filename, self.total_miss_count, self.success_count))
 71 | 
 72 |     def __enter__(self):
 73 |         return self
 74 | 
 75 |     def generate_header(self, from_file=False):
 76 |         if from_file:
 77 |             if not self.wb.worksheets:
 78 |                 # empty file
 79 |                 return
 80 |             sheet = self.wb.worksheets[self.config.sheet_index]
 81 |             row_iter = sheet.rows
 82 |             try:
 83 |                 row = next(row_iter)
 84 |                 for each in row:
 85 |                     self.col_dict[each.value] = len(self.col_dict) + 1
 86 |             except StopIteration:
 87 |                 # empty file
 88 |                 return
 89 |             if len(self.col_dict) == 1 and list(self.col_dict.keys())[0] is None:
 90 |                 # empty file
 91 |                 self.col_dict.clear()
 92 |                 return
 93 |             max_row = sheet.max_row
 94 |             self.row = max_row + 1
 95 |         else:
 96 |             for key in self.config.headers:
 97 |                 self.col_dict[key] = len(self.col_dict) + 1
 98 |                 self.ws1.cell(row=1, column=self.col_dict[key], value=key)
 99 | 
100 |         self.header_generated = True
101 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/XLSXGetter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from openpyxl import load_workbook
  3 | from .BaseGetter import BaseGetter
  4 | 
  5 | 
  6 | class XLSXGetter(BaseGetter):
  7 |     def __init__(self, config):
  8 |         super().__init__()
  9 |         self.config = config
 10 |         self.wb = load_workbook(filename=self.config.filename, read_only=True)
 11 |         if not self.wb.worksheets:
 12 |             raise ValueError("Empty file: %s" % (self.config.filename, ))
 13 |         self.sheet = self.wb.worksheets[self.config.sheet_index]
 14 |         self.row_iter = self.sheet.rows
 15 |         self.headers = self.generate_headers()
 16 | 
 17 |         self.max_row = self.sheet.max_row
 18 |         if self.config.max_limit and self.config.max_limit > self.max_row:
 19 |             self.max_row = self.config.max_limit + 1  # add first headers
 20 | 
 21 |         self.row_num = 0
 22 |         self.responses = list()
 23 |         self.curr_size = 0
 24 |         self.done = False
 25 |         self.miss_count = 0
 26 |         self.total_count = 0
 27 | 
 28 |     def init_val(self):
 29 |         self.row_num = 0
 30 |         self.responses = list()
 31 |         self.curr_size = 0
 32 |         self.done = False
 33 |         self.miss_count = 0
 34 |         self.total_count = 0
 35 | 
 36 |         self.row_iter = self.sheet.rows
 37 | 
 38 |     def __aiter__(self):
 39 |         return self
 40 | 
 41 |     async def __anext__(self):
 42 |         if self.done:
 43 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 44 |                          (self.config.filename, self.total_count, self.miss_count))
 45 |             self.init_val()
 46 |             raise StopAsyncIteration
 47 | 
 48 |         while self.row_num < self.max_row:
 49 |             if self.row_num == 0:
 50 |                 self.row_num += 1
 51 |                 continue
 52 | 
 53 |             self.row_num += 1
 54 |             self.total_count += 1
 55 |             row = self.get_next_row()
 56 |             if self.config.filter:
 57 |                 row = self.config.filter(row)
 58 |                 if not row:
 59 |                     self.miss_count += 1
 60 |                     continue
 61 |             self.responses.append(row)
 62 |             if len(self.responses) > self.config.per_limit:
 63 |                 self.curr_size += len(self.responses)
 64 |                 return self.clear_and_return()
 65 | 
 66 |         if self.responses:
 67 |             self.done = True
 68 |             return self.clear_and_return()
 69 | 
 70 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 71 |                      (self.config.filename, self.total_count, self.miss_count))
 72 |         self.init_val()
 73 |         raise StopAsyncIteration
 74 | 
 75 |     def generate_headers(self):
 76 |         keys = list()
 77 |         try:
 78 |             row = next(self.row_iter)
 79 |             for each in row:
 80 |                 keys.append(each.value)
 81 |         except StopIteration:
 82 |             pass
 83 |         return keys
 84 | 
 85 |     def get_next_row(self):
 86 |         ret_item = dict()
 87 |         r = next(self.row_iter)
 88 |         for key, cell in zip(self.headers, r):
 89 |             ret_item[key] = cell.value
 90 |         return ret_item
 91 | 
 92 |     def __iter__(self):
 93 |         for row_num in range(self.max_row):
 94 |             if row_num == 0:
 95 |                 continue
 96 | 
 97 |             row_num += 1
 98 |             self.total_count += 1
 99 |             row = self.get_next_row()
100 |             if self.config.filter:
101 |                 row = self.config.filter(row)
102 |                 if not row:
103 |                     self.miss_count += 1
104 |                     continue
105 |             self.responses.append(row)
106 |             if len(self.responses) > self.config.per_limit:
107 |                 self.curr_size += len(self.responses)
108 |                 yield self.clear_and_return()
109 | 
110 |         if self.responses:
111 |             yield self.responses
112 | 
113 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
114 |                          (self.config.filename, self.total_count, self.miss_count))
115 |             self.init_val()
116 | 
117 |     def __del__(self):
118 |         self.wb.close()
119 | 
120 |     def clear_and_return(self):
121 |         resp = self.responses
122 |         self.responses = list()
123 |         return resp
124 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/MongoGetter.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import traceback
  3 | import random
  4 | import logging
  5 | from .BaseGetter import BaseGetter
  6 | 
  7 | 
  8 | class MongoGetter(BaseGetter):
  9 |     def __init__(self, config):
 10 |         super().__init__(self)
 11 |         self.config = config
 12 |         self.responses = list()
 13 |         self.miss_count = 0
 14 |         self.total_count = 0
 15 |         self.total_size = None
 16 |         self.need_finish = False
 17 | 
 18 |     def init_val(self):
 19 |         self.responses = list()
 20 |         self.miss_count = 0
 21 |         self.total_count = 0
 22 |         self.total_size = None
 23 |         self.need_finish = False
 24 | 
 25 |     def __aiter__(self):
 26 |         return self
 27 | 
 28 |     async def __anext__(self):
 29 |         self.config.get_mongo_cli()  # init mongo pool
 30 | 
 31 |         if self.need_finish:
 32 |             await self.finish()
 33 | 
 34 |         if self.total_size is None:
 35 |             self.total_size = await self.get_total_size()
 36 | 
 37 |         if self.total_count < self.total_size:
 38 |             await self.fetch_per_limit()
 39 |             return self.clear_and_return()
 40 | 
 41 |         # reach here, means done
 42 |         await self.finish()
 43 | 
 44 |     def __iter__(self):
 45 |         raise ValueError("MongoGetter must be used with async generator, not normal generator")
 46 | 
 47 |     async def finish(self):
 48 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 49 |                      (self.config.name, self.total_count, self.miss_count))
 50 |         self.init_val()
 51 |         raise StopAsyncIteration
 52 | 
 53 |     async def get_total_size(self):
 54 |         if hasattr(self.config.cursor, "count"):
 55 |             size = await self.config.cursor.count()
 56 |         else:
 57 |             size = await self.config.client[self.config.database][self.config.collection].count_documents({} if not self.config.query_body else self.config.query_body)
 58 |         size = min(size, self.config.max_limit if self.config.max_limit is not None else size)
 59 |         if size == 0:
 60 |             await self.finish()
 61 |         return size
 62 | 
 63 |     async def fetch_per_limit(self):
 64 |         curr_size = 0
 65 |         try_time = 0
 66 |         get_all = True
 67 | 
 68 |         while try_time < self.config.max_retry:
 69 |             try:
 70 |                 async for document in self.config.cursor:
 71 |                     curr_size += 1
 72 |                     self.responses.append(document)
 73 |                     if curr_size >= self.config.per_limit:
 74 |                         get_all = False
 75 |                         break
 76 |                 if get_all:
 77 |                     # get all item
 78 |                     if self.total_count + curr_size < self.total_size:
 79 |                         logging.error("get all items: %d, but not reach 'total_size': %d" % (self.total_count + curr_size, self.total_size))
 80 |                         self.need_finish = True
 81 |                 break
 82 |             except Exception as e:
 83 |                 try_time += 1
 84 |                 if try_time < self.config.max_retry:
 85 |                     logging.error("retry: %d, %s" % (try_time, str(e)))
 86 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
 87 |                 else:
 88 |                     logging.error("Give up MongoGetter getter: %s, After retry: %d times, still fail, "
 89 |                                   "total get %d items, total filtered: %d items, reason: %s" %
 90 |                                   (self.config.name, self.config.max_retry, self.total_count, self.miss_count,
 91 |                                    str(traceback.format_exc())))
 92 |                     self.need_finish = True
 93 | 
 94 |         self.total_count += len(self.responses)
 95 | 
 96 |         curr_miss_count = 0
 97 |         if self.config.filter:
 98 |             target_results = list()
 99 |             for each in self.responses:
100 |                 each = self.config.filter(each)
101 |                 if each:
102 |                     target_results.append(each)
103 |                 else:
104 |                     curr_miss_count += 1
105 |             self.responses = target_results
106 |             self.miss_count += curr_miss_count
107 | 
108 |         logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" %
109 |                      (len(self.responses), self.config.name, curr_miss_count,
110 |                       (self.total_count / self.total_size * 100) if self.total_size else 0))
111 | 
112 |     def clear_and_return(self):
113 |         resp = self.responses
114 |         self.responses = list()
115 |         return resp
116 | 


--------------------------------------------------------------------------------
/README_CN_simple.md:
--------------------------------------------------------------------------------
  1 | # idataapi-transform
  2 | 
  3 | **idataapi-transform** 是一个纯python实现的，所有功能均支持异步化处理的工具包，你可以使用他将数据从一个位置/格式方便的转换到另一个位置/格式，提供易用的**命令行调用支持**和功能丰富的**python模块支持**
  4 | 
  5 | 该工具现用于 [IDataAPI](http://www.idataapi.cn/) 团队作为基础工具包，以提高效率
  6 | 
  7 | ##### idataapi 转换工具(简洁版)
  8 | 
  9 | -------------------
 10 | 
 11 | 以下是简单的命令行示例，如果需要详细的命令行示例/代码调用示例:
 12 | 
 13 | * [详细中文说明戳这里](https://github.com/zpoint/idataapi-transform/blob/master/README_CN.md)
 14 | 
 15 | -------------------
 16 | 
 17 | #### 环境要求
 18 | * python 版本号 >= 3.5.2
 19 | * 如果你需要使用 MySQL 模块, 你的 python 版本号要 >= 3.5.3
 20 | * 如果你需要使用 MongoDB 模块，你需要在非 Windows 下
 21 | 
 22 | -------------------
 23 | 
 24 | #### 安装指南
 25 | 
 26 | 	python3 -m pip install idataapi-transform
 27 |     # 安装完成后在终端跑如下命令
 28 |     transform --help # 解释各个参数的作用以及创建默认的配置文件
 29 |     # 编辑配置文件 ~/idataapi-transform.ini 配置 ElasticSearch, redis, mysql 主机, 端口, 默认并发数等参数
 30 | 
 31 |     # 如果你的 python 版本 >= 3.5.3, 并且需要安装 MySQL 模块
 32 |     python3 -m pip install 'PyMySQL<=0.9.2,>=0.9'
 33 |     python3 -m pip install aiomysql
 34 | 
 35 |     # 如果你不在 Windows 下, 并且需要安装 MongoDB 模块
 36 |     python3 -m pip install motor
 37 | 
 38 | 
 39 | -------------------
 40 | 
 41 | #### 命令行支持及示例
 42 | 
 43 | * 从以下任意一格式读数据 **[API, ES, CSV, XLSX, JSON, Redis, MySQL, MongoDB]**
 44 | * 写数据至以下任意一格式 **[CSV, XLSX, JSON, TXT, ES, Redis, MySQL, MongoDB, Kafka]**
 45 | 
 46 | ##### 从 API 读取数据 转换为 XLSX 格式
 47 | 
 48 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.xlsx** (默认参数)
 49 | 
 50 | 	transform API xlsx "http://xxx/post/dengta?kw=中国石化&apikey=xxx"
 51 | 
 52 | ##### 从 API 读取数据 转换为 XLSX 格式
 53 | 
 54 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入到 /Users/zpoint/Desktop/result.xlsx 中, **写入文件为可选参数, 可以不填, 默认参数是 ./result.xlsx**
 55 | 
 56 | 	transform API xlsx "http://xxx/post/dengta?kw=中国石化&apikey=xxx" "/Users/zpoint/Desktop/result"
 57 | 
 58 | ##### 从 API 读取数据 转换为 CSV 格式
 59 | 
 60 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.csv** (默认参数)
 61 | 
 62 | 	transform API csv "http://xxx/post/dengta?kw=中国石化&apikey=xxx"
 63 | 
 64 | ##### 从 API 读取数据 转换为 CSV 格式
 65 | 
 66 | w_encoding 表示写入文件的编码，默认为 utf8
 67 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.csv** (默认参数), ./result.csv 以 gbk 编码保存
 68 | 
 69 | 	transform API csv "http://xxx/post/dengta?kw=中国石化&apikey=xxx" --w_encoding=gbk
 70 | 
 71 | 
 72 | ##### 从 API 读取数据 转换为 JSON 格式
 73 | 
 74 | JSON 为一行一条数据的 JSON 文件
 75 | 会从提供的http请求读取所有数据(翻到最后一页为止), 并写入 **./result.json** (默认参数)
 76 | 
 77 | 	transform API json "http://xxx/post/dengta?kw=中国石化&apikey=xxx"
 78 | 
 79 | ##### 从 API 读取数据 转换为 JSON 格式
 80 | 
 81 | max_limit 表示最多只获取到这么多条数据
 82 | 会从提供的http请求读取所有数据(翻到最后一页或者获取到超过100条为止), 并写入 **./result.json** (默认参数)
 83 | 
 84 | 	transform API json "http://xxx/post/dengta?kw=中国石化&apikey=xxx" --max_limit=100
 85 | 
 86 | ##### 从 CSV 读取数据 转换至 xlsx
 87 | 
 88 | 会从 ./a.csv 读取数据, 并保存至 **./result.xlsx**
 89 | 
 90 | 	transform CSV xlsx "./a.csv"
 91 | 
 92 | 
 93 | ##### 从 Elasticsearch 读取数据 转换至 CSV (复杂示例)
 94 | * 以 "gbk" **(--w_encoding)** 编码保存 CSV 文件
 95 | * 指定 ES 的 index: knowledge20170517 **(knowledge20170517)**
 96 | * 指定如下过滤条件 **(--query_body)**
 97 | 
 98 |     	body = {
 99 |         	"size": 100,
100 |         	"_source": {
101 |             	"includes": ["location", "title", "city", "id"]
102 |                 }
103 |               }
104 | 
105 | * 在写入 CSV 之前, 为每一条获取到的数据增加时间戳，以及移除 "city" 字段为空的对象 **(--filter)**
106 | 
107 |         # 创建一个文件叫做 my_filter.py (随便什么名字都行)
108 |         import time
109 |         def my_filter(item): # 函数名必须为 "my_filter"
110 |         	# item 是一条数据，在这里是一个字段对象
111 |             item["createtime"] = int(time.time())
112 |             if item["city"]:
113 |                 return item # item 会被写入你指定的目的地
114 |             # 执行到了这里, 说明返回 None, 这一条 item 会被抛弃，不会被写入目的地
115 | 
116 | * 终端:
117 | 
118 |     	transform ES csv "knowledge20170517" --w_encoding gbk --query_body '{"size": 100, "_source": {"includes": ["location", "title", "city", "id"]}}' --filter ./my_filter.py
119 | 
120 | ##### 从 API 读取数据 存储至 Redis
121 | 
122 | * 键名称为 my_key
123 | * redis 存储/读取 支持 LIST, 以及 HASH 两种数据结构, 默认为 LIST, 可用参数 --key_type 指明
124 | 
125 | 会从 ./a.csv 读取数据, 并保存至 **./result.xlsx**
126 | 
127 | 	transform API redis "http://xxx/post/dengta?kw=中国石化&apikey=xxx" "/Users/zpoint/Desktop/result"
128 | 
129 | ##### 从 Redis 读取数据 存储至 csv
130 | 
131 | 会从 my_key 中读取至多100条数据， 并保存至 **./result.csv**
132 | 
133 | 	transform Redis csv my_key --max_limit 100
134 | 
135 | ##### 从 API 读取数据 写入 MySQL
136 | 
137 | * 当表格不存在是自动创建
138 | 
139 | 会至多从API获取50条数据， 写入 MySQL 表格: **my_table**
140 | 
141 | 	transform API MYSQL 'http://xxx' my_table --max_limit=50
142 | 
143 | ##### 从 MySQL 读取数据 写入 redis
144 | 
145 | 会从 MySQL 表格 **table** 获取数据，每次网络请求60条数据，写入 redis LIST 结构，默认键名称为 result
146 | 
147 | 	transform MYSQL redis my_table --per_limit=60
148 | 
149 | ##### 从 MongoDB 读取数据 写入 csv
150 | 
151 | * 你也可以提供 --query_body 参数进行过滤查询
152 | 
153 | 会从 my_coll 中读取至多50条数据， 并保存至 **./result.csv**
154 | 
155 | 	transform MONGO csv my_coll --max_limit=50
156 | 
157 | -------------------
158 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/ESGetter.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import random
  3 | import logging
  4 | import traceback
  5 | from .BaseGetter import BaseGetter
  6 | 
  7 | 
  8 | class ESScrollGetter(BaseGetter):
  9 |     def __init__(self, config):
 10 |         super().__init__(self)
 11 |         self.config = config
 12 |         self.es_client = config.es_client
 13 | 
 14 |         self.total_size = None
 15 |         self.result = None
 16 |         self.scroll_id = None
 17 |         self.miss_count = 0
 18 |         self.total_count = 0
 19 | 
 20 |     def __aiter__(self):
 21 |         return self
 22 | 
 23 |     def init_val(self):
 24 |         self.total_size = None
 25 |         self.result = None
 26 |         self.scroll_id = None
 27 |         self.miss_count = 0
 28 |         self.total_count = 0
 29 | 
 30 |     async def __anext__(self, retry=1):
 31 |         if self.total_size is None:
 32 |             self.result = await self.es_client.search(
 33 |                 index=self.config.indices, doc_type=self.config.doc_type,
 34 |                 params={"scroll": self.config.scroll}, body=self.config.query_body
 35 |             )
 36 |             self.total_size = self.result['hits']['total']['value']
 37 |             self.total_size = self.config.max_limit if (self.config.max_limit and self.config.max_limit < self.result['hits']['total']['value']) else self.total_size
 38 |             self.total_count += len(self.result['hits']['hits'])
 39 |             logging.info("Get %d items from %s, percentage: %.2f%%" %
 40 |                          (len(self.result['hits']['hits']), self.config.indices + "->" + str(self.config.doc_type),
 41 |                           (self.total_count / self.total_size * 100) if self.total_size else 0))
 42 | 
 43 |             origin_length = len(self.result['hits']['hits'])
 44 |             if self.config.return_source:
 45 |                 results = [i["_source"] for i in self.result['hits']['hits']]
 46 |             else:
 47 |                 results = self.result
 48 |             if self.config.filter:
 49 |                 results = [self.config.filter(i) for i in results]
 50 |                 results = [i for i in results if i]
 51 |                 self.miss_count += origin_length - len(results)
 52 |             self.get_score_id_and_clear_result()
 53 |             return results
 54 | 
 55 |         if self.scroll_id and self.total_count < self.total_size:
 56 |             try:
 57 |                 self.result = await self.es_client.scroll(scroll_id=self.scroll_id,
 58 |                                                           scroll=self.config.scroll)
 59 |             except Exception as e:
 60 |                 if retry < self.config.max_retry:
 61 |                     logging.error("retry: %d, %s" % (retry, str(e)))
 62 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
 63 |                     return await self.__anext__(retry+1)
 64 |                 else:
 65 |                     logging.error("Give up es getter, After retry: %d times, still fail to get result: %s, "
 66 |                                   "total get %d items, total filtered: %d items, reason: %s" %
 67 |                                   (self.config.max_retry, self.config.indices + "->" + str(self.config.doc_type),
 68 |                                    self.total_count, self.miss_count, traceback.format_exc()))
 69 |                     raise StopAsyncIteration
 70 | 
 71 |             logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" %
 72 |                          (len(self.result['hits']['hits']), self.config.indices + "->" + str(self.config.doc_type),
 73 |                           self.miss_count, (self.total_count / self.total_size * 100) if self.total_size else 0))
 74 | 
 75 |             origin_length = len(self.result['hits']['hits'])
 76 |             self.total_count += origin_length
 77 |             if self.config.return_source:
 78 |                 results = [i["_source"] for i in self.result['hits']['hits']]
 79 |             else:
 80 |                 results = self.result
 81 |             if self.config.filter:
 82 |                 results = [self.config.filter(i) for i in results]
 83 |                 results = [i for i in results if i]
 84 |                 self.miss_count += origin_length - len(results)
 85 | 
 86 |             self.get_score_id_and_clear_result()
 87 |             if origin_length > 0:
 88 |                 return results
 89 |             else:
 90 |                 # if scroll empty item, means no more next page
 91 |                 logging.info("empty result, terminating scroll, scroll id: %s" % (str(self.scroll_id), ))
 92 | 
 93 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 94 |                      (self.config.indices + "->" + str(self.config.doc_type), self.total_count, self.miss_count))
 95 |         self.init_val()
 96 |         raise StopAsyncIteration
 97 | 
 98 |     async def delete_all(self):
 99 |         """
100 |         inefficient delete
101 |         """
102 |         body = {
103 |             "query": {
104 |                 "match_all": {}
105 |             }
106 |         }
107 |         result = await self.config.es_client.delete_by_query(index=self.config.indices, doc_type=self.config.doc_type,
108 |                                                              body=body, params={"conflicts": "proceed"})
109 |         return result
110 | 
111 |     def __iter__(self):
112 |         raise ValueError("ESGetter must be used with async generator, not normal generator")
113 | 
114 |     def get_score_id_and_clear_result(self):
115 |         if "_scroll_id" in self.result and self.result["_scroll_id"]:
116 |             self.scroll_id = self.result["_scroll_id"]
117 |         else:
118 |             self.scroll_id = None
119 |         self.result = dict()
120 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/RedisGetter.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import random
  3 | import logging
  4 | import traceback
  5 | import json
  6 | import zlib
  7 | from .BaseGetter import BaseGetter
  8 | 
  9 | 
 10 | class RedisGetter(BaseGetter):
 11 |     def __init__(self, config):
 12 |         super().__init__(self)
 13 |         self.config = config
 14 |         self.is_range = self.config.is_range
 15 |         self.need_del = self.config.need_del
 16 |         self.responses = list()
 17 |         self.done = False
 18 |         self.total_size = None
 19 |         self.miss_count = 0
 20 |         self.total_count = 0
 21 |         self.redis_object_length = 0
 22 | 
 23 |     def init_val(self):
 24 |         self.responses = list()
 25 |         self.done = False
 26 |         self.miss_count = 0
 27 |         self.total_count = 0
 28 |         self.redis_object_length = 0
 29 |         self.total_size = None
 30 | 
 31 |     def decode(self, loaded_object):
 32 |         if self.config.compress:
 33 |             return zlib.decompress(loaded_object).decode(self.config.encoding)
 34 |         else:
 35 |             return json.loads(loaded_object)
 36 | 
 37 |     def __aiter__(self):
 38 |         return self
 39 | 
 40 |     async def __anext__(self, retry=1):
 41 |         await self.config.get_redis_pool_cli()  # init redis pool
 42 |         if self.is_range and self.total_size is None:
 43 |             self.redis_object_length = await self.config.redis_len_method(self.config.key)
 44 |             self.total_size = self.config.max_limit if (self.config.max_limit and self.config.max_limit < self.redis_object_length) else self.redis_object_length
 45 | 
 46 |         if self.done:
 47 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 48 |                          (self.config.name, self.total_count, self.miss_count))
 49 |             self.init_val()
 50 |             raise StopAsyncIteration
 51 | 
 52 |         if self.is_range:
 53 |             if self.config.direction == "L":
 54 |                 left = self.total_count
 55 |                 right = self.total_count + self.config.per_limit - 1
 56 |             else:
 57 |                 left = self.total_size - self.config.per_limit - 1
 58 |                 if left < 0:
 59 |                     left = 0
 60 |                 right = left + self.config.per_limit
 61 | 
 62 |             try:
 63 |                 self.responses = await self.config.redis_read_method(self.config.key, left, right)
 64 |                 self.responses = [self.decode(i) for i in self.responses]
 65 |             except Exception as e:
 66 |                 if retry < self.config.max_retry:
 67 |                     logging.error("retry: %d, %s" % (retry, str(e)))
 68 |                     await asyncio.sleep(random.randint(self.config.random_min_sleep, self.config.random_max_sleep))
 69 |                     return await self.__anext__(retry+1)
 70 |                 else:
 71 |                     logging.error("Give up redis getter, After retry: %d times, still fail to get key: %s, "
 72 |                                   "total get %d items, total filtered: %d items, error: %s" % (self.config.max_retry, self.config.key, self.total_count, self.miss_count, str(traceback.format_exc())))
 73 |                     raise StopAsyncIteration
 74 | 
 75 |             if len(self.responses) < self.config.per_limit or not self.responses or self.total_count + len(self.responses) >= self.total_size:
 76 |                 self.done = True
 77 |                 if self.need_del:
 78 |                     await self.config.redis_del_method(self.config.key)
 79 |         else:
 80 | 
 81 |             try:
 82 |                 self.responses = await self.config.redis_read_method(self.config.key)
 83 |                 self.responses = [self.decode(i) for i in self.responses.values()][:self.total_size]
 84 |             except Exception as e:
 85 |                 if retry < self.config.max_retry:
 86 |                     logging.error("retry: %d, %s" % (retry, str(e)))
 87 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
 88 |                     return await self.__anext__(retry+1)
 89 |                 else:
 90 |                     logging.error("Give up redis getter, After retry: %d times, still fail to get key: %s, "
 91 |                                   "total get %d items, total filtered: %d items, reason: %s" %
 92 |                                   (self.config.max_retry, self.config.key, self.total_count, self.miss_count, str(traceback.format_exc())))
 93 |                     raise StopAsyncIteration
 94 | 
 95 |             if self.config.max_limit:
 96 |                 self.responses = self.responses[:self.config.max_limit]
 97 |             self.done = True
 98 |             if self.need_del:
 99 |                 await self.config.redis_del_method(self.config.key)
100 | 
101 |         current_response_length = len(self.responses)
102 |         curr_miss_count = 0
103 |         self.total_count += current_response_length
104 |         if self.config.filter:
105 |             target_responses = list()
106 |             for i in self.responses:
107 |                 if self.config.filter:
108 |                     i = self.config.filter(i)
109 |                 if i:
110 |                     target_responses.append(i)
111 |                 else:
112 |                     curr_miss_count += 1
113 |             self.responses = target_responses
114 | 
115 |         self.miss_count += curr_miss_count
116 |         if self.is_range:
117 |             logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" %
118 |                          (current_response_length, self.config.name, curr_miss_count,
119 |                           (self.total_count / self.total_size * 100) if self.total_size else 0))
120 |         return self.clear_and_return()
121 | 
122 |     def __iter__(self):
123 |         raise ValueError("RedisGetter must be used with async generator, not normal generator")
124 | 
125 |     def clear_and_return(self):
126 |         resp = self.responses
127 |         self.responses = list()
128 |         return resp
129 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/MySQLGetter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | import traceback
  4 | import random
  5 | import logging
  6 | from .BaseGetter import BaseGetter
  7 | 
  8 | 
  9 | class MySQLGetter(BaseGetter):
 10 |     def __init__(self, config):
 11 |         super().__init__(self)
 12 |         self.config = config
 13 |         self.responses = list()
 14 |         self.miss_count = 0
 15 |         self.total_count = 0
 16 |         self.total_size = None
 17 |         self.key_fields = list()
 18 |         self.key_fields_map = dict()
 19 |         self.need_finish = False
 20 | 
 21 |     def init_val(self):
 22 |         self.responses = list()
 23 |         self.miss_count = 0
 24 |         self.total_count = 0
 25 |         self.total_size = None
 26 |         self.key_fields = list()
 27 |         self.key_fields_map = dict()
 28 |         self.need_finish = False
 29 | 
 30 |     def __aiter__(self):
 31 |         return self
 32 | 
 33 |     async def __anext__(self):
 34 |         await self.config.get_mysql_pool_cli()  # init mysql pool
 35 | 
 36 |         if self.need_finish:
 37 |             await self.finish()
 38 | 
 39 |         if self.total_size is None:
 40 |             self.total_size, self.key_fields = await self.get_total_size_and_key_field()
 41 | 
 42 |         if self.total_count < self.total_size:
 43 |             await self.fetch_per_limit()
 44 |             return self.clear_and_return()
 45 | 
 46 |         # reach here, means done
 47 |         await self.finish()
 48 | 
 49 |     def __iter__(self):
 50 |         raise ValueError("MySQLGetter must be used with async generator, not normal generator")
 51 | 
 52 |     async def finish(self):
 53 |         logging.info("get source done: %s, total get %d items, total filtered: %d items" %
 54 |                      (self.config.name, self.total_count, self.miss_count))
 55 |         self.init_val()
 56 |         self.config.free_resource()
 57 |         raise StopAsyncIteration
 58 | 
 59 |     async def get_total_size_and_key_field(self):
 60 |         await self.config.cursor.execute("DESC %s" % (self.config.table, ))
 61 |         result = await self.config.cursor.fetchall()
 62 |         field = result[0][0]
 63 |         await self.config.cursor.execute("select count(%s) from %s" % (field, self.config.table))
 64 |         result = await self.config.cursor.fetchone()
 65 |         # key field
 66 |         await self.config.cursor.execute("DESC %s" % (self.config.table, ))
 67 |         results = await self.config.cursor.fetchall()
 68 |         key_fields = list()
 69 |         for each in results:
 70 |             key_fields.append(each[0])
 71 |             if "tinyint" in each[1]:
 72 |                 self.key_fields_map[each[0]] = bool
 73 |             elif "text" in each[1]:
 74 |                 self.key_fields_map[each[0]] = str  # or json
 75 | 
 76 |         key_fields = list(i[0] for i in results)
 77 |         return result[0], key_fields
 78 | 
 79 |     async def fetch_per_limit(self):
 80 |         results = list()
 81 |         try_time = 0
 82 |         while try_time < self.config.max_retry:
 83 |             try:
 84 |                 await self.config.cursor.execute("SELECT * FROM %s LIMIT %d,%d" %
 85 |                                                  (self.config.table, self.total_count, self.config.per_limit))
 86 |                 results = await self.config.cursor.fetchall()
 87 |                 break
 88 |             except Exception as e:
 89 |                 try_time += 1
 90 |                 if try_time < self.config.max_retry:
 91 |                     logging.error("retry: %d, %s" % (try_time, str(e)))
 92 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
 93 |                 else:
 94 |                     logging.error("Give up MySQL getter: %s, After retry: %d times, still fail, "
 95 |                                   "total get %d items, total filtered: %d items, reason: %s" %
 96 |                                   (self.config.name, self.config.max_retry, self.total_count, self.miss_count,
 97 |                                    str(traceback.format_exc())))
 98 |                     self.need_finish = True
 99 | 
100 |         self.responses = [self.decode(i) for i in results]
101 |         curr_miss_count = 0
102 |         if self.config.filter:
103 |             target_results = list()
104 |             for each in results:
105 |                 each = self.config.filter(each)
106 |                 if each:
107 |                     target_results.append(each)
108 |                 else:
109 |                     curr_miss_count += 1
110 |             self.responses = target_results
111 |             self.miss_count += curr_miss_count
112 | 
113 |         self.total_count += len(results)
114 |         logging.info("Get %d items from %s, filtered: %d items, percentage: %.2f%%" %
115 |                      (len(results), self.config.name, curr_miss_count,
116 |                       (self.total_count / self.total_size * 100) if self.total_size else 0))
117 |         if self.total_count >= self.total_size:
118 |             self.need_finish = True
119 |         return
120 | 
121 |     def decode(self, item):
122 |         """
123 |         :param item: tuple
124 |         :return: dict
125 |         """
126 |         ret_dict = dict()
127 |         index = 0
128 |         for key in self.key_fields:
129 |             if key in self.key_fields_map:
130 |                 if self.key_fields_map[key] is bool:
131 |                     ret_dict[key] = bool(item[index])
132 |                 elif item[index] is None:
133 |                     ret_dict[key] = None
134 |                 elif item[index][0] in ("{", "["):
135 |                     try:
136 |                         val = json.loads(item[index])
137 |                     except json.decoder.JSONDecodeError:
138 |                         val = item[index]
139 |                     ret_dict[key] = val
140 |                 else:
141 |                     ret_dict[key] = item[index]
142 |             else:
143 |                 ret_dict[key] = item[index]
144 |             index += 1
145 |         return ret_dict
146 | 
147 |     def clear_and_return(self):
148 |         resp = self.responses
149 |         self.responses = list()
150 |         return resp
151 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/MainConfig.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import configparser
  4 | from os.path import expanduser
  5 | from .LogConfig import init_log, remove_log
  6 | from .ESConfig import init_es
  7 | 
  8 | 
  9 | default_configure_content = """
 10 | [main]
 11 | # default max concurrency value for APIGetter
 12 | concurrency = 50
 13 | 
 14 | # buffer size
 15 | per_limit = 100
 16 | 
 17 | # fetch at most max_limit items
 18 | max_limit = None
 19 | 
 20 | # max retry for getter before give up if fail to get data
 21 | max_retry = 3
 22 | 
 23 | # sleep interval if fail
 24 | random_min_sleep = 1
 25 | random_max_sleep = 3
 26 | 
 27 | [es]
 28 | # elasticsearch host
 29 | # hosts = ["localhost:9393"]
 30 | 
 31 | # elasticsearch headers when perform http request
 32 | # headers = {"Host": "localhost", "value": "value"}
 33 | 
 34 | # request timeout, seconds
 35 | # timeout = 10
 36 | 
 37 | # http auth
 38 | # http_auth = ["user", "passwd"]
 39 | 
 40 | [log]
 41 | # a directory to save log file
 42 | # path = /Users/zpoint/Desktop/idataapi-transform/logs/
 43 | 
 44 | # max byte per log file
 45 | # log_byte = 5242880
 46 | """
 47 | 
 48 | redis_config_content = """
 49 | [redis]
 50 | host = localhost
 51 | port = 0
 52 | db = 0
 53 | password = 
 54 | timeout = 3
 55 | encoding = utf8
 56 | # whether need to del the key after get object from redis, 0 means false, 1 means true
 57 | need_del = 0
 58 | # default direction when read/write , "L" means lpop/lpush, "R" means rpop/rpush
 59 | direction = L
 60 | """
 61 | 
 62 | mysql_config_content = """
 63 | [mysql]
 64 | host = localhost
 65 | port = 0
 66 | user = root
 67 | password = 
 68 | database = 
 69 | # default charset
 70 | encoding = utf8
 71 | """
 72 | 
 73 | mongo_config_content = """
 74 | [mongo]
 75 | protocal = mongodb  # or mongodb+srv
 76 | host = localhost
 77 | port = 0
 78 | username = 
 79 | password = 
 80 | database = test_database
 81 | other_params = 
 82 | """
 83 | 
 84 | kafka_config_content = """
 85 | [kafka]
 86 | bootstrap.servers = localhost:9092
 87 | """
 88 | 
 89 | main_config_box = None
 90 | 
 91 | 
 92 | class MainConfig(object):
 93 |     def __init__(self, ini_path=None):
 94 |         global main_config_box
 95 |         main_config_box = self
 96 |         # singleton
 97 |         if not hasattr(self, "__instance"):
 98 |             if not ini_path:
 99 |                 home = expanduser("~")
100 |                 ini_path = home + "/idataapi-transform.ini"
101 | 
102 |             if not os.path.exists(ini_path):
103 |                 with open(ini_path, "w") as f:
104 |                     f.write(default_configure_content + redis_config_content + mysql_config_content + mongo_config_content)
105 | 
106 |             if os.path.exists("./idataapi-transform.ini"):
107 |                 ini_path = "./idataapi-transform.ini"
108 | 
109 |             self.read_config(ini_path)
110 | 
111 |     def read_config(self, ini_path):
112 |         self.ini_path = ini_path
113 |         self.__instance = configparser.ConfigParser()
114 | 
115 |         self.__instance.read(ini_path)
116 |         MainConfig.__instance = self.__instance
117 | 
118 |         self.has_log_file = self.__instance.has_log_file = self.config_log()
119 |         self.has_es_configured = self.__instance.has_es_configured = self.config_es()
120 |         self.has_redis_configured = self.__instance.has_redis_configured = self.config_redis()
121 |         self.has_mysql_configured = self.__instance.has_mysql_configured = self.config_mysql()
122 |         self.has_mongo_configured = self.__instance.has_mongo_configured = self.config_mongo()
123 |         self.has_kafka_configured = self.__instance.has_kafka_configured = self.config_kafka()
124 | 
125 |         self.__instance.ini_path = self.ini_path
126 | 
127 |     def __call__(self):
128 |         return self.__instance
129 | 
130 |     def config_log(self, log_path=None, max_log_file_bytes=None):
131 |         remove_log()
132 |         if log_path:
133 |             manual = True
134 |         else:
135 |             max_log_file_bytes = self.__instance["log"].getint("log_byte")
136 |             log_path = self.__instance["log"].get("path")
137 |             manual = False
138 |         return init_log(log_path, max_log_file_bytes, self.ini_path, manual=manual)
139 | 
140 |     def config_es(self):
141 |         hosts = self.__instance["es"].get("hosts")
142 |         timeout = self.__instance["es"].getint("timeout")
143 |         http_auth = self.__instance["es"].get("http_auth")
144 |         if hosts:
145 |             try:
146 |                 hosts = json.loads(hosts)
147 |             except Exception as e:
148 |                 raise ValueError("es host must be json serialized")
149 | 
150 |         headers = self.__instance["es"].get("headers")
151 |         if headers and headers != "None":
152 |             try:
153 |                 headers = json.loads(headers)
154 |             except Exception as e:
155 |                 raise ValueError("es headers must be json serialized")
156 |         if http_auth and http_auth != "None":
157 |             try:
158 |                 http_auth = json.loads(http_auth)
159 |             except Exception as e:
160 |                 raise ValueError("es http_auth must be json serialized")
161 |         else:
162 |             headers = None
163 |         return init_es(hosts, headers, timeout, http_auth)
164 | 
165 |     def config_redis(self):
166 |         try:
167 |             self.__instance["redis"].get("port")
168 |         except KeyError as e:
169 |             with open(self.ini_path, "a+") as f:
170 |                 f.write(redis_config_content)
171 |             self.__instance.read(self.ini_path)
172 | 
173 |         port = self.__instance["redis"].getint("port")
174 |         return port > 0
175 | 
176 |     def config_mysql(self):
177 |         try:
178 |             self.__instance["mysql"].get("port")
179 |         except KeyError as e:
180 |             with open(self.ini_path, "a+") as f:
181 |                 f.write(mysql_config_content)
182 |             self.__instance.read(self.ini_path)
183 | 
184 |         port = self.__instance["mysql"].getint("port")
185 |         return port > 0
186 | 
187 |     def config_mongo(self):
188 |         try:
189 |             self.__instance["mongo"].get("port")
190 |         except KeyError as e:
191 |             with open(self.ini_path, "a+") as f:
192 |                 f.write(mongo_config_content)
193 |             self.__instance.read(self.ini_path)
194 | 
195 |         port = self.__instance["mongo"].getint("port")
196 |         return port > 0
197 | 
198 |     def config_kafka(self):
199 |         try:
200 |             self.__instance["kafka"].get("bootstrap.servers")
201 |         except KeyError as e:
202 |             with open(self.ini_path, "a+") as f:
203 |                 f.write(kafka_config_content)
204 |             self.__instance.read(self.ini_path)
205 | 
206 |         return "bootstrap.servers" in  self.__instance["kafka"]
207 | 
208 | 
209 | main_config = MainConfig()
210 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ESConfig.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import logging
  4 | from collections.abc import Iterable
  5 | from elasticsearch._async.transport import AsyncTransport as OriginAsyncTransport
  6 | from elasticsearch._async.client.utils import _make_path
  7 | from elasticsearch import TransportError
  8 | from elasticsearch.exceptions import ConnectionError, ConnectionTimeout
  9 | from elasticsearch import AsyncElasticsearch
 10 | 
 11 | es_hosts = None
 12 | http_auth = None
 13 | 
 14 | 
 15 | def init_es(hosts, es_headers, timeout_, http_auth_):
 16 |     global es_hosts, http_auth, AsyncElasticsearch, AsyncTransport
 17 |     es_hosts = hosts
 18 |     http_auth = tuple(http_auth_) if isinstance(http_auth_, Iterable) else None
 19 |     if not es_hosts:
 20 |         return False
 21 | 
 22 |     class MyAsyncTransport(OriginAsyncTransport):
 23 |         """
 24 |         Override default AsyncTransport to add timeout
 25 |         """
 26 |         async def perform_request(self, method, url, params=None, body=None, timeout=None, headers=None):
 27 |             await self._async_call()
 28 | 
 29 |             method, headers, params, body, ignore, __timeout = self._resolve_request_args(
 30 |                 method, headers, params, body
 31 |             )
 32 | 
 33 |             for attempt in range(self.max_retries + 1):
 34 |                 connection = self.get_connection()
 35 | 
 36 |                 try:
 37 |                     status, headers, data = await connection.perform_request(
 38 |                         method,
 39 |                         url,
 40 |                         params,
 41 |                         body,
 42 |                         headers=headers,
 43 |                         ignore=ignore,
 44 |                         timeout=timeout,
 45 |                     )
 46 |                 except TransportError as e:
 47 |                     if method == "HEAD" and e.status_code == 404:
 48 |                         return False
 49 | 
 50 |                     retry = False
 51 |                     if isinstance(e, ConnectionTimeout):
 52 |                         retry = self.retry_on_timeout
 53 |                     elif isinstance(e, ConnectionError):
 54 |                         retry = True
 55 |                     elif e.status_code in self.retry_on_status:
 56 |                         retry = True
 57 | 
 58 |                     if retry:
 59 |                         try:
 60 |                             # only mark as dead if we are retrying
 61 |                             self.mark_dead(connection)
 62 |                         except TransportError:
 63 |                             # If sniffing on failure, it could fail too. Catch the
 64 |                             # exception not to interrupt the retries.
 65 |                             pass
 66 |                         # raise exception on last retry
 67 |                         if attempt == self.max_retries:
 68 |                             raise e
 69 |                     else:
 70 |                         raise e
 71 | 
 72 |                 else:
 73 |                     # connection didn't fail, confirm it's live status
 74 |                     self.connection_pool.mark_live(connection)
 75 | 
 76 |                     if method == "HEAD":
 77 |                         return 200 <= status < 300
 78 | 
 79 |                     if data:
 80 |                         data = self.deserializer.loads(data, headers.get("content-type"))
 81 |                     return data
 82 | 
 83 |     class MyAsyncElasticsearch(AsyncElasticsearch):
 84 |         def __init__(self, *args, **kwargs):
 85 |             super().__init__(*args, **kwargs)
 86 |             if "headers" in kwargs:
 87 |                 self.headers = kwargs["headers"]
 88 |             else:
 89 |                 self.headers = None
 90 | 
 91 |         async def add_dict_to_es(self, indices, doc_type, items, id_hash_func, app_code=None, actions=None,
 92 |                                  create_date=None, error_if_fail=True, timeout=None, auto_insert_createDate=True):
 93 |             if not actions:
 94 |                 actions = "index"
 95 |             body = ""
 96 |             for item in items:
 97 |                 if app_code:
 98 |                     item["appCode"] = app_code
 99 |                 if auto_insert_createDate and "createDate" not in item:
100 |                     if create_date:
101 |                         item["createDate"] = create_date
102 |                     else:
103 |                         item["createDate"] = int(time.time())
104 | 
105 |                 action = {
106 |                     actions: {
107 |                         "_index": indices,
108 |                         "_type": doc_type,
109 |                         "_id": id_hash_func(item)
110 |                     }
111 |                 }
112 |                 if actions == "update":
113 |                     item = {"doc": item}
114 |                 body += json.dumps(action) + "\n" + json.dumps(item) + "\n"
115 |             try:
116 |                 success = fail = 0
117 |                 r = await self.transport.perform_request(
118 |                     "POST", "/_bulk?pretty", body=body, timeout=timeout or timeout_, headers=self.headers or es_headers)
119 |                 if r["errors"]:
120 |                     for item in r["items"]:
121 |                         for k, v in item.items():
122 |                             if "error" in v:
123 |                                 if error_if_fail:
124 |                                     # log error
125 |                                     logging.error(json.dumps(v["error"]))
126 |                                 fail += 1
127 |                             else:
128 |                                 success += 1
129 |                 else:
130 |                     success = len(r["items"])
131 |                 return success, fail, r
132 |             except Exception as e:
133 |                 import traceback
134 |                 logging.error(traceback.format_exc())
135 |                 logging.error("elasticsearch Exception, give up: %s" % (str(e), ))
136 |                 return None, None, None
137 | 
138 |         async def search(
139 |                 self, body=None, index=None, doc_type=None, params=None, headers=None
140 |         ):
141 |             if "from_" in params:
142 |                 params["from"] = params.pop("from_")
143 | 
144 |             return await self.transport.perform_request(
145 |                 "POST",
146 |                 _make_path(index, doc_type, "_search"),
147 |                 params=params,
148 |                 headers=headers if headers else self.headers,
149 |                 body=body,
150 |             )
151 | 
152 |     OriginAsyncTransport.perform_request = MyAsyncTransport.perform_request
153 | 
154 |     AsyncElasticsearch = MyAsyncElasticsearch
155 |     return True
156 | 
157 | 
158 | global_client = None
159 | 
160 | 
161 | def get_es_client(hosts=None, headers=None):
162 |     global global_client
163 |     if not hosts:
164 |         if global_client is None:
165 |             global_client = AsyncElasticsearch(hosts=es_hosts, headers=headers, http_auth=http_auth)
166 |         return global_client
167 |     else:
168 |         return AsyncElasticsearch(hosts=hosts, headers=headers, http_auth=http_auth)
169 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataWriter/MySQLWriter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | import random
  4 | import logging
  5 | import traceback
  6 | from .BaseWriter import BaseWriter
  7 | 
  8 | 
  9 | class MySQLWriter(BaseWriter):
 10 |     def __init__(self, config):
 11 |         super().__init__()
 12 |         self.config = config
 13 |         self.total_miss_count = 0
 14 |         self.success_count = 0
 15 |         self.table_checked = False
 16 |         self.key_fields = list()
 17 |         self.auto_increment_keys = set()
 18 | 
 19 |     async def write(self, responses):
 20 |         await self.config.get_mysql_pool_cli()  # init mysql pool
 21 | 
 22 |         miss_count = 0
 23 |         original_length = len(responses)
 24 |         if self.config.filter:
 25 |             target_responses = list()
 26 |             for i in responses:
 27 |                 i = self.config.filter(i)
 28 |                 if i:
 29 |                     target_responses.append(i)
 30 |                 else:
 31 |                     miss_count += 1
 32 |             responses = target_responses
 33 | 
 34 |         if not responses:
 35 |             self.finish_once(miss_count, original_length)
 36 |             return
 37 | 
 38 |         # After filtered, still have responses to write
 39 |         if not self.table_checked:
 40 |             await self.table_check(responses)
 41 | 
 42 |         if await self.perform_write(responses):
 43 |             self.finish_once(miss_count, original_length)
 44 | 
 45 |     def __exit__(self, exc_type, exc_val, exc_tb):
 46 |         self.config.free_resource()
 47 |         logging.info("%s write done, total filtered %d item, total write %d item" %
 48 |                      (self.config.name, self.total_miss_count, self.success_count))
 49 | 
 50 |     def __enter__(self):
 51 |         return self
 52 | 
 53 |     def finish_once(self, miss_count, original_length):
 54 |         self.total_miss_count += miss_count
 55 |         self.success_count += original_length
 56 |         logging.info("%s write %d item, filtered %d item" % (self.config.name, original_length - miss_count, miss_count))
 57 | 
 58 |     async def table_check(self, responses):
 59 |         await self.config.cursor.execute("SHOW TABLES LIKE '%s'" % (self.config.table, ))
 60 |         result = await self.config.cursor.fetchone()
 61 |         if result is None:
 62 |             await self.create_table(responses)
 63 |         # check field
 64 |         await self.config.cursor.execute("DESC %s" % (self.config.table, ))
 65 |         results = await self.config.cursor.fetchall()
 66 |         for field in results:
 67 |             if "auto_increment" in field:
 68 |                 self.auto_increment_keys.add(field[0])
 69 | 
 70 |         fields = set(i[0] for i in results)
 71 |         self.key_fields = list(i[0] for i in results)
 72 |         real_keys = set(responses[0].keys())
 73 |         difference_set = real_keys.difference(fields)
 74 |         if difference_set:
 75 |             # real keys not subset of fields
 76 |             raise ValueError("Field %s not in MySQL Table: %s" % (str(difference_set), self.config.table))
 77 | 
 78 |         self.table_checked = True
 79 | 
 80 |     async def create_table(self, responses):
 81 |         test_response = dict()
 82 |         for response in responses[:50]:
 83 |             for k, v in response.items():
 84 |                 if k not in test_response:
 85 |                     test_response[k] = v
 86 |                 elif test_response[k] is None:
 87 |                     test_response[k] = v
 88 |                 elif isinstance(v, dict) or isinstance(v, list):
 89 |                     if len(json.dumps(test_response[k])) < len(json.dumps(v)):
 90 |                         test_response[k] = v
 91 |                 elif v is not None and test_response[k] < v:
 92 |                     test_response[k] = v
 93 | 
 94 |         sql = """
 95 |         CREATE TABLE `%s` (
 96 |         """ % (self.config.table, )
 97 |         first_field = True
 98 |         for key, value in responses[0].items():
 99 |             if "Count" in key:
100 |                 field_type = "BIGINT"
101 |             elif value is None:
102 |                 field_type = "TEXT"
103 |             elif key in ("content", ) or isinstance(value, dict) or isinstance(value, list):
104 |                 field_type = "TEXT"
105 |             elif isinstance(value, bool):
106 |                 field_type = "BOOLEAN"
107 |             elif isinstance(value, int):
108 |                 field_type = "BIGINT"
109 |             elif isinstance(value, float):
110 |                 field_type = "DOUBLE"
111 |             # varchar can store at most 65536 bytes, utf8 occupy 1-8 bytes per character,
112 |             # so length should be less than 65536 / 8 = 8192
113 |             #  assume this field  (the shortest length) * 4 <= the longest length(8192)
114 |             elif len(value) > 2048:
115 |                 field_type = "TEXT"
116 |             else:
117 |                 length = len(value) * 4
118 |                 if length < 256:
119 |                     length = 256
120 |                 field_type = "VARCHAR(%d)" % (length, )
121 |             sql += ("\t" if first_field else "\t\t") + "`%s` %s" % (key, field_type)
122 |             if key == "id":
123 |                 sql += " NOT NULL,\n"
124 |             else:
125 |                 sql += ",\n"
126 |             if first_field:
127 |                 first_field = False
128 | 
129 |         tail_sql = """
130 |         \tPRIMARY KEY (`id`)
131 |         ) ENGINE=InnoDB DEFAULT CHARSET=%s
132 |         """ % (self.config.charset, )
133 |         sql += tail_sql
134 |         logging.info("Creating table: %s\n%s", self.config.table, sql)
135 |         await self.config.cursor.execute(sql)
136 |         await self.config.connection.commit()
137 |         logging.info("table created")
138 | 
139 |     async def perform_write(self, responses):
140 |         sql = "REPLACE INTO %s VALUES " % (self.config.table, )
141 |         normal_sql = False
142 |         sql_without_auto_increment_keys = list()
143 | 
144 |         for each in responses:
145 |             need_specific_sql = False
146 |             keys = list()
147 | 
148 |             curr_sql = '('
149 |             for field in self.key_fields:
150 |                 if field in self.auto_increment_keys and field not in each:
151 |                     need_specific_sql = True
152 |                     continue
153 |                 val = each[field]
154 |                 keys.append(field)
155 |                 if isinstance(val, dict) or isinstance(val, list):
156 |                     val = json.dumps(val)
157 |                 if val is None:
158 |                     curr_sql += 'NULL,'
159 |                 else:
160 |                     curr_sql += repr(val) + ","
161 |             curr_sql = curr_sql[:-1] + '),\n'
162 |             if need_specific_sql:
163 |                 sql_keys = "("
164 |                 for each_sql_key in keys:
165 |                     sql_keys += each_sql_key + ","
166 |                 sql_keys = sql_keys[:-1] + ")"
167 |                 sql_without_auto_increment_keys.append("REPLACE INTO %s%s VALUES " % (self.config.table, sql_keys) + curr_sql[:-2])
168 |             else:
169 |                 normal_sql = True
170 |                 sql += curr_sql
171 |         sql = sql[:-2]
172 |         try_time = 0
173 |         while try_time < self.config.max_retry:
174 |             try:
175 |                 ret_sql = ""
176 |                 if normal_sql:
177 |                     ret_sql += sql + ";\n"
178 |                 if sql_without_auto_increment_keys:
179 |                     ret_sql += ";\n".join(sql_without_auto_increment_keys)
180 |                     ret_sql += ";"
181 |                 await self.config.cursor.execute(ret_sql)
182 |                 await self.config.cursor.connection.commit()
183 |                 return True
184 |             except Exception as e:
185 |                 try_time += 1
186 |                 if try_time < self.config.max_retry:
187 |                     logging.error("retry: %d, %s" % (try_time, str(e)))
188 |                     await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
189 |                 else:
190 |                     logging.error("Give up MySQL writer: %s, After retry: %d times, still fail to write, "
191 |                                   "total write %d items, total filtered: %d items, reason: %s" %
192 |                                   (self.config.name, self.config.max_retry, self.success_count, self.total_miss_count,
193 |                                    str(traceback.format_exc())))
194 |         return False
195 | 


--------------------------------------------------------------------------------
/idataapi_transform/cli.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | import argparse
  4 | from .DataProcess.Config.DefaultValue import DefaultVal
  5 | from .DataProcess.Config.ConfigUtil import GetterConfig
  6 | from .DataProcess.Config.ConfigUtil import WriterConfig
  7 | from .DataProcess.ProcessFactory import ProcessFactory
  8 | 
  9 | 
 10 | class Args(object):
 11 |     from_choices = ["API", "ES", "CSV", "XLSX", "JSON", "REDIS", "MYSQL", "MONGO"]
 12 |     from_desc = "argument 'from' can only set to one of 'API', 'ES', 'CSV', 'XLSX', " \
 13 |                 "'JSON'(means json line by line file), 'REDIS', 'MYSQL' or 'MONGO'"
 14 | 
 15 |     to_choices = ["csv", "xlsx", "json", "txt", "es", "redis", 'mysql', 'mongo', 'kafka']
 16 |     to_desc = "argument 'to' can only set to one of \"csv\", \"xlsx\", \"json\", \"txt\" \"es\", \"json\", \"redis\", \"kafka\", " \
 17 |               "\"mysql\", \"mongo\", \"json\" will write 'json.dumps(item)' line by line. " \
 18 |               "\"txt\" will write each item line by line, each element in each line is separated by 'space' bu default"
 19 | 
 20 |     source_desc = """
 21 |     argument 'source', When argument '-from' set to 'ES', source should be 'index' When 
 22 |     argument 'from' set tp 'API', source should be 'http://...
 23 |     argument 'from' set tp 'REDIS', source should be key name
 24 |     argument 'from' set tp 'MYSQL', source should be table name
 25 |     argument 'from' set to others, source should be file path
 26 |     """
 27 |     dest_desc = "argument 'dest', filename to save result, no need for suffix, " \
 28 |                 "ie '/Desktop/result', default: './result'\n" \
 29 |                 "When argument '-to' set to 'ES', dest should be 'index'"
 30 | 
 31 |     per_limit_desc = "amount of data buffered, when buffer filled, Program will write buffered data to 'dest', default 100"
 32 |     max_limit_desc = "write at most 'max_limit' data to 'dest', if 'max_limit' set to 0, means no limit, default to None"
 33 |     retry_desc = "when fetch data failed, retry at most 'retry' time, default 3"
 34 |     r_encoding_desc = "encoding of input file, ignore for xlsx format, default 'utf8'"
 35 |     w_encoding_desc = "encoding of output file, ignore for xlsx format, default 'utf8'"
 36 | 
 37 |     filter_desc = "file contains a 'my_filter(item)' function for filter"
 38 | 
 39 |     param_file_desc = """When you have many item save in id.json, --param_file './id.json::id::pid' means open './id.json
 40 |     ', read each json object line by line, use each_json['id'] as the parameter 'pid' and add it to the tail part of 
 41 |     'source'. --param_file can be either "filename.json::json_param::request_param" or "filename.txt::request_param"
 42 |     """
 43 | 
 44 |     expand_desc = """If your item is {"a": {"b": "c"}, "b": "d"}, --expand 1 will make your item become 
 45 |     {"a_b": "c", "b": "d"}, --expand N means expand at most N level deep of your object, --expand -1 means expand all 
 46 |     level -- expand 0 means no expand of your item. Default 0.
 47 |     """
 48 |     qsn_desc = """quote scientific notation, ie: 4324234234234234123123 will become 4.32423423423423E+021 in normal csv, 
 49 |     If quote like '4324234234234234123123', it won't become scientific notation, Only work for output format 'csv' 
 50 |     --qsn True means quote scientific notation,  --qsn False means not quote scientific notation"""
 51 | 
 52 |     query_body_desc = """ElasticSearch query body, size has same function as "--limit", i.e: 
 53 |         body = {
 54 |             "size": 100,
 55 |             "_source": {
 56 |                 "includes": ["location", "title", "city", "id"]
 57 |             },
 58 |             "query": {
 59 |                 "bool": {
 60 |                     "must": [
 61 |                         {
 62 |                             "term": {"appCode": {"value": "ctrip"}}
 63 |                         }
 64 |                     ]
 65 |                 }
 66 |             }
 67 |         }
 68 |     """
 69 | 
 70 |     write_mode_desc = """'w' or 'a+'"""
 71 |     key_type_desc = """redis data type to operate, options: [LIST] or [HASH], default: [LIST]"""
 72 |     quote_char_desc = """csv only, default quote char is '"'"""
 73 | 
 74 | getter_config_map = {
 75 |     Args.from_choices[0]: GetterConfig.RAPIConfig,
 76 |     Args.from_choices[1]: GetterConfig.RESConfig,
 77 |     Args.from_choices[2]: GetterConfig.RCSVConfig,
 78 |     Args.from_choices[3]: GetterConfig.RXLSXConfig,
 79 |     Args.from_choices[4]: GetterConfig.RJsonConfig,
 80 |     Args.from_choices[5]: GetterConfig.RRedisConfig,
 81 |     Args.from_choices[6]: GetterConfig.RMySQLConfig,
 82 |     Args.from_choices[7]: GetterConfig.RMongoConfig
 83 | }
 84 | 
 85 | writer_config_map = {
 86 |     Args.to_choices[0]: WriterConfig.WCSVConfig,
 87 |     Args.to_choices[1]: WriterConfig.WXLSXConfig,
 88 |     Args.to_choices[2]: WriterConfig.WJsonConfig,
 89 |     Args.to_choices[3]: WriterConfig.WJsonConfig,
 90 |     Args.to_choices[4]: WriterConfig.WESConfig,
 91 |     Args.to_choices[5]: WriterConfig.WRedisConfig,
 92 |     Args.to_choices[6]: WriterConfig.WMySQLConfig,
 93 |     Args.to_choices[7]: WriterConfig.WMongoConfig
 94 | }
 95 | 
 96 | 
 97 | def get_arg():
 98 |     parser = argparse.ArgumentParser(prog="idataapi_transform",
 99 |                                      description='convert data from a format to another format, '
100 |                                                  'read/write from file or database, suitable for iDataAPI')
101 |     parser.add_argument("from", choices=Args.from_choices, help=Args.from_desc, type=str.upper)
102 |     parser.add_argument("to", choices=Args.to_choices, help=Args.to_desc, type=str.lower)
103 | 
104 |     parser.add_argument("source", help=Args.source_desc)
105 | 
106 |     parser.add_argument("dest", help=Args.dest_desc, default=DefaultVal.dest, nargs="?")
107 |     parser.add_argument("--per_limit", default=DefaultVal.per_limit, type=int, help=Args.per_limit_desc)
108 |     parser.add_argument("--max_limit", default=DefaultVal.max_limit, type=int, help=Args.max_limit_desc)
109 |     parser.add_argument("--max_retry", default=DefaultVal.max_retry, type=int, help=Args.retry_desc)
110 |     parser.add_argument("--r_encoding", default=DefaultVal.default_encoding, help=Args.r_encoding_desc)
111 |     parser.add_argument("--w_encoding", default=DefaultVal.default_encoding, help=Args.w_encoding_desc)
112 |     parser.add_argument("--filter", default=None, help=Args.filter_desc)
113 |     parser.add_argument("--expand", default=None, type=int, help=Args.expand_desc)
114 |     parser.add_argument("--qsn", default=None, type=bool, help=Args.qsn_desc)
115 |     parser.add_argument("--query_body", default=DefaultVal.query_body, type=str, help=Args.query_body_desc)
116 |     parser.add_argument("--write_mode", default=DefaultVal.default_file_mode_w, type=str, help=Args.write_mode_desc)
117 |     parser.add_argument("--key_type", default=DefaultVal.default_key_type, type=str.upper, help=Args.key_type_desc)
118 |     parser.add_argument("--quotechar", default=DefaultVal.default_quote_char, type=str, help=Args.quote_char_desc)
119 |     return parser.parse_args()
120 | 
121 | 
122 | def get_filter(filter_file):
123 |     if not filter_file:
124 |         return None
125 |     with open(filter_file, "r") as f:
126 |         exec(f.read())
127 |     func = locals()["my_filter"]
128 |     return func
129 | 
130 | 
131 | async def getter_to_writer(getter, writer):
132 |     with writer as safe_writer:
133 |         async for items in getter:
134 |             if asyncio.iscoroutinefunction(safe_writer.write):
135 |                 await safe_writer.write(items)
136 |             else:
137 |                 safe_writer.write(items)
138 | 
139 | 
140 | def clean():
141 |     from idataapi_transform.DataProcess.Config.ESConfig import global_client
142 |     if global_client is not None:
143 |         loop = asyncio.get_event_loop()
144 |         loop.run_until_complete(global_client.close())
145 | 
146 | 
147 | def main():
148 |     args = get_arg()
149 |     from_ = getattr(args, "from")
150 | 
151 |     from_args = list()
152 |     from_kwargs = dict()
153 |     to_args = list()
154 |     to_kwargs = dict()
155 | 
156 |     if from_ != Args.from_choices[0]:  # not api
157 |         from_args.extend(args.source.split(":"))
158 |     else:
159 |         from_args.extend([args.source])
160 | 
161 |     from_kwargs["encoding"] = args.r_encoding
162 |     from_kwargs["key_type"] = args.key_type
163 |     if args.query_body:
164 |         try:
165 |             from_kwargs["query_body"] = json.loads(args.query_body)
166 |         except Exception as e:
167 |             raise SyntaxError("--query_body must be json serialized")
168 | 
169 |     for key in ("per_limit", "max_limit", "max_retry"):
170 |         from_kwargs[key] = getattr(args, key)
171 | 
172 |     to_kwargs["filter_"] = get_filter(args.filter)
173 |     to_kwargs["encoding"] = args.w_encoding
174 |     to_kwargs["mode"] = args.write_mode
175 |     to_kwargs["key_type"] = args.key_type
176 |     for key in ("max_retry", "expand", "qsn", "quotechar"):
177 |         to_kwargs[key] = getattr(args, key)
178 | 
179 |     if from_ not in getter_config_map:
180 |         raise ValueError("argument from must be in %s" % (str(Args.from_choices), ))
181 |     getter_config = getter_config_map[from_](*from_args, **from_kwargs)
182 |     getter = ProcessFactory.create_getter(getter_config)
183 | 
184 |     if args.to == Args.to_choices[4]:
185 |         # es
186 |         to_args.extend(args.dest.split(":"))
187 |     elif args.to in Args.to_choices[5:]:
188 |         # redis, mysql, mongo
189 |         if args.dest == DefaultVal.dest:
190 |             to_args.append(DefaultVal.dest_without_path)
191 |         else:
192 |             to_args.append(args.dest)
193 |     else:
194 |         dest = args.dest + "." + args.to
195 |         to_args.append(dest)
196 | 
197 |     writer_config = writer_config_map[args.to](*to_args, **to_kwargs)
198 |     writer = ProcessFactory.create_writer(writer_config)
199 |     loop = asyncio.get_event_loop()
200 |     loop.run_until_complete(getter_to_writer(getter, writer))
201 |     # close
202 |     clean()
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     main()
207 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/DataGetter/APIGetter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import hashlib
  4 | import random
  5 | import logging
  6 | import asyncio
  7 | import inspect
  8 | import traceback
  9 | from .BaseGetter import BaseGetter
 10 | from ..Config.ConfigUtil.GetterConfig import RAPIConfig
 11 | from ..Config.ConfigUtil.AsyncHelper import AsyncGenerator
 12 | from ..PersistentUtil.PersistentWriter import PersistentWriter
 13 | 
 14 | headers = {
 15 |     "Accept-Encoding": "gzip",
 16 |     # "Connection": "close"
 17 | }
 18 | 
 19 | post_headers = {
 20 |     "Accept-Encoding": "gzip",
 21 |     "Content-Type": "application/x-www-form-urlencoded"
 22 | }
 23 | 
 24 | 
 25 | class SourceObject(object):
 26 |     def __init__(self, response, tag, source, error_url, post_body):
 27 |         """
 28 |         When error occur
 29 |         :param response: error response body
 30 |         :param tag: tag user pass in
 31 |         :param source: source url user pass in
 32 |         :param error_url: current url elicit error
 33 |         :param post_body: HTTP post body
 34 |         """
 35 |         self.response = response
 36 |         self.tag = tag
 37 |         self.source = source
 38 |         self.error_url = error_url
 39 |         self.post_body = post_body
 40 | 
 41 | 
 42 | class APIGetter(BaseGetter):
 43 |     def __init__(self, config):
 44 |         super().__init__()
 45 |         self.config = config
 46 |         self.base_url = self.config.source
 47 |         self.retry_count = 0
 48 |         self.responses = list()
 49 |         self.bad_responses = list()
 50 |         self.done = False
 51 |         self.page_token = ""
 52 |         self.miss_count = 0
 53 |         self.total_count = 0
 54 |         self.call_back = self.async_call_back = None
 55 |         if self.config.call_back is not None:
 56 |             if inspect.iscoroutinefunction(self.config.call_back):
 57 |                 self.async_call_back = self.config.call_back
 58 |             else:
 59 |                 self.call_back = self.config.call_back
 60 |         self.request_time = 0
 61 |         self.method = "POST" if self.config.post_body else "GET"
 62 |         self.give_up = False
 63 |         self.need_keep_fields = None
 64 |         self.origin_filter = None
 65 |         if self.config.http_headers:
 66 |             self.headers = self.config.http_headers
 67 |         elif self.config.post_body:
 68 |             self.headers = post_headers
 69 |         else:
 70 |             self.headers = headers
 71 | 
 72 |     def init_val(self):
 73 |         self.base_url = self.config.source
 74 |         self.retry_count = 0
 75 |         self.responses = list()
 76 |         self.bad_responses = list()
 77 |         self.done = False
 78 |         self.page_token = ""
 79 |         self.miss_count = 0
 80 |         self.total_count = 0
 81 |         self.call_back = self.async_call_back = None
 82 |         self.request_time = 0
 83 |         self.config.persistent_writer = None
 84 |         self.give_up = False
 85 |         self.need_keep_fields = None
 86 |         self.origin_filter = None
 87 | 
 88 |     def generate_sub_func(self):
 89 |         def sub_func(match):
 90 | 
 91 |             return match.group(1) + self.page_token + match.group(3)
 92 |         return sub_func
 93 | 
 94 |     def update_base_url(self, key="pageToken"):
 95 |         if self.base_url[-1] == "/":
 96 |             self.base_url = self.base_url[:-1]
 97 |         elif self.base_url[-1] == "?":
 98 |             self.base_url = self.base_url[:-1]
 99 | 
100 |         key += "="
101 |         if key not in self.base_url:
102 |             if "?" not in self.base_url:
103 |                 self.base_url = self.base_url + "?" + key + self.page_token
104 |             else:
105 |                 self.base_url = self.base_url + "&" + key + self.page_token
106 |         else:
107 |             self.base_url = re.sub("(" + key + ")(.+?)($|&)", self.generate_sub_func(), self.base_url)
108 | 
109 |     def generate_new_filter(self, json_result):
110 |         def next_filter(item):
111 |             for each_key in self.need_keep_fields:
112 |                 if each_key not in json_result:
113 |                     logging.error("keep_other_field set to True, but key: %s not found in curr_page: %s" % (each_key, self.base_url))
114 |                     return item
115 |                 item[each_key] = json_result[each_key]
116 |             return item
117 | 
118 |         def combine(item):
119 |             result = self.origin_filter(item) if self.origin_filter else item
120 |             if result is not None:
121 |                 return next_filter(result)
122 | 
123 |         if self.need_keep_fields is None:
124 |             # first time to generate field map
125 |             self.need_keep_fields = dict()
126 |             for key in self.config.keep_fields:
127 |                 if key not in json_result:
128 |                     logging.error("key: %s not in page response, not going to add this filed in the following result" % (key, ))
129 |                     continue
130 |                 self.need_keep_fields[key] = json_result[key]
131 |             self.origin_filter = self.config.filter
132 | 
133 |         if not self.need_keep_fields or not self.config.keep_fields:
134 |             return
135 |         self.config.filter = combine
136 | 
137 |     def __aiter__(self):
138 |         return self
139 | 
140 |     async def __anext__(self):
141 |         if self.done:
142 |             logging.info("get source done: %s, total get %d items, total filtered: %d items" %
143 |                          (self.config.source, self.total_count, self.miss_count))
144 |             if self.config.persistent_writer and (not self.give_up or self.config.persistent_to_disk_if_give_up):
145 |                 self.config.persistent_writer.add(self.config.source)
146 |             self.init_val()
147 |             raise StopAsyncIteration
148 | 
149 |         while True:
150 |             result = None # for SourceObject
151 |             try:
152 |                 if self.config.debug_mode:
153 |                     log_str = "HTTP method: %s, url: %s" % (self.method, self.base_url)
154 |                     logging.info(log_str)
155 |                 resp = await self.config.session._request(self.method, self.base_url, headers=self.headers, data=self.config.post_body, timeout=self.config.http_timeout)
156 |                 text = await resp.text()
157 |                 # print(text)
158 |                 result = json.loads(text)
159 |                 if "data" not in result:
160 |                     if "retcode" not in result or result["retcode"] not in self.config.success_ret_code:
161 |                         raise ValueError("Bad retcode: %s" % (str(result["retcode"]) if "retcode" in result else str(result), ))
162 |                 if self.config.keep_other_fields:
163 |                     self.generate_new_filter(result)
164 | 
165 |             except Exception as e:
166 |                 self.retry_count += 1
167 |                 logging.error("retry: %d, %s: %s" % (self.retry_count, str(e), self.base_url))
168 |                 await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
169 |                 if self.retry_count < self.config.max_retry:
170 |                     continue
171 |                 else:
172 |                     # fail
173 |                     logging.error("Give up, After retry: %d times, Unable to get url: %s, total get %d items, "
174 |                                   "total filtered: %d items, error: %s" % (self.config.max_retry, self.base_url,
175 |                                                                            self.total_count, self.miss_count,
176 |                                                                            str(traceback.format_exc()) if "Bad retcode" not in str(e) else str(e)))
177 |                     self.done = self.give_up = True
178 |                     if self.config.return_fail:
179 |                         self.bad_responses.append(SourceObject(result, self.config.tag, self.config.source, self.base_url, self.config.post_body))
180 |                         return await self.clear_and_return()
181 |                     elif self.responses:
182 |                         return await self.clear_and_return()
183 |                     else:
184 |                         return await self.__anext__()
185 | 
186 |             self.request_time += 1
187 |             if "data" in result:
188 |                 # success
189 |                 self.retry_count = 0
190 |                 origin_length = len(result["data"])
191 | 
192 |                 if self.config.filter:
193 |                     curr_response = [self.config.filter(i) for i in result["data"]]
194 |                     curr_response = [i for i in curr_response if i]
195 |                     self.miss_count += origin_length - len(curr_response)
196 |                 else:
197 |                     curr_response = result["data"]
198 |                 self.total_count += origin_length if self.config.exclude_filtered_to_max_limit else len(curr_response)
199 |                 self.responses.extend(curr_response)
200 |                 # trim_to_max_limit
201 |                 if self.config.trim_to_max_limit and self.config.max_limit and self.total_count > self.config.max_limit:
202 |                     need_trim_items = self.total_count - self.config.max_limit
203 |                     self.responses = self.responses[:-need_trim_items]
204 |                     logging.info("trim %d items to fit max_limit: %d" % (need_trim_items, self.config.max_limit))
205 |                     self.total_count -= need_trim_items
206 |                 # check if done
207 |                 if self.config.done_if is not None and self.config.done_if(curr_response):
208 |                     self.done = True
209 |                     return await self.clear_and_return()
210 | 
211 |             # get next page if success, retry if fail
212 |             if "pageToken" in result:
213 |                 if not result["pageToken"]:
214 |                     self.done = True
215 |                     if self.need_return():
216 |                         return await self.clear_and_return()
217 | 
218 |                 self.page_token = str(result["pageToken"])
219 |                 self.update_base_url()
220 | 
221 |             elif "retcode" in result and result["retcode"] in self.config.success_ret_code:
222 |                 self.done = True
223 |                 if self.need_return():
224 |                     return await self.clear_and_return()
225 |                 return await self.__anext__()
226 |             else:
227 |                 self.retry_count += 1
228 |                 if self.retry_count >= self.config.max_retry:
229 |                     logging.error("Give up, After retry: %d times, Unable to get url: %s, total get %d items, "
230 |                                   "total filtered: %d items" % (self.config.max_retry, self.base_url,
231 |                                                                 self.total_count, self.miss_count))
232 |                     self.done = self.give_up = True
233 |                     if self.need_return():
234 |                         return await self.clear_and_return()
235 | 
236 |                 await asyncio.sleep(random.uniform(self.config.random_min_sleep, self.config.random_max_sleep))
237 |                 return await self.__anext__()
238 | 
239 |             if self.config.max_limit and self.total_count >= self.config.max_limit:
240 |                 self.done = True
241 |                 return await self.clear_and_return()
242 |             elif len(self.responses) >= self.config.per_limit:
243 |                 return await self.clear_and_return()
244 |             elif self.done:
245 |                 # buffer has empty data, and done fetching
246 |                 return await self.__anext__()
247 | 
248 |             if self.request_time % self.config.report_interval == 0:
249 |                 logging.info("After request %d pages, current item count(%d) < per_limit(%d), latest request page: %s" %
250 |                              (self.request_time, len(self.responses), self.config.per_limit, self.base_url))
251 | 
252 |     def __iter__(self):
253 |         raise ValueError("APIGetter must be used with async generator, not normal generator")
254 | 
255 |     async def clear_and_return(self):
256 |         self.request_time = 0
257 |         if self.config.return_fail:
258 |             resp, bad_resp = self.responses, self.bad_responses
259 |             self.responses, self.bad_responses = list(), list()
260 |             if self.call_back is not None:
261 |                 r = self.call_back(resp, bad_resp)
262 |                 if inspect.iscoroutine(r):
263 |                     # bind function for coroutine
264 |                     self.async_call_back = self.call_back
265 |                     self.call_back = None
266 |                     return await r
267 |                 return r
268 |             elif self.async_call_back is not None:
269 |                 return await self.async_call_back(resp, bad_resp)
270 |             else:
271 |                 return resp, bad_resp
272 |         else:
273 |             resp = self.responses
274 |             self.responses = list()
275 |             if self.call_back is not None:
276 |                 r = self.call_back(resp)
277 |                 if inspect.iscoroutine(r):
278 |                     # bind function for coroutine
279 |                     self.async_call_back = self.call_back
280 |                     self.call_back = None
281 |                     return await r
282 |                 return r
283 |             elif self.async_call_back is not None:
284 |                 return await self.async_call_back(resp)
285 |             else:
286 |                 return resp
287 | 
288 |     def need_return(self):
289 |         return self.responses or (self.config.return_fail and (self.responses or self.bad_responses))
290 | 
291 | 
292 | class APIBulkGetter(BaseGetter):
293 |     def __init__(self, config):
294 |         super().__init__()
295 |         self.config = config
296 |         self.async_api_configs = AsyncGenerator(self.config.sources, self.to_config)
297 | 
298 |         self.pending_tasks = list()
299 |         self.buffers = list()
300 |         self.bad_buffers = list()
301 |         self.success_task = 0
302 |         self.curr_size = 0
303 |         self.curr_bad_size = 0
304 |         self.persistent_writer = None
305 |         self.skip_num = 0
306 | 
307 |     def to_config(self, item):
308 |         if isinstance(item, RAPIConfig):
309 |             r = item
310 |         else:
311 |             r = RAPIConfig(item, session=self.config.session, filter_=self.config.filter,
312 |                               return_fail=self.config.return_fail, done_if=self.config.done_if,
313 |                               trim_to_max_limit=self.config.trim_to_max_limit,
314 |                               exclude_filtered_to_max_limit=self.config.exclude_filtered_to_max_limit,
315 |                            persistent_to_disk_if_give_up=self.config.persistent_to_disk_if_give_up,
316 |                            debug_mode=self.config.debug_mode, http_headers=self.config.http_headers)
317 |         # persistent
318 |         if self.config.persistent:
319 |             if not self.config.persistent_key:
320 |                 self.config.persistent_key = hashlib.md5(r.source.encode("utf8")).hexdigest()
321 |             if self.persistent_writer is None:
322 |                 self.persistent_writer = PersistentWriter(self.config.persistent_key)
323 |             r.persistent_writer = self.persistent_writer
324 |         return r
325 | 
326 |     async def fetch_items(self, api_config):
327 |         if api_config.return_fail:
328 |             async for items, bad_items in APIGetter(api_config):
329 |                 if self.config.return_fail:
330 |                     self.bad_buffers.extend(bad_items)
331 |                 self.buffers.extend(items)
332 |         else:
333 |             async for items in APIGetter(api_config):
334 |                 self.buffers.extend(items)
335 | 
336 |     async def fill_tasks(self):
337 |         if len(self.pending_tasks) >= self.config.concurrency:
338 |             return
339 | 
340 |         async for api_config in self.async_api_configs:
341 |             # skip already done task
342 |             if self.config.persistent:
343 |                 if api_config.source in self.persistent_writer:
344 |                     self.skip_num += 1
345 |                     continue
346 |             self.pending_tasks.append(self.fetch_items(api_config))
347 |             if len(self.pending_tasks) >= self.config.concurrency:
348 |                 self.persistent()
349 |                 return
350 | 
351 |         self.persistent()
352 | 
353 |     def __aiter__(self):
354 |         return self
355 | 
356 |     async def __anext__(self):
357 |         await self.fill_tasks()
358 |         while self.pending_tasks:
359 |             done, pending = await asyncio.wait(self.pending_tasks, timeout=self.config.interval)
360 |             self.pending_tasks = list(pending)
361 |             self.success_task += len(done)
362 |             if self.buffers or (self.config.return_fail and (self.buffers or self.bad_buffers)):
363 |                 return self.clear_and_return()
364 |             else:
365 |                 # after interval seconds, no item fetched
366 |                 await self.fill_tasks()
367 |                 log_str = "After %.2f seconds, no new item fetched, current done task: %d, pending tasks: %d" % (float(self.config.interval), self.success_task, len(self.pending_tasks))
368 |                 if self.config.persistent:
369 |                     log_str += ", skip %d already finished tasks with persistent mode on" % (self.skip_num, )
370 |                 logging.info(log_str)
371 |                 continue
372 | 
373 |         ret_log = "APIBulkGetter Done, total perform: %d tasks, fetch: %d items" % (self.success_task, self.curr_size)
374 |         if self.config.return_fail:
375 |             ret_log += ", fail: %d items" % (self.curr_bad_size, )
376 |         if self.config.persistent:
377 |             ret_log += ", skip %d already finished tasks with persistent mode on" % (self.skip_num,)
378 |         logging.info(ret_log)
379 |         if self.config.persistent:
380 |             self.persistent_writer.clear(self.config.persistent_start_fresh_if_done)
381 |         raise StopAsyncIteration
382 | 
383 |     def __iter__(self):
384 |         raise ValueError("APIBulkGetter must be used with async generator, not normal generator")
385 | 
386 |     def clear_and_return(self):
387 |         if self.config.return_fail:
388 |             buffers, bad_buffers = self.buffers, self.bad_buffers
389 |             self.curr_size += len(self.buffers)
390 |             self.curr_bad_size += len(self.bad_buffers)
391 |             self.buffers, self.bad_buffers = list(), list()
392 |             return buffers, bad_buffers
393 |         else:
394 |             buffers = self.buffers
395 |             self.curr_size += len(self.buffers)
396 |             self.buffers = list()
397 |             return buffers
398 | 
399 |     def persistent(self):
400 |         # persistent task to file
401 |         if self.config.persistent:
402 |             self.persistent_writer.write()
403 |             # logging.info("persistent mode on, after sync, totally skip %d already finished tasks" % (self.skip_num,))
404 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConfigUtil/WriterConfig.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import aioredis
  3 | import inspect
  4 | 
  5 | try:
  6 |     import aiomysql
  7 | except Exception as e:
  8 |     pass
  9 | 
 10 | try:
 11 |     import motor.motor_asyncio
 12 | except Exception as e:
 13 |     pass
 14 | 
 15 | try:
 16 |     import confluent_kafka
 17 | except Exception:
 18 |     pass
 19 | 
 20 | 
 21 | from .BaseConfig import BaseWriterConfig
 22 | from ..ESConfig import get_es_client
 23 | from ..DefaultValue import DefaultVal
 24 | 
 25 | 
 26 | class WCSVConfig(BaseWriterConfig):
 27 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding,
 28 |                  headers=None, filter_=None, expand=None, qsn=DefaultVal.qsn,
 29 |                  quotechar=DefaultVal.default_quote_char, **kwargs):
 30 |         """
 31 |         :param filename: filename to write
 32 |         :param mode: file open mode, i.e "w" or "a+"
 33 |         :param encoding: file encoding i.e "utf8"
 34 |         :param headers: csv headers in first row, if not set, automatically extract in first bulk of items
 35 |         :param filter_: run "transform --help" to see command line interface explanation for detail
 36 |         :param expand: run "transform --help" to see command line interface explanation for detail
 37 |         :param qsn: run "transform --help" to see command line interface explanation for detail
 38 |         :param quotechar: run "transform --help" to see command line interface explanation for detail
 39 |         :param kwargs:
 40 | 
 41 |         Example:
 42 |             ...
 43 |             csv_config = WCSVConfig("./result.csv", encoding="utf8", headers=["likeCount", "id", "title"])
 44 |             with ProcessFactory.create_writer(csv_config) as csv_writer:
 45 |                 async for items in es_getter:
 46 |                     # do whatever you want with items
 47 |                     csv_writer.write(items)
 48 |         """
 49 |         super().__init__()
 50 |         self.filename = filename
 51 |         self.encoding = encoding
 52 |         self.mode = mode
 53 |         self.headers = headers
 54 |         self.filter = filter_
 55 |         self.expand = expand
 56 |         self.qsn = qsn
 57 |         self.quotechar = quotechar
 58 | 
 59 | 
 60 | class WESConfig(BaseWriterConfig):
 61 |     def __init__(self, indices, doc_type=None, filter_=None, expand=None, id_hash_func=DefaultVal.default_id_hash_func,
 62 |                  appCode=None, actions=None, createDate=None, error_if_fail=True, timeout=None, max_retry=None,
 63 |                  random_min_sleep=None, random_max_sleep=None, auto_insert_createDate=True, hosts=None, headers=None,
 64 |                  **kwargs):
 65 |         """
 66 |         :param indices: elasticsearch indices
 67 |         :param doc_type: elasticsearch doc_type
 68 |         :param filter_: run "transform --help" to see command line interface explanation for detail
 69 |         :param expand: run "transform --help" to see command line interface explanation for detail
 70 |         :param id_hash_func: function to generate id_ for each item
 71 |         :param appCode: if not None, add appCode to each item before write to es
 72 |         :param actions: if not None, will set actions to user define actions, else default actions is 'index'
 73 |         :param createDate: if not None, add createDate to each item before write to es
 74 |         :param error_if_fail: if True, log to error if fail to insert to es, else log nothing
 75 |         :param timeout: http connection timeout when connect to es, seconds
 76 |         :param max_retry: if request fail, retry max_retry times
 77 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
 78 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
 79 |         :param auto_insert_createDate: whether insert createDate for each item automatic -> boolean
 80 |         :param hosts: elasticsearch hosts, list type, i.e: ["localhost:8888", "127.0.0.2:8889"]
 81 |         :param headers: headers when perform http requests to elasticsearch, dict type, i.e: {"Host": "aaa", "apikey": "bbb"}
 82 |         :param kwargs:
 83 | 
 84 |         Example:
 85 |             ...
 86 |             es_config = WESConfig("post20170630", "news")
 87 |             with ProcessFactory.create_writer(es_config) as es_writer:
 88 |                     # asyncio function must call with await
 89 |                     await csv_writer.write(items)
 90 |         """
 91 |         super().__init__()
 92 | 
 93 |         if not random_min_sleep:
 94 |             random_min_sleep = DefaultVal.random_min_sleep
 95 |         if not random_max_sleep:
 96 |             random_max_sleep = DefaultVal.random_max_sleep
 97 |         if not max_retry:
 98 |             max_retry = DefaultVal.max_retry
 99 | 
100 |         if not DefaultVal.main_config.has_es_configured:
101 |             raise ValueError("You must config es_hosts before using Elasticsearch, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
102 | 
103 |         self.indices = indices
104 |         self.doc_type = doc_type
105 |         self.filter = filter_
106 |         self.expand = expand
107 |         self.id_hash_func = id_hash_func
108 |         self.es_client = get_es_client(hosts=hosts, headers=headers)
109 |         self.app_code = appCode
110 |         self.actions = actions
111 |         self.create_date = createDate
112 |         self.error_if_fail = error_if_fail
113 |         self.timeout = timeout
114 |         self.max_retry = max_retry
115 |         self.random_min_sleep = random_min_sleep
116 |         self.random_max_sleep = random_max_sleep
117 |         self.auto_insert_createDate = auto_insert_createDate
118 | 
119 | 
120 | class WJsonConfig(BaseWriterConfig):
121 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding,
122 |                  expand=None, filter_=None, new_line=DefaultVal.new_line, **kwargs):
123 |         """
124 |         :param filename: filename to write
125 |         :param mode: file open mode, i.e "w" or "a+"
126 |         :param encoding: file encoding i.e "utf8"
127 |         :param expand: run "transform --help" to see command line interface explanation for detail
128 |         :param filter_: run "transform --help" to see command line interface explanation for detail
129 |         :param new_line: new_line seperator for each item, default is "\n"
130 |         :param kwargs:
131 | 
132 |         Example:
133 |             ...
134 |             json_config = WJsonConfig("./result.json")
135 |             with ProcessFactory.create_writer(csv_config) as json_writer:
136 |                 async for items in es_getter:
137 |                     json_writer.write(items)
138 |         """
139 |         super().__init__()
140 |         self.filename = filename
141 |         self.mode = mode
142 |         self.encoding = encoding
143 |         self.expand = expand
144 |         self.filter = filter_
145 |         self.new_line = new_line
146 | 
147 | 
148 | class WTXTConfig(BaseWriterConfig):
149 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_w, encoding=DefaultVal.default_encoding,
150 |                  expand=None, filter_=None, new_line=DefaultVal.new_line, join_val=DefaultVal.join_val, **kwargs):
151 |         """
152 |         :param filename: filename to write
153 |         :param mode: file open mode, i.e "w" or "a+"
154 |         :param encoding: file encoding i.e "utf8"
155 |         :param expand: run "transform --help" to see command line interface explanation for detail
156 |         :param filter_: run "transform --help" to see command line interface explanation for detail
157 |         :param new_line: new_line seperator for each item, default is "\n"
158 |         :param join_val: space seperator for each key in each item, default is " "
159 |         :param kwargs:
160 | 
161 |         Example:
162 |             ...
163 |             txt_config = WTXTConfig("./result.txt")
164 |             with ProcessFactory.create_writer(txt_config) as txt_writer:
165 |                 async for items in es_getter:
166 |                     txt_writer.write(items)
167 |         """
168 |         super().__init__()
169 |         self.filename = filename
170 |         self.mode = mode
171 |         self.encoding = encoding
172 |         self.expand = expand
173 |         self.filter = filter_
174 |         self.new_line = new_line
175 |         self.join_val = join_val
176 | 
177 | 
178 | class WXLSXConfig(BaseWriterConfig):
179 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_w, title=DefaultVal.title, expand=None, filter_=None, headers=None, sheet_index=0, **kwargs):
180 |         """
181 |         :param filename: filename to write
182 |         :param mode: file open mode, i.e "w" or "a+"
183 |         :param title: sheet title
184 |         :param expand: run "transform --help" to see command line interface explanation for detail
185 |         :param filter_: run "transform --help" to see command line interface explanation for detail
186 |         :param headers: xlsx headers in first row, if not set, automatically extract in first bulk of items
187 |         :param sheet_index: which sheet to get, 0 means 0th sheet, only work for append mode
188 |         :param kwargs:
189 | 
190 |         Example:
191 |             ...
192 |             xlsx_config = WXLSXConfig("./result.xlsx")
193 |             with ProcessFactory.create_writer(xlsx_config) as xlsx_writer:
194 |                 async for items in es_getter:
195 |                     xlsx_writer.write(items)
196 |         """
197 |         super().__init__()
198 |         self.filename = filename
199 |         self.mode = mode
200 |         self.title = title
201 |         self.expand = expand
202 |         self.filter = filter_
203 |         self.headers = headers
204 |         self.sheet_index = sheet_index
205 | 
206 | 
207 | class WRedisConfig(BaseWriterConfig):
208 |     def __init__(self, key, key_type="LIST", filter_=None, host=None, port=None, db=None, password=None, timeout=None,
209 |                  encoding=None, direction=None, max_retry=None, random_min_sleep=None, random_max_sleep=None,
210 |                  compress=None, **kwargs):
211 |         """
212 |         :param key: redis key to write data
213 |         :param key_type: redis data type to operate, current only support LIST, HASH
214 |         :param filter_: run "transform --help" to see command line interface explanation for detail
215 |         :param host: redis host -> str
216 |         :param port: redis port -> int
217 |         :param db: redis database number -> int
218 |         :param password: redis password -> int
219 |         :param timeout: timeout per redis connection -> float
220 |         :param encoding: redis object encoding -> str
221 |         :param direction: "L" or "R", lpush or rpush
222 |         :param compress: whether compress data use zlib before write to redis -> boolean
223 |         :param kwargs:
224 | 
225 |         Example:
226 |             redis_config = WRedisConfig("my_key")
227 |             with ProcessFactory.create_writer(redis_config) as redis_writer:
228 |                 async for items in es_getter:
229 |                     await redis_writer.write(items)
230 |         """
231 |         super().__init__()
232 |         # load default value
233 |         if not random_min_sleep:
234 |             random_min_sleep = DefaultVal.random_min_sleep
235 |         if not random_max_sleep:
236 |             random_max_sleep = DefaultVal.random_max_sleep
237 |         if not max_retry:
238 |             max_retry = DefaultVal.max_retry
239 |         if host is None:
240 |             host = DefaultVal.redis_host
241 |         if port is None:
242 |             port = DefaultVal.redis_port
243 |         if db is None:
244 |             db = DefaultVal.redis_db
245 |         if password is None:
246 |             password = DefaultVal.redis_password
247 |         if timeout is None:
248 |             timeout = DefaultVal.redis_timeout
249 |         if encoding is None:
250 |             encoding = DefaultVal.redis_encoding
251 |         if direction is None:
252 |             direction = DefaultVal.redis_direction
253 |         if compress is None:
254 |             compress = DefaultVal.redis_compress
255 | 
256 |         # check value
257 |         if not DefaultVal.main_config.has_redis_configured and port <= 0:
258 |             raise ValueError("You must config redis before using Redis, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
259 |         if key_type not in ("LIST", "HASH"):
260 |             raise ValueError("key_type must be one of (%s)" % (str(("LIST", )), ))
261 |         if not encoding:
262 |             raise ValueError("You must specific encoding, since I am going to load each object in json format, "
263 |                              "and treat it as dictionary in python")
264 |         if not password:
265 |             password = None
266 | 
267 |         self.redis_pool_cli = None
268 |         self.key = key
269 |         self.host = host
270 |         self.port = port
271 |         self.db = db
272 |         self.password = password
273 |         self.encoding = encoding
274 |         self.timeout = timeout
275 | 
276 |         self.key_type = key_type
277 |         self.filter = filter_
278 | 
279 |         self.name = "%s_%s->%s" % (str(host), str(port), str(key))
280 | 
281 |         self.redis_write_method = None
282 |         self.direction = direction
283 |         self.max_retry = max_retry
284 |         self.random_min_sleep = random_min_sleep
285 |         self.random_max_sleep = random_max_sleep
286 |         self.compress = compress
287 | 
288 |         if key_type == "LIST":
289 |             self.is_range = True
290 |         else:
291 |             self.is_range = False
292 | 
293 |     async def get_redis_pool_cli(self):
294 |         """
295 |         :return: an async redis client
296 |         """
297 |         if self.redis_pool_cli is None:
298 |             kwargs = {
299 |                 "db": int(self.db),
300 |                 "password": self.password,
301 |                 "encoding": self.encoding,
302 |                 "timeout": self.timeout,
303 |                 "minsize": 1,
304 |                 "maxsize": 3
305 |             }
306 |             if self.compress:
307 |                 del kwargs["encoding"]
308 |             self.redis_pool_cli = await aioredis.create_redis_pool((self.host, self.port), **kwargs)
309 |             if self.key_type == "LIST":
310 |                 if self.direction == "L":
311 |                     self.redis_write_method = self.redis_pool_cli.lpush
312 |                 else:
313 |                     self.redis_write_method = self.redis_pool_cli.rpush
314 |             else:
315 |                 self.redis_write_method = self.redis_pool_cli.hset
316 | 
317 |         return self.redis_pool_cli
318 | 
319 | 
320 | class WMySQLConfig(BaseWriterConfig):
321 |     def __init__(self, table, filter_=None, max_retry=None, random_min_sleep=None, random_max_sleep=None,
322 |                  host=None, port=None, user=None, password=None, database=None, charset=None, loop=None, **kwargs):
323 |         """
324 |         :param table: mysql table
325 |         :param filter_: run "transform --help" to see command line interface explanation for detail
326 |         :param max_retry: if request fail, retry max_retry times
327 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
328 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
329 |         :param host: mysql host -> str
330 |         :param port: mysql port -> int
331 |         :param user: mysql user -> str
332 |         :param password: mysql password -> str
333 |         :param database: mysql database -> str
334 |         :param charset: default utf8 -> str
335 |         :param loop: async loop instance
336 |         :param kwargs:
337 | 
338 |         Example:
339 |             mysql_config = WMySQLConfig("my_table")
340 |             mysql_writer = ProcessFactory.create_writer(mysql_config)
341 |             async for items in redis_getter:
342 |                 await mysql_writer.write(items)
343 |         """
344 |         super().__init__()
345 |         if not random_min_sleep:
346 |             random_min_sleep = DefaultVal.random_min_sleep
347 |         if not random_max_sleep:
348 |             random_max_sleep = DefaultVal.random_max_sleep
349 |         if not max_retry:
350 |             max_retry = DefaultVal.max_retry
351 |         if not host:
352 |             host = DefaultVal.mysql_host
353 |         if not port:
354 |             port = DefaultVal.mysql_port
355 |         if not user:
356 |             user = DefaultVal.mysql_user
357 |         if not password:
358 |             password = DefaultVal.mysql_password
359 |         if not database:
360 |             database = DefaultVal.mysql_database
361 |         if not charset:
362 |             charset = DefaultVal.mysql_encoding
363 | 
364 |         if not DefaultVal.main_config.has_mysql_configured and port <= 0:
365 |             raise ValueError("You must config mysql before using MySQL, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
366 |         if "aiomysql" not in globals():
367 |             raise ValueError("module mysql disabled, please reinstall "
368 |                              "requirements with python version higher than 3.5.3 to enable it")
369 | 
370 |         self.table = table
371 |         self.database = database
372 | 
373 |         self.max_retry = max_retry
374 |         self.random_min_sleep = random_min_sleep
375 |         self.random_max_sleep = random_max_sleep
376 |         self.filter = filter_
377 | 
378 |         self.name = "%s->%s" % (self.database, self.table)
379 | 
380 |         self.host = host
381 |         self.port = port
382 |         self.user = user
383 |         if not password:
384 |             password = ''
385 |         self.password = password
386 |         self.database = database
387 |         self.charset = charset
388 | 
389 |         if not loop:
390 |             loop = asyncio.get_event_loop()
391 |         self.loop = loop
392 |         self.mysql_pool_cli = self.connection = self.cursor = None
393 | 
394 |     async def get_mysql_pool_cli(self):
395 |         """
396 |         :return: an async mysql client
397 |         """
398 |         if self.mysql_pool_cli is None:
399 |             self.mysql_pool_cli = await aiomysql.create_pool(host=self.host, port=self.port, user=self.user,
400 |                                                              password=self.password, db=self.database, loop=self.loop,
401 |                                                              minsize=1, maxsize=3, charset=self.charset)
402 |             self.connection = await self.mysql_pool_cli.acquire()
403 |             self.cursor = await self.connection.cursor()
404 |         return self.mysql_pool_cli
405 | 
406 |     def free_resource(self):
407 |         if self.mysql_pool_cli is not None:
408 |             self.mysql_pool_cli.release(self.connection)
409 |             self.mysql_pool_cli.close()
410 |             self.loop.create_task(self.mysql_pool_cli.wait_closed())
411 |             self.mysql_pool_cli = self.connection = self.cursor = None
412 | 
413 | 
414 | class WMongoConfig(BaseWriterConfig):
415 |     def __init__(self, collection, id_hash_func=DefaultVal.default_id_hash_func, max_retry=None, random_min_sleep=None,
416 |                  random_max_sleep=None, filter_=None, protocol=None, host=None, port=None, username=None, password=None,
417 |                  database=None, other_params=None, auto_insert_createDate=False, createDate=None, **kwargs):
418 |         """
419 |         :param collection: collection name
420 |         :param id_hash_func: function to generate id_ for each item, only if "_id" not in item will I use 'id_hash_func' to generate "_id"
421 |         :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object
422 |                               if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}}
423 |         :param max_retry: if request fail, retry max_retry times
424 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
425 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
426 |         :param filter_: run "transform --help" to see command line interface explanation for detail
427 |         :param protocol: connection url protocol
428 |         :param host: mongodb host -> str
429 |         :param port: mongodb port -> int
430 |         :param user: mongodb user -> str
431 |         :param password: mongodb password -> str
432 |         :param database: mongodb database -> str
433 |         :param other_params: connection url's params after ?
434 |         :param createDate: if not None, add createDate to each item before write to mongodb
435 |         :param auto_insert_createDate: whether insert createDate for each item automatic -> boolean
436 |         :param kwargs:
437 | 
438 |         Example:
439 |             data = [json_obj, json_obj, json_obj]
440 |             mongo_config = WMongoConfig("my_coll")
441 |             async with ProcessFactory.create_writer(mongo_config) as mongo_writer:
442 |                 await mongo_writer.write(data)
443 |         """
444 |         super().__init__()
445 |         if not random_min_sleep:
446 |             random_min_sleep = DefaultVal.random_min_sleep
447 |         if not random_max_sleep:
448 |             random_max_sleep = DefaultVal.random_max_sleep
449 |         if not max_retry:
450 |             max_retry = DefaultVal.max_retry
451 |         if not host:
452 |             host = DefaultVal.mongo_host
453 |         if not port:
454 |             port = DefaultVal.mongo_port
455 |         if not username:
456 |             username = DefaultVal.mongo_username
457 |         if not password:
458 |             password = DefaultVal.mongo_password
459 |         if not database:
460 |             database = DefaultVal.mongo_database
461 |         if not protocol:
462 |             protocol =  DefaultVal.mongo_protocol
463 |         else:
464 |             raise ValueError("Must define URI Scheme in mongo")
465 |         if not other_params:
466 |             other_params = DefaultVal.mongo_other_params
467 | 
468 |         if not DefaultVal.main_config.has_mongo_configured:
469 |             raise ValueError("You must config MongoDB before using MongoDB, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
470 |         if "motor" not in globals():
471 |             raise ValueError("module motor disabled, please reinstall "
472 |                              "requirements in linux")
473 | 
474 |         self.collection = collection
475 |         self.max_retry = max_retry
476 |         self.random_min_sleep = random_min_sleep
477 |         self.random_max_sleep = random_max_sleep
478 |         self.filter = filter_
479 |         if "srv" in protocol:
480 |             try:
481 |                 import dns  # required for mongodb connecting with SRV
482 |             except Exception:
483 |                 raise ValueError("can't find dnspython, install it first!")
484 |         self.protocol = protocol
485 |         self.host = host
486 |         self.port = port
487 |         self.username = username
488 |         self.password = password
489 |         self.database = database
490 |         self.other_params = other_params
491 |         self.name = "%s->%s" % (self.database, self.collection)
492 |         self.id_hash_func = id_hash_func
493 |         self.auto_insert_createDate = auto_insert_createDate
494 |         self.createDate = createDate
495 | 
496 |         self.client = self.collection_cli = None
497 | 
498 |     def get_mongo_cli(self):
499 |         if self.client is None:
500 |             kwargs = {
501 |                 "host": self.host,
502 |                 "port": self.port
503 |             }
504 |             if self.protocol and self.username:
505 |                 if "srv" in self.protocol:  # mongodb+srv must not include port number
506 |                     self.client = motor.motor_asyncio.AsyncIOMotorClient(
507 |                         "%s://%s:%s@%s/%s?%s" % (self.protocol, self.username, self.password, kwargs["host"],
508 |                                                  self.database, self.other_params))
509 |                 else:
510 |                     self.client = motor.motor_asyncio.AsyncIOMotorClient(
511 |                         "%s://%s:%s@%s:%s/%s?%s" % (self.protocol, self.username, self.password, kwargs["host"],
512 |                                                       str(kwargs["port"]), self.database, self.other_params))
513 |             else:
514 |                 self.client = motor.motor_asyncio.AsyncIOMotorClient(**kwargs)
515 |             self.collection_cli = self.client[self.database][self.collection]
516 |         return self.client
517 | 
518 | 
519 | class WKafkaConfig(BaseWriterConfig):
520 |     def __init__(self, max_retry=None, random_min_sleep=None,
521 |                  random_max_sleep=None, filter_=None, bootstrap_servers=None, **kwargs):
522 |         """
523 |         :param max_retry: if request fail, retry max_retry times
524 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
525 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
526 |         :param filter_: run "transform --help" to see command line interface explanation for detail
527 |         :param bootstrap_servers: kafka bootstrap.servers -> str
528 |         :param kwargs:
529 | 
530 |         Example:
531 |             data = [json_obj, json_obj, json_obj]
532 |             mongo_config = WMongoConfig("my_coll")
533 |             async with ProcessFactory.create_writer(mongo_config) as mongo_writer:
534 |                 await mongo_writer.write(data)
535 |         """
536 |         super().__init__()
537 |         if not random_min_sleep:
538 |             random_min_sleep = DefaultVal.random_min_sleep
539 |         if not random_max_sleep:
540 |             random_max_sleep = DefaultVal.random_max_sleep
541 |         if not max_retry:
542 |             max_retry = DefaultVal.max_retry
543 |         if not bootstrap_servers:
544 |             bootstrap_servers = DefaultVal.kafka_bootstrap_servers
545 |         else:
546 |             raise ValueError("Must define bootstrap.servers in kafka")
547 | 
548 |         if not DefaultVal.main_config.has_kafka_configured:
549 |             raise ValueError("You must config kafka before using Kafka, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
550 |         if "confluent_kafka" not in globals():
551 |             raise ValueError("module confluent_kafka disabled, please reinstall "
552 |                              "requirements in linux")
553 | 
554 |         self.max_retry = max_retry
555 |         self.random_min_sleep = random_min_sleep
556 |         self.random_max_sleep = random_max_sleep
557 |         self.filter = filter_
558 |         self.bootstrap_servers = bootstrap_servers
559 | 


--------------------------------------------------------------------------------
/idataapi_transform/DataProcess/Config/ConfigUtil/GetterConfig.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | import inspect
  4 | import aioredis
  5 | 
  6 | try:
  7 |     import aiomysql
  8 | except Exception as e:
  9 |     pass
 10 | 
 11 | try:
 12 |     import motor.motor_asyncio
 13 | except Exception as e:
 14 |     pass
 15 | 
 16 | from aiohttp.client import sentinel
 17 | from .BaseConfig import BaseGetterConfig
 18 | 
 19 | from ..ESConfig import get_es_client
 20 | from ..DefaultValue import DefaultVal
 21 | from ..ConnectorConfig import session_manger
 22 | 
 23 | 
 24 | class RAPIConfig(BaseGetterConfig):
 25 |     def __init__(self, source, per_limit=DefaultVal.per_limit, max_limit=DefaultVal.max_limit,
 26 |                  max_retry=DefaultVal.max_retry, random_min_sleep=None, random_max_sleep=None, session=None,
 27 |                  filter_=None, return_fail=False, tag=None, call_back=None, report_interval=10, success_ret_code=None,
 28 |                  done_if=None, trim_to_max_limit=DefaultVal.trim_to_max_limit,
 29 |                  exclude_filtered_to_max_limit=DefaultVal.exclude_filtered_to_max_limit, post_body=None,
 30 |                  persistent_writer=None, persistent_to_disk_if_give_up=True, debug_mode=False, keep_other_fields=False,
 31 |                  keep_fields=("dataType", "appCode"), http_headers=None, http_timeout=None, **kwargs):
 32 |         """
 33 |         will request until no more next_page to get, or get "max_limit" items
 34 | 
 35 |         :param source: API to get, i.e. "http://..."
 36 |         :param per_limit: how many items to get per time (counter will add each item after filter)
 37 |         :param max_limit: get at most max_limit items, if not set, get all (counter will add each item before filter)
 38 |         :param max_retry: if request fail, retry max_retry times
 39 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
 40 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
 41 |         :param session: aiohttp session to perform request
 42 |         :param filter_: run "transform --help" to see command line interface explanation for detail
 43 |         :param return_fail: if set to True, for each iteration, will return a tuple,
 44 |             api_getter = ProcessFactory.create_getter(RAPIConfig("http://..."))
 45 |             async for items, bad_objects in getter:
 46 |                 A = bad_objects[0]
 47 |                 A.response: -> json object: '{"appCode": "weixinpro", "dataType": "post", "message": "param error", "retcode": "100005"}', if fail in request, response will be None
 48 |                 A.tag: -> tag you pass to RAPIConfig
 49 |                 A.source: -> source you pass to RAPIConfig
 50 |                 A.post_body: -> http post body
 51 | 
 52 |         :param call_back: a function(can be async function) to call on results before each "async for" return
 53 |         :param report_interval: an integer value, if set to 5, after 5 request times, current response counter still
 54 |         less than 'per_limit', the "async for' won't return to user, there's going to be an INFO log to tell user what happen
 55 |         :param success_ret_code: ret_code indicate success, default is ("100002", "100301", "100103") ===> ("search no result", "account not found", "account processing")
 56 |         :param done_if: the APIGetter will automatically fetch next page until max_limit or no more page, if you provide a function, APIGetter will terminate fetching next page when done_if(items) return True
 57 |         :param trim_to_max_limit: set max_limit to the precise value, default max_limit is rough value
 58 |         :param exclude_filtered_to_max_limit: max_limit including filtered object or excluding filtered object
 59 |         :param post_body: POST with post_body instead of get
 60 |         :param persistent_writer: corporate with RAPIBulkConfig
 61 |         :param persistent_to_disk_if_give_up: corporate with RAPIBulkConfig, when retry to max_retry times, still fail to get result, whether regard this job as success and persistent to disk or not
 62 |         :param debug_mode: whether log every http request url
 63 |         :param keep_other_fields: keep field in "keep_fields" in each json_object
 64 |         :param http_headers: http_headers, dict object
 65 |         :param http_timeout: in seconds
 66 |         :param args:
 67 |         :param kwargs:
 68 | 
 69 |         Example:
 70 |             api_config = RAPIConfig("http://...")
 71 |             api_getter = ProcessFactory.create_getter(api_config)
 72 |             async for items in api_getter:
 73 |                 print(items)
 74 |         """
 75 |         super().__init__()
 76 |         if not random_min_sleep:
 77 |             random_min_sleep = DefaultVal.random_min_sleep
 78 |         if not random_max_sleep:
 79 |             random_max_sleep = DefaultVal.random_max_sleep
 80 |         if not success_ret_code:
 81 |             success_ret_code = DefaultVal.success_ret_code
 82 | 
 83 |         self.source = source
 84 |         self.per_limit = per_limit
 85 |         self.max_limit = max_limit
 86 |         self.max_retry = max_retry
 87 |         self.random_min_sleep = random_min_sleep
 88 |         self.random_max_sleep = random_max_sleep
 89 |         self.session = session_manger.get_session() if not session else session
 90 |         self.filter = filter_
 91 |         self.return_fail = return_fail
 92 |         self.tag = tag
 93 |         self.call_back = call_back
 94 |         self.report_interval = report_interval
 95 |         self.success_ret_code = success_ret_code
 96 |         self.done_if = done_if
 97 |         self.trim_to_max_limit = trim_to_max_limit
 98 |         self.exclude_filtered_to_max_limit = exclude_filtered_to_max_limit
 99 |         if post_body:
100 |             if not isinstance(post_body, (bytes, str)):
101 |                 post_body = json.dumps(post_body).encode(DefaultVal.default_encoding)
102 |         self.post_body = post_body
103 |         self.persistent_writer = persistent_writer
104 |         self.persistent_to_disk_if_give_up = persistent_to_disk_if_give_up
105 |         self.debug_mode = debug_mode
106 |         self.keep_other_fields = keep_other_fields
107 |         self.keep_fields = keep_fields
108 |         self.http_headers = http_headers
109 |         self.http_timeout = http_timeout if http_timeout is not None else sentinel
110 | 
111 | 
112 | class RCSVConfig(BaseGetterConfig):
113 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_r, encoding=DefaultVal.default_encoding,
114 |                  per_limit=None, max_limit=None, filter_=None, **kwargs):
115 |         """
116 |         :param filename: filename to read
117 |         :param mode: file open mode, i.e "r"
118 |         :param encoding: file encoding i.e "utf8"
119 |         :param per_limit: how many items to get per time
120 |         :param max_limit: get at most max_limit items, if not set, get all
121 |         :param filter_: run "transform --help" to see command line interface explanation for detail
122 |         :param kwargs:
123 | 
124 |         Example:
125 |             csv_config = RJsonConfig("./result.csv", encoding="gbk")
126 |             csv_getter = ProcessFactory.create_getter(csv_config)
127 |             async for items in csv_getter:
128 |                 print(items)
129 | 
130 |             # both async generator and generator implemented
131 |             for items in csv_getter:
132 |                 print(items)
133 |         """
134 |         super().__init__()
135 |         if not per_limit:
136 |             per_limit = DefaultVal.per_limit
137 |         if not max_limit:
138 |             max_limit = DefaultVal.max_limit
139 | 
140 |         self.filename = filename
141 |         self.mode = mode
142 |         self.encoding = encoding
143 |         self.per_limit = per_limit
144 |         self.max_limit = max_limit
145 |         self.filter = filter_
146 | 
147 | 
148 | class RESConfig(BaseGetterConfig):
149 |     def __init__(self, indices, doc_type=None, per_limit=None, max_limit=None, scroll="1m", query_body=None,
150 |                  return_source=True, max_retry=None, random_min_sleep=None, random_max_sleep=None, filter_=None,
151 |                  hosts=None, headers=None, **kwargs):
152 |         """
153 |         :param indices: elasticsearch indices
154 |         :param doc_type: elasticsearch doc_type
155 |         :param per_limit: how many items to get per request
156 |         :param max_limit: get at most max_limit items, if not set, get all
157 |         :param scroll: default is "1m"
158 |         :param query_body: default is '{"size": "per_limit", "query": {"match_all": {}}}'
159 |         :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object
160 |                               if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}}
161 |         :param max_retry: if request fail, retry max_retry times
162 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
163 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
164 |         :param filter_: run "transform --help" to see command line interface explanation for detail,
165 |             only work if return_source is False
166 |         :param hosts: elasticsearch hosts, list type, i.e: ["localhost:8888", "127.0.0.2:8889"]
167 |         :param headers: headers when perform http requests to elasticsearch, dict type, i.e: {"Host": "aaa", "apikey": "bbb"}
168 |         :param kwargs:
169 | 
170 |         Example:
171 |             body = {
172 |                 "size": 100,
173 |                 "_source": {
174 |                     "includes": ["likeCount", "id", "title"]
175 |                     }
176 |             }
177 |             es_config = RESConfig("post20170630", "news", max_limit=1000, query_body=body)
178 |             es_getter = ProcessFactory.create_getter(es_config)
179 |             async for items in es_getter:
180 |                 print(item)
181 |         """
182 |         super().__init__()
183 | 
184 |         if not random_min_sleep:
185 |             random_min_sleep = DefaultVal.random_min_sleep
186 |         if not random_max_sleep:
187 |             random_max_sleep = DefaultVal.random_max_sleep
188 |         if not per_limit:
189 |             per_limit = DefaultVal.per_limit
190 |         if not max_limit:
191 |             max_limit = DefaultVal.max_limit
192 |         if not max_retry:
193 |             max_retry = DefaultVal.max_retry
194 | 
195 |         if not DefaultVal.main_config.has_es_configured:
196 |             raise ValueError("You must config es_hosts before using Elasticsearch, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
197 | 
198 |         if not query_body:
199 |             query_body = {
200 |                 "size": per_limit,
201 |                 "query": {
202 |                     "match_all": {}
203 |                 }
204 |             }
205 |         self.query_body = query_body
206 |         self.indices = indices
207 |         self.doc_type = doc_type
208 |         self.per_limit = per_limit
209 |         self.max_limit = max_limit
210 |         self.scroll = scroll
211 |         self.es_client = get_es_client(hosts=hosts, headers=headers)
212 |         self.return_source = return_source
213 |         self.max_retry = max_retry
214 |         self.random_min_sleep = random_min_sleep
215 |         self.random_max_sleep = random_max_sleep
216 |         self.filter = filter_
217 | 
218 | 
219 | class RJsonConfig(BaseGetterConfig):
220 |     def __init__(self, filename, mode=DefaultVal.default_file_mode_r, encoding=DefaultVal.default_encoding,
221 |                  per_limit=None, max_limit=None, filter_=None, **kwargs):
222 |         """
223 |         :param filename: line by line json file to read
224 |         :param mode: file open mode, i.e "r"
225 |         :param encoding: file encoding i.e "utf8"
226 |         :param per_limit: how many items to get per time
227 |         :param max_limit: get at most max_limit items, if not set, get all
228 |         :param filter_: run "transform --help" to see command line interface explanation for detail
229 |         :param kwargs:
230 | 
231 |         Example:
232 |             json_config = RJsonConfig("./result.json")
233 |             json_getter = ProcessFactory.create_getter(json_config)
234 |             async for items in json_getter:
235 |                 print(items)
236 | 
237 |             # both async generator and generator implemented
238 |             for items in json_getter:
239 |                 print(items)
240 |         """
241 |         super().__init__()
242 | 
243 |         if not per_limit:
244 |             per_limit = DefaultVal.per_limit
245 |         if not max_limit:
246 |             max_limit = DefaultVal.max_limit
247 | 
248 |         self.filename = filename
249 |         self.mode = mode
250 |         self.encoding = encoding
251 |         self.per_limit = per_limit
252 |         self.max_limit = max_limit
253 |         self.filter = filter_
254 | 
255 | 
256 | class RXLSXConfig(BaseGetterConfig):
257 |     def __init__(self, filename, per_limit=None, max_limit=None, sheet_index=0, filter_=None, **kwargs):
258 |         """
259 |         :param filename: filename to read
260 |         :param per_limit: how many items to get per time
261 |         :param max_limit: get at most max_limit items, if not set, get all
262 |         :param sheet_index: which sheet to get, 0 means 0th sheet
263 |         :param filter_: run "transform --help" to see command line interface explanation for detail
264 |         :param kwargs:
265 | 
266 |         Example:
267 |             xlsx_config = RXLSXConfig("./result.xlsx")
268 |             xlsx_getter = ProcessFactory.create_getter(xlsx_config)
269 |             async for items in xlsx_getter:
270 |                 print(items)
271 | 
272 |             # both async generator and generator implemented
273 |             for items in xlsx_getter:
274 |                 print(items)
275 | 
276 |         """
277 |         super().__init__()
278 | 
279 |         if not per_limit:
280 |             per_limit = DefaultVal.per_limit
281 |         if not max_limit:
282 |             max_limit = DefaultVal.max_limit
283 | 
284 |         self.filename = filename
285 |         self.per_limit = per_limit
286 |         self.max_limit = max_limit
287 |         self.sheet_index = sheet_index
288 |         self.filter = filter_
289 | 
290 | 
291 | class RAPIBulkConfig(BaseGetterConfig):
292 |     def __init__(self, sources, interval=DefaultVal.interval, concurrency=None, filter_=None, return_fail=False,
293 |                  done_if=None, trim_to_max_limit=DefaultVal.trim_to_max_limit,
294 |                  exclude_filtered_to_max_limit=DefaultVal.exclude_filtered_to_max_limit, persistent=False,
295 |                  persistent_key=None, persistent_start_fresh_if_done=True, persistent_to_disk_if_give_up=True,
296 |                  debug_mode=False, http_headers=None, **kwargs):
297 |         """
298 |         :param sources: an iterable object (can be async generator), each item must be "url" or instance of RAPIConfig
299 |         :param interval: integer or float, each time you call async generator, you will wait for "interval" seconds
300 |                          and get all items fetch during this "interval", notice if sources is an "async generator",
301 |                          the "interval" seconds will exclude the time processing async fenerator
302 |         :param concurrency: how many concurrency task run, default read from config file, if concurrency set,
303 |                             only string(url) in "sources" will work with this concurrency level, RAPIConfig instance won't
304 |         :param filter_: run "transform --help" to see command line interface explanation for detail
305 |         :param return_fail: if set to True, for each iteration, will return a tuple,
306 |             api_getter = ProcessFactory.create_getter(RAPIBulkConfig([...]))
307 |             async for items, bad_objects in getter:
308 |                 A = bad_objects[0]
309 |                 A.response: -> json object: '{"appCode": "weixinpro", "dataType": "post", "message": "param error", "retcode": "100005"}', if fail in request, response will be None
310 |                 A.tag: -> tag you pass to RAPIConfig
311 |                 A.source: -> source you pass to RAPIConfig
312 |         :param done_if: if will only work if the source[n] is type string, if the source[n] is type RAPIConfig, it won't work, please refer to RAPIConfig for more detail
313 |         :param trim_to_max_limit: set max_limit to the precise value, default max_limit is rough value
314 |         :param exclude_filtered_to_max_limit: max_limit including filtered object or excluding filtered object
315 |         :param persistent: whether save progress to disk, if set to true, the job progress will be persistent to disk every "interval" seconds
316 |         :param persistent_key: the key to identify the task
317 |         :param persistent_start_fresh_if_done: if all task done, whether remove the persistent record file, if the persistent file hasn't been removed and all of the jobs finished,
318 |                next time you run the program, there will be no job to schedule
319 |         :param persistent_to_disk_if_give_up: if there's a job fail after retry max_retry times, whether regard this job as success and persistent to disk or not
320 |         :param debug_mode: log every http request url
321 |         :param http_headers: http_headers, dict object
322 |         :param kwargs:
323 | 
324 |         Example:
325 |             sources = ["http://....", "http://....", "http://....", RAPIConfig("http://....")]
326 |             bulk_config = RAPUBulkConfig(sources)
327 |             bulk_getter = ProcessFactory.create_getter(bulk_config)
328 |             async for items in bulk_getter:
329 |                 print(items)
330 | 
331 |         """
332 |         super().__init__()
333 |         if not concurrency:
334 |             concurrency = DefaultVal.main_config["main"].getint("concurrency")
335 |         self.sources = sources
336 |         self.interval = interval
337 |         self.concurrency = concurrency
338 |         self.session = session_manger._generate_session(concurrency_limit=concurrency)
339 |         self.filter = filter_
340 |         self.return_fail = return_fail
341 |         self.done_if = done_if
342 |         self.trim_to_max_limit = trim_to_max_limit
343 |         self.exclude_filtered_to_max_limit = exclude_filtered_to_max_limit
344 |         self.persistent = persistent
345 |         self.persistent_key = persistent_key
346 |         self.persistent_start_fresh_if_done = persistent_start_fresh_if_done
347 |         self.persistent_to_disk_if_give_up = persistent_to_disk_if_give_up
348 |         self.debug_mode = debug_mode
349 |         self.http_headers = http_headers
350 | 
351 |     def __del__(self):
352 |         if inspect.iscoroutinefunction(self.session.close):
353 |             if not self.session.closed:
354 |                 if self.session._connector is not None and self.session._connector_owner:
355 |                     self.session._connector.close()
356 |                 self._connector = None
357 |         else:
358 |             self.session.close()
359 | 
360 | 
361 | class RRedisConfig(BaseGetterConfig):
362 |     def __init__(self, key, key_type="LIST", per_limit=None, max_limit=None, filter_=None, max_retry=None,
363 |                  random_min_sleep=None, random_max_sleep=None, host=None, port=None, db=None, password=None,
364 |                  timeout=None, encoding=None, need_del=None, direction=None, compress=None, **kwargs):
365 |         """
366 |         :param key: redis key to get data
367 |         :param key_type: redis data type to operate, current only support LIST, HASH
368 |         :param per_limit: how many items to get per time
369 |         :param max_limit: get at most max_limit items, if not set, get all
370 |         :param max_retry: if request fail, retry max_retry times
371 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
372 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
373 |         :param filter_: run "transform --help" to see command line interface explanation for detail
374 |         :param host: redis host -> str
375 |         :param port: redis port -> int
376 |         :param db: redis database number -> int
377 |         :param password: redis password -> int
378 |         :param timeout: timeout per redis connection -> float
379 |         :param encoding: redis object encoding -> str
380 |         :param need_del:  whether need to del the key after get object from redis -> boolean
381 |         :param direction: "L" or "R", left to right or roght to left
382 |         :param compress: whether compress data use zlib before write to redis -> boolean
383 |         :param kwargs:
384 | 
385 |         Example:
386 |             redis_config = RRedisConfig("my_key")
387 |             redis_getter = ProcessFactory.create_getter(redis_config)
388 |             async for items in redis_getter:
389 |                 print(items)
390 |         """
391 |         super().__init__()
392 |         # load default value
393 |         if not random_min_sleep:
394 |             random_min_sleep = DefaultVal.random_min_sleep
395 |         if not random_max_sleep:
396 |             random_max_sleep = DefaultVal.random_max_sleep
397 |         if not per_limit:
398 |             per_limit = DefaultVal.per_limit
399 |         if not max_limit:
400 |             max_limit = DefaultVal.max_limit
401 |         if not max_retry:
402 |             max_retry = DefaultVal.max_retry
403 |         if host is None:
404 |             host = DefaultVal.redis_host
405 |         if port is None:
406 |             port = DefaultVal.redis_port
407 |         if db is None:
408 |             db = DefaultVal.redis_db
409 |         if password is None:
410 |             password = DefaultVal.redis_password
411 |         if timeout is None:
412 |             timeout = DefaultVal.redis_timeout
413 |         if encoding is None:
414 |             encoding = DefaultVal.redis_encoding
415 |         if direction is None:
416 |             direction = DefaultVal.redis_direction
417 |         if need_del is None:
418 |             need_del = DefaultVal.redis_need_del
419 |         if compress is None:
420 |             compress = DefaultVal.redis_compress
421 | 
422 |         if not DefaultVal.main_config.has_redis_configured and port <= 0:
423 |             raise ValueError("You must config redis before using Redis, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
424 | 
425 |         if key_type not in ("LIST", "HASH"):
426 |             raise ValueError("key_type must be one of (%s)" % (str(("LIST", "HASH")), ))
427 |         if not encoding:
428 |             raise ValueError("You must specific encoding, since I am going to load each object in json format, "
429 |                              "and treat it as dictionary in python")
430 |         if not password:
431 |             password = None
432 | 
433 |         self.redis_pool_cli = None
434 |         self.key = key
435 |         self.host = host
436 |         self.port = port
437 |         self.db = db
438 |         self.password = password
439 |         self.encoding = encoding
440 |         self.timeout = timeout
441 | 
442 |         self.key_type = key_type
443 |         self.per_limit = per_limit
444 |         self.max_limit = max_limit
445 |         self.filter = filter_
446 |         self.max_retry = max_retry
447 |         self.random_min_sleep = random_min_sleep
448 |         self.random_max_sleep = random_max_sleep
449 |         self.need_del = need_del
450 | 
451 |         self.name = "%s_%s->%s" % (str(host), str(port), str(key))
452 | 
453 |         self.redis_read_method = self.redis_len_method = self.redis_del_method = None
454 |         self.direction = direction
455 |         self.compress = compress
456 | 
457 |         if key_type == "LIST":
458 |             self.is_range = True
459 |         else:
460 |             self.is_range = False
461 | 
462 |     async def get_redis_pool_cli(self):
463 |         """
464 |         :return: an async redis client
465 |         """
466 |         if self.redis_pool_cli is None:
467 |             kwargs = {
468 |                 "db": int(self.db),
469 |                 "password": self.password,
470 |                 "encoding": self.encoding,
471 |                 "timeout": self.timeout,
472 |                 "minsize": 1,
473 |                 "maxsize": 3
474 |             }
475 |             if self.compress:
476 |                 del kwargs["encoding"]
477 |             self.redis_pool_cli = await aioredis.create_redis_pool((self.host, self.port), **kwargs)
478 |             if self.key_type == "LIST":
479 |                 self.redis_read_method = self.redis_pool_cli.lrange
480 |                 self.redis_len_method = self.redis_pool_cli.llen
481 |                 self.redis_del_method = self.redis_pool_cli.delete
482 |             else:
483 |                 self.redis_read_method = self.redis_pool_cli.hgetall
484 |                 self.redis_len_method = self.redis_pool_cli.hlen
485 |                 self.redis_del_method = self.redis_pool_cli.delete
486 | 
487 |         return self.redis_pool_cli
488 | 
489 | 
490 | class RMySQLConfig(BaseGetterConfig):
491 |     def __init__(self, table, per_limit=None, max_limit=None, filter_=None, max_retry=None, random_min_sleep=None,
492 |                  random_max_sleep=None, host=None, port=None, user=None, password=None, database=None,
493 |                  charset=None, loop=None, **kwargs):
494 |         """
495 |         :param table: mysql table
496 |         :param per_limit: how many items to get per time
497 |         :param max_limit: get at most max_limit items, if not set, get all
498 |         :param filter_: run "transform --help" to see command line interface explanation for detail
499 |         :param max_retry: if request fail, retry max_retry times
500 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
501 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
502 |         :param host: mysql host -> str
503 |         :param port: mysql port -> int
504 |         :param user: mysql user -> str
505 |         :param password: mysql password -> str
506 |         :param database: mysql database -> str
507 |         :param charset: default utf8 -> str
508 |         :param loop: async loop instance
509 |         :param kwargs:
510 | 
511 |         Example:
512 |             mysql_config = RRedisConfig("my_table")
513 |             redis_getter = ProcessFactory.create_getter(mysql_config)
514 |             async for items in redis_getter:
515 |                 print(items)
516 |         """
517 |         super().__init__()
518 | 
519 |         if not random_min_sleep:
520 |             random_min_sleep = DefaultVal.random_min_sleep
521 |         if not random_max_sleep:
522 |             random_max_sleep = DefaultVal.random_max_sleep
523 |         if not per_limit:
524 |             per_limit = DefaultVal.per_limit
525 |         if not max_limit:
526 |             max_limit = DefaultVal.max_limit
527 |         if not max_retry:
528 |             max_retry = DefaultVal.max_retry
529 |         if not host:
530 |             host = DefaultVal.mysql_host
531 |         if not port:
532 |             port = DefaultVal.mysql_port
533 |         if not user:
534 |             user = DefaultVal.mysql_user
535 |         if not password:
536 |             password = DefaultVal.mysql_password
537 |         if not database:
538 |             database = DefaultVal.mysql_database
539 |         if not charset:
540 |             charset = DefaultVal.mysql_encoding
541 | 
542 |         if not DefaultVal.main_config.has_mysql_configured and port <= 0:
543 |             raise ValueError("You must config mysql before using MySQL, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
544 |         if "aiomysql" not in globals():
545 |             raise ValueError("module mysql disabled, please reinstall "
546 |                              "requirements with python version higher than 3.5.3 to enable it")
547 | 
548 |         self.table = table
549 |         self.database = database
550 | 
551 |         self.max_limit = max_limit
552 |         self.per_limit = per_limit
553 |         self.max_retry = max_retry
554 |         self.random_min_sleep = random_min_sleep
555 |         self.random_max_sleep = random_max_sleep
556 |         self.filter = filter_
557 | 
558 |         self.name = "%s->%s" % (self.database, self.table)
559 | 
560 |         self.host = host
561 |         self.port = port
562 |         self.user = user
563 |         if not password:
564 |             password = ''
565 |         self.password = password
566 |         self.database = database
567 |         self.charset = charset
568 | 
569 |         if not loop:
570 |             loop = asyncio.get_event_loop()
571 |         self.loop = loop
572 |         self.mysql_pool_cli = self.connection = self.cursor = None
573 | 
574 |     async def get_mysql_pool_cli(self):
575 |         """
576 |         :return: an async mysql client
577 |         """
578 |         if self.mysql_pool_cli is None:
579 |             self.mysql_pool_cli = await aiomysql.create_pool(host=self.host, port=self.port, user=self.user,
580 |                                                              password=self.password, db=self.database, loop=self.loop,
581 |                                                              minsize=1, maxsize=3, charset=self.charset)
582 |             self.connection = await self.mysql_pool_cli.acquire()
583 |             self.cursor = await self.connection.cursor()
584 |         return self.mysql_pool_cli
585 | 
586 |     def free_resource(self):
587 |         if self.mysql_pool_cli is not None:
588 |             self.mysql_pool_cli.release(self.connection)
589 |             self.mysql_pool_cli.close()
590 |             self.loop.create_task(self.mysql_pool_cli.wait_closed())
591 |             self.mysql_pool_cli = self.connection = self.cursor = None
592 | 
593 | 
594 | class RMongoConfig(BaseGetterConfig):
595 |     def __init__(self, collection, per_limit=None, max_limit=None, query_body=None, max_retry=None,
596 |                  random_min_sleep=None, random_max_sleep=None, filter_=None, host=None, port=None, username=None,
597 |                  password=None, database=None, **kwargs):
598 |         """
599 |         :param collection: collection name
600 |         :param per_limit: how many items to get per request
601 |         :param max_limit: get at most max_limit items, if not set, get all
602 |         :param query_body: search query, default None, i.e: {'i': {'$lt': 5}}
603 |         :param return_source: if set to True, will return [item , ..., itemN], item is the "_source" object
604 |                               if set to False, will return whatever elasticsearch return, i.e {"hits": {"total": ...}}
605 |         :param max_retry: if request fail, retry max_retry times
606 |         :param random_min_sleep: if request fail, random sleep at least random_min_sleep seconds before request again
607 |         :param random_max_sleep: if request fail, random sleep at most random_min_sleep seconds before request again
608 |         :param filter_: run "transform --help" to see command line interface explanation for detail
609 |         :param kwargs:
610 | 
611 |         Example:
612 |             mongo_config = RMongoConfig("my_coll")
613 |             mongo_getter = ProcessFactory.create_getter(mongo_config)
614 |             async for items in mongo_getter:
615 |                 print(item)
616 |         """
617 |         super().__init__()
618 | 
619 |         if not random_min_sleep:
620 |             random_min_sleep = DefaultVal.random_min_sleep
621 |         if not random_max_sleep:
622 |             random_max_sleep = DefaultVal.random_max_sleep
623 |         if not per_limit:
624 |             per_limit = DefaultVal.per_limit
625 |         if not max_limit:
626 |             max_limit = DefaultVal.max_limit
627 |         if not max_retry:
628 |             max_retry = DefaultVal.max_retry
629 |         if not host:
630 |             host = DefaultVal.mongo_host
631 |         if not port:
632 |             port = DefaultVal.mongo_port
633 |         if not username:
634 |             username = DefaultVal.mongo_username
635 |         if not password:
636 |             password = DefaultVal.mongo_password
637 |         if not database:
638 |             database = DefaultVal.mongo_database
639 | 
640 |         if not DefaultVal.main_config.has_mongo_configured:
641 |             raise ValueError("You must config MongoDB before using MongoDB, Please edit configure file: %s" % (DefaultVal.main_config.ini_path, ))
642 |         if "motor" not in globals():
643 |             raise ValueError("module motor disabled, please reinstall "
644 |                              "requirements in linux")
645 | 
646 |         self.collection = collection
647 |         self.query_body = query_body
648 |         self.per_limit = per_limit
649 |         self.max_limit = max_limit
650 |         self.max_retry = max_retry
651 |         self.random_min_sleep = random_min_sleep
652 |         self.random_max_sleep = random_max_sleep
653 |         self.filter = filter_
654 |         self.host = host
655 |         self.port = port
656 |         self.username = username
657 |         self.password = password
658 |         self.database = database
659 |         self.name = "%s->%s" % (self.database, self.collection)
660 | 
661 |         self.client = self.cursor = None
662 | 
663 |     def get_mongo_cli(self):
664 |         if self.client is None:
665 |             kwargs = {
666 |                 "host": self.host,
667 |                 "port": self.port
668 |             }
669 |             if self.username:
670 |                 address = "mongodb://%s:%s@%s:%s/%s" % (self.username, self.password, kwargs["host"], str(kwargs["port"]), self.database)
671 |                 self.client = motor.motor_asyncio.AsyncIOMotorClient(address)
672 |             else:
673 |                 self.client = motor.motor_asyncio.AsyncIOMotorClient(**kwargs)
674 | 
675 |             if self.query_body:
676 |                 self.cursor = self.client[self.database][self.collection].find(self.query_body)
677 |             else:
678 |                 self.cursor = self.client[self.database][self.collection].find()
679 |         return self.client
680 | 


--------------------------------------------------------------------------------