├── .bumpversion.cfg ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── benchmarks ├── 1k ├── 2k ├── default ├── perf_get_records.py └── perf_put_records.py ├── changelog.rst ├── datahub ├── __init__.py ├── auth │ ├── __init__.py │ ├── aliyun_account.py │ └── core.py ├── batch │ ├── __init__.py │ ├── batch_binary_record.py │ ├── batch_header.py │ ├── batch_serializer.py │ ├── binary_record.py │ ├── record_header.py │ ├── schema_registry_client.py │ └── utils.py ├── client │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constant.py │ │ ├── datahub_factory.py │ │ ├── library_factory.py │ │ ├── meta_data.py │ │ ├── offset_meta.py │ │ ├── shard_coordinator.py │ │ ├── thread_pool.py │ │ └── timer.py │ ├── consumer │ │ ├── __init__.py │ │ ├── consumer_coordinator.py │ │ ├── consumer_heartbeat.py │ │ ├── datahub_consumer.py │ │ ├── message_key.py │ │ ├── message_reader.py │ │ ├── offset_coordinator.py │ │ ├── offset_manager.py │ │ ├── offset_select_strategy.py │ │ ├── shard_group_reader.py │ │ ├── shard_reader.py │ │ └── sync_group_meta.py │ └── producer │ │ ├── __init__.py │ │ ├── datahub_producer.py │ │ ├── message_writer.py │ │ ├── record_pack.py │ │ ├── record_pack_queue.py │ │ ├── shard_group_writer.py │ │ ├── shard_writer.py │ │ └── write_result.py ├── core.py ├── exceptions.py ├── implement.py ├── models │ ├── __init__.py │ ├── compress.py │ ├── connector.py │ ├── cursor.py │ ├── params.py │ ├── record.py │ ├── results.py │ ├── schema.py │ ├── shard.py │ ├── subscription.py │ └── types.py ├── proto │ ├── __init__.py │ ├── datahub_record_proto_pb.py │ └── record.proto ├── rest.py ├── utils │ ├── __init__.py │ ├── codec.py │ ├── constants.py │ ├── converters.py │ └── validator.py └── version.py ├── dependency ├── certifi-2018.4.16-py2.py3-none-any.whl ├── chardet-3.0.4-py2.py3-none-any.whl ├── cprotobuf-0.1.9.tar.gz ├── crcmod-1.7.tar.gz ├── enum34-1.1.6.tar.gz ├── funcsigs-1.0.2-py2.py3-none-any.whl ├── future-0.16.0.tar.gz ├── idna-2.6-py2.py3-none-any.whl ├── install_dependency.sh ├── lz4-2.0.0.tar.gz ├── pip-10.0.1.tar.gz ├── pkgconfig-1.3.1.tar.gz ├── pytest-runner-4.2.tar.gz ├── readme.txt ├── requests-2.18.4-py2.py3-none-any.whl ├── setuptools-39.2.0.zip ├── setuptools_scm-2.1.0.tar.gz ├── simplejson-3.15.0.tar.gz ├── six-1.11.0-py2.py3-none-any.whl └── urllib3-1.22-py2.py3-none-any.whl ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── PyDatahub.png │ └── theme_override.css │ ├── api.rst │ ├── conf.py │ ├── index.rst │ ├── installation.rst │ ├── tutorial-client.rst │ ├── tutorial-connector.rst │ ├── tutorial-meter.rst │ ├── tutorial-offset.rst │ ├── tutorial-project.rst │ ├── tutorial-record.rst │ ├── tutorial-schema.rst │ ├── tutorial-shard.rst │ ├── tutorial-subscription.rst │ ├── tutorial-topic.rst │ └── tutorial.rst ├── examples ├── client │ ├── datahub.config.template │ ├── example_async_producer.py │ ├── example_collaborative_consumer.py │ ├── example_general_consumer.py │ └── example_general_producer.py ├── datahub │ ├── basic_example.py │ ├── blob │ │ ├── blob_topic_pub.py │ │ └── blob_topic_sub.py │ ├── consume_example.py │ └── tuple │ │ ├── tuple_topic_pub.py │ │ └── tuple_topic_sub.py └── resources │ └── datahub.png ├── readthedocs.yaml ├── requirements.txt ├── setup.py ├── test-requirements.txt ├── tests ├── datahub.ini.template ├── fixtures │ ├── projects.cursor.topics.invalid_param.shards.0.json │ ├── projects.cursor.topics.success.shards.0.json │ ├── projects.existed.json │ ├── projects.existed.topics.existed.json │ ├── projects.get.topics.blob.shards.0.bin │ ├── projects.get.topics.blob.shards.0.json │ ├── projects.get.topics.blob_batch.shards.0.bin │ ├── projects.get.topics.invalid_cursor.shards.0.json │ ├── projects.get.topics.invalid_cursor_batch.shards.0.bin │ ├── projects.get.topics.tuple.shards.0.bin │ ├── projects.get.topics.tuple.shards.0.json │ ├── projects.get.topics.tuple_batch.shards.0.bin │ ├── projects.json │ ├── projects.merge.topics.invalid_state.shards.json │ ├── projects.merge.topics.limit_exceeded.shards.json │ ├── projects.merge.topics.shards_not_adjacent.shards.json │ ├── projects.merge.topics.success.shards.json │ ├── projects.meter.topics.success.shards.0.json │ ├── projects.put.topics.invalid_state.shards.json │ ├── projects.put.topics.invalid_state_batch.shards.0.bin │ ├── projects.put.topics.limit_exceeded.shards.json │ ├── projects.put.topics.limit_exceeded_batch.shards.0.bin │ ├── projects.put.topics.malformed.shards.json │ ├── projects.put.topics.malformed_batch.shards.0.bin │ ├── projects.put.topics.success.shards.bin │ ├── projects.put.topics.success.shards.json │ ├── projects.schema.topics.delete.json │ ├── projects.schema.topics.get.json │ ├── projects.schema.topics.list.json │ ├── projects.schema.topics.register.json │ ├── projects.split.topics.default.shards.json │ ├── projects.split.topics.invalid_key.shards.json │ ├── projects.split.topics.invalid_state.shards.json │ ├── projects.split.topics.limit_exceeded.shards.json │ ├── projects.split.topics.success.shards.json │ ├── projects.success.json │ ├── projects.success.topics.blob.json │ ├── projects.success.topics.json │ ├── projects.success.topics.success.connectors.json │ ├── projects.success.topics.success.shards.json │ ├── projects.success.topics.tuple.json │ ├── projects.unexisted.json │ ├── projects.unexisted.topics.json │ ├── projects.unexisted.topics.valid.connectors.sink_odps.json │ ├── projects.unexisted.topics.valid.json │ ├── projects.unexisted.topics.valid.shards.0.json │ ├── projects.unexisted.topics.valid.shards.json │ ├── projects.unexisted.topics.valid_batch.shards.0.bin │ ├── projects.valid.topics.unexisted.connectors.sink_odps.json │ ├── projects.valid.topics.unexisted.json │ ├── projects.valid.topics.unexisted.shards.0.json │ ├── projects.valid.topics.unexisted.shards.json │ ├── projects.valid.topics.valid.connectors.sink_odps.json │ ├── projects.valid.topics.valid.shards.0.json │ ├── projects.valid.topics.valid.shards.json │ ├── projects.valid_batch.topics.unexisted.shards.0.bin │ ├── projects.valid_batch.topics.valid_batch.shards.0.bin │ ├── projects.wait.topics.ready.shards.json │ └── projects.wait.topics.unready.shards.json ├── function │ ├── test_connector.py │ ├── test_cursor.py │ ├── test_meter.py │ ├── test_offset.py │ ├── test_project.py │ ├── test_record.py │ ├── test_record_batch.py │ ├── test_schema.py │ ├── test_shard.py │ ├── test_subscription.py │ └── test_topic.py ├── resources │ └── datahub.png └── unit │ ├── __init__.py │ ├── test_batch_unit.py │ ├── test_connector_unit.py │ ├── test_cursor_unit.py │ ├── test_meter_unit.py │ ├── test_project_unit.py │ ├── test_record_unit.py │ ├── test_schema_register_unit.py │ ├── test_schema_unit.py │ ├── test_shard_unit.py │ ├── test_topic_unit.py │ └── unittest_util.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | files = datahub/version.py 3 | commit = True 4 | tag = True 5 | current_version = 2.25.4 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_store 3 | build 4 | dist 5 | pydatahub.egg-info 6 | docs/build 7 | .idea 8 | .pytest_cache 9 | venv* 10 | .tox 11 | .coverage 12 | .eggs 13 | tests/datahub.ini 14 | pydatahub_beta.egg-info 15 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include Introduction.md 4 | include requirements.txt 5 | -------------------------------------------------------------------------------- /benchmarks/1k: -------------------------------------------------------------------------------- 1 | import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="aaaaaa", type=int) 2 | -------------------------------------------------------------------------------- /benchmarks/2k: -------------------------------------------------------------------------------- 1 | import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="aaaaaa", type=int) 2 | import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="echo the string use here", type=int) parser.add_argument("-v", help="version", type=int) args = parser.parse_args() print args.echo print args.v import argparse parser = argparse.ArgumentParser() parser.add_argument("echo", help="aaaaaa", type=int) 3 | -------------------------------------------------------------------------------- /benchmarks/default: -------------------------------------------------------------------------------- 1 | [2016-05-23 00:33:07.283175] [INFO] [29141] [/apsarapangu/disk12/ying.cai/auto_datahub_build/datahub/frontend_server/xstream_request_dispatcher.cpp:717] Received request: RequestId:2016052300330799d6650a00000002 RemoteAddress:10.101.214.153 RemotePort:42711 Method:GET URI:/projects/test_project QueryString: Headers:{ 2 | -------------------------------------------------------------------------------- /benchmarks/perf_get_records.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import argparse 21 | import time 22 | 23 | from datahub import DataHub 24 | from datahub.models import CursorType 25 | 26 | 27 | class Timer(object): 28 | def __init__(self, verbose=False): 29 | self.verbose = verbose 30 | 31 | def __enter__(self): 32 | self.start = time.time() 33 | return self 34 | 35 | def __exit__(self, *args): 36 | self.end = time.time() 37 | self.secs = self.end - self.start 38 | self.msecs = self.secs * 1000 # millisecs 39 | if self.verbose: 40 | print('elapsed time: %f ms' % self.msecs) 41 | 42 | 43 | if __name__ == '__main__': 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('access_id', help='account access id') 46 | parser.add_argument('access_key', help='account access key') 47 | parser.add_argument('endpoint', help='datahub server endpoint') 48 | parser.add_argument('--batch', help='batch record num', type=int, default=100) 49 | parser.add_argument('--round', help='round num', type=int, default=10000) 50 | parser.add_argument('--project', help='project name', default='py_perf_test_project') 51 | parser.add_argument('--topic', help='topic name', default='py_perf_test_topic') 52 | parser.add_argument('--retry_times', help='request retry nums', type=int, default=3) 53 | parser.add_argument('--conn_timeout', help='connect timeout', type=int, default=5) 54 | parser.add_argument('--read_timeout', help='read timeout', type=int, default=120) 55 | parser.add_argument('--stream', help='read timeout', action="store_true") 56 | parser.add_argument('--protobuf', help='protobuf mode', type=bool, default=False) 57 | args = parser.parse_args() 58 | print("=============configuration=============") 59 | print("access_id:%s" % args.access_id) 60 | print("access_key:%s" % args.access_key) 61 | print("endpoint:%s" % args.endpoint) 62 | print("project:%s" % args.project) 63 | print("topic:%s" % args.topic) 64 | print("retry_times:%d" % args.retry_times) 65 | print("conn_timeout:%d" % args.conn_timeout) 66 | print("read_timeout:%d" % args.read_timeout) 67 | print("batch record num:%d" % args.batch) 68 | print("round num:%d" % args.round) 69 | print("stream:%s" % args.stream) 70 | print("protobuf:%s" % args.protobuf) 71 | print("=======================================\n\n") 72 | 73 | dh = DataHub(args.access_id, args.access_key, args.endpoint, retry_times=args.retry_times, 74 | conn_timeout=args.conn_timeout, read_timeout=args.read_timeout) 75 | # project = Project(name=args.project, comment='perf project for python sdk') 76 | # dh.create_project(project) 77 | # print "create project %s success!" % args.project 78 | # print "=======================================\n\n" 79 | 80 | topic_result = dh.get_topic(args.project, args.topic) 81 | print("get topic %s success! detail:\n%s" % (args.topic, str(topic_result))) 82 | print("=======================================\n\n") 83 | 84 | cursor_result = dh.get_cursor(args.project, args.topic, '0', CursorType.OLDEST) 85 | print("get topic %s oldest cursor success! detail:\n%s" % (args.topic, cursor_result.cursor)) 86 | print("=======================================\n\n") 87 | 88 | read_request_count = 0 89 | read_suc_reord_count = 0 90 | cursor = cursor_result.cursor 91 | 92 | with Timer() as t: 93 | for i in range(0, args.round): 94 | record_result = dh.get_tuple_records(topic_result.project_name, topic_result.topic_name, '0', 95 | topic_result.record_schema, cursor, args.batch) 96 | read_request_count += 1 97 | read_suc_reord_count += record_result.record_count 98 | if record_result.record_count == 0: 99 | break 100 | cursor = record_result.next_cursor 101 | 102 | print("===============result==================") 103 | print("read_request_count:%d, %f/s" % (read_request_count, (1000.0 * read_request_count) / t.msecs)) 104 | print("read_suc_reord_count:%d, %f/s" % (read_suc_reord_count, (1000.0 * read_suc_reord_count) / t.msecs)) 105 | print("=> elapsed time: %fs" % t.secs) 106 | 107 | -------------------------------------------------------------------------------- /changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ================ 3 | 4 | Version 2.17.2 5 | ----------------- 6 | 7 | + support to create topic with extend mode 8 | 9 | Version 2.17.1 10 | ----------------- 11 | 12 | + fix invalid shard status 13 | 14 | Version 2.17.0 15 | ----------------- 16 | 17 | + fix field length longer than schema specified 18 | 19 | Version 2.16.0 20 | ----------------- 21 | 22 | + fix python3 blob_data type error in pb mode 23 | + support new date type 24 | 25 | Version 2.15.2 26 | ----------------- 27 | 28 | + fix bool cast 29 | 30 | Version 2.15.1 31 | ----------------- 32 | 33 | + remove string length check 34 | 35 | Version 2.15.0 36 | ----------------- 37 | 38 | + add update project api 39 | + validate param type 40 | + list_connector will return connector_ids 41 | + create_connector will return connector_id 42 | + param connector_type change to connector_id 43 | + deprecated shard_contexts in GetConnectorResult 44 | + deprecated ConnectorState `CONNECTOR_CREATED`, `CONNECTOR_PAUSED`, add `CONNECTOR_STOPPED` 45 | + member variable in GetConnectorShardStatusResult is changed, not compatible 46 | + remove max_commit_size in DatabaseConnectorConfig,EsConnectorConfig, not compatible 47 | + remove invocation_role,batch_size,start_position,start_timestamp in FcConnectorConfig, not compatible 48 | + add start_time in CreateConnectorParams 49 | 50 | Version 2.12.7 51 | ----------------- 52 | 53 | + specify cprotobuf version to 0.1.9 54 | 55 | Version 2.12.6 56 | ----------------- 57 | 58 | + add update connector offset 59 | 60 | Version 2.12.5 61 | ----------------- 62 | 63 | + fix double field convert precision 64 | 65 | Version 2.12.4 66 | ----------------- 67 | 68 | + fix user define mode connector config param 69 | 70 | Version 2.12.3 71 | ----------------- 72 | 73 | + fix user define mode get connector key not found 74 | 75 | Version 2.12.2 76 | ----------------- 77 | 78 | + fix None value for bool field in record 79 | 80 | Version 2.12.1 81 | ----------------- 82 | 83 | + fix implement bug 84 | 85 | Version 2.12.0 86 | ----------------- 87 | 88 | + update connector 89 | + put record by shard id 90 | 91 | Version 2.11.5 92 | ----------------- 93 | 94 | + some compatibility 95 | 96 | Version 2.11.4 97 | ----------------- 98 | 99 | + fix response without request-id 100 | 101 | Version 2.11.3 102 | ----------------- 103 | 104 | + fix record repr bug 105 | 106 | Version 2.11.2 107 | ----------------- 108 | 109 | + add security token 110 | 111 | Version 2.11.1 112 | ----------------- 113 | 114 | + fix signature bug 115 | + fix null value parse error in pb mode 116 | + fix ClosedTime parse error in ListShardResult 117 | + fix failed_record_count in FailedRecordCount 118 | + add example 119 | + correct doc 120 | 121 | Version 2.11.0 122 | ----------------- 123 | 124 | + complement and refactor datahub service api 125 | + support python 2.7, 3.4, 3.5, 3.6, pypy 126 | + support protobuf 127 | -------------------------------------------------------------------------------- /datahub/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import 21 | 22 | from .core import DataHub, DatahubProtocolType 23 | from .version import __version__, __datahub_client_version__ 24 | 25 | """ author 26 | """ 27 | __author__ = 'panjinxing.pjx' 28 | 29 | __all__ = ['DataHub', '__author__', '__version__', '__datahub_client_version__'] 30 | -------------------------------------------------------------------------------- /datahub/auth/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from .aliyun_account import AliyunAccount 21 | from .core import AccountType, Account 22 | 23 | __all__ = ['AccountType', 'Account', 'AliyunAccount'] 24 | -------------------------------------------------------------------------------- /datahub/auth/aliyun_account.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import 21 | 22 | from collections import OrderedDict 23 | 24 | import six 25 | from six.moves.urllib.parse import urlparse, unquote 26 | 27 | from .core import Account, AccountType 28 | from ..rest import Headers 29 | from ..utils import hmac_sha1, to_str 30 | 31 | try: 32 | from urllib.parse import parse_qsl 33 | except ImportError: 34 | from urlparse import parse_qsl 35 | 36 | # ============================ log ============================ 37 | import logging 38 | 39 | logger = logging.getLogger('datahub.account') 40 | logger.setLevel(logging.INFO) 41 | if not logger.handlers: 42 | logger.addHandler(logging.NullHandler()) 43 | 44 | 45 | class AliyunAccount(Account): 46 | """ 47 | Aliyun account implement base from :class:`datahub.auth.Account` 48 | """ 49 | 50 | __slots__ = '_access_id', '_access_key', '_security_token' 51 | 52 | def __init__(self, *args, **kwargs): 53 | super(AliyunAccount, self).__init__(*args, **kwargs) 54 | self._access_id = kwargs.get('access_id', '').strip() 55 | self._access_key = kwargs.get('access_key', '').strip() 56 | self._security_token = kwargs.get('security_token', '').strip() 57 | 58 | @property 59 | def access_id(self): 60 | return self._access_id 61 | 62 | @access_id.setter 63 | def access_id(self, value): 64 | self._access_id = value 65 | 66 | @property 67 | def access_key(self): 68 | return self._access_key 69 | 70 | @access_key.setter 71 | def access_key(self, value): 72 | self._access_key = value 73 | 74 | @property 75 | def security_token(self): 76 | return self._security_token 77 | 78 | @security_token.setter 79 | def security_token(self, value): 80 | self._security_token = value 81 | 82 | def get_type(self): 83 | """ 84 | Get account type. 85 | 86 | :return: the account type 87 | :rtype: :class:`datahub.auth.AccountType` 88 | """ 89 | return AccountType.ALIYUN 90 | 91 | @staticmethod 92 | def _build_canonical_query(query): 93 | param_pairs = sorted(parse_qsl(query, True), key=lambda it: it[0]) 94 | param_parts = map(lambda p: p[0] + '=' + p[1] if p[1] else p[0], param_pairs) 95 | return '&'.join(param_parts) 96 | 97 | @staticmethod 98 | def _build_canonical_str(url_components, req): 99 | # Build signing string 100 | lines = [req.method, req.headers[Headers.CONTENT_TYPE], req.headers[Headers.DATE], ] 101 | 102 | headers_to_sign = dict() 103 | 104 | # req headers 105 | headers = req.headers 106 | for k, v in six.iteritems(headers): 107 | k = k.lower() 108 | if k.startswith('x-datahub-'): 109 | headers_to_sign[k] = v 110 | 111 | # url params 112 | canonical_query = AliyunAccount._build_canonical_query(url_components.query) 113 | 114 | headers_to_sign = OrderedDict([(k, headers_to_sign[k]) 115 | for k in sorted(headers_to_sign)]) 116 | logger.debug('headers to sign: %s' % headers_to_sign) 117 | 118 | for k, v in six.iteritems(headers_to_sign): 119 | lines.append('%s:%s' % (k, v)) 120 | 121 | lines.append(url_components.path + '?' + canonical_query if canonical_query else url_components.path) 122 | return '\n'.join(lines) 123 | 124 | def sign_request(self, request): 125 | """ 126 | Generator signature for request. 127 | 128 | :param request: request object 129 | :return: none 130 | """ 131 | url = request.path_url 132 | url_components = urlparse(unquote(url)) 133 | canonical_str = self._build_canonical_str(url_components, request) 134 | logger.debug('canonical string: ' + canonical_str) 135 | 136 | sign = to_str(hmac_sha1(self._access_key, canonical_str)) 137 | 138 | auth_str = 'DATAHUB %s:%s' % (self._access_id, sign) 139 | request.headers[Headers.AUTHORIZATION] = auth_str 140 | -------------------------------------------------------------------------------- /datahub/auth/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import 21 | 22 | 23 | class AccountType(object): 24 | """ 25 | Account type. 26 | 27 | Only Support 'aliyun' type now. 28 | """ 29 | ALIYUN = 'aliyun' 30 | 31 | 32 | class Account(object): 33 | """ 34 | Base Account Class. 35 | 36 | .. seealso:: :class:`datahub.auth.AliyunAccount` 37 | """ 38 | 39 | def __init__(self, *args, **kwargs): 40 | pass 41 | 42 | def get_type(self): 43 | """ 44 | Get account type, subclass must be provided. 45 | 46 | :return: the account type 47 | :rtype: :class:`datahub.auth.AccountType` 48 | """ 49 | raise NotImplementedError("subclass must provide getType method") 50 | 51 | def sign_request(self, request): 52 | """ 53 | Generator signature for request, subclass must be provided. 54 | 55 | :param request: request object 56 | :return: none 57 | """ 58 | raise NotImplementedError("subclass must provide getType method") 59 | -------------------------------------------------------------------------------- /datahub/batch/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | -------------------------------------------------------------------------------- /datahub/batch/batch_binary_record.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import crcmod.predefined 22 | from .binary_record import BinaryRecord 23 | from .batch_header import BatchHeader, BATCH_HEAD_SIZE 24 | from ..models.compress import * 25 | from ..exceptions import DatahubException, InvalidParameterException 26 | 27 | 28 | class BatchBinaryRecord: 29 | """ 30 | Batch binary record 31 | """ 32 | def __init__(self, records=None): 33 | self._version = None 34 | self._length = None 35 | self._raw_size = None 36 | self._crc32 = None 37 | self._attributes = None 38 | self._record_count = None 39 | 40 | self._records = records if records else [] # list of BinaryRecord 41 | self._buffer = bytes() 42 | 43 | def add_record(self, record): 44 | if not record or not isinstance(record, BinaryRecord): 45 | raise InvalidParameterException("Add record fail. record must be a valid BinaryRecord instance") 46 | self._records.append(record) 47 | 48 | def serialize(self, compress_type=None): 49 | try: 50 | # Add BinaryRecord list 51 | for record in self._records: 52 | record_byte = record.serialize() 53 | self._buffer += record_byte 54 | 55 | # compress 56 | self.__compress(compress_type) 57 | 58 | crc32c = crcmod.predefined.mkCrcFun('crc-32c') 59 | self._crc32 = crc32c(self._buffer) & 0xffffffff 60 | self._version = 0 61 | self._record_count = len(self._records) 62 | 63 | # Add Batch header 64 | header_byte = BatchHeader.serialize( 65 | self._version, 66 | self._length, 67 | self._raw_size, 68 | self._crc32, 69 | self._attributes, 70 | self._record_count 71 | ) 72 | if len(header_byte) != BATCH_HEAD_SIZE: 73 | raise DatahubException("Batch header size should be {}, it is {}".format(BATCH_HEAD_SIZE, len(header_byte))) 74 | return header_byte + self._buffer 75 | except Exception as e: 76 | raise DatahubException("Serialize batch record fail. {}".format(e)) 77 | 78 | def __compress(self, compress_type=None): 79 | self._raw_size = len(self._buffer) 80 | self._length = self._raw_size + BATCH_HEAD_SIZE 81 | 82 | try: 83 | data_compressor = get_compressor(compress_type) 84 | compress_data = data_compressor.compress(self._buffer) 85 | 86 | if len(compress_data) < self._raw_size: 87 | self._attributes = compress_type.get_index() | 8 88 | self._buffer = compress_data 89 | self._length = BATCH_HEAD_SIZE + len(compress_data) 90 | else: 91 | self._attributes = CompressFormat.NONE.get_index() | 8 92 | except Exception as e: 93 | raise DatahubException("Compress data fail. {}".format(e)) 94 | 95 | @property 96 | def records(self): 97 | return self._records 98 | 99 | @property 100 | def buffer(self): 101 | return self._buffer 102 | 103 | @buffer.setter 104 | def buffer(self, buffer): 105 | self._buffer = buffer 106 | -------------------------------------------------------------------------------- /datahub/batch/batch_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | from .utils import * 22 | from ..utils.converters import to_binary, to_text 23 | from ..exceptions import DatahubException, InvalidParameterException 24 | 25 | MAGIC_NUMBER = "DHUB" 26 | BATCH_HEAD_SIZE = 26 27 | 28 | 29 | class BatchHeader: 30 | """ 31 | Batch binary header 32 | """ 33 | 34 | def __init__(self, version=None, length=None, raw_size=None, crc32=None, attributes=None, record_count=None): 35 | self._magic = MAGIC_NUMBER 36 | self._version = version 37 | self._length = length 38 | self._raw_size = raw_size 39 | self._crc32 = crc32 40 | self._attributes = attributes 41 | self._record_count = record_count 42 | 43 | @staticmethod 44 | def serialize(version, length, raw_size, crc32, attributes, record_count): 45 | header = bytes() 46 | header += to_binary(MAGIC_NUMBER) 47 | header += int2byte(version, size=4) 48 | header += int2byte(length, size=4, unsigned=True) 49 | header += int2byte(raw_size, size=4, unsigned=True) 50 | header += int2byte(crc32, size=4, unsigned=True) 51 | header += int2byte(attributes, size=2, unsigned=True) # short 52 | header += int2byte(record_count, size=4, unsigned=True) 53 | return header 54 | 55 | @staticmethod 56 | def deserialize(header): 57 | if len(header) != BATCH_HEAD_SIZE: 58 | raise DatahubException("Batch header length is {}, should be {}".format(len(header), BATCH_HEAD_SIZE)) 59 | if to_text(header[:4]) != MAGIC_NUMBER: 60 | raise InvalidParameterException("Error. Batch header should start with ", MAGIC_NUMBER) 61 | 62 | return BatchHeader( 63 | byte2int(header[4:8]), 64 | byte2int(header[8:12], unsigned=True), 65 | byte2int(header[12:16], unsigned=True), 66 | byte2int(header[16:20], unsigned=True), 67 | byte2int(header[20:22], size=2, unsigned=True), 68 | byte2int(header[22:26], unsigned=True) 69 | ) 70 | 71 | @property 72 | def magic(self): 73 | return self._magic 74 | 75 | @property 76 | def version(self): 77 | return self._version 78 | 79 | @version.setter 80 | def version(self, version): 81 | self._version = version 82 | 83 | @property 84 | def length(self): 85 | return self._length 86 | 87 | @length.setter 88 | def length(self, length): 89 | self._length = length 90 | 91 | @property 92 | def raw_size(self): 93 | return self._raw_size 94 | 95 | @raw_size.setter 96 | def raw_size(self, raw_size): 97 | self._raw_size = raw_size 98 | 99 | @property 100 | def crc32(self): 101 | return self._crc32 102 | 103 | @crc32.setter 104 | def crc32(self, crc32): 105 | self._crc32 = crc32 106 | 107 | @property 108 | def attributes(self): 109 | return self._attributes 110 | 111 | @attributes.setter 112 | def attributes(self, attributes): 113 | self._attributes = attributes 114 | 115 | @property 116 | def record_count(self): 117 | return self._record_count 118 | 119 | @record_count.setter 120 | def record_count(self, record_count): 121 | self._record_count = record_count 122 | -------------------------------------------------------------------------------- /datahub/batch/record_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | from .utils import * 22 | from ..exceptions import DatahubException 23 | 24 | RECORD_HEADER_SIZE = 16 25 | 26 | 27 | class RecordHeader: 28 | def __init__(self, encode_type=None, schema_version=None, total_size=None, attr_offset=None): 29 | self._encode_type = encode_type 30 | self._schema_version = schema_version 31 | self._total_size = total_size 32 | self._attr_offset = attr_offset 33 | 34 | @staticmethod 35 | def serialize(encode_type, schema_version, total_size, attr_offset): 36 | header = bytes() 37 | header += int2byte(encode_type) 38 | header += int2byte(schema_version) 39 | header += int2byte(total_size, unsigned=True) 40 | header += int2byte(attr_offset, unsigned=True) 41 | return header 42 | 43 | @staticmethod 44 | def deserialize(header): 45 | if len(header) != RECORD_HEADER_SIZE: 46 | raise DatahubException("Record header length should be {}".format(RECORD_HEADER_SIZE)) 47 | return RecordHeader( 48 | byte2int(header[:4]), 49 | byte2int(header[4:8]), 50 | byte2int(header[8:12]), 51 | byte2int(header[12:16]) 52 | ) 53 | 54 | @property 55 | def encode_type(self): 56 | return self._encode_type 57 | 58 | @encode_type.setter 59 | def encode_type(self, encode_type): 60 | self._encode_type = encode_type 61 | 62 | @property 63 | def schema_version(self): 64 | return self._schema_version 65 | 66 | @schema_version.setter 67 | def schema_version(self, schema_version): 68 | self._schema_version = schema_version 69 | 70 | @property 71 | def total_size(self): 72 | return self._total_size 73 | 74 | @total_size.setter 75 | def total_size(self, total_size): 76 | self._total_size = total_size 77 | 78 | @property 79 | def attr_offset(self): 80 | return self._attr_offset 81 | 82 | @attr_offset.setter 83 | def attr_offset(self, attr_offset): 84 | self._attr_offset = attr_offset 85 | -------------------------------------------------------------------------------- /datahub/batch/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import struct 22 | 23 | PADDING_BYTES = b'\x00' 24 | 25 | def int2byte(input_int, size=4, unsigned=False): 26 | if size == 1: 27 | return struct.pack("<{}".format("B" if unsigned else "b"), input_int) 28 | if size == 2: 29 | return struct.pack("<{}".format("H" if unsigned else "h"), input_int) 30 | if size == 4: 31 | return struct.pack("<{}".format("I" if unsigned else "i"), input_int) 32 | if size == 8: 33 | return struct.pack("<{}".format("Q" if unsigned else "q"), input_int) 34 | return None 35 | 36 | 37 | def byte2int(input_byte, size=4, unsigned=False): 38 | if size == 1: 39 | return struct.unpack("<{}".format("B" if unsigned else "b"), input_byte)[0] 40 | if size == 2: 41 | return struct.unpack("<{}".format("H" if unsigned else "h"), input_byte)[0] 42 | if size == 4: 43 | return struct.unpack("<{}".format("I" if unsigned else "i"), input_byte)[0] 44 | if size == 8: 45 | return struct.unpack("<{}".format("Q" if unsigned else "q"), input_byte)[0] 46 | return None 47 | 48 | 49 | def float2byte(input_float): 50 | return struct.pack(" client 28 | _datahub_lock = threading.Lock() 29 | 30 | @staticmethod 31 | def create_datahub_client(datahub_config): 32 | key = "{}:{}:{}:{}:{}".format(datahub_config.endpoint, datahub_config.access_id, datahub_config.access_key, 33 | datahub_config.protocol_type.value, datahub_config.compress_format.value) 34 | if key not in DatahubFactory._datahub_client_pool: 35 | with DatahubFactory._datahub_lock: 36 | if key not in DatahubFactory._datahub_client_pool: 37 | DatahubFactory._datahub_client_pool[key] = DataHub( 38 | access_id=datahub_config.access_id, 39 | access_key=datahub_config.access_key, 40 | endpoint=datahub_config.endpoint, 41 | protocol_type=datahub_config.protocol_type, 42 | compress_format=datahub_config.compress_format, 43 | use_client=True 44 | ) 45 | return DatahubFactory._datahub_client_pool.get(key) 46 | -------------------------------------------------------------------------------- /datahub/client/common/library_factory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import abc 22 | import threading 23 | from .meta_data import MetaData 24 | 25 | 26 | class LibraryFactory(metaclass=abc.ABCMeta): 27 | 28 | _meta_data_pool = dict() 29 | _meta_data_lock = threading.Lock() 30 | 31 | @staticmethod 32 | def get_meta_data(coordinator, common_config): 33 | key = "{}:{}:{}:{}".format(coordinator.endpoint, coordinator.project_name, 34 | coordinator.topic_name, coordinator.sub_id) 35 | if key not in LibraryFactory._meta_data_pool: 36 | with LibraryFactory._meta_data_lock: 37 | if key not in LibraryFactory._meta_data_pool: 38 | LibraryFactory._meta_data_pool[key] = MetaData(key, coordinator.project_name, coordinator.topic_name, coordinator.sub_id, common_config) 39 | meta_data = LibraryFactory._meta_data_pool.get(key) 40 | meta_data.register(coordinator) 41 | return meta_data 42 | 43 | @staticmethod 44 | def remove_meta_data(coordinator): 45 | meta_data = coordinator.meta_data 46 | with LibraryFactory._meta_data_lock: 47 | if meta_data.unregister(coordinator) <= 0: 48 | meta_data.close() 49 | LibraryFactory._meta_data_pool.pop(meta_data.class_key) 50 | -------------------------------------------------------------------------------- /datahub/client/common/offset_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | class OffsetMeta: 22 | 23 | def __init__(self, version_id, session_id): 24 | self._version_id = version_id 25 | self._session_id = session_id 26 | 27 | @property 28 | def version_id(self): 29 | return self._version_id 30 | 31 | @version_id.setter 32 | def version_id(self, value): 33 | self._version_id = value 34 | 35 | @property 36 | def session_id(self): 37 | return self._session_id 38 | 39 | @session_id.setter 40 | def session_id(self, value): 41 | self._session_id = value 42 | 43 | 44 | class ConsumeOffset(OffsetMeta): 45 | def __init__(self, sequence, timestamp, batch_index=0, version_id=-1, session_id=""): 46 | super().__init__(version_id, session_id) 47 | self._sequence = sequence 48 | self._timestamp = timestamp 49 | self._batch_index = batch_index 50 | self._next_cursor = "" 51 | 52 | def reset_timestamp(self, timestamp): 53 | self._next_cursor = None 54 | self._sequence = -1 55 | self._batch_index = 0 56 | self._timestamp = timestamp 57 | 58 | def to_string(self): 59 | return "({}:{}:{})".format(self._sequence, self._timestamp, self._batch_index) 60 | 61 | @property 62 | def next_cursor(self): 63 | return self._next_cursor 64 | 65 | @next_cursor.setter 66 | def next_cursor(self, value): 67 | self._next_cursor = value 68 | 69 | @property 70 | def sequence(self): 71 | return self._sequence 72 | 73 | @property 74 | def timestamp(self): 75 | return self._timestamp 76 | 77 | @property 78 | def batch_index(self): 79 | return self._batch_index 80 | -------------------------------------------------------------------------------- /datahub/client/common/shard_coordinator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import logging 22 | from datahub.exceptions import DatahubException 23 | from .library_factory import LibraryFactory 24 | 25 | 26 | class ShardCoordinator: 27 | 28 | def __init__(self, project_name, topic_name, sub_id, common_config): 29 | self._closed = False 30 | self._logger = logging.getLogger(ShardCoordinator.__name__) 31 | 32 | self._endpoint = common_config.endpoint 33 | self._project_name = project_name 34 | self._topic_name = topic_name 35 | self._sub_id = sub_id 36 | self._uniq_key = None 37 | self._gen_uniq_key() 38 | 39 | self._meta_data = LibraryFactory.get_meta_data(self, common_config) 40 | self._assign_shard_list = [] 41 | self._shard_change = None 42 | self._remove_all_shards = None 43 | 44 | def close(self): 45 | self._closed = True 46 | LibraryFactory.remove_meta_data(self) 47 | 48 | def update_shard_info(self): 49 | if self._closed: 50 | self._logger.warning("ShardCoordinator closed when update shard info. key: {}".format(self._uniq_key)) 51 | raise DatahubException("ShardCoordinator closed when update shard info") 52 | 53 | self._meta_data.update_shard_meta() 54 | 55 | def register_shard_change(self, shard_change_callback): 56 | self._shard_change = shard_change_callback 57 | 58 | def register_remove_all_shards(self, remove_all_shards_callback): 59 | self._remove_all_shards = remove_all_shards_callback 60 | 61 | def on_shard_meta_change(self, add_shards, del_shards): 62 | if not self.is_user_shard_assign(): 63 | self._do_shard_change(add_shards, del_shards) 64 | 65 | def is_user_shard_assign(self): 66 | return len(self._assign_shard_list) > 0 67 | 68 | @property 69 | def assign_shard_list(self): 70 | return self._assign_shard_list 71 | 72 | @assign_shard_list.setter 73 | def assign_shard_list(self, value): 74 | self._assign_shard_list = value 75 | 76 | @property 77 | def endpoint(self): 78 | return self._endpoint 79 | 80 | @property 81 | def project_name(self): 82 | return self._project_name 83 | 84 | @property 85 | def topic_name(self): 86 | return self._topic_name 87 | 88 | @property 89 | def sub_id(self): 90 | return self._sub_id 91 | 92 | @property 93 | def meta_data(self): 94 | return self._meta_data 95 | 96 | @property 97 | def uniq_key(self): 98 | return self._uniq_key 99 | 100 | def _do_shard_change(self, add_shards, del_shards): 101 | if self._closed: 102 | self._logger.warning("ShardCoordinator closed when shard change. key: {}".format(self._uniq_key)) 103 | raise DatahubException("ShardCoordinator closed when shard change") 104 | 105 | if self._shard_change and ((add_shards and len(add_shards) != 0) or (del_shards and len(del_shards) != 0)): 106 | self._shard_change(add_shards, del_shards) 107 | 108 | def _do_remove_all_shards(self): 109 | if self._closed: 110 | self._logger.warning("ShardCoordinator closed when remove all shards. key: {}".format(self._uniq_key)) 111 | raise DatahubException("ShardCoordinator closed when remove all shards") 112 | 113 | if self._remove_all_shards: 114 | self._remove_all_shards() 115 | 116 | def _gen_uniq_key(self, suffix=None): 117 | if not self._uniq_key: 118 | self._uniq_key = "{}:{}".format(self._project_name, self._topic_name) 119 | if self._sub_id: 120 | self._uniq_key += (":" + self._sub_id) 121 | 122 | if suffix: 123 | self._uniq_key += (":" + suffix) 124 | -------------------------------------------------------------------------------- /datahub/client/common/timer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import time 22 | import threading 23 | 24 | 25 | class Timer: 26 | 27 | def __init__(self, timeout): 28 | self._timeout = 0 29 | self._start_time = 0 30 | self._deadline_time = 0 31 | self._condition = threading.Condition() 32 | self.reset(timeout) 33 | 34 | def reset(self, timeout=None): 35 | if timeout: 36 | self._timeout = timeout 37 | self._start_time = Timer.get_curr_time() 38 | self._deadline_time = self._start_time + self._timeout 39 | 40 | def reset_deadline(self): 41 | self._deadline_time = 0 42 | 43 | def is_expired(self, curr_time=None): 44 | if curr_time is None: 45 | return Timer.get_curr_time() > self._deadline_time 46 | return curr_time > self._deadline_time 47 | 48 | def elapse(self): 49 | return Timer.get_curr_time() - self._start_time 50 | 51 | def wait_expire(self, diff=None): 52 | if diff is None: 53 | diff = self._deadline_time - Timer.get_curr_time() 54 | else: 55 | diff = min(diff, self._deadline_time - Timer.get_curr_time()) 56 | if diff > 0: 57 | with self._condition: 58 | self._condition.wait(diff) 59 | 60 | def notify_all(self): 61 | with self._condition: 62 | self._condition.notify_all() 63 | 64 | def notify_one(self): 65 | with self._condition: 66 | self._condition.notify() 67 | 68 | @property 69 | def timeout(self): 70 | return self._timeout 71 | 72 | @property 73 | def start_time(self): 74 | return self._start_time 75 | 76 | @property 77 | def deadline_time(self): 78 | return self._deadline_time 79 | 80 | @staticmethod 81 | def get_curr_time(): 82 | return time.time() 83 | -------------------------------------------------------------------------------- /datahub/client/consumer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | -------------------------------------------------------------------------------- /datahub/client/consumer/datahub_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import logging 22 | from .offset_coordinator import OffsetCoordinator 23 | from .consumer_coordinator import ConsumerCoordinator 24 | from .shard_group_reader import ShardGroupReader 25 | from ..common.config import Utils 26 | 27 | 28 | class DatahubConsumer: 29 | """ 30 | Consumer client for datahub 31 | 32 | Members: 33 | project_name (:class:`string`): project name 34 | 35 | topic_name (:class:`string`): topic name 36 | 37 | sub_id (:class:`string`): subscription id for consume 38 | 39 | consumer_config (:class:`datahub.client.common.ConsumerConfig`): config for consumer client 40 | 41 | shard_ids (:class:`list`): list of `string`: shard list you want to consume. 42 | default is None, means allocated automatically by datahub server 43 | 44 | timestamp (:class:`int`): set the start timestamp for consume. 45 | default is -1, means start with the subscription offset 46 | """ 47 | 48 | def __init__(self, project_name, topic_name, sub_id, consumer_config, shard_ids=None, timestamp=-1): 49 | logging.basicConfig(filename=consumer_config.logging_filename, filemode="a", 50 | level=consumer_config.logging_level, format=Utils.FORMAT, datefmt=Utils.DATE_FMT) 51 | 52 | if shard_ids: 53 | # 指定shard消费 54 | self._coordinator = OffsetCoordinator(project_name, topic_name, sub_id, consumer_config) 55 | else: 56 | # 协同消费 57 | self._coordinator = ConsumerCoordinator(project_name, topic_name, sub_id, consumer_config) 58 | 59 | try: 60 | self._group_reader = ShardGroupReader(self._coordinator, shard_ids, timestamp) 61 | except Exception as e: 62 | self._coordinator.close() 63 | raise e 64 | 65 | def close(self): 66 | self._group_reader.close() 67 | self._coordinator.close() 68 | 69 | def read(self, shard_id=None, timeout=60): 70 | return self._group_reader.read(shard_id, timeout) 71 | -------------------------------------------------------------------------------- /datahub/client/consumer/message_key.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import atomic 22 | 23 | 24 | class MessageKey: 25 | 26 | def __init__(self, shard_id, offset): 27 | self._ready = atomic.AtomicLong(0) 28 | self._shard_id = shard_id 29 | self._offset = offset 30 | 31 | def ack(self): 32 | self._ready.get_and_set(1) 33 | 34 | def is_ready(self): 35 | return self._ready.value != 0 36 | 37 | def to_string(self): 38 | return "({}@{}:{}:{})".format(self._shard_id, self._offset.sequence, self._offset.timestamp, self._offset.batch_index) 39 | 40 | @property 41 | def offset(self): 42 | return self._offset 43 | 44 | @property 45 | def shard_id(self): 46 | return self._shard_id 47 | -------------------------------------------------------------------------------- /datahub/client/consumer/offset_coordinator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import atomic 22 | from datahub.exceptions import DatahubException 23 | from .offset_manager import OffsetManager 24 | from ..common.offset_meta import ConsumeOffset 25 | from ..common.shard_coordinator import ShardCoordinator 26 | 27 | 28 | class OffsetCoordinator(ShardCoordinator): 29 | 30 | def __init__(self, project_name, topic_name, sub_id, consumer_config): 31 | super().__init__(project_name, topic_name, sub_id, consumer_config) 32 | self._sub_session_changed = False 33 | self._sub_offline = False 34 | self._sub_deleted = False 35 | self._offset_not_ack = False 36 | self._offset_reset = atomic.AtomicLong(0) 37 | 38 | self._auto_ack_offset = consumer_config.auto_ack_offset 39 | self._max_record_buffer_size = consumer_config.max_record_buffer_size 40 | self._fetch_limit = consumer_config.fetch_limit 41 | 42 | self._offset_manager = OffsetManager(self) 43 | 44 | def close(self): 45 | super().close() 46 | self._offset_manager.close() 47 | self._logger.info("OffsetCoordinator close success. key: {}".format(self._uniq_key)) 48 | 49 | def update_shard_info(self): 50 | if self._sub_deleted: 51 | raise DatahubException("Subscription has been deleted. key: {}".format(self._uniq_key)) 52 | if self._sub_session_changed: 53 | raise DatahubException("Subscription session has changed. key: {}".format(self._uniq_key)) 54 | if self._sub_offline: 55 | raise DatahubException("Subscription offline. key: {}".format(self._uniq_key)) 56 | if self._offset_not_ack: 57 | raise DatahubException("Offset has not been updated for a long time. key: {}".format(self._uniq_key)) 58 | super().update_shard_info() 59 | 60 | def on_shard_read_end(self, shard_ids): 61 | self._do_shard_change(None, shard_ids) 62 | 63 | def on_offset_reset(self): 64 | if self._offset_reset.compare_and_set(0, 1): 65 | if self.is_user_shard_assign(): 66 | self._do_shard_change(self._assign_shard_list, self._assign_shard_list) 67 | self._offset_reset.compare_and_set(1, 0) 68 | 69 | def waiting_shard_assign(self): 70 | return False 71 | 72 | def init_and_get_offset(self, shard_ids): 73 | client = self._meta_data.datahub_client 74 | try: 75 | init_result = client.init_and_get_subscription_offset(self._project_name, self._topic_name, self._sub_id, shard_ids) 76 | consume_offset_map = dict() 77 | for shard_id, offset in init_result.offsets.items(): 78 | consume_offset_map[shard_id] = ConsumeOffset( 79 | sequence=offset.sequence if offset.sequence < 0 else offset.sequence + 1, 80 | timestamp=offset.timestamp, 81 | batch_index=offset.batch_index, 82 | version_id=offset.version, 83 | session_id=offset.session_id 84 | ) 85 | self._logger.info("Init and get offset once success. key: {}, shard_id: {}, offset: {}".format(self._uniq_key, shard_id, offset)) 86 | self._offset_manager.set_offset_meta(consume_offset_map) 87 | return consume_offset_map 88 | except DatahubException as e: 89 | self._logger.warning("Init and get subscription offset fail. key: {}, {}".format(self._uniq_key, e)) 90 | raise e 91 | 92 | def send_record_offset(self, message_key): 93 | self._offset_manager.send_record_offset(message_key) 94 | 95 | def on_sub_offline(self): 96 | self._sub_offline = True 97 | 98 | def on_sub_session_changed(self): 99 | self._sub_session_changed = True 100 | 101 | def on_sub_deleted(self): 102 | self._sub_deleted = True 103 | 104 | def on_offset_not_ack(self): 105 | self._offset_not_ack = True 106 | 107 | @property 108 | def auto_ack_offset(self): 109 | return self._auto_ack_offset 110 | 111 | @property 112 | def fetch_limit(self): 113 | return self._fetch_limit 114 | 115 | @property 116 | def max_record_buffer_size(self): 117 | return self._max_record_buffer_size 118 | -------------------------------------------------------------------------------- /datahub/client/consumer/offset_select_strategy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | from ..common.timer import Timer 22 | from ..common.constant import Constant 23 | 24 | 25 | class OffsetSelectStrategy: 26 | 27 | def __init__(self): 28 | self._timer = Timer(Constant.READER_SELECT_EMPTY_SHARD_TIMEOUT) 29 | self._offset_map = dict() 30 | self._empty_shards = set() 31 | 32 | def add_shard(self, shard_id): 33 | self._offset_map[shard_id] = -1 34 | if shard_id in self._empty_shards: 35 | self._empty_shards.remove(shard_id) 36 | 37 | def remove_shard(self, shard_id): 38 | if shard_id in set(self._offset_map.keys()): 39 | self._offset_map.pop(shard_id) 40 | if shard_id in set(self._empty_shards): 41 | self._empty_shards.add(shard_id) 42 | 43 | def after_read(self, shard_id, record): 44 | if not record: 45 | self._empty_shards.add(shard_id) 46 | else: 47 | self._offset_map[shard_id] = record.system_time 48 | 49 | def get_next_shard(self): 50 | shard_id = self.__find_oldest_shard() 51 | 52 | if shard_id is None or self._timer.is_expired(): 53 | self._empty_shards.clear() 54 | self._timer.reset() 55 | return shard_id 56 | 57 | def __find_oldest_shard(self): 58 | if len(self._offset_map) == 0: 59 | return None 60 | oldest_timestamp = -1 61 | oldest_shard_id = None 62 | for shard_id, timestamp in self._offset_map.items(): 63 | if shard_id in self._empty_shards: 64 | continue 65 | if oldest_shard_id is None or timestamp < oldest_timestamp: 66 | oldest_timestamp = timestamp 67 | oldest_shard_id = shard_id 68 | return oldest_shard_id 69 | -------------------------------------------------------------------------------- /datahub/client/consumer/sync_group_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | class SyncGroupMeta: 22 | 23 | def __init__(self): 24 | self._release_shards = set() 25 | self._read_end_shards = set() 26 | self._active_shards = set() 27 | 28 | def on_shard_release(self, shards): 29 | for shard in shards: 30 | if shard in self._read_end_shards: 31 | self._read_end_shards.remove(shard) 32 | self._active_shards.remove(shard) 33 | if shard not in self._release_shards: 34 | self._release_shards.add(shard) 35 | 36 | def on_shard_read_end(self, shards): 37 | for shard in shards: 38 | if shard not in self._read_end_shards: 39 | self._read_end_shards.add(shard) 40 | if shard in self._release_shards: 41 | self._release_shards.remove(shard) 42 | 43 | def need_sync_group(self): 44 | return len(self._release_shards) != 0 or len(self._read_end_shards) != 0 45 | 46 | def clear_shard_release(self): 47 | self._release_shards.clear() 48 | 49 | def on_heartbeat_done(self, shardIds): 50 | for shard in shardIds: 51 | self._active_shards.add(shard) 52 | 53 | def get_valid_shards(self): 54 | return self._active_shards.difference(self._read_end_shards) 55 | 56 | @property 57 | def release_shards(self): 58 | return self._release_shards 59 | 60 | @property 61 | def read_end_shards(self): 62 | return self._read_end_shards 63 | -------------------------------------------------------------------------------- /datahub/client/producer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | -------------------------------------------------------------------------------- /datahub/client/producer/datahub_producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import logging 22 | from .shard_group_writer import ShardGroupWriter 23 | from ..common.config import Utils 24 | from ..common.shard_coordinator import ShardCoordinator 25 | 26 | 27 | class DatahubProducer: 28 | """ 29 | Producer client for datahub 30 | 31 | Members: 32 | project_name (:class:`string`): project name 33 | 34 | topic_name (:class:`string`): topic name 35 | 36 | producer_config (:class:`datahub.client.common.ProducerConfig`): config for producer client 37 | 38 | shard_ids (:class:`list`): list of `string`: shard list you want to producer. 39 | default is None, means write to all shards evenly 40 | """ 41 | 42 | def __init__(self, project_name, topic_name, producer_config, shard_ids=None): 43 | logging.basicConfig(filename=producer_config.logging_filename, filemode="a", 44 | level=producer_config.logging_level, format=Utils.FORMAT, datefmt=Utils.DATE_FMT) 45 | 46 | self._coordinator = ShardCoordinator(project_name, topic_name, "", producer_config) 47 | 48 | try: 49 | self._group_writer = ShardGroupWriter(self._coordinator, shard_ids, producer_config) 50 | except Exception as e: 51 | self._coordinator.close() 52 | raise e 53 | 54 | def close(self): 55 | self._group_writer.close() 56 | self._coordinator.close() 57 | 58 | def write(self, records): 59 | return self._group_writer.write(records) 60 | 61 | def write_async(self, records): 62 | return self._group_writer.write_async(records) 63 | 64 | def flush(self): 65 | self._group_writer.flush() 66 | 67 | @property 68 | def topic_meta(self): 69 | return self._coordinator.meta_data.topic_meta 70 | -------------------------------------------------------------------------------- /datahub/client/producer/message_writer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import logging 22 | from datahub.exceptions import DatahubException 23 | from ..common.thread_pool import HashThreadPool 24 | 25 | 26 | class MessageWriter: 27 | 28 | def __init__(self, meta_data, queue_limit_num, threads_num): 29 | self._meta_data = meta_data 30 | self._logger = logging.getLogger(MessageWriter.__name__) 31 | self._executor = HashThreadPool(queue_limit_num, threads_num, "MessageWriter") 32 | 33 | def close(self): 34 | self._executor.shutdown() 35 | 36 | def empty(self, key): 37 | return self._executor.empty(key) 38 | 39 | def send_task(self, key, task, *args, **kwargs): 40 | return self._executor.submit(key, task, *args, **kwargs) 41 | 42 | def put_record(self, records): 43 | topic_meta = self._meta_data.topic_meta 44 | datahub_client = self._meta_data.datahub_client 45 | 46 | try: 47 | datahub_client.put_records(topic_meta.project_name, topic_meta.topic_name, records) 48 | except DatahubException as e: 49 | self._logger.warning("Put records fail. records count: {}, DatahubException: {}".format(len(records), e)) 50 | raise e 51 | except Exception as e: 52 | self._logger.warning("Put records fail. records count: {}, {}".format(len(records), e)) 53 | raise e 54 | 55 | def put_record_by_shard(self, shard_id, records): 56 | topic_meta = self._meta_data.topic_meta 57 | datahub_client = self._meta_data.datahub_client 58 | 59 | try: 60 | datahub_client.put_records_by_shard(topic_meta.project_name, topic_meta.topic_name, shard_id, records) 61 | except DatahubException as e: 62 | self._logger.warning("Put records by shard fail. shard_id: {}, records count: {}, DatahubException: {}".format(shard_id, len(records), e)) 63 | raise e 64 | except Exception as e: 65 | self._logger.warning("Put records by shard fail. shard_id: {}, records count: {}, {}".format(shard_id, len(records), e)) 66 | raise e 67 | -------------------------------------------------------------------------------- /datahub/client/producer/record_pack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import time 22 | from concurrent.futures import Future 23 | 24 | 25 | class RecordPack: 26 | 27 | def __init__(self, max_buffer_size, max_buffer_record_count, max_buffer_time): 28 | self._is_ready = False 29 | self._init_time = time.time() 30 | self._curr_size = 0 31 | self._curr_count = 0 32 | 33 | self._max_buffer_size = max_buffer_size 34 | self._max_buffer_record_count = max_buffer_record_count 35 | self._max_buffer_time = max_buffer_time 36 | 37 | self._records = [] 38 | self._write_result_futures = [] 39 | 40 | def is_ready(self): 41 | return self._is_ready or time.time() - self._init_time >= self._max_buffer_time 42 | 43 | def try_append(self, records): 44 | size = self.__get_total_records_size(records) 45 | if (self._curr_size + size < self._max_buffer_size and self._curr_count + len(records) <= self._max_buffer_record_count) or self._curr_count == 0: 46 | return self.__append_records(records, size) 47 | else: 48 | self._is_ready = True 49 | return None 50 | 51 | def __append_records(self, records, size): 52 | self._records += records 53 | self._curr_size += size 54 | self._curr_count += len(records) 55 | 56 | write_future = Future() 57 | self._write_result_futures.append(write_future) 58 | return write_future 59 | 60 | @property 61 | def init_time(self): 62 | return self._init_time 63 | 64 | @property 65 | def curr_size(self): 66 | return self._curr_size 67 | 68 | @property 69 | def curr_count(self): 70 | return self._curr_count 71 | 72 | @property 73 | def records(self): 74 | return self._records 75 | 76 | @property 77 | def write_result_futures(self): 78 | return self._write_result_futures 79 | 80 | def __get_total_records_size(self, records): 81 | return sum([len(record.encode_values()) for record in records]) 82 | -------------------------------------------------------------------------------- /datahub/client/producer/record_pack_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import time 22 | import queue 23 | import threading 24 | from .record_pack import RecordPack 25 | 26 | 27 | class RecordPackQueue: 28 | 29 | def __init__(self, max_buffer_size, max_buffer_record_count, max_buffer_time, max_record_pack_queue_limit): 30 | self._lock = threading.Lock() 31 | self._last_obtain_time = time.time() 32 | self._max_buffer_size = max_buffer_size 33 | self._max_buffer_record_count = max_buffer_record_count 34 | self._max_buffer_time = max_buffer_time 35 | self._ready_record_packs = queue.Queue(max_record_pack_queue_limit) 36 | self._current_record_pack = None 37 | 38 | def flush(self): 39 | self.__merge_current_pack(True) 40 | 41 | def obtain_ready_record_pack(self): 42 | try: 43 | return self._ready_record_packs.get_nowait() 44 | except queue.Empty: 45 | return None 46 | 47 | def append_record(self, records): 48 | result = self.__try_append(records) 49 | 50 | with self._lock: 51 | self.__merge_current_pack(False) 52 | 53 | if result is None: 54 | with self._lock: 55 | self._current_record_pack = RecordPack(self._max_buffer_size, self._max_buffer_record_count, self._max_buffer_time) 56 | result = self._current_record_pack.try_append(records) 57 | return result 58 | 59 | def __try_append(self, records): 60 | with self._lock: 61 | return None if self._current_record_pack is None else self._current_record_pack.try_append(records) 62 | 63 | def __merge_current_pack(self, force): 64 | if self._current_record_pack is not None and (force or self._current_record_pack.is_ready()): 65 | self._ready_record_packs.put(self._current_record_pack) 66 | self._current_record_pack = None 67 | -------------------------------------------------------------------------------- /datahub/client/producer/write_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | class WriteResult: 22 | 23 | def __init__(self, shard_id, elapsed_time, send_time): 24 | self._shard_id = shard_id 25 | self._elapsed_time = elapsed_time 26 | self._send_time = send_time 27 | 28 | @property 29 | def shard_id(self): 30 | return self._shard_id 31 | 32 | @property 33 | def elapsed_time(self): 34 | return self._elapsed_time 35 | 36 | @property 37 | def send_time(self): 38 | return self._send_time 39 | -------------------------------------------------------------------------------- /datahub/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from .schema import Field, RecordSchema, FieldType 21 | from .record import RecordType, Record, BlobRecord, TupleRecord, FailedRecord 22 | from .cursor import CursorType 23 | from .compress import CompressFormat 24 | from .shard import ShardState, Shard, ShardContext, ShardBase 25 | from .connector import ConnectorConfig, ConnectorShardStatus, AuthMode, ConnectorState, PartitionMode, \ 26 | OdpsConnectorConfig, DatabaseConnectorConfig, EsConnectorConfig, FcConnectorConfig, OssConnectorConfig, \ 27 | OtsConnectorConfig, HologresConnectorConfig, ConnectorType, ConnectorOffset 28 | from .subscription import OffsetBase, OffsetWithVersion, OffsetWithSession, SubscriptionState 29 | from .params import * 30 | from .results import * 31 | -------------------------------------------------------------------------------- /datahub/models/compress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | from __future__ import absolute_import 22 | 23 | import abc 24 | import struct 25 | import zlib 26 | from enum import Enum 27 | 28 | import lz4.block 29 | import six 30 | 31 | from ..exceptions import DatahubException 32 | 33 | 34 | class CompressFormat(Enum): 35 | """ 36 | CompressFormat enum class, there are: ``NONE``, ``LZ4``, ``ZLIB``, ``DEFLATE`` 37 | """ 38 | NONE = '' 39 | DEFLATE = 'deflate' 40 | LZ4 = 'lz4' 41 | ZLIB = 'zlib' 42 | 43 | def get_index(self): 44 | return { 45 | CompressFormat.NONE: 0, 46 | CompressFormat.DEFLATE: 1, 47 | CompressFormat.LZ4: 2, 48 | CompressFormat.ZLIB: 3 49 | }[self] 50 | 51 | @staticmethod 52 | def get_compress(index): 53 | return { 54 | 0: CompressFormat.NONE, 55 | 1: CompressFormat.DEFLATE, 56 | 2: CompressFormat.LZ4, 57 | 3: CompressFormat.ZLIB 58 | }[index] 59 | 60 | 61 | @six.add_metaclass(abc.ABCMeta) 62 | class Compressor(object): 63 | """ 64 | Abstract Compressor class 65 | """ 66 | 67 | @abc.abstractmethod 68 | def compress(self, data): 69 | pass 70 | 71 | @abc.abstractmethod 72 | def decompress(self, data, raw_size=-1): 73 | pass 74 | 75 | @abc.abstractmethod 76 | def compress_format(self): 77 | pass 78 | 79 | 80 | class NoneCompressor(Compressor): 81 | """ 82 | None compressor 83 | """ 84 | 85 | def compress(self, data): 86 | return data 87 | 88 | def decompress(self, data, raw_size=-1): 89 | return data 90 | 91 | def compress_format(self): 92 | return CompressFormat.NONE 93 | 94 | 95 | class Lz4Compressor(Compressor): 96 | """ 97 | Lz4 compressor 98 | """ 99 | 100 | def compress(self, data): 101 | return lz4.block.compress(data, store_size=False) 102 | 103 | def decompress(self, data, raw_size=-1): 104 | size_header = struct.pack('I', crc) + struct.pack('>I', len(pb_data)) + pb_data 46 | 47 | 48 | def unwrap_pb_frame(pb_frame): 49 | crc32c = crcmod.predefined.mkCrcFun('crc-32c') 50 | binary = to_binary(pb_frame) 51 | crc = binary[4:8] 52 | pb_str = pb_frame[12:] if six.PY3 else to_str(pb_frame[12:]) 53 | compute_crc = struct.pack('>I', crc32c(pb_str) & 0xffffffff) 54 | return crc, compute_crc, pb_str 55 | -------------------------------------------------------------------------------- /datahub/utils/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | class ErrorMessage(object): 22 | INVALID_PROJECT_NAME = 'project name should start with letter, only contains [a-zA-Z0-9_], 3 < length < 32' 23 | INVALID_TOPIC_NAME = 'topic name should start with letter, only contains [a-zA-Z0-9_], 1 < length < 128' 24 | MISSING_RECORD_SCHEMA = 'missing record schema for tuple record type' 25 | INVALID_RECORD_SCHEMA_TYPE = 'record schema parameter must be type of RecordSchema' 26 | MISSING_BLOB_RECORD_DATA = 'Blob Record blob data or values is missing' 27 | MISSING_TUPLE_RECORD_SCHEMA = 'TUPLE Record fields or schema is missing' 28 | MISSING_SYSTEM_TIME = 'get SYSTEM_TIME cursor must provide invalid system_time parameter' 29 | MISSING_SEQUENCE = 'get SEQUENCE cursor must provide invalid sequence parameter' 30 | WAIT_SHARD_TIMEOUT = 'wait shards ready timeout' 31 | PARAMETER_EMPTY = '%s could not be empty' 32 | PARAMETER_NOT_POSITIVE = '%s must be positive' 33 | PARAMETER_NEGATIVE = '%s could not be negative' 34 | INVALID_TYPE = '%s must be type of %s' 35 | -------------------------------------------------------------------------------- /datahub/utils/converters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import, print_function 21 | 22 | import datetime 23 | 24 | import six 25 | 26 | GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' 27 | 28 | 29 | def indent(text, n_spaces): 30 | if n_spaces <= 0: 31 | return text 32 | block = ' ' * n_spaces 33 | return '\n'.join((block + it) if not it else it 34 | for it in text.split('\n')) 35 | 36 | 37 | def gen_rfc822_date(): 38 | date_str = datetime.datetime.utcnow().strftime(GMT_FORMAT) 39 | return date_str 40 | 41 | 42 | def to_binary(text, encoding='utf-8'): 43 | if text is None: 44 | return text 45 | if isinstance(text, six.text_type): 46 | return text.encode(encoding) 47 | elif isinstance(text, (six.binary_type, bytearray)): 48 | return bytes(text) 49 | return str(text).encode(encoding) if six.PY3 else str(text) 50 | 51 | 52 | def to_text(binary, encoding='utf-8'): 53 | if binary is None: 54 | return binary 55 | if isinstance(binary, (six.binary_type, bytearray)): 56 | return binary.decode(encoding) 57 | elif isinstance(binary, six.text_type): 58 | return binary 59 | return str(binary) if six.PY3 else str(binary).decode(encoding) 60 | 61 | 62 | def to_str(text, encoding='utf-8'): 63 | return to_text(text, encoding=encoding) if six.PY3 else to_binary(text, encoding=encoding) 64 | 65 | 66 | def bool_to_str(value): 67 | return to_str(None) if value is None else to_str(value).lower() 68 | -------------------------------------------------------------------------------- /datahub/utils/validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import, print_function 21 | 22 | import re 23 | from functools import wraps 24 | 25 | import six 26 | from funcsigs import signature 27 | 28 | from . import ErrorMessage 29 | from ..exceptions import InvalidParameterException 30 | 31 | PROJECT_NAME_REGULAR_EXPRESSION = r'^[a-zA-Z]+[a-zA-Z0-9_]*' 32 | TOPIC_NAME_REGULAR_EXPRESSION = r'^[a-zA-Z]+[a-zA-Z0-9_]*' 33 | 34 | PROJECT_NAME_MIN_LENGTH = 3 35 | PROJECT_NAME_MAX_LENGTH = 32 36 | TOPIC_NAME_MIN_LENGTH = 1 37 | TOPIC_NAME_MAX_LENGTH = 128 38 | 39 | 40 | def is_valid_str(text, regular_expression, min_length, max_length): 41 | if not text or not isinstance(text, six.string_types): 42 | return False 43 | if len(text) < min_length or len(text) > max_length: 44 | return False 45 | pattern = re.compile(regular_expression) 46 | result = pattern.match(text) 47 | return result is not None and len(result.group()) == len(text) 48 | 49 | 50 | def check_project_name_valid(project_name): 51 | return is_valid_str(project_name, PROJECT_NAME_REGULAR_EXPRESSION, PROJECT_NAME_MIN_LENGTH, 52 | PROJECT_NAME_MAX_LENGTH) 53 | 54 | 55 | def check_topic_name_valid(topic_name): 56 | return is_valid_str(topic_name, TOPIC_NAME_REGULAR_EXPRESSION, TOPIC_NAME_MIN_LENGTH, TOPIC_NAME_MAX_LENGTH) 57 | 58 | 59 | def check_empty(variable): 60 | return variable is None or not variable 61 | 62 | 63 | def check_type(variable, *args): 64 | for data_type in args: 65 | if isinstance(variable, data_type): 66 | return True 67 | return False 68 | 69 | 70 | def check_negative(variable): 71 | return variable < 0 72 | 73 | 74 | def check_positive(variable): 75 | return variable > 0 76 | 77 | 78 | def type_assert(*type_args, **type_kwargs): 79 | def decorate(func): 80 | signature_ = signature(func) 81 | bound_types = signature_.bind_partial(*type_args, **type_kwargs).arguments 82 | 83 | @wraps(func) 84 | def wrapper(*args, **kwargs): 85 | bound_values = signature_.bind(*args, **kwargs) 86 | # Enforce type assertions across supplied arguments 87 | for name, value in bound_values.arguments.items(): 88 | if name in bound_types: 89 | # pre check for special argument 90 | if name == 'cursor' and type(value).__name__ == 'GetCursorResult': 91 | raise InvalidParameterException('param cursor should be type of str, get the cursor from ' 92 | 'getCursorResult') 93 | 94 | if not isinstance(value, bound_types[name]): 95 | # for python 2.7 96 | if not (type(value).__name__ == "unicode" and (str in bound_types[name] if isinstance(bound_types[name], tuple) else bound_types[name] == str)): 97 | raise InvalidParameterException(ErrorMessage.INVALID_TYPE % 98 | (name, bound_types[name].__name__) 99 | + ", input is: " + type(value).__name__) 100 | return func(*args, **kwargs) 101 | 102 | return wrapper 103 | 104 | return decorate 105 | -------------------------------------------------------------------------------- /datahub/version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | """ version 21 | """ 22 | 23 | __version__ = '2.25.4' 24 | __datahub_client_version__ = '1.1' 25 | -------------------------------------------------------------------------------- /dependency/certifi-2018.4.16-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/certifi-2018.4.16-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/chardet-3.0.4-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/chardet-3.0.4-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/cprotobuf-0.1.9.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/cprotobuf-0.1.9.tar.gz -------------------------------------------------------------------------------- /dependency/crcmod-1.7.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/crcmod-1.7.tar.gz -------------------------------------------------------------------------------- /dependency/enum34-1.1.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/enum34-1.1.6.tar.gz -------------------------------------------------------------------------------- /dependency/funcsigs-1.0.2-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/funcsigs-1.0.2-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/future-0.16.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/future-0.16.0.tar.gz -------------------------------------------------------------------------------- /dependency/idna-2.6-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/idna-2.6-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/install_dependency.sh: -------------------------------------------------------------------------------- 1 | pip install ./certifi-2018.4.16-py2.py3-none-any.whl 2 | pip install ./chardet-3.0.4-py2.py3-none-any.whl 3 | pip install ./crcmod-1.7.tar.gz 4 | pip install ./idna-2.6-py2.py3-none-any.whl 5 | pip install ./simplejson-3.15.0.tar.gz 6 | pip install ./six-1.11.0-py2.py3-none-any.whl 7 | pip install ./urllib3-1.22-py2.py3-none-any.whl 8 | pip install ./pkgconfig-1.3.1.tar.gz 9 | pip install ./setuptools_scm-2.1.0.tar.gz 10 | pip install ./future-0.16.0.tar.gz 11 | pip install ./enum34-1.1.6.tar.gz; python_version < '3.4' 12 | pip install ./cprotobuf-0.1.9.tar.gz 13 | pip install ./requests-2.18.4-py2.py3-none-any.whl 14 | pip install ./pytest-runner-4.2.tar.gz 15 | pip install ./lz4-2.0.0.tar.gz 16 | pip install ./funcsigs-1.0.2-py2.py3-none-any.whl 17 | -------------------------------------------------------------------------------- /dependency/lz4-2.0.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/lz4-2.0.0.tar.gz -------------------------------------------------------------------------------- /dependency/pip-10.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/pip-10.0.1.tar.gz -------------------------------------------------------------------------------- /dependency/pkgconfig-1.3.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/pkgconfig-1.3.1.tar.gz -------------------------------------------------------------------------------- /dependency/pytest-runner-4.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/pytest-runner-4.2.tar.gz -------------------------------------------------------------------------------- /dependency/readme.txt: -------------------------------------------------------------------------------- 1 | # if without pip 2 | 0. unzip setuptools-39.2.0.zip 3 | 1. cd setuptools-39.2.0 4 | 2. python setup.py install 5 | 3. cd .. 6 | 4. tar zxvf pip-10.0.1.tar.gz 7 | 5. cd pip-10.0.1 8 | 6. python setup.py install 9 | 7. cd .. 10 | # end 11 | 12 | # install dependency 13 | 0. sudo ./install_dependency.sh -------------------------------------------------------------------------------- /dependency/requests-2.18.4-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/requests-2.18.4-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/setuptools-39.2.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/setuptools-39.2.0.zip -------------------------------------------------------------------------------- /dependency/setuptools_scm-2.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/setuptools_scm-2.1.0.tar.gz -------------------------------------------------------------------------------- /dependency/simplejson-3.15.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/simplejson-3.15.0.tar.gz -------------------------------------------------------------------------------- /dependency/six-1.11.0-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/six-1.11.0-py2.py3-none-any.whl -------------------------------------------------------------------------------- /dependency/urllib3-1.22-py2.py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/dependency/urllib3-1.22-py2.py3-none-any.whl -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = PyDatahub 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=PyDatahub 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | -------------------------------------------------------------------------------- /docs/source/_static/PyDatahub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/docs/source/_static/PyDatahub.png -------------------------------------------------------------------------------- /docs/source/_static/theme_override.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: 1050px 3 | } 4 | 5 | .wy-table-responsive table td, .wy-table-responsive table th { 6 | white-space: normal !important; 7 | } 8 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyDatahub documentation master file, created by 2 | sphinx-quickstart on Sat Feb 4 14:54:19 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyDatahub: Datahub Python SDK 7 | ===================================== 8 | 9 | `PyDatahub `_ 是Datahub的Python版本的SDK,它对Datahub服务提供的各个RESTful API接口进行了封装,提供了简单方便的Python编程接口。有关Datahub服务的详细介绍请参见 `阿里云官网介绍 `_ 。 10 | 11 | Requirements: 12 | * setuptools (>=39.2.0) 13 | * requests (>=2.4.0) 14 | * simplejson (>=3.3.0) 15 | * six (>=1.1.0) 16 | * enum34 (>=1.1.5 for python_version < '3.4') 17 | * crcmod (>=1.7) 18 | * lz4 (>=2.0.0) 19 | * cprotobuf (==0.1.9) 20 | * funcsigs (>=1.0.2) 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | 25 | installation 26 | tutorial 27 | api 28 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | ************** 4 | 安装指南 5 | ************** 6 | 7 | 8 | 基础环境准备 9 | ============ 10 | 11 | 安装pip,可以参考 `地址 `_ 。 12 | 13 | 安装PyDatahub 14 | ============= 15 | 16 | 快速安装 17 | -------- 18 | 19 | .. code-block:: sh 20 | 21 | $ pip install pydatahub 22 | 23 | **注:** 这里PyDatahub的相关依赖包如果没有安装的话会自动安装。 24 | 25 | 源码安装 26 | -------- 27 | 28 | .. code-block:: sh 29 | 30 | $ git clone https://github.com/aliyun/aliyun-datahub-sdk-python.git 31 | $ cd aliyun-datahub-sdk-python 32 | $ python setup.py install 33 | 34 | **注:** 没有网络的情况下可以通过如下方式安装依赖: 35 | 36 | .. code-block:: sh 37 | 38 | $ cd dependency 39 | $ pip install -r first.txt 40 | $ pip install -r second.txt 41 | 42 | 安装验证 43 | ======== 44 | 45 | .. code-block:: sh 46 | 47 | python -c "from datahub import DataHub" 48 | 49 | 如果上述命令执行成功,恭喜你安装Datahub Python版本SDK成功! 50 | 51 | 常见问题 52 | ========== 53 | 54 | 如果安装过程中出现错误信息'Python.h: No such file or directory',常用的操作系统安装方式如下: 55 | 56 | .. code-block:: sh 57 | 58 | $ sudo apt-get install python-dev # for python2.x installs 59 | $ sudo apt-get install python3-dev # for python3.x installs 60 | 61 | $ sudo yum install python-devel # for python2.x installs 62 | $ sudo yum install python34-devel # for python3.4 installs 63 | 64 | 如果使用windows操作系统,根据提示信息可到 `此处 `__ 下载对应版本的 Visual C++ SDK 65 | 66 | -------------------------------------------------------------------------------- /docs/source/tutorial-client.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-client: 2 | 3 | ********************* 4 | high-level client操作 5 | ********************* 6 | 7 | High-level client提供了更便利的数据发布和订阅功能。 8 | 9 | 发布数据 10 | --------------------- 11 | 12 | * 创建DatahubProducer 13 | 14 | .. code-block:: python 15 | 16 | producer_config = ProducerConfig(access_id, access_key, endpoint) 17 | datahub_producer = DatahubProducer(project_name, topic_name, producer_config) 18 | 19 | 详细定义: 20 | :ref:`producer` 21 | 22 | * 数据发布方式 23 | 24 | 同步发布 25 | 26 | .. code-block:: python 27 | 28 | shard_id = datahub_producer.write(records) 29 | 30 | 接口传入的records是用户希望写入的数据,返回的是当前records写入的shard_id。 31 | 32 | 异步发布 33 | 34 | .. code-block:: python 35 | 36 | result = datahub_producer.write_async(records) 37 | 38 | 接口传入的records是用户发布的数据,返回的result是异步写入的结果。 39 | 40 | 订阅数据 41 | -------------------- 42 | 43 | * 创建DatahubConsumer 44 | 45 | .. code-block:: python 46 | 47 | consumer_config = ConsumerConfig(access_id, access_key, endpoint) 48 | datahub_consumer = DatahubConsumer(project_name, topic_name, sub_id, consumer_config) 49 | 50 | 详细定义: 51 | :ref:`consumer` 52 | 53 | * 数据订阅方式 54 | 55 | .. code-block:: python 56 | 57 | record = datahub_consumer.read(timeout=60) 58 | 59 | 接口传入的参数timeout是订阅数据超时时间,返回的record是消费的数据。 60 | -------------------------------------------------------------------------------- /docs/source/tutorial-meter.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-meter: 2 | 3 | ************* 4 | meter操作 5 | ************* 6 | 7 | metering info是对shard的资源占用情况的统计信息,一小时更新一次 8 | 9 | 获取metering info 10 | ====================== 11 | 12 | * get_metering_info接口获取指定shard的统计信息 13 | 14 | .. code-block:: python 15 | 16 | result = dh.get_metering_info(project_name, topic_name, shard_id) 17 | 18 | get_metering_info返回的结果是GetMeteringInfoResult对象,包含active_time, storage两个成员。 19 | 20 | 详细定义: 21 | :ref:`Results` -------------------------------------------------------------------------------- /docs/source/tutorial-offset.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-offset: 2 | 3 | ************* 4 | offset操作 5 | ************* 6 | 7 | 一个subscription创建后,初始状态是未消费的,要使用subscription服务提供的点位存储功能,需要进行一些offset操作 8 | 9 | 初始化offset 10 | -------------- 11 | 12 | * init_and_get_subscription_offset接口初始化offset,是开始消费的第一步 13 | 14 | .. code-block:: python 15 | 16 | init_result = dh.init_and_get_subscription_offset(project_name, topic_name, sub_id, shard_id) 17 | init_result = dh.init_and_get_subscription_offset(project_name, topic_name, sub_id, shard_ids) 18 | 19 | 最后一个参数可以是一个shard_id,也可以是shard_id的list,指定要初始化的shard 20 | init_and_get_subscription_offset返回的是InitAndGetSubscriptionOffsetResult对象,包含offsets成员,是一个OffsetWithSession对象, 21 | 其中包含成员sequence, timestamp, version, session_id。 22 | sequence和timestamp就是点位信息,第一次初始化的session_id为0,之后每次初始化都会+1 23 | 24 | 详细定义: 25 | :ref:`Results` 26 | 27 | 获取offset 28 | ----------- 29 | 30 | * get_subscription_offset能够获取订阅的offset信息 31 | 32 | .. code-block:: python 33 | 34 | offset_result = dh.get_subscription_offset(project_name, topic_name, sub_id, shard_id) 35 | offset_result = dh.get_subscription_offset(project_name, topic_name, sub_id, shard_ids) 36 | 37 | 最后一个参数可以是一个shard_id,也可以是shard_id的list,指定要初始化的shard 38 | get_subscription_offset返回的是GetSubscriptionOffsetResult对象,包含OffsetWithVersion对象的list。 39 | OffsetWithVersion类是OffsetWithSession的父类,只包含sequence, timestamp, version 40 | 41 | 详细定义: 42 | :ref:`offset`, :ref:`Results` 43 | 44 | 更新offset 45 | ----------- 46 | 47 | * update_subscription_offset接口能够更新offset 48 | 49 | .. code-block:: python 50 | 51 | offsets1 = { 52 | '0': OffsetWithSession(sequence0, timestamp0, version0, session_id0), 53 | '1': OffsetWithSession(sequence1, timestamp1, version1, session_id1) 54 | } 55 | 56 | offsets2 = { 57 | '0': { 58 | 'Sequence': 0, 59 | 'Timestamp': 0, 60 | 'Version': 0, 61 | 'SessionId': 0 62 | }, 63 | '1': { 64 | 'Sequence': 1, 65 | 'Timestamp': 1, 66 | 'Version': 1, 67 | 'SessionId': 1 68 | } 69 | } 70 | 71 | dh.update_subscription_offset(project_name, topic_name, sub_id, offsets1) 72 | dh.update_subscription_offset(project_name, topic_name, sub_id, offsets2) 73 | 74 | 75 | 参数offsets是一个dict,其中的key是shard_id,value可以是OffsetWithSession对象,也可以是一个dict,如果version和session_id发生变化,就会更新失败。 76 | 当错误信息指出version发生变化,可以通过get_subscription_offset接口获取最新的version信息,继续消费。 77 | 当错误信息指出session_id发生变化,就只能再次使用init_and_get_subscription_offset初始化offset信息,再继续消费。 78 | 79 | 详细定义: 80 | :ref:`offset` 81 | 82 | 重置offset 83 | ----------- 84 | 85 | * reset_subscription_offset接口能够重置offset信息并更新version 86 | 87 | .. code-block:: python 88 | 89 | offsets1 = { 90 | '0': OffsetWithSession(sequence0, timestamp0, version0, session_id0), 91 | '1': OffsetWithSession(sequence1, timestamp1, version1, session_id1) 92 | } 93 | 94 | offsets2 = { 95 | '0': { 96 | 'Sequence': 0, 97 | 'Timestamp': 0, 98 | 'Version': 0, 99 | 'SessionId': 0 100 | }, 101 | '1': { 102 | 'Sequence': 1, 103 | 'Timestamp': 1, 104 | 'Version': 1, 105 | 'SessionId': 1 106 | } 107 | } 108 | 109 | offsets3 = { 110 | '0': OffsetBase(sequence0, timestamp0), 111 | '1': OffsetBase(sequence1, timestamp1) 112 | } 113 | 114 | offsets4 = { 115 | '0': { 116 | 'Sequence': 0, 117 | 'Timestamp': 0 118 | }, 119 | '1': { 120 | 'Sequence': 1, 121 | 'Timestamp': 1 122 | } 123 | } 124 | 125 | dh.reset_subscription_offset(project_name, topic_name, sub_id, offsets1) 126 | dh.reset_subscription_offset(project_name, topic_name, sub_id, offsets2) 127 | dh.reset_subscription_offset(project_name, topic_name, sub_id, offsets3) 128 | dh.reset_subscription_offset(project_name, topic_name, sub_id, offsets4) 129 | 130 | 131 | 参数offsets是一个dict,其中的key是shard_id,value可以是OffsetBase对象以及其子类对象,也可以是一个dict。 132 | OffsetBase是OffsetWithVersion的父类,只包含sequence, timestamp 133 | 134 | 详细定义: 135 | :ref:`offset` -------------------------------------------------------------------------------- /docs/source/tutorial-project.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-project: 2 | 3 | ************* 4 | project操作 5 | ************* 6 | 7 | 项目(Project)是DataHub数据的基本组织单元,下面包含多个Topic。值得注意的是,DataHub的项目空间与MaxCompute的项目空间是相互独立的。用户在MaxCompute中创建的项目不能复用于DataHub,需要独立创建。 8 | 9 | 创建Project 10 | ----------- 11 | 12 | * create_project接口创建新的Project 13 | 14 | .. code-block:: python 15 | 16 | dh.create_project(project_name, comment) 17 | 18 | 创建Project需要提供Project的名字和描述,Project的名字长度限制为[3,32],必须以英文字母开头,仅允许英文字母、数字及“_”,大小写不敏感。 19 | 20 | 删除Project 21 | ----------- 22 | 23 | * delete_project接口删除Project 24 | 25 | .. code-block:: python 26 | 27 | dh.delete_project(project_name) 28 | 29 | 要删除Project,必须保证Project内没有Topic。 30 | 31 | 列出Project 32 | ----------- 33 | 34 | * list_project接口能够获取datahub服务下的所有Project的名字 35 | 36 | .. code-block:: python 37 | 38 | projects_result = dh.list_project() 39 | 40 | list_project返回的结果是ListProjectResult对象,其中包含成员project_names,是一个包含Project名字的list。 41 | 42 | 查询Project 43 | ----------- 44 | 45 | * get_project接口获取一个Project的详细信息 46 | 47 | .. code-block:: python 48 | 49 | project_result = dh.get_project(project_name) 50 | 51 | get_project返回的结果是GetProjectResult对象,其中包含project_name, comment, create_time, last_modify_time这四个成员。 52 | -------------------------------------------------------------------------------- /docs/source/tutorial-schema.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-schema: 2 | 3 | ************* 4 | schema类型 5 | ************* 6 | 7 | schema是用来标明数据存储的名称和对应类型的,在创建tuple topic 和 读写 record 的时候用到。因为网络传输中,数据都是以字符串的形式发送,需要schema来转换成对应的类型。 8 | 9 | 获取schema 10 | =========== 11 | 12 | * 对于已创建的topic,可以使用get_topic接口来获取schema信息 13 | 14 | .. code-block:: python 15 | 16 | topic_result = dh.get_topic(project_name, topic_name) 17 | record_schema = topic_result.record_schema 18 | 19 | 20 | 详细定义: 21 | :ref:`schema` 22 | 23 | 定义schema 24 | =========== 25 | 26 | 要创建新的tuple topic,需要自己定义schema,schema可以通过以下方式进行初始化 27 | 28 | 详细定义: 29 | :ref:`schema` 30 | 31 | * 通过lists定义schema 32 | 33 | .. code-block:: python 34 | 35 | from datahub.models import RecordSchema, FieldType, Field 36 | 37 | record_schema1 = RecordSchema.from_lists( 38 | ['bigint_field' , 'string_field' , 'double_field' , 'bool_field' , 'event_time1' ], 39 | [FieldType.BIGINT, FieldType.STRING, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP] 40 | ) 41 | 42 | record_schema2 = RecordSchema.from_lists( 43 | ['bigint_field' , 'string_field' , 'double_field' , 'bool_field' , 'event_time1' ], 44 | [FieldType.BIGINT, FieldType.STRING, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP], 45 | [True , False , True , False , True ] 46 | ) 47 | 48 | 必须的参数为2个list,第一个list是对应field的名称,第二个list是对应field的类型,第三个list可选,True为对应feild允许为None, False为对应field不能为None,不传第三个list则默认所有field都为True,即可以为None 49 | 50 | * 通过json字符串定义schema 51 | 52 | .. code-block:: python 53 | 54 | record_schema_1 = RecordSchema.from_json_str(json_str) 55 | 56 | json字符串的格式如下: 57 | 58 | "{\"fields\":[{\"type\":\"BIGINT\",\"name\":\"a\"},{\"type\":\"STRING\",\"name\":\"b\"}]}" 59 | 60 | * 逐个对schema进行set 61 | 62 | .. code-block:: python 63 | 64 | record_schema = RecordSchema() 65 | record_schema.add_field(Field('bigint_field', FieldType.BIGINT)) 66 | record_schema.add_field(Field('string_field', FieldType.STRING), False) 67 | record_schema.add_field(Field('double_field', FieldType.DOUBLE)) 68 | record_schema.add_field(Field('bool_field', FieldType.BOOLEAN)) 69 | record_schema.add_field(Field('event_time1', FieldType.TIMESTAMP)) 70 | 71 | 参数为Field对象,Field构造函数第一个参数是field的名称,第二个是field的类型,第三个参数可选,True表示field的值允许为None, False表示field的值不能为None,True,即可以为None -------------------------------------------------------------------------------- /docs/source/tutorial-shard.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-shard: 2 | 3 | ************* 4 | shard操作 5 | ************* 6 | 7 | Shard表示对一个Topic进行数据传输的并发通道,每个Shard会有对应的ID。每个Shard会有多种状态: Opening - 启动中,Active - 启动完成可服务。每个Shard启用以后会占用一定的服务端资源,建议按需申请Shard数量。 8 | 9 | 列出shard 10 | ----------- 11 | 12 | * list_shard接口列出topic中所有的shard信息 13 | 14 | .. code-block:: python 15 | 16 | shards_result = dh.list_shard(project_name, topic_name) 17 | 18 | list_shard返回的结果是ListShardResult对象,其中包含shards成员,是Shard对象的list,Shard对象包含shard_id, begin_hash_key, end_hash_key, state等多个信息。 19 | 20 | 详细定义: 21 | :ref:`shard`, :ref:`Results` 22 | 23 | 合并shard 24 | ----------- 25 | 26 | * merge_shard接口合并两个相邻的shard 27 | 28 | .. code-block:: python 29 | 30 | merge_result = dh.merge_shard(project_name, topic_name, shard_id, adj_shard_id) 31 | 32 | 传入两个相邻的shard id,合并成功返回MergeShardResult对象,其中包含新生成的shard_id, begin_hash_key, end_hash_key三个成员。 33 | 34 | 详细定义: 35 | :ref:`shard`, :ref:`Results` 36 | 37 | 分裂shard 38 | ----------- 39 | 40 | * split_shard接口根据所给的split key将指定的shard分裂为2个相邻的shard 41 | 42 | .. code-block:: python 43 | 44 | split_result = dh.split_shard(project_name, topic_name, shard_id) 45 | split_result = dh.split_shard(project_name, topic_name, shard_id, split_key) 46 | 47 | split_shard返回的结果是SplitShardResult对象,其中包含成员new_shards,是一个ShardBase对象的list,ShardBase对象只包含shard_id, begin_hash_key, end_hash_key三个信息。 48 | 如果不指定split_key,会自动查询该shard的hash key的范围,为split_key指定一个中间值进行分裂。 49 | 50 | 详细定义: 51 | :ref:`shard`, :ref:`Results` -------------------------------------------------------------------------------- /docs/source/tutorial-subscription.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-subscription: 2 | 3 | ******************** 4 | subscription操作 5 | ******************** 6 | 7 | 订阅服务提供了服务端保存用户消费点位的功能,只需要通过简单配置和处理,就可以实现高可用的点位存储服务。 8 | 9 | 创建subscription 10 | --------------------- 11 | 12 | * create_subscription能够创建一个新的订阅 13 | 14 | .. code-block:: python 15 | 16 | create_result = dh.create_subscription(project_name, topic_name, 'comment') 17 | 18 | create_subscription返回的结果是CreateSubscriptionResult对象,其中包含sub_id成员,即创建的订阅id 19 | 20 | 详细定义: 21 | :ref:`Results` 22 | 23 | 删除subscription 24 | ------------------- 25 | 26 | * delete_subscription接口删除一个订阅 27 | 28 | .. code-block:: python 29 | 30 | dh.delete_subscription(project_name, topic_name, sub_id) 31 | 32 | 传入需要删除的sub_id来删除指定的订阅 33 | 34 | 查询subscription 35 | -------------------- 36 | 37 | * get_subscription接口能够查询subscription的详细信息 38 | 39 | .. code-block:: python 40 | 41 | subscription_result = dh.get_subscription(project_name, topic_name, create_result.sub_id) 42 | 43 | get_subscription返回的是GetSubscriptionResult对象,其中包含成员comment, create_time, is_owner, last_modify_time, state, sub_id, topic_name, type。 44 | 其中state是SubscriptionState枚举类的对象,分为ACTIVE和INACTIVE。 45 | 46 | 详细定义: 47 | :ref:`subscription`, :ref:`Results` 48 | 49 | 更新subscription 50 | -------------------- 51 | 52 | * update_subscription接口能够更新subscription 53 | 54 | .. code-block:: python 55 | 56 | dh.update_subscription(project_name, topic_name, sub_id, new_comment) 57 | 58 | update_subscription更新对应sub_id的subscription的comment 59 | 60 | 更新subscription状态 61 | ------------------------ 62 | 63 | * update_subscription_state接口更新subscription的状态 64 | 65 | .. code-block:: python 66 | 67 | dh.update_subscription_state(project_name, topic_name, sub_id, state) 68 | 69 | update_subscription_state更新对应sub_id的subscription状态,state是SubscriptionState枚举类的对象,分为ACTIVE和INACTIVE。 70 | 71 | 列出subscription 72 | ------------------- 73 | 74 | * list_subscription接口列出topic下的所有subscription 75 | 76 | .. code-block:: python 77 | 78 | subscriptions_result = dh.list_subscription(project_name, topic_name, query_key, page_index, page_size) 79 | 80 | 传入query_key作为搜索条件,可以传空字符串,通过page_index和page_size获取指定范围的subscription信息,如page_index=1, page_size=10,获取1-10个subscription; 81 | page_index=2, page_size=5则获取6-10的subscription。 82 | list_subscription返回的是ListSubscriptionResult对象,其中包含total_count和subscriptions两个成员。 83 | total_count是topic下总共包含的subscription数量,subscriptions是Subscription对象的list。 84 | Subscription对象包含成员comment, create_time, is_owner, last_modify_time, state, sub_id, topic_name, type。 85 | 其中state是SubscriptionState枚举类的对象,分为ACTIVE和INACTIVE。 86 | 87 | 详细定义: 88 | :ref:`subscription`, :ref:`Results` -------------------------------------------------------------------------------- /docs/source/tutorial-topic.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial-topic: 2 | 3 | ************* 4 | topic操作 5 | ************* 6 | 7 | Topic是 DataHub 订阅和发布的最小单位,用户可以用Topic来表示一类或者一种流数据。目前支持Tuple与Blob两种类型。Tuple类型的Topic支持类似于数据库的记录的数据,每条记录包含多个列。Blob类型的Topic仅支持写入一块二进制数据。 8 | 9 | Tuple Topic 10 | =========== 11 | 12 | Tuple类型Topic写入的数据是有格式的,需要指定Record Schema,目前支持以下几种数据类型: 13 | 14 | +-----------+------------------------------------+---------------------------------------------------+ 15 | | 类型 | 含义 | 值域 | 16 | +===========+====================================+===================================================+ 17 | | Bigint | 8字节有符号整型。 | -9223372036854775807 ~ 9223372036854775807 | 18 | +-----------+------------------------------------+---------------------------------------------------+ 19 | | String | 字符串,只支持UTF-8编码。 | 单个String列最长允许1MB。 | 20 | +-----------+------------------------------------+---------------------------------------------------+ 21 | | Boolean | 布尔类型 | True/False或true/false或0/1 | 22 | +-----------+------------------------------------+---------------------------------------------------+ 23 | | Double | 8字节双精度浮点数 | -1.0 * 10^308 ~ 1.0 * 10^308 | 24 | +-----------+------------------------------------+---------------------------------------------------+ 25 | | TimeStamp | 时间戳类型 | 表示到微秒的时间戳类型 | 26 | +-----------+------------------------------------+---------------------------------------------------+ 27 | 28 | 创建示例 29 | -------- 30 | 31 | .. code-block:: python 32 | 33 | project_name = 'topic_test_project' 34 | topic_name = 'tuple_topic_test_topic' 35 | shard_count = 3 36 | life_cycle = 7 37 | 38 | record_schema = RecordSchema.from_lists( 39 | ['bigint_field', 'string_field', 'double_field', 'bool_field', 'time_field' ], 40 | [FieldType.BIGINT, FieldType.STRING, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP] 41 | ) 42 | 43 | try: 44 | dh.create_tuple_topic(project_name, topic_name, shard_count, life_cycle, record_schema, 'comment') 45 | print("create topic success!") 46 | print("=======================================\n\n") 47 | except InvalidParameterException as e: 48 | print(e) 49 | print("=======================================\n\n") 50 | except ResourceExistException as e: 51 | print("topic already exist!") 52 | print("=======================================\n\n") 53 | except Exception as e: 54 | print(traceback.format_exc()) 55 | sys.exit(-1) 56 | 57 | 新增field 58 | ------------- 59 | 60 | .. code-block:: python 61 | 62 | dh.append_field(project_name, topic_name, field_name, field_type) 63 | 64 | 新增field必须是allow_null为True的,给出field_name和field_type作为参数即可,field_type为FieldType枚举类型。 65 | 66 | Blob Topic 67 | ========== 68 | 69 | Blob类型Topic支持写入一块二进制数据作为一个Record,数据将会以BASE64编码传输。 70 | 71 | 创建示例 72 | -------- 73 | 74 | .. code-block:: python 75 | 76 | project_name = 'topic_test_project' 77 | topic_name = 'blob_topic_test_topic' 78 | shard_count = 3 79 | life_cycle = 7 80 | 81 | 82 | try: 83 | dh.create_blob_topic(project_name, topic_name, shard_count, life_cycle, 'comment') 84 | print("create topic success!") 85 | print("=======================================\n\n") 86 | except InvalidParameterException as e: 87 | print(e) 88 | print("=======================================\n\n") 89 | except ResourceExistException as e: 90 | print("topic already exist!") 91 | print("=======================================\n\n") 92 | except Exception as e: 93 | print(traceback.format_exc()) 94 | sys.exit(-1) 95 | 96 | -------------------------------------------------------------------------------- /docs/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | ************* 4 | 快速上手 5 | ************* 6 | 7 | 8 | Datahub相关的基本概念 9 | ===================== 10 | 11 | 详情参见 `DataHub基本概念 `_ 。 12 | 13 | 准备工作 14 | ======== 15 | 16 | * 访问DataHub服务需要使用阿里云认证账号,需要提供阿里云accessId及accessKey。 同时需要提供可访问的DataHub服务地址。 17 | * 登陆 `Datahub WebConsole页面 `_ ,创建Project 18 | 19 | 日志信息 20 | ========= 21 | 22 | 可以在自己的代码中设置日志的输出和打印级别,sdk中主要包含一些debug日志和error日志,以下是将sdk的DEBUG日志打印到控制台的配置样例 23 | 24 | .. code-block:: python 25 | 26 | import logging 27 | 28 | logger = logging.getLogger('datahub') 29 | sh = logging.StreamHandler() 30 | sh.setLevel(logging.DEBUG) 31 | logger.addHandler(sh) 32 | 33 | 34 | 初始化DataHub对象 35 | ================= 36 | 37 | Datahub Python SDK提供的所有API接口均由 ``datahub.DataHub`` 类实现,所以第一步就是初始化一个DataHub对象。 38 | 可选项: 39 | 40 | * Datahub支持三种协议,主要在put/get record时,分别为Json、Protobuf和Batch协议。可在初始化Datahub对象时指定协议类型,默认为Json类型。 41 | * Datahub支持三种压缩类型,分别为DEFLATE、LZ4和ZLIB,默认为LZ4类型。 42 | 43 | .. code-block:: python 44 | 45 | from datahub import DataHub, DatahubProtocolType 46 | from datahub.models import CompressFormat 47 | 48 | access_id = ***your access id*** 49 | access_key = ***your access key*** 50 | endpoint = ***your datahub server endpoint*** 51 | dh = DataHub(access_id, access_key, endpoint) 52 | dh = DataHub(access_id, access_key, endpoint, protocol_type=DatahubProtocolType.PB) 53 | dh = DataHub(access_id, access_key, endpoint, protocol_type=DatahubProtocolType.BATCH) 54 | dh = DataHub(access_id, access_key, endpoint) # use lz4 compression when put/get record 55 | dh = DataHub(access_id, access_key, endpoint, compress_format=CompressFormat.ZLIB) # use zlib compression when put/get record 56 | 57 | 更多详细定义: 58 | :ref:`datahub_client` 59 | 60 | 接口示例 61 | ================= 62 | 针对常用接口分别给出以下示例: 63 | 64 | .. toctree:: 65 | :maxdepth: 1 66 | 67 | tutorial-project 68 | tutorial-topic 69 | tutorial-schema 70 | tutorial-record 71 | tutorial-shard 72 | tutorial-meter 73 | tutorial-connector 74 | tutorial-subscription 75 | tutorial-offset 76 | tutorial-client 77 | -------------------------------------------------------------------------------- /examples/client/datahub.config.template: -------------------------------------------------------------------------------- 1 | [datahub] 2 | endpoint = 3 | access_id = 4 | access_key = 5 | 6 | project_name = 7 | topic_name = 8 | sub_id = 9 | 10 | [common] 11 | retry_times = 12 | async_thread_limit = 13 | thread_queue_limit = 14 | 15 | [consumer] 16 | auto_ack_offset = 17 | session_timeout = 18 | fetch_limit = 19 | max_record_buffer_size = 20 | 21 | [producer] 22 | max_async_buffer_records = 23 | max_async_buffer_size = 24 | max_async_buffer_time = 25 | max_record_pack_queue_limit = 26 | -------------------------------------------------------------------------------- /examples/client/example_collaborative_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import os.path 22 | import configparser 23 | from datahub.core import DatahubProtocolType 24 | from datahub.exceptions import DatahubException 25 | from datahub.models.compress import CompressFormat 26 | from datahub.client import DatahubConsumer, ConsumerConfig 27 | 28 | 29 | parser = configparser.ConfigParser() 30 | parser.read(filenames=os.path.join("./datahub.config.template")) 31 | 32 | endpoint = parser.get("datahub", "endpoint") 33 | access_id = parser.get("datahub", "access_id") 34 | access_key = parser.get("datahub", "access_key") 35 | project_name = parser.get("datahub", "project_name") 36 | topic_name = parser.get("datahub", "topic_name") 37 | sub_id = parser.get("datahub", "sub_id") 38 | protocol_type = DatahubProtocolType.PB 39 | compress_format = CompressFormat.LZ4 40 | 41 | retry_times = parser.get("common", "retry_times") 42 | retry_times = int(retry_times) if len(retry_times) > 0 else -1 43 | async_thread_limit = parser.get("common", "async_thread_limit") 44 | async_thread_limit = int(async_thread_limit) if len(async_thread_limit) > 0 else -1 45 | thread_queue_limit = parser.get("common", "thread_queue_limit") 46 | thread_queue_limit = int(thread_queue_limit) if len(thread_queue_limit) > 0 else -1 47 | 48 | auto_ack_offset = parser.get("consumer", "auto_ack_offset") 49 | auto_ack_offset = bool(int(auto_ack_offset)) if len(auto_ack_offset) > 0 else True 50 | session_timeout = parser.get("consumer", "session_timeout") 51 | session_timeout = int(session_timeout) if len(session_timeout) > 0 else -1 52 | max_record_buffer_size = parser.get("consumer", "max_record_buffer_size") 53 | max_record_buffer_size = int(max_record_buffer_size) if len(max_record_buffer_size) > 0 else -1 54 | fetch_limit = parser.get("consumer", "fetch_limit") 55 | fetch_limit = int(fetch_limit) if len(fetch_limit) > 0 else -1 56 | 57 | consumer_config = ConsumerConfig(access_id, access_key, endpoint) 58 | 59 | if retry_times > 0: 60 | consumer_config.retry_times = retry_times 61 | if async_thread_limit > 0: 62 | consumer_config.async_thread_limit = async_thread_limit 63 | if thread_queue_limit > 0: 64 | consumer_config.thread_queue_limit = thread_queue_limit 65 | if auto_ack_offset is False: 66 | consumer_config.auto_ack_offset = auto_ack_offset 67 | if session_timeout > 0: 68 | consumer_config.session_timeout = session_timeout 69 | if max_record_buffer_size > 0: 70 | consumer_config.max_record_buffer_size = max_record_buffer_size 71 | if fetch_limit > 0: 72 | consumer_config.fetch_limit = fetch_limit 73 | 74 | 75 | def process_result(record): 76 | pass 77 | 78 | 79 | def collaborative_consume(): 80 | datahub_consumer = DatahubConsumer(project_name, topic_name, sub_id, consumer_config) 81 | 82 | record_cnt = 0 83 | try: 84 | while True: 85 | record = datahub_consumer.read(timeout=60) 86 | if record is None: 87 | break 88 | # TODO: deal with record data 89 | process_result(record) 90 | record_cnt += 1 91 | if not consumer_config.auto_ack_offset: # ack the record manually if auto_ack_offset is False 92 | record.record_key.ack() 93 | except DatahubException as e: 94 | print("Read record fail. DatahubException: ", e) 95 | finally: 96 | datahub_consumer.close() 97 | 98 | print("Read {} records total".format(record_cnt)) 99 | 100 | 101 | if __name__ == "__main__": 102 | collaborative_consume() 103 | 104 | -------------------------------------------------------------------------------- /examples/client/example_general_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import os.path 22 | import configparser 23 | from datahub.core import DatahubProtocolType 24 | from datahub.exceptions import DatahubException 25 | from datahub.models.compress import CompressFormat 26 | from datahub.client import DatahubConsumer, ConsumerConfig 27 | 28 | 29 | parser = configparser.ConfigParser() 30 | parser.read(filenames=os.path.join("./datahub.config.template")) 31 | 32 | endpoint = parser.get("datahub", "endpoint") 33 | access_id = parser.get("datahub", "access_id") 34 | access_key = parser.get("datahub", "access_key") 35 | project_name = parser.get("datahub", "project_name") 36 | topic_name = parser.get("datahub", "topic_name") 37 | sub_id = parser.get("datahub", "sub_id") 38 | protocol_type = DatahubProtocolType.PB 39 | compress_format = CompressFormat.LZ4 40 | 41 | retry_times = parser.get("common", "retry_times") 42 | retry_times = int(retry_times) if len(retry_times) > 0 else -1 43 | async_thread_limit = parser.get("common", "async_thread_limit") 44 | async_thread_limit = int(async_thread_limit) if len(async_thread_limit) > 0 else -1 45 | thread_queue_limit = parser.get("common", "thread_queue_limit") 46 | thread_queue_limit = int(thread_queue_limit) if len(thread_queue_limit) > 0 else -1 47 | 48 | auto_ack_offset = parser.get("consumer", "auto_ack_offset") 49 | auto_ack_offset = bool(int(auto_ack_offset)) if len(auto_ack_offset) > 0 else True 50 | session_timeout = parser.get("consumer", "session_timeout") 51 | session_timeout = int(session_timeout) if len(session_timeout) > 0 else -1 52 | max_record_buffer_size = parser.get("consumer", "max_record_buffer_size") 53 | max_record_buffer_size = int(max_record_buffer_size) if len(max_record_buffer_size) > 0 else -1 54 | fetch_limit = parser.get("consumer", "fetch_limit") 55 | fetch_limit = int(fetch_limit) if len(fetch_limit) > 0 else -1 56 | 57 | consumer_config = ConsumerConfig(access_id, access_key, endpoint) 58 | 59 | if retry_times > 0: 60 | consumer_config.retry_times = retry_times 61 | if async_thread_limit > 0: 62 | consumer_config.async_thread_limit = async_thread_limit 63 | if thread_queue_limit > 0: 64 | consumer_config.thread_queue_limit = thread_queue_limit 65 | if auto_ack_offset is False: 66 | consumer_config.auto_ack_offset = auto_ack_offset 67 | if session_timeout > 0: 68 | consumer_config.session_timeout = session_timeout 69 | if max_record_buffer_size > 0: 70 | consumer_config.max_record_buffer_size = max_record_buffer_size 71 | if fetch_limit > 0: 72 | consumer_config.fetch_limit = fetch_limit 73 | 74 | 75 | def process_result(record): 76 | pass 77 | 78 | 79 | def general_consume(): 80 | shard_ids = ["0", "1", "2"] 81 | datahub_consumer = DatahubConsumer(project_name, topic_name, sub_id, consumer_config, shard_ids) 82 | 83 | record_cnt = 0 84 | try: 85 | while True: 86 | record = datahub_consumer.read(timeout=5) 87 | if record is None: 88 | break 89 | # TODO: deal with record data 90 | process_result(record) 91 | record_cnt += 1 92 | if not consumer_config.auto_ack_offset: # ack the record manually if auto_ack_offset is False 93 | record.record_key.ack() 94 | except DatahubException as e: 95 | print("Read record fail. DatahubException: ", e) 96 | finally: 97 | datahub_consumer.close() 98 | 99 | print("Read {} records total".format(record_cnt)) 100 | 101 | 102 | if __name__ == "__main__": 103 | general_consume() 104 | -------------------------------------------------------------------------------- /examples/client/example_general_producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | import os.path 22 | import configparser 23 | from datahub.core import DatahubProtocolType 24 | from datahub.exceptions import DatahubException 25 | from datahub.models import BlobRecord, FieldType, TupleRecord, CompressFormat, RecordType 26 | from datahub.client import DatahubProducer, ProducerConfig 27 | 28 | 29 | RECORD_NUM = 5 30 | EPOCH_NUM = 10 31 | 32 | parser = configparser.ConfigParser() 33 | parser.read(filenames=os.path.join("./datahub.config.template")) 34 | 35 | endpoint = parser.get("datahub", "endpoint") 36 | access_id = parser.get("datahub", "access_id") 37 | access_key = parser.get("datahub", "access_key") 38 | project_name = parser.get("datahub", "project_name") 39 | topic_name = parser.get("datahub", "topic_name") 40 | protocol_type = DatahubProtocolType.PB 41 | compress_format = CompressFormat.LZ4 42 | 43 | retry_times = parser.get("common", "retry_times") 44 | retry_times = int(retry_times) if len(retry_times) > 0 else -1 45 | async_thread_limit = parser.get("common", "async_thread_limit") 46 | async_thread_limit = int(async_thread_limit) if len(async_thread_limit) > 0 else -1 47 | thread_queue_limit = parser.get("common", "thread_queue_limit") 48 | thread_queue_limit = int(thread_queue_limit) if len(thread_queue_limit) > 0 else -1 49 | 50 | producer_config = ProducerConfig(access_id, access_key, endpoint) 51 | 52 | if retry_times > 0: 53 | producer_config.retry_times = retry_times 54 | if async_thread_limit > 0: 55 | producer_config.async_thread_limit = async_thread_limit 56 | if thread_queue_limit > 0: 57 | producer_config.thread_queue_limit = thread_queue_limit 58 | 59 | 60 | def gen_blob_record(data): 61 | record = BlobRecord(data) 62 | record.put_attribute("key", "value") 63 | return record 64 | 65 | 66 | def gen_tuple_record(schema): 67 | record = TupleRecord(schema=schema) 68 | for id, field in enumerate(schema.field_list): 69 | if field.type in (FieldType.BOOLEAN, ): 70 | record.set_value(id, True) 71 | elif field.type in (FieldType.DOUBLE, FieldType.FLOAT, ): 72 | record.set_value(id, 1.23) 73 | elif field.type in (FieldType.BIGINT, FieldType.INTEGER, FieldType.SMALLINT, FieldType.TINYINT, ): 74 | record.set_value(id, 123) 75 | elif field.type in (FieldType.STRING, ): 76 | record.set_value(id, "123") 77 | elif field.type in (FieldType.TIMESTAMP, ): 78 | record.set_value(id, 123456789) 79 | elif field.type in (FieldType.DECIMAL, ): 80 | record.set_value(id, 123) 81 | record.put_attribute("key", "value") 82 | return record 83 | 84 | 85 | def gen_records(topic_meta, record_num): 86 | records = [] 87 | if topic_meta.record_type == RecordType.BLOB: 88 | for i in range(record_num): 89 | data = "test_record_{}".format(i) 90 | records.append(gen_blob_record(data)) 91 | else: 92 | for i in range(record_num): 93 | records.append(gen_tuple_record(topic_meta.record_schema)) 94 | return records 95 | 96 | 97 | def general_produce(): 98 | datahub_producer = DatahubProducer(project_name, topic_name, producer_config) 99 | records = gen_records(datahub_producer.topic_meta, RECORD_NUM) 100 | 101 | shard_records = dict() 102 | try: 103 | for i in range(EPOCH_NUM): 104 | shard_id = datahub_producer.write(records) 105 | if shard_id is None: 106 | break 107 | 108 | if shard_id not in shard_records: 109 | shard_records[shard_id] = 0 110 | shard_records[shard_id] += RECORD_NUM 111 | except DatahubException as e: 112 | print("Write record fail. DatahubException: ", e) 113 | finally: 114 | datahub_producer.close() 115 | 116 | for shard_id, cnt in shard_records.items(): 117 | print("Write {} records to shard {}".format(cnt, shard_id)) 118 | 119 | 120 | if __name__ == "__main__": 121 | general_produce() 122 | -------------------------------------------------------------------------------- /examples/datahub/blob/blob_topic_pub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import sys 21 | import traceback 22 | 23 | from datahub import DataHub 24 | from datahub.exceptions import DatahubException, ResourceExistException 25 | from datahub.models import RecordType, BlobRecord 26 | 27 | access_id = '******* your access id *******' 28 | access_key = '******* your access key *******' 29 | endpoint = '******* your endpoint *******' 30 | 31 | dh = DataHub(access_id, access_key, endpoint) 32 | 33 | project_name = 'blob_record_test' 34 | topic_name = 'blob_record_test' 35 | shard_count = 3 36 | life_cycle = 7 37 | 38 | try: 39 | dh.create_project(project_name, 'comment') 40 | print("create project success!") 41 | print("=======================================\n\n") 42 | except ResourceExistException as e: 43 | print("project already exist!") 44 | print("=======================================\n\n") 45 | except Exception: 46 | print(traceback.format_exc()) 47 | sys.exit(-1) 48 | 49 | try: 50 | dh.create_blob_topic(project_name, topic_name, shard_count, life_cycle, 'comment') 51 | print("create topic success!") 52 | print("=======================================\n\n") 53 | except ResourceExistException as e: 54 | print("topic already exist!") 55 | print("=======================================\n\n") 56 | except Exception: 57 | print(traceback.format_exc()) 58 | sys.exit(-1) 59 | 60 | try: 61 | dh.wait_shards_ready(project_name, topic_name) 62 | 63 | topic = dh.get_topic(project_name, topic_name) 64 | print("get topic suc! topic=%s" % str(topic)) 65 | if topic.record_type != RecordType.BLOB: 66 | print("topic type illegal!") 67 | sys.exit(-1) 68 | print("=======================================\n\n") 69 | 70 | shards_result = dh.list_shard(project_name, topic_name) 71 | for shard in shards_result.shards: 72 | print(shard) 73 | print("=======================================\n\n") 74 | 75 | records = [] 76 | 77 | data = 'iVBORw0KGgoAAAANSUhEUgAAB5FrTVeMB4wHjAeMBD3nAgEU' 78 | 79 | record0 = BlobRecord(blob_data=data) 80 | record0.shard_id = '0' 81 | records.append(record0) 82 | 83 | record1 = BlobRecord(blob_data=data) 84 | record1.shard_id = '1' 85 | records.append(record1) 86 | 87 | record2 = BlobRecord(blob_data=data) 88 | record2.shard_id = '2' 89 | records.append(record2) 90 | 91 | failed_indices = dh.put_records(project_name, topic_name, records) 92 | print("put blob %d records, failed list: %s" % (len(records), failed_indices)) 93 | print("=======================================\n\n") 94 | 95 | except DatahubException as e: 96 | print(traceback.format_exc()) 97 | sys.exit(-1) 98 | else: 99 | sys.exit(-1) 100 | -------------------------------------------------------------------------------- /examples/datahub/blob/blob_topic_sub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import sys 21 | import time 22 | import traceback 23 | 24 | from datahub import DataHub 25 | from datahub.exceptions import DatahubException, ResourceExistException 26 | from datahub.models import RecordType, CursorType 27 | 28 | access_id = '******* your access id *******' 29 | access_key = '******* your access key *******' 30 | endpoint = '******* your endpoint *******' 31 | 32 | dh = DataHub(access_id, access_key, endpoint) 33 | 34 | project_name = 'blob_record_test' 35 | topic_name = 'blob_record_test' 36 | shard_count = 3 37 | life_cycle = 7 38 | record_type = RecordType.BLOB 39 | 40 | try: 41 | dh.create_project(project_name, 'comment') 42 | print("create project success!") 43 | print("=======================================\n\n") 44 | except ResourceExistException as e: 45 | print("project already exist!") 46 | print("=======================================\n\n") 47 | except Exception: 48 | print(traceback.format_exc()) 49 | sys.exit(-1) 50 | 51 | try: 52 | topic_result = dh.get_topic(project_name, topic_name) 53 | print("get topic suc! topic=%s" % str(topic_result)) 54 | if topic_result.record_type != RecordType.BLOB: 55 | print("topic type illegal!") 56 | sys.exit(-1) 57 | print("=======================================\n\n") 58 | 59 | cursor = dh.get_cursor(project_name, topic_name, '0', CursorType.OLDEST).cursor 60 | index = 0 61 | while True: 62 | get_result = dh.get_blob_records(project_name, topic_name, '0', cursor, 3) 63 | for record in get_result.records: 64 | print("blob data (%d): %s" % (index, record.blob_data)) 65 | index += 1 66 | if 0 == get_result.record_count: 67 | time.sleep(1) 68 | cursor = get_result.next_cursor 69 | 70 | except DatahubException as e: 71 | print(traceback.format_exc()) 72 | sys.exit(-1) 73 | -------------------------------------------------------------------------------- /examples/datahub/consume_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing,r 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import time 21 | 22 | from datahub import DataHub 23 | from datahub.exceptions import ResourceNotFoundException, InvalidParameterException, DatahubException, \ 24 | InvalidOperationException, OffsetResetException 25 | from datahub.models import CursorType, OffsetWithSession 26 | 27 | endpoint = '' 28 | access_id = '' 29 | access_key = '' 30 | project_name = '' 31 | topic_name = '' 32 | sub_id = '' 33 | shard_id = '0' 34 | shards = [shard_id] 35 | 36 | dh = DataHub(access_id, access_key, endpoint) 37 | 38 | try: 39 | offset_result = dh.init_and_get_subscription_offset(project_name, topic_name, sub_id, shards) 40 | offsets = offset_result.offsets 41 | except ResourceNotFoundException as e: 42 | print(e) 43 | exit(-1) 44 | except InvalidParameterException as e: 45 | print(e) 46 | exit(-1) 47 | except DatahubException as e: 48 | print(e) 49 | exit(-1) # or retry 50 | 51 | offset = offsets.get(shard_id) 52 | try: 53 | if offset.sequence >= 0: 54 | # sequence is valid 55 | sequence = offset.sequence 56 | cursor = dh.get_cursor(project_name, topic_name, shard_id, CursorType.SEQUENCE, sequence).cursor 57 | else: 58 | cursor = dh.get_cursor(project_name, topic_name, shard_id, CursorType.OLDEST).cursor 59 | except DatahubException as e: 60 | print(e) 61 | exit(-1) 62 | 63 | fetch_num = 10 64 | record_count = 0 65 | 66 | try: 67 | schema = dh.get_topic(project_name, topic_name).record_schema 68 | except DatahubException as e: 69 | print(e) 70 | exit(-1) 71 | 72 | while True: 73 | try: 74 | record_result = dh.get_tuple_records(project_name, topic_name, shard_id, schema, cursor, fetch_num) 75 | if record_result.record_count <= 0: 76 | time.sleep(1) 77 | continue 78 | 79 | for record in record_result.records: 80 | record_count += 1 81 | offset.sequence = record.sequence 82 | offset.timestamp = record.system_time 83 | 84 | if record_count % 1000 == 0: 85 | offsets_to_commit = { 86 | shard_id: OffsetWithSession( 87 | offset.sequence, 88 | offset.timestamp, 89 | offset.version, 90 | offset.session_id 91 | ) 92 | } 93 | print(offsets_to_commit) 94 | dh.update_subscription_offset(project_name, topic_name, sub_id, offsets_to_commit) 95 | 96 | cursor = record_result.next_cursor 97 | except OffsetResetException as e: 98 | new_offsets = dh.get_subscription_offset(project_name, topic_name, sub_id).offsets 99 | next_sequence = new_offsets.get(shard_id).sequence 100 | offset.version = new_offsets.get(shard_id).version 101 | cursor = dh.get_cursor(project_name, topic_name, shard_id, CursorType.SEQUENCE, next_sequence).cursor 102 | except InvalidOperationException as e: 103 | print(e) 104 | exit(-1) 105 | except DatahubException as e: 106 | print(e) 107 | exit(-1) # or retry 108 | -------------------------------------------------------------------------------- /examples/datahub/tuple/tuple_topic_pub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing,r 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import sys 21 | import traceback 22 | 23 | from datahub import DataHub 24 | from datahub.exceptions import DatahubException, ResourceExistException 25 | from datahub.models import RecordType, FieldType, RecordSchema, TupleRecord 26 | 27 | access_id = '******* your access id *******' 28 | access_key = '******* your access key *******' 29 | endpoint = '******* your endpoint *******' 30 | 31 | dh = DataHub(access_id, access_key, endpoint, read_timeout=10) 32 | 33 | project_name = 'tuple_record_test' 34 | topic_name = 'tuple_record_test' 35 | shard_count = 3 36 | life_cycle = 7 37 | record_type = RecordType.TUPLE 38 | record_schema = RecordSchema.from_lists( 39 | ['bigint_field', 'string_field', 'double_field', 'bool_field', 'time_field'], 40 | [FieldType.BIGINT, FieldType.STRING, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP], 41 | ["bigint_comment", "string_comment", "double_comment", "boolean_comment", "timestamp_comment"]) 42 | try: 43 | dh.create_project(project_name, 'comment') 44 | print("create project success!") 45 | print("=======================================\n\n") 46 | except ResourceExistException as e: 47 | print("project already exist!") 48 | print("=======================================\n\n") 49 | except Exception: 50 | print(traceback.format_exc()) 51 | sys.exit(-1) 52 | 53 | try: 54 | dh.create_tuple_topic(project_name, topic_name, shard_count, life_cycle, record_schema, 'comment') 55 | print("create topic success!") 56 | print("=======================================\n\n") 57 | except ResourceExistException as e: 58 | print("topic already exist!") 59 | print("=======================================\n\n") 60 | except Exception: 61 | print(traceback.format_exc()) 62 | sys.exit(-1) 63 | 64 | try: 65 | # block等待所有shard状态ready 66 | dh.wait_shards_ready(project_name, topic_name) 67 | 68 | topic = dh.get_topic(project_name, topic_name) 69 | print("get topic suc! topic=%s" % str(topic)) 70 | if topic.record_type != RecordType.TUPLE: 71 | print("topic type illegal!") 72 | sys.exit(-1) 73 | print("=======================================\n\n") 74 | 75 | shards_result = dh.list_shard(project_name, topic_name) 76 | shards = shards_result.shards 77 | for shard in shards: 78 | print(shard) 79 | print("=======================================\n\n") 80 | 81 | while True: 82 | records = [] 83 | 84 | record0 = TupleRecord(schema=topic.record_schema, values=[1, 'yc1', 10.01, True, 1455869335000000]) 85 | record0.shard_id = shards[0].shard_id 86 | record0.put_attribute('AK', '47') 87 | records.append(record0) 88 | 89 | record1 = TupleRecord(schema=topic.record_schema) 90 | record1.values = [1, 'yc1', 10.01, True, 1455869335000000] 91 | record1.shard_id = shards[1].shard_id 92 | records.append(record1) 93 | 94 | record2 = TupleRecord(schema=topic.record_schema) 95 | record2.set_value(0, 3) 96 | record2.set_value(1, 'yc3') 97 | record2.set_value('double_field', 10.03) 98 | record2.set_value('bool_field', False) 99 | record2.set_value('time_field', 1455869335000013) 100 | record2.shard_id = shards[2].shard_id 101 | records.append(record2) 102 | 103 | failed_indexs = dh.put_records(project_name, topic_name, records) 104 | print("put tuple %d records, failed list: %s" % (len(records), failed_indexs)) 105 | # failed_indexs如果非空最好对failed record再进行重试 106 | print("=======================================\n\n") 107 | 108 | 109 | except DatahubException as e: 110 | print(traceback.format_exc()) 111 | sys.exit(-1) 112 | else: 113 | sys.exit(-1) 114 | -------------------------------------------------------------------------------- /examples/datahub/tuple/tuple_topic_sub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import sys 21 | import time 22 | import traceback 23 | 24 | from datahub import DataHub 25 | from datahub.exceptions import DatahubException, ResourceExistException 26 | from datahub.models import RecordType, CursorType 27 | 28 | access_id = '******* your access id *******' 29 | access_key = '******* your access key *******' 30 | endpoint = '******* your endpoint *******' 31 | 32 | dh = DataHub(access_id, access_key, endpoint) 33 | 34 | project_name = 'tuple_record_test' 35 | topic_name = 'tuple_record_test' 36 | 37 | try: 38 | dh.create_project(project_name, 'comment') 39 | print("create project success!") 40 | print("=======================================\n\n") 41 | except ResourceExistException as e: 42 | print("project already exist!") 43 | print("=======================================\n\n") 44 | except Exception: 45 | print(traceback.format_exc()) 46 | sys.exit(-1) 47 | 48 | try: 49 | topic_result = dh.get_topic(project_name, topic_name) 50 | print("get topic suc! topic=%s" % str(topic_result)) 51 | if topic_result.record_type != RecordType.TUPLE: 52 | print("topic type illegal!") 53 | sys.exit(-1) 54 | print("=======================================\n\n") 55 | 56 | cursor = dh.get_cursor(project_name, topic_name, '0', CursorType.OLDEST).cursor 57 | while True: 58 | get_result = dh.get_tuple_records(project_name, topic_name, '0', topic_result.record_schema, cursor, 10) 59 | for record in get_result.records: 60 | print(record) 61 | if 0 == get_result.record_count: 62 | time.sleep(1) 63 | cursor = get_result.next_cursor 64 | 65 | except DatahubException as e: 66 | print(traceback.format_exc()) 67 | sys.exit(-1) 68 | -------------------------------------------------------------------------------- /examples/resources/datahub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/examples/resources/datahub.png -------------------------------------------------------------------------------- /readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # readthedocs.yaml 2 | version: 2 3 | 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.10" 8 | 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | python: 13 | install: 14 | - requirements: requirements.txt 15 | - requirements: docs/requirements.txt 16 | - method: pip 17 | path: . 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | funcsigs>=1.0.2 2 | requests>=2.4.0 3 | simplejson>=3.3.0 4 | six>=1.1.0 5 | enum34>=1.1.5; python_version < '3.4' 6 | crcmod>=1.7 7 | lz4>=2.0.0 8 | cprotobuf>=0.1.9 9 | atomic>=0.7.0 10 | rwlock>=0.0.6 11 | urllib3>=1.26.10 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | import os 20 | import sys 21 | 22 | import setuptools 23 | 24 | repo_root = os.path.dirname(os.path.abspath(__file__)) 25 | 26 | try: 27 | execfile 28 | except NameError: 29 | def execfile(fname, globs, locs=None): 30 | locs = locs or globs 31 | exec(compile(open(fname).read(), fname, "exec"), globs, locs) 32 | 33 | version = sys.version_info 34 | PY2 = version[0] == 2 35 | PY3 = version[0] == 3 36 | 37 | if PY2 and version[:2] < (2, 7): 38 | raise Exception('Datahub Python SDK supports Python 2.7+ (including Python 3.3+).') 39 | 40 | version_ns = {} 41 | execfile(os.path.join(repo_root, 'datahub', 'version.py'), version_ns) 42 | 43 | requirements = [] 44 | with open('requirements.txt') as f: 45 | requirements.extend(f.read().splitlines()) 46 | 47 | 48 | long_description = None 49 | if os.path.exists('README.rst'): 50 | with open('README.rst') as f: 51 | long_description = f.read() 52 | 53 | setuptools.setup( 54 | name='pydatahub', 55 | version=version_ns['__version__'], 56 | keywords='pydatahub, python, aliyun, datahub, sdk', 57 | description='Datahub Python SDK', 58 | long_description=long_description, 59 | author='panjinxing.pjx', 60 | author_email='panjinxing.pjx@alibaba-inc.com', 61 | url='https://github.com/aliyun/aliyun-datahub-sdk-python', 62 | packages=setuptools.find_packages(exclude=('tests', 'examples')), 63 | install_requires=requirements, 64 | license='Apache License 2.0' 65 | ) 66 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest 3 | pytest-cov 4 | coverage 5 | httmock -------------------------------------------------------------------------------- /tests/datahub.ini.template: -------------------------------------------------------------------------------- 1 | [datahub] 2 | access_id = 3 | access_key = 4 | endpoint = 5 | 6 | meter_test_project_name = 7 | meter_test_topic_name = 8 | meter_test_shard_id = 9 | 10 | connector_test_project_name = 11 | connector_test_topic_name = 12 | 13 | [connector] 14 | project_name = 15 | table_name = 16 | odps_endpoint = 17 | tunnel_endpoint = 18 | access_id = 19 | access_key = 20 | type = 21 | column_fields = -------------------------------------------------------------------------------- /tests/fixtures/projects.cursor.topics.invalid_param.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidParameter", 3 | "ErrorMessage": "Invalid time value: ErrorTime" 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.cursor.topics.success.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "Cursor": "20000000000000000000000000000000", 3 | "RecordTime": 0, 4 | "Sequence": 0 5 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.existed.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "ProjectAlreadyExist", 3 | "ErrorMessage": "The specified project already exists." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.existed.topics.existed.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "TopicAlreadyExist", 3 | "ErrorMessage": "The specified topic already exists." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.blob.shards.0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/fixtures/projects.get.topics.blob.shards.0.bin -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.blob.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "NextCursor": "20000000000000000000000000140001", 3 | "RecordCount": 1, 4 | "StartSeq": 0, 5 | "Records": [ 6 | { 7 | "SystemTime": 1526292424292, 8 | "Sequence": 0, 9 | "NextCursor": "20000000000000000000000000140001", 10 | "Data": "iVBORw0KGgoAAAANSUhEUgAAB5FrTVeMB4wHjAeMBD3nAgEU" 11 | } 12 | ] 13 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.blob_batch.shards.0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/fixtures/projects.get.topics.blob_batch.shards.0.bin -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.invalid_cursor.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidCursor", 3 | "ErrorMessage": "The cursor is invalid." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.invalid_cursor_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidCursor", 3 | "ErrorMessage": "The cursor is invalid." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.tuple.shards.0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/fixtures/projects.get.topics.tuple.shards.0.bin -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.tuple.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "NextCursor": "20000000000000000000000000830010", 3 | "RecordCount": 1, 4 | "StartSeq": 0, 5 | "Records": [ 6 | { 7 | "SystemTime": 1526293795168, 8 | "Sequence": 0, 9 | "NextCursor": "20000000000000000000000000830010", 10 | "Data": [ 11 | "1", 12 | "yc1", 13 | "10.01", 14 | "false", 15 | "1455869335000000" 16 | ], 17 | "Attributes": { 18 | "string": "string" 19 | } 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.get.topics.tuple_batch.shards.0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/fixtures/projects.get.topics.tuple_batch.shards.0.bin -------------------------------------------------------------------------------- /tests/fixtures/projects.json: -------------------------------------------------------------------------------- 1 | { 2 | "ProjectNames": [ 3 | "project_name_1", 4 | "project_name_2", 5 | "project_name_3" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/fixtures/projects.merge.topics.invalid_state.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidShardOperation", 3 | "ErrorMessage": "The specified shard is not active." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.merge.topics.limit_exceeded.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "LimitExceeded", 3 | "ErrorMessage": "Split/Merge shard operation limit exceeded." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.merge.topics.shards_not_adjacent.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidParameter", 3 | "ErrorMessage": "The two shards are not adjacent." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.merge.topics.success.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "BeginHashKey": "00000000000000000000000000000000", 3 | "EndHashKey": "55555555555555555555555555555555", 4 | "ShardId": "2" 5 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.meter.topics.success.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ActiveTime": 1590206, 3 | "Storage": 0 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.invalid_state.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidShardOperation", 3 | "ErrorMessage": "The specified shard is not active." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.invalid_state_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidShardOperation", 3 | "ErrorMessage": "The specified shard is not active." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.limit_exceeded.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "LimitExceeded", 3 | "ErrorMessage": "The query rate or throughput rate is exceeded." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.limit_exceeded_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "LimitExceeded", 3 | "ErrorMessage": "The query rate or throughput rate is exceeded." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.malformed.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "MalformedRecord", 3 | "ErrorMessage": "The record is not well-formed." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.malformed_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "MalformedRecord", 3 | "ErrorMessage": "The record is not well-formed." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.success.shards.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/fixtures/projects.put.topics.success.shards.bin -------------------------------------------------------------------------------- /tests/fixtures/projects.put.topics.success.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "FailedRecordCount": 0, 3 | "FailedRecords": [] 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.schema.topics.delete.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.schema.topics.get.json: -------------------------------------------------------------------------------- 1 | { 2 | "VersionId": 1, 3 | "CreateTime": 160212718, 4 | "Creator": "", 5 | "RecordSchema": "{\"fields\": [{\"name\": \"bigint_field\", \"type\": \"bigint\"}, {\"name\": \"string_field\", \"type\": \"string\"}, {\"name\": \"double_field\", \"type\": \"double\"}, {\"name\": \"bool_field\", \"type\": \"boolean\"}, {\"name\": \"time_field\", \"type\": \"timestamp\"}]}" 6 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.schema.topics.list.json: -------------------------------------------------------------------------------- 1 | { 2 | "PageCount": 1, 3 | "PageNumber": 1, 4 | "PageSize": 1, 5 | "RecordSchemaList": [ 6 | { 7 | "CreateTime": 1658132195, 8 | "Creator": "", 9 | "RecordSchema": "{\"fields\": [{\"name\": \"bigint_field\", \"type\": \"bigint\"}, {\"name\": \"string_field\", \"type\": \"string\"}, {\"name\": \"double_field\", \"type\": \"double\"}, {\"name\": \"bool_field\", \"type\": \"boolean\"}, {\"name\": \"time_field\", \"type\": \"timestamp\"}]}", 10 | "VersionId": 1 11 | } 12 | ], 13 | "TotalCount": 1 14 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.schema.topics.register.json: -------------------------------------------------------------------------------- 1 | { 2 | "VersionId": 0 3 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.split.topics.default.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "Protocol": "http1.1", 3 | "Interval": 300000, 4 | "Shards": [ 5 | { 6 | "BeginHashKey": "00000000000000000000000000000000", 7 | "BeginKey": "00000000000000000000000000000000", 8 | "Cluster": "AT-ODPS-TEST", 9 | "CreateTime": 1525344044, 10 | "EndHashKey": "55555555555555555555555555555555", 11 | "EndKey": "55555555555555555555555555555555", 12 | "LeftShardId": "4294967295", 13 | "ParentShardIds": [ 14 | "10", 15 | "11" 16 | ], 17 | "RightShardId": "1", 18 | "ShardId": "0", 19 | "State": "ACTIVE", 20 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 21 | } 22 | ], 23 | "NewShards": [ 24 | { 25 | "BeginHashKey": "00000000000000000000000000000000", 26 | "EndHashKey": "16666666666666666666666666666666", 27 | "ShardId": "3" 28 | }, 29 | { 30 | "BeginHashKey": "16666666666666666666666666666666", 31 | "EndHashKey": "55555555555555555555555555555555", 32 | "ShardId": "4" 33 | } 34 | ] 35 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.split.topics.invalid_key.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidParameter", 3 | "ErrorMessage": "The key range is invalid." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.split.topics.invalid_state.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidShardOperation", 3 | "ErrorMessage": "The specified shard is not active." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.split.topics.limit_exceeded.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "LimitExceeded", 3 | "ErrorMessage": "Split/Merge shard operation limit exceeded." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.split.topics.success.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "NewShards": [ 3 | { 4 | "BeginHashKey": "00000000000000000000000000000000", 5 | "EndHashKey": "16666666666666666666666666666666", 6 | "ShardId": "3" 7 | }, 8 | { 9 | "BeginHashKey": "16666666666666666666666666666666", 10 | "EndHashKey": "55555555555555555555555555555555", 11 | "ShardId": "4" 12 | } 13 | ] 14 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.success.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "get project", 3 | "CreateTime": 1525312757, 4 | "LastModifyTime": 1525312757 5 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.success.topics.blob.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "blob", 3 | "CreateTime": 1525344044, 4 | "LastModifyTime": 1525344044, 5 | "Lifecycle": 7, 6 | "RecordType": "BLOB", 7 | "ShardCount": 3 8 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.success.topics.json: -------------------------------------------------------------------------------- 1 | { 2 | "TopicNames": [ 3 | "topic_name_1", 4 | "topic_name_2", 5 | "topic_name_3" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/fixtures/projects.success.topics.success.connectors.json: -------------------------------------------------------------------------------- 1 | { 2 | "Connectors": [ 3 | "connector_1", 4 | "connector_2" 5 | ] 6 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.success.topics.success.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "Protocol": "http1.1", 3 | "Interval": 300000, 4 | "Shards": [ 5 | { 6 | "BeginHashKey": "00000000000000000000000000000000", 7 | "BeginKey": "00000000000000000000000000000000", 8 | "Cluster": "AT-ODPS-TEST", 9 | "CreateTime": 1525344044, 10 | "EndHashKey": "55555555555555555555555555555555", 11 | "EndKey": "55555555555555555555555555555555", 12 | "LeftShardId": "4294967295", 13 | "ParentShardIds": [ 14 | "10", 15 | "11" 16 | ], 17 | "RightShardId": "1", 18 | "ShardId": "0", 19 | "State": "ACTIVE", 20 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 21 | }, 22 | { 23 | "BeginHashKey": "55555555555555555555555555555555", 24 | "BeginKey": "55555555555555555555555555555555", 25 | "Cluster": "AT-ODPS-TEST", 26 | "CreateTime": 1525344044, 27 | "EndHashKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 28 | "EndKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 29 | "LeftShardId": "0", 30 | "ParentShardIds": [], 31 | "RightShardId": "2", 32 | "ShardId": "1", 33 | "State": "CLOSED", 34 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 35 | }, 36 | { 37 | "BeginHashKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 38 | "BeginKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 39 | "Cluster": "AT-ODPS-TEST", 40 | "CreateTime": 1525344044, 41 | "EndHashKey": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 42 | "EndKey": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 43 | "LeftShardId": "1", 44 | "ParentShardIds": [], 45 | "RightShardId": "4294967295", 46 | "ShardId": "2", 47 | "State": "OPENING", 48 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 49 | }, 50 | { 51 | "BeginHashKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 52 | "BeginKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 53 | "Cluster": "AT-ODPS-TEST", 54 | "CreateTime": 1525344044, 55 | "EndHashKey": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 56 | "EndKey": "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 57 | "LeftShardId": "1", 58 | "ParentShardIds": [], 59 | "RightShardId": "4294967295", 60 | "ShardId": "3", 61 | "State": "CLOSING", 62 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 63 | } 64 | ] 65 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.success.topics.tuple.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "tuple", 3 | "CreateTime": 1525312823, 4 | "LastModifyTime": 1525312823, 5 | "Lifecycle": 7, 6 | "RecordSchema": "{\"fields\":[{\"name\":\"bigint_field\",\"type\":\"BIGINT\"},{\"name\":\"string_field\",\"type\":\"STRING\"},{\"name\":\"double_field\",\"type\":\"DOUBLE\"},{\"name\":\"bool_field\",\"type\":\"BOOLEAN\"},{\"name\":\"event_time1\",\"type\":\"TIMESTAMP\"}]}", 7 | "RecordType": "TUPLE", 8 | "ShardCount": 3 9 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.valid.connectors.sink_odps.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.valid.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.valid.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.valid.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchProject", 3 | "ErrorMessage": "The specified project name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.unexisted.topics.valid_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.unexisted.connectors.sink_odps.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.unexisted.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.unexisted.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.unexisted.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic name does not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.valid.connectors.sink_odps.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "InvalidParameter", 3 | "ErrorMessage": "The specified parameter is invalid." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.valid.shards.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchShard", 3 | "ErrorMessage": "ShardId Not Exist. Invalid shard id:shard_test_p1524491527_1/meter_test/0" 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid.topics.valid.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchShard", 3 | "ErrorMessage": "Shard meta not exist." 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.valid_batch.topics.unexisted.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchTopic", 3 | "ErrorMessage": "The specified topic does not exist."} -------------------------------------------------------------------------------- /tests/fixtures/projects.valid_batch.topics.valid_batch.shards.0.bin: -------------------------------------------------------------------------------- 1 | { 2 | "ErrorCode": "NoSuchShard", 3 | "ErrorMessage": "ShardId Not Exist. Invalid shard id:shard_test_p1524491527_1/meter_test/0" 4 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.wait.topics.ready.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "Protocol": "http1.1", 3 | "Interval": 300000, 4 | "Shards": [ 5 | { 6 | "BeginHashKey": "00000000000000000000000000000000", 7 | "BeginKey": "00000000000000000000000000000000", 8 | "Cluster": "AT-ODPS-TEST", 9 | "CreateTime": 1525344044, 10 | "EndHashKey": "55555555555555555555555555555555", 11 | "EndKey": "55555555555555555555555555555555", 12 | "LeftShardId": "4294967295", 13 | "ParentShardIds": [ 14 | "10", 15 | "11" 16 | ], 17 | "RightShardId": "1", 18 | "ShardId": "0", 19 | "State": "ACTIVE", 20 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 21 | }, 22 | { 23 | "BeginHashKey": "55555555555555555555555555555555", 24 | "BeginKey": "55555555555555555555555555555555", 25 | "Cluster": "AT-ODPS-TEST", 26 | "CreateTime": 1525344044, 27 | "EndHashKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 28 | "EndKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 29 | "LeftShardId": "0", 30 | "ParentShardIds": [], 31 | "RightShardId": "2", 32 | "ShardId": "1", 33 | "State": "ACTIVE", 34 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /tests/fixtures/projects.wait.topics.unready.shards.json: -------------------------------------------------------------------------------- 1 | { 2 | "Protocol": "http1.1", 3 | "Interval": 300000, 4 | "Shards": [ 5 | { 6 | "BeginHashKey": "00000000000000000000000000000000", 7 | "BeginKey": "00000000000000000000000000000000", 8 | "Cluster": "AT-ODPS-TEST", 9 | "CreateTime": 1525344044, 10 | "EndHashKey": "55555555555555555555555555555555", 11 | "EndKey": "55555555555555555555555555555555", 12 | "LeftShardId": "4294967295", 13 | "ParentShardIds": [ 14 | "10", 15 | "11" 16 | ], 17 | "RightShardId": "1", 18 | "ShardId": "0", 19 | "State": "OPENING", 20 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 21 | }, 22 | { 23 | "BeginHashKey": "55555555555555555555555555555555", 24 | "BeginKey": "55555555555555555555555555555555", 25 | "Cluster": "AT-ODPS-TEST", 26 | "CreateTime": 1525344044, 27 | "EndHashKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 28 | "EndKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 29 | "LeftShardId": "0", 30 | "ParentShardIds": [], 31 | "RightShardId": "2", 32 | "ShardId": "1", 33 | "State": "CLOSING", 34 | "Worker": "Datahub/XStreamServicexishao4/XStreamBroker@rs3b12026.et2sqa" 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /tests/function/test_meter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | import os 20 | import sys 21 | 22 | import pytest 23 | from six.moves import configparser 24 | 25 | from datahub import DataHub 26 | from datahub.exceptions import InvalidOperationException 27 | 28 | current_path = os.path.split(os.path.realpath(__file__))[0] 29 | root_path = os.path.join(current_path, '../..') 30 | 31 | configer = configparser.ConfigParser() 32 | configer.read(os.path.join(current_path, '../datahub.ini')) 33 | access_id = configer.get('datahub', 'access_id') 34 | access_key = configer.get('datahub', 'access_key') 35 | endpoint = configer.get('datahub', 'endpoint') 36 | 37 | project_name = configer.get('datahub', 'meter_test_project_name') 38 | topic_name = configer.get('datahub', 'meter_test_topic_name') 39 | shard_id = configer.get('datahub', 'meter_test_shard_id') 40 | 41 | print("=======================================") 42 | print("access_id: %s" % access_id) 43 | print("access_key: %s" % access_key) 44 | print("endpoint: %s" % endpoint) 45 | print("=======================================\n\n") 46 | 47 | if not access_id or not access_key or not endpoint: 48 | print("[access_id, access_key, endpoint] must be set in datahub.ini!") 49 | sys.exit(-1) 50 | 51 | dh = DataHub(access_id, access_key, endpoint) 52 | 53 | 54 | def clean_topic(datahub_client, project_name, force=False): 55 | topic_names = datahub_client.list_topic(project_name).topic_names 56 | for topic_name in topic_names: 57 | if force: 58 | clean_subscription(datahub_client, project_name, topic_name) 59 | datahub_client.delete_topic(project_name, topic_name) 60 | 61 | 62 | def clean_project(datahub_client, force=False): 63 | project_names = datahub_client.list_project().project_names 64 | for project_name in project_names: 65 | if force: 66 | clean_topic(datahub_client, project_name) 67 | try: 68 | datahub_client.delete_project(project_name) 69 | except InvalidOperationException: 70 | pass 71 | 72 | 73 | def clean_subscription(datahub_client, project_name, topic_name): 74 | subscriptions = datahub_client.list_subscription(project_name, topic_name, '', 1, 100).subscriptions 75 | for subscription in subscriptions: 76 | datahub_client.delete_subscription(project_name, topic_name, subscription.sub_id) 77 | 78 | 79 | @pytest.mark.skipif(project_name == '' or topic_name == '' or shard_id == '', reason="meter test shard isn\'t set") 80 | class TestMeter: 81 | 82 | def test_get_metering_info(self): 83 | result = dh.get_metering_info(project_name, topic_name, shard_id) 84 | print(result) 85 | assert result.active_time > 0 86 | assert result.storage >= 0 87 | 88 | 89 | # run directly 90 | if __name__ == '__main__': 91 | test = TestMeter() 92 | test.test_get_metering_info() 93 | -------------------------------------------------------------------------------- /tests/resources/datahub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-datahub-sdk-python/c900aceeb6d921da1d7e38dd00abd0b4b2d3acf3/tests/resources/datahub.png -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from __future__ import absolute_import 21 | 22 | from .unittest_util import * 23 | -------------------------------------------------------------------------------- /tests/unit/test_schema_unit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # 'License'); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | from datahub.exceptions import InvalidParameterException 21 | from datahub.models import RecordSchema, FieldType, Field 22 | 23 | 24 | class TestSchema: 25 | 26 | def test_build_schema_success(self): 27 | record_schema_0 = RecordSchema.from_lists( 28 | ['tinyint_field', 'smallint_field', 'integer_field', 'bigint_field', 'string_field', 29 | 'float_field', 'double_field', 'bool_field', 'event_time1'], 30 | [FieldType.TINYINT, FieldType.SMALLINT, FieldType.INTEGER, FieldType.BIGINT, FieldType.STRING, 31 | FieldType.FLOAT, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP], 32 | [False, True, True, True, True, True, True, True, True] 33 | ) 34 | 35 | record_schema_1 = RecordSchema([]) 36 | record_schema_1.add_field(Field('tinyint_field', FieldType.TINYINT, False)) 37 | record_schema_1.add_field(Field('smallint_field', FieldType.SMALLINT)) 38 | record_schema_1.add_field(Field('integer_field', FieldType.INTEGER)) 39 | record_schema_1.add_field(Field('bigint_field', FieldType.BIGINT)) 40 | record_schema_1.add_field(Field('string_field', FieldType.STRING)) 41 | record_schema_1.add_field(Field('float_field', FieldType.FLOAT)) 42 | record_schema_1.add_field(Field('double_field', FieldType.DOUBLE)) 43 | record_schema_1.add_field(Field('bool_field', FieldType.BOOLEAN)) 44 | record_schema_1.add_field(Field('event_time1', FieldType.TIMESTAMP)) 45 | 46 | fields = [] 47 | fields.append(Field('tinyint_field', FieldType.TINYINT, False)) 48 | fields.append(Field('smallint_field', FieldType.SMALLINT)) 49 | fields.append(Field('integer_field', FieldType.INTEGER)) 50 | fields.append(Field('bigint_field', FieldType.BIGINT)) 51 | fields.append(Field('string_field', FieldType.STRING)) 52 | fields.append(Field('float_field', FieldType.FLOAT)) 53 | fields.append(Field('double_field', FieldType.DOUBLE)) 54 | fields.append(Field('bool_field', FieldType.BOOLEAN)) 55 | fields.append(Field('event_time1', FieldType.TIMESTAMP)) 56 | 57 | record_schema_2 = RecordSchema(fields) 58 | 59 | for index in range(0, len(record_schema_0.field_list)): 60 | assert record_schema_0.field_list[index].name == record_schema_1.field_list[index].name 61 | assert record_schema_0.field_list[index].type == record_schema_1.field_list[index].type 62 | assert record_schema_0.field_list[index].allow_null == record_schema_1.field_list[index].allow_null 63 | 64 | assert record_schema_0.field_list[index].name == record_schema_2.field_list[index].name 65 | assert record_schema_0.field_list[index].type == record_schema_2.field_list[index].type 66 | assert record_schema_0.field_list[index].allow_null == record_schema_2.field_list[index].allow_null 67 | 68 | def test_build_schema_with_invalid_type(self): 69 | try: 70 | record_schema_0 = RecordSchema.from_lists( 71 | ['bigint_field', 'string_field', 'double_field', 'bool_field', 'event_time1'], 72 | ['int', FieldType.STRING, FieldType.DOUBLE, FieldType.BOOLEAN, FieldType.TIMESTAMP]) 73 | except InvalidParameterException: 74 | pass 75 | else: 76 | raise Exception('build schema success with wrong filed type!') 77 | 78 | try: 79 | record_schema_1 = RecordSchema() 80 | record_schema_1.add_field(Field('string_field', 'str')) 81 | except InvalidParameterException: 82 | pass 83 | else: 84 | raise Exception('build schema success with wrong filed type!') 85 | 86 | try: 87 | fields = [] 88 | fields.append(Field('bigint_field', FieldType.BIGINT)) 89 | fields.append(Field('string_field', FieldType.STRING)) 90 | fields.append(Field('double_field', FieldType.DOUBLE)) 91 | fields.append(Field('bool_field', FieldType.BOOLEAN)) 92 | fields.append(Field('event_time1', 'time')) 93 | except InvalidParameterException: 94 | pass 95 | else: 96 | raise Exception('build schema success with wrong filed type!') 97 | 98 | 99 | if __name__ == '__main__': 100 | test = TestSchema() 101 | test.test_build_schema_success() 102 | test.test_build_schema_with_invalid_type() 103 | -------------------------------------------------------------------------------- /tests/unit/unittest_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import json 21 | import os 22 | 23 | from httmock import urlmatch, response 24 | 25 | from datahub.exceptions import InvalidParameterException 26 | 27 | _TESTS_PATH = os.path.abspath(os.path.dirname(__file__)) 28 | _FIXTURE_PATH = os.path.join(_TESTS_PATH, '../fixtures') 29 | 30 | 31 | def gen_mock_api(check): 32 | @urlmatch(netloc=r'(.*\.)?endpoint') 33 | def datahub_api_mock(url, request): 34 | check(request) 35 | path = url.path.replace('/', '.')[1:] 36 | res_file = os.path.join(_FIXTURE_PATH, '%s.json' % path) 37 | status_code = 200 38 | content = { 39 | } 40 | headers = { 41 | 'Content-Type': 'application/json', 42 | 'x-datahub-request-id': 0 43 | } 44 | try: 45 | with open(res_file, 'rb') as f: 46 | content = json.loads(f.read().decode('utf-8')) 47 | if 'ErrorCode' in content: 48 | status_code = 500 49 | except (IOError, ValueError) as e: 50 | content['ErrorMessage'] = 'Loads fixture %s failed, error: %s' % (res_file, e) 51 | return response(status_code, content, headers, request=request) 52 | 53 | return datahub_api_mock 54 | 55 | 56 | def gen_pb_mock_api(check): 57 | @urlmatch(netloc=r'(.*\.)?endpoint') 58 | def datahub_pb_api_mock(url, request): 59 | check(request) 60 | path = url.path.replace('/', '.')[1:] 61 | res_file = os.path.join(_FIXTURE_PATH, '%s.bin' % path) 62 | status_code = 200 63 | content = { 64 | } 65 | headers = { 66 | 'Content-Type': 'application/x-protobuf', 67 | 'x-datahub-request-id': 0 68 | } 69 | try: 70 | with open(res_file, 'rb') as f: 71 | content = f.read() 72 | except (IOError, InvalidParameterException) as e: 73 | content['ErrorMessage'] = 'Loads fixture %s failed, error: %s' % (res_file, e) 74 | return response(status_code, content, headers, request=request) 75 | 76 | return datahub_pb_api_mock 77 | 78 | 79 | def gen_batch_mock_api(check): 80 | @urlmatch(netloc=r'(.*\.)?endpoint') 81 | def datahub_batch_api_mock(url, request): 82 | check(request) 83 | path = url.path.replace('/', '.')[1:] 84 | res_file = os.path.join(_FIXTURE_PATH, '%s.bin' % path) 85 | status_code = 200 86 | content = { 87 | } 88 | headers = { 89 | 'Content-Type': 'application/x-binary', 90 | 'x-datahub-request-id': 0 91 | } 92 | # ===> For schema register 93 | if not isinstance(request.body, bytes): 94 | res_file = os.path.join(_FIXTURE_PATH, 'projects.schema.topics.list.json') 95 | headers = { 96 | 'Content-Type': 'application/json', 97 | 'x-datahub-request-id': 0 98 | } 99 | # <=== 100 | try: 101 | with open(res_file, 'rb') as f: 102 | content = f.read() 103 | if b"ErrorCode" in content: 104 | status_code = 404 105 | except (IOError, InvalidParameterException) as e: 106 | content['ErrorMessage'] = 'Loads fixture %s failed, error: %s' % (res_file, e) 107 | return response(status_code, content, headers, request=request) 108 | 109 | return datahub_batch_api_mock 110 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27, py33, py34, py35, py36, pypy, pypy3 8 | 9 | [testenv] 10 | usedevelop = True 11 | commands = 12 | {envpython} setup.py test 13 | pytest -v --cov 14 | deps = -rtest-requirements.txt 15 | install_command = pip install {opts} {packages} 16 | 17 | [testenv:lint] 18 | deps = pylint 19 | commands = pylint datahub 20 | 21 | [testenv:pep8] 22 | commands = flake8 23 | 24 | [flake8] 25 | # H405: multi line docstring summary not separated with an empty line 26 | ignore = H405 27 | show-source = True 28 | exclude = .venv,.tox,dist,doc,*egg,build, 29 | --------------------------------------------------------------------------------