├── ps_stream ├── __init__.py ├── cli │ ├── __init__.py │ └── main.py ├── publisher.yml ├── utils.py ├── collector.py └── publisher.py ├── requirements.txt ├── NOTICE ├── Dockerfile ├── setup.py ├── .gitignore ├── docker-compose.yml └── README.md /ps_stream/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ps_stream/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This product includes derivations of software developed by 2 | Docker, Inc., used under the Apache License Version 2.0. 3 | -------------------------------------------------------------------------------- /ps_stream/publisher.yml: -------------------------------------------------------------------------------- 1 | message_keys: 2 | PROJECT: '{PROJECT_ID}' 3 | PROJ_ACTIVITY: '{PROJECT_ID}' 4 | CURRENCY_CD_TBL: '{CURRENCY_CD}' 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ucalgary/python-librdkafka:3.7.0-0.11.6 2 | 3 | RUN mkdir -p /usr/src/app 4 | WORKDIR /usr/src/app 5 | 6 | COPY setup.py /usr/src/app 7 | COPY ps_stream /usr/src/app/ps_stream 8 | RUN apk add --no-cache --virtual .build-deps \ 9 | gcc \ 10 | git \ 11 | musl-dev && \ 12 | python setup.py install && \ 13 | apk del .build-deps 14 | 15 | ENTRYPOINT ["/usr/local/bin/ps-stream"] 16 | CMD ["--help"] 17 | 18 | LABEL maintainer="King Chung Huang " \ 19 | org.label-schema.vcs-url="https://github.com/ucalgary/ps-stream" 20 | -------------------------------------------------------------------------------- /ps_stream/utils.py: -------------------------------------------------------------------------------- 1 | def element_text(element): 2 | value = element.text 3 | if value: 4 | value.strip() 5 | return value 6 | 7 | 8 | def element_to_obj(element, map_class=dict, value_f=element_text, wrap_value=True): 9 | value = None 10 | 11 | if len(element) > 0: 12 | child_values = map(lambda e: (e.tag.split('}', 1)[-1], element_to_obj( 13 | e, map_class=map_class, value_f=value_f, wrap_value=False)), element) 14 | value = map_class(child_values) 15 | else: 16 | value = value_f(element) 17 | 18 | if wrap_value: 19 | value = {element.tag.split('}', 1)[-1]: value} 20 | return value 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pkg_resources 4 | from setuptools import find_packages 5 | from setuptools import setup 6 | 7 | 8 | install_requires = [ 9 | 'docopt==0.6.2', 10 | 'Twisted==16.6.0', 11 | 'PyYAML==3.12', 12 | 'pytz==2016.10', 13 | 'confluent-kafka==0.11.5', 14 | 'ujson==1.35', 15 | 'docopt_utils==0.0.0' 16 | ] 17 | 18 | dependency_links = [ 19 | 'https://github.com/ucalgary/docopt-utils/archive/master.zip#egg=docopt_utils-0.0.0', 20 | ] 21 | 22 | 23 | setup( 24 | name='ps_stream', 25 | description='Process PeopleSoft sync messages into logical streams', 26 | author='King Chung Huang', 27 | packages=find_packages(), 28 | package_data={ 29 | '': ['*.yml'] 30 | }, 31 | install_requires=install_requires, 32 | dependency_links=dependency_links, 33 | entry_points=""" 34 | [console_scripts] 35 | ps-stream=ps_stream.cli.main:main 36 | """, 37 | zip_safe=True 38 | ) 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv/ 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:3.3.0 6 | hostname: zookeeper 7 | environment: 8 | - ZOOKEEPER_CLIENT_PORT=2181 9 | networks: 10 | - streaming 11 | 12 | kafka: 13 | image: confluentinc/cp-kafka:3.3.0 14 | hostname: kafka 15 | environment: 16 | - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 17 | - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 18 | networks: 19 | - streaming 20 | depends_on: 21 | - zookeeper 22 | 23 | kafka-rest: 24 | image: confluentinc/cp-kafka-rest:3.3.0 25 | hostname: kafka-rest 26 | environment: 27 | - ACCESS_CONTROL_ALLOW_ORIGIN_DEFAULT="*" 28 | - KAFKA_REST_SCHEMA_REGISTRY_URL=http://schema-registry:8081 29 | - KAFKA_REST_ZOOKEEPER_CONNECT=zookeeper:2181 30 | - KAFKA_REST_HOST_NAME=kafka-rest 31 | - KAFKA_REST_LISTENERS=http://kafka-rest:8082 32 | - KAFKA_REST_BOOTSTRAP_SERVERS=PLAINTEXT://kafka:9092 33 | networks: 34 | - streaming 35 | depends_on: 36 | - zookeeper 37 | - kafka 38 | 39 | schema-registry: 40 | image: confluentinc/cp-schema-registry:3.3.0 41 | hostname: schema-registry 42 | environment: 43 | - SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181 44 | - SCHEMA_REGISTRY_HOST_NAME=schema-registry 45 | - SCHEMA_REGISTRY_LISTENERS=http://schema-registry:8081 46 | networks: 47 | - streaming 48 | depends_on: 49 | - zookeeper 50 | - kafka 51 | 52 | ps-collector: 53 | image: ucalgary/ps-stream 54 | command: 55 | - collect 56 | ports: 57 | - 8000:8000 58 | networks: 59 | - streaming 60 | 61 | ps-publisher: 62 | image: ucalgary/ps-stream 63 | command: 64 | - publish 65 | networks: 66 | - streaming 67 | 68 | kafka-topics-ui: 69 | image: landoop/kafka-topics-ui:0.9.2 70 | environment: 71 | - KAFKA_REST_PROXY_URL=http://kafka-rest:8082 72 | - PROXY=true 73 | ports: 74 | - 8001:8000 75 | networks: 76 | - streaming 77 | 78 | networks: 79 | streaming: 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Process PeopleSoft Sync Messages into Kafka Topics 2 | 3 | [![](https://images.microbadger.com/badges/image/ucalgary/ps-stream.svg)](https://microbadger.com/images/ucalgary/ps-stream) [![Anchore Image Overview](https://anchore.io/service/badges/image/a26f2562a708b063d8bf1e0f685f0b2bc75bde1725a787588d1be531b23f06ff)](https://anchore.io/image/dockerhub/a26f2562a708b063d8bf1e0f685f0b2bc75bde1725a787588d1be531b23f06ff?repo=ucalgary%2Fps-stream&tag=latest) 4 | 5 | `ps-stream` is a Python utility that collects and and parses [PeopleSoft rowset-based messages](http://docs.oracle.com/cd/E66686_01/pt855pbr1/eng/pt/tibr/concept_PeopleSoftRowset-BasedMessageFormat-0764fb.html) generated by sync and fullsync services into Kafka messages and topics. PeopleSoft sync processes are normally used to sync data between PeopleSoft applications. However, they can also be used as a way to generate an externalized stream of PeopleSoft objects in streaming data pipelines. 6 | 7 | There are two major commands in `ps-stream`. 8 | 9 | **`collect`** accepts PeopleSoft rowset-based messages over http or https, and produces a Kafka message for each transaction in the PeopleSoft message, stored in one or more Kafka topics. 10 | 11 | **`parse`** consumes transaction messages stored by `collect` and produces Kafka messages in topics with `KTable` semantics. Each record in the resulting stream is oriented to reflect records whose record key is the primary key or identifier of the transaction message. 12 | 13 | ## Running a ps-stream container 14 | 15 | Collect PeopleSoft sync messages. 16 | 17 | ``` 18 | $ docker run -p 8000:8000 -d ucalgary/ps-stream collect 19 | ``` 20 | 21 | ## Test Drive 22 | 23 | You can quickly deploy ps-stream on Play with Docker, a community-run Docker playground, by clicking the following button. 24 | 25 | [![Try in PWD](https://cdn.rawgit.com/play-with-docker/stacks/cff22438/assets/images/button.png)](http://play-with-docker.com?stack=https://raw.githubusercontent.com/ucalgary/ps-stream/master/docker-compose.yml&stack_name=ps-stream) 26 | 27 | After starting the demo, try `POST`ing a PeopleSoft rowset-based message to the collector on port 8000, then using kafka-topics-ui on port 8001 to see the message in a Kafka topic. 28 | 29 | ## Maintenance 30 | 31 | This repository and image are currently maintained by the Research Management Systems project at the [University of Calgary](http://www.ucalgary.ca/). 32 | -------------------------------------------------------------------------------- /ps_stream/cli/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from .. import collector 5 | from .. import publisher 6 | 7 | from docopt_utils.dispatcher import dispatch 8 | 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | def main(): 14 | def set_logging_level(handler, options): 15 | logging.basicConfig(level=logging.DEBUG if options['--verbose'] else logging.INFO) 16 | 17 | command_classes = {'__root__': PSStreamCommand} 18 | dispatch(command_classes, env='PSSTREAM', before_f=set_logging_level) 19 | 20 | 21 | class PSStreamCommand(object): 22 | """Process PeopleSoft sync messages into Kafka topics. 23 | 24 | Usage: 25 | ps-stream [--kafka=]... 26 | [--verbose] 27 | [COMMAND] [ARGS...] 28 | ps-stream -h|--help 29 | 30 | Options: 31 | -k, --kafka HOSTS Kafka bootstrap hosts [default: kafka:9092] 32 | --verbose Show more output 33 | 34 | Commands: 35 | collect Collect PeopleSoft sync messages 36 | config Validate and view the collector config 37 | publish Parse transaction messages into record streams 38 | """ 39 | 40 | def collect(self, options): 41 | """Collect PeopleSoft sync and fullsync messages. 42 | 43 | Usage: collect [--port=] [--target-prefix=] [--target-topic=] 44 | [--accept-from=]... 45 | [--accept-to=]... 46 | [--accept-messagename=]... 47 | 48 | Options: 49 | --port PORT Port to listen to messages on [default: 8000] 50 | --accept-from NAMES Accepted values for the From header 51 | --accept-to NAMES Accepted values for the To header 52 | --accept-messagename NAMES Accepted values for the MessageName header 53 | --target-prefix PREFIX Prefix name for target topic [default: ps] 54 | --target-topic TOPIC Topic to write transactions to [default: transactions] 55 | """ 56 | config = kafka_config_from_options(options) 57 | 58 | collector.collect( 59 | config, 60 | topic=prefix_topics(options['--target-prefix'], options['--target-topic']), 61 | port=int(options['--port']), 62 | senders=options['--accept-from'], 63 | recipients=options['--accept-to'], 64 | message_names=options['--accept-messagename']) 65 | 66 | def config(self, options): 67 | """Validate and view the collector config. 68 | 69 | Usage: config 70 | """ 71 | pass 72 | 73 | def publish(self, options): 74 | """Parse transaction messages into record streams. 75 | 76 | Usage: publish [--source-prefix=] [--source-topic=]... 77 | [--target-prefix=] [--target-topic=] 78 | [options] 79 | 80 | Options: 81 | --source-prefix PREFIX Prefix string for source topics [default: ps] 82 | --source-topic NAME Topics to consume transactions from [default: transactions] 83 | --target-prefix PREFIX Prefix name for target topics [default: ps] 84 | --target-topic NAME Topic to write records to, defaults to the record type 85 | --consumer-group GROUP Kafka consumer group name [default: ps-stream] 86 | """ 87 | config = kafka_config_from_options(options) 88 | 89 | publisher.publish( 90 | config, 91 | source_topics=prefix_topics(options['--source-prefix'], options['--source-topic']), 92 | target_topic=prefix_topics(options['--target-prefix'], options['--target-topic']), 93 | target_prefix=options['--target-prefix']) 94 | 95 | 96 | def kafka_config_from_options(options): 97 | config = dict() 98 | 99 | if '--kafka' in options: 100 | config['bootstrap.servers'] = ','.join(options['--kafka']) 101 | if '--consumer-group' in options: 102 | config['group.id'] = options['--consumer-group'] 103 | 104 | return config 105 | 106 | 107 | def prefix_topics(prefix, topics): 108 | if not topics: 109 | return topics 110 | if prefix: 111 | if not isinstance(topics, str): 112 | return [f'{prefix}.{topic}' for topic in topics] 113 | else: 114 | return f'{prefix}.{topics}' 115 | return topics 116 | -------------------------------------------------------------------------------- /ps_stream/collector.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytz 3 | from datetime import datetime 4 | from xml.etree import ElementTree 5 | 6 | import ujson as json 7 | from confluent_kafka import Producer 8 | from twisted.internet import endpoints, reactor 9 | from twisted.web import resource, server 10 | 11 | from .utils import element_to_obj 12 | 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | class PSStreamCollector(resource.Resource): 18 | 19 | isLeaf = True 20 | 21 | def __init__(self, producer, topic=None, authorize_f=None): 22 | super().__init__() 23 | self.producer = producer 24 | self.topic = topic 25 | self.authorize_f = authorize_f 26 | 27 | def render_GET(self, request): 28 | return '{"status":"GET ok"}'.encode('utf-8') 29 | 30 | def render_POST(self, request): 31 | """Decode PeopleSoft rowset-based messages into transactions, and produce Kafka 32 | messages for each transaction. PeopleSoft is expected to POST messages as events 33 | occur via SYNC and FULLSYNC services. 34 | 35 | The following URL describes the PeopleSoft Rowset-Based Message Format. 36 | http://docs.oracle.com/cd/E66686_01/pt855pbr1/eng/pt/tibr/concept_PeopleSoftRowset-BasedMessageFormat-0764fb.html 37 | """ 38 | if self.authorize_f and not self.authorize_f(request): 39 | request.setResponseCode(403, message='Forbidden') 40 | log.info('Unauthorized message received') 41 | log.debug('To: {}, From: {}, MessageName: {}'.format( 42 | request.getHeader('To'), 43 | request.getHeader('From'), 44 | request.getHeader('MessageName'))) 45 | return 'Message not accepted by collector.'.encode('utf-8') 46 | 47 | assert(request.getHeader('DataChunk') == '1') 48 | assert(request.getHeader('DataChunkCount') == '1') 49 | 50 | psft_message_name = None 51 | field_types = None 52 | 53 | transaction_id = request.getHeader('TransactionID') 54 | orig_time_stamp = request.getHeader('OrigTimeStamp') 55 | 56 | # Parse the root element for the PeopleSoft message name and FieldTypes 57 | request.content.seek(0, 0) 58 | for event, e in ElementTree.iterparse(request.content, events=('start', 'end')): 59 | if event == 'start' and psft_message_name is None: 60 | psft_message_name = e.tag.split('}', 1)[-1] 61 | elif event == 'end' and e.tag.split('}', 1)[-1] == 'FieldTypes': 62 | field_types = element_to_obj(e, value_f=field_type) 63 | break 64 | 65 | # Rescan for transactions, removing read elements to reduce memory usage 66 | transaction_index = 1 67 | request.content.seek(0, 0) 68 | for event, e in ElementTree.iterparse(request.content, events=('end',)): 69 | if e.tag.split('}', 1)[-1] == 'Transaction': 70 | transaction = ElementTree.tostring(e, encoding='unicode') 71 | message = { 72 | 'TransactionID': transaction_id, 73 | 'TransactionIndex': transaction_index, 74 | 'OrigTimeStamp': orig_time_stamp, 75 | 'CollectTimeStamp': datetime.now(pytz.utc).astimezone().isoformat(), 76 | 'Transaction': transaction 77 | } 78 | message_str = json.dumps(message) 79 | self.producer.produce(self.topic, message_str, transaction_id) 80 | e.clear() 81 | transaction_index += 1 82 | self.producer.flush() 83 | 84 | return '{"status":"POST ok"}'.encode('utf-8') 85 | 86 | 87 | def collect(config, topic=None, port=8000, senders=None, recipients=None, message_names=None): 88 | def authorize_request(request): 89 | if senders and not request.getHeader('To') in senders: 90 | return False 91 | if recipients and not request.getHeader('From') in senders: 92 | return False 93 | if message_names and request.getHeader('MessageName') in senders: 94 | return False 95 | return True 96 | 97 | producer = Producer(config) 98 | collector = PSStreamCollector(producer, topic=topic, authorize_f=authorize_request) 99 | site = server.Site(collector) 100 | endpoint = endpoints.TCP4ServerEndpoint(reactor, int(port)) 101 | endpoint.listen(site) 102 | log.info(f'Listening for connections on port {port}') 103 | reactor.run() 104 | 105 | 106 | def field_type(element): 107 | assert('type' in element.attrib) 108 | return element.attrib.get('type') 109 | -------------------------------------------------------------------------------- /ps_stream/publisher.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pkg_resources 3 | import signal 4 | import sys 5 | from difflib import SequenceMatcher 6 | from xml.etree import ElementTree 7 | 8 | import ujson as json 9 | import yaml 10 | from confluent_kafka import Consumer, Producer 11 | from confluent_kafka import KafkaError 12 | 13 | from .utils import element_to_obj 14 | 15 | 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | key_formats_by_record_type = yaml.load( 20 | pkg_resources.resource_stream(__name__, 'publisher.yml'))['message_keys'] 21 | 22 | 23 | class PSStreamPublisher(object): 24 | 25 | def __init__(self, consumer, producer, 26 | source_topics=None, target_topic=None, target_prefix=None): 27 | super().__init__() 28 | self.consumer = consumer 29 | self.producer = producer 30 | self.source_topics = source_topics 31 | self.target_topic = target_topic 32 | self.target_prefix = target_prefix 33 | self.running = True 34 | 35 | def run(self): 36 | '''Process transactions from the source topics and publish 37 | messages representing a stream of PeopleSoft rows organized 38 | by record name. 39 | ''' 40 | signal.signal(signal.SIGINT, self.terminate) 41 | signal.signal(signal.SIGTERM, self.terminate) 42 | 43 | self.consumer.subscribe(self.source_topics) 44 | 45 | while self.running: 46 | message = self.consumer.poll(timeout=5) 47 | 48 | if not message: 49 | continue 50 | elif not message.error(): 51 | transaction = json.loads(message.value().decode('utf-8')) 52 | 53 | for topic, key, value in self.messages_from_transaction(transaction): 54 | self.producer.produce(topic, value, key) 55 | elif message.error().code() != KafkaError._PARTITION_EOF: 56 | print(message.error()) 57 | self.running = False 58 | 59 | self.terminate() 60 | 61 | def terminate(self): 62 | log.info('Terminating') 63 | self.consumer.close() 64 | self.producer.flush() 65 | sys.exit(0) 66 | 67 | def messages_from_transaction(self, transaction, key_serde=json.dumps, value_serde=json.dumps): 68 | transaction['Transaction'] = element_to_obj( 69 | ElementTree.fromstring(transaction['Transaction']), wrap_value=False) 70 | 71 | audit_actn = transaction['Transaction']['PSCAMA']['AUDIT_ACTN'] 72 | if audit_actn is not None and audit_actn not in ('A', 'C', 'D', 'K', 'N', 'O'): 73 | log.info('Invalid AUDIT_ACTN received') 74 | log.debug(transaction) 75 | return 76 | 77 | for record_type, record_data in transaction['Transaction'].items(): 78 | if record_type == 'PSCAMA': 79 | continue 80 | topic = self.topic_for_record(record_type, record_data) 81 | key = self.key_for_record(record_type, record_data) 82 | value = audit_actn in (None, 'A', 'C') and record_data or None 83 | log.debug(f'Producing to topic {topic} with key {key}') 84 | if key and key_serde: 85 | key = key_serde(key) 86 | if value and value_serde: 87 | value = value_serde(value) 88 | yield topic, key, value 89 | 90 | def topic_for_record(self, record_type, record_data): 91 | if self.target_topic: 92 | return self.target_topic 93 | elif self.target_prefix: 94 | return f'{self.target_prefix}.{record_type}' 95 | return record_type 96 | 97 | def key_for_record(self, record_type, record_data, guess=False): 98 | key_format = key_formats_by_record_type.get(record_type, None) 99 | if not key_format and guess: 100 | keys = record_data.keys() 101 | keys = sorted(keys, 102 | key=lambda x: SequenceMatcher(a=record_type, b=x).ratio(), 103 | reverse=True) 104 | key_attribute = keys[0] 105 | key_format = '{%s}' % key_attribute 106 | key_formats_by_record_type[record_type] = key_format 107 | return key_format and key_format.format(**record_data) 108 | 109 | 110 | def publish(config, source_topics=None, target_topic=None, target_prefix=None): 111 | consumer_config = {**config, ** { 112 | 'default.topic.config': { 113 | 'auto.offset.reset': 'smallest', 114 | 'auto.commit.interval.ms': 5000 115 | } 116 | }} 117 | producer_config = config 118 | consumer = Consumer(consumer_config) 119 | producer = Producer(producer_config) 120 | publisher = PSStreamPublisher( 121 | consumer, producer, 122 | source_topics=source_topics, target_topic=target_topic, target_prefix=target_prefix) 123 | log.info(f'Reading transactions from {source_topics}') 124 | publisher.run() 125 | --------------------------------------------------------------------------------