├── logs └── .keep ├── ecosystem ├── tests ├── __init__.py ├── tools │ ├── __init__.py │ ├── introspector │ │ └── __init__.py │ ├── _glob_util_test.py │ ├── schema_ref_test.py │ ├── meteorite_wrappers_test.py │ ├── sensu_ttl_alerter_test.py │ ├── meteorite_gauge_manager_test.py │ └── sensu_alert_manager_test.py ├── benchmarks │ ├── __init__.py │ ├── _fast_uuid_test.py │ ├── envelope_test.py │ ├── producer_test.py │ ├── logging_test.py │ └── message_test.py ├── consumer │ └── __init__.py ├── factories │ ├── __init__.py │ └── base_factory.py ├── helpers │ ├── __init__.py │ ├── mock_utils.py │ ├── config.py │ └── decorators_test.py ├── schematizer_clientlib │ └── __init__.py ├── team_test.py ├── initialization_vector_test.py ├── _fast_uuid_test.py ├── client_test.py ├── _retry_util_test.py └── envelope_test.py ├── .ruby-version ├── debian ├── compat ├── .gitignore ├── control ├── data-pipeline-tools.links └── rules ├── data_pipeline ├── tools │ ├── .ipython │ ├── __init__.py │ ├── introspector │ │ ├── __init__.py │ │ ├── info │ │ │ ├── __init__.py │ │ │ ├── topic.py │ │ │ ├── namespace.py │ │ │ └── source.py │ │ ├── register │ │ │ ├── __init__.py │ │ │ ├── base_command.py │ │ │ ├── mysql_command.py │ │ │ └── avro_command.py │ │ ├── list_command │ │ │ ├── __init__.py │ │ │ ├── base_command.py │ │ │ ├── topics.py │ │ │ ├── namespaces.py │ │ │ └── sources.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── schema.py │ │ │ ├── source.py │ │ │ ├── namespace.py │ │ │ ├── base.py │ │ │ └── topic.py │ │ ├── register_command_parser.py │ │ ├── info_command_parser.py │ │ ├── list_command_parser.py │ │ ├── main.py │ │ └── schema_check_command.py │ ├── _glob_util.py │ ├── sensu_ttl_alerter.py │ ├── meteorite_gauge_manager.py │ ├── heartbeat_periodic_processor.py │ └── binlog_analyzer.py ├── data_pipeline.py ├── helpers │ ├── __init__.py │ ├── lists.py │ ├── log.py │ ├── singleton.py │ ├── frozendict_json_encoder.py │ ├── decorators.py │ └── yelp_avro_store.py ├── servlib │ └── __init__.py ├── testing_helpers │ ├── __init__.py │ ├── docker-compose.yml │ ├── docker-compose-opensource.yml │ └── kafka_docker.py ├── schematizer_clientlib │ ├── __init__.py │ └── models │ │ ├── __init__.py │ │ ├── model_base.py │ │ ├── data_source_type_enum.py │ │ ├── target_schema_type_enum.py │ │ ├── namespace.py │ │ ├── meta_attr_source_mapping.py │ │ ├── meta_attr_namespace_mapping.py │ │ ├── data_target.py │ │ ├── consumer_group_data_source.py │ │ ├── source.py │ │ ├── consumer_group.py │ │ └── avro_schema_element.py ├── schemas │ ├── initialization_vector_v1.avsc │ ├── monitoring_message_v1.avsc │ ├── registration_message_v1.avsc │ └── envelope_v1.avsc ├── __init__.py ├── environment_configs.py ├── publish_guarantee.py ├── _clog_writer.py ├── _consumer_tick.py ├── initialization_vector.py ├── team.py ├── expected_frequency.py ├── message_type.py └── _kafka_util.py ├── setup.cfg ├── docs ├── history.rst ├── usage.rst ├── contributing.rst ├── installation.rst └── index.rst ├── requirements.d ├── pre_commit.txt ├── tools.txt ├── dev.txt └── dev-internal.txt ├── zookeeper_discoverydevc.yaml ├── Gemfile ├── teams.yaml ├── bin ├── data_pipeline_tailer ├── data_pipeline_introspector ├── data_pipeline_compaction_setter ├── data_pipeline_refresh_manager ├── data_pipeline_refresh_requester └── data_pipeline_refresh_runner ├── docker-compose.yml ├── .rat-excludes ├── .dockerignore ├── NOTICE ├── travis.yml ├── MANIFEST.in ├── .travis.yml ├── Procfile ├── HISTORY.rst ├── USAGE.rst ├── yelp_package ├── dockerfiles │ ├── lucid │ │ └── Dockerfile │ └── trusty │ │ └── Dockerfile ├── Makefile └── itest │ └── ubuntu.sh ├── .coveragerc ├── run_guard.sh ├── .pre-commit-config.yaml ├── setup_bundles.sh ├── .gitignore ├── Gemfile.lock ├── Dockerfile ├── Makefile-opensource ├── tox-opensource.ini ├── Guardfile ├── requirements.txt ├── tox.ini ├── Makefile ├── key-1.key └── CONTRIBUTING.rst /logs/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ecosystem: -------------------------------------------------------------------------------- 1 | devc 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.2.2 2 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /tests/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/tools/.ipython: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/consumer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/factories/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/data_pipeline.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/servlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /tests/tools/introspector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/testing_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/schematizer_clientlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../USAGE.rst 2 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/info/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/register/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.d/pre_commit.txt: -------------------------------------------------------------------------------- 1 | pre-commit>=0.4.2 2 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /zookeeper_discoverydevc.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - - zk 3 | - 2181 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'foreman' 4 | gem 'guard' 5 | -------------------------------------------------------------------------------- /debian/.gitignore: -------------------------------------------------------------------------------- 1 | data-pipeline-tools.substvars 2 | data-pipeline-tools/ 3 | files 4 | -------------------------------------------------------------------------------- /requirements.d/tools.txt: -------------------------------------------------------------------------------- 1 | psutil==4.2.0 2 | sqlparse 3 | yelp_batch 4 | yelp_clog==2.5.2 5 | yelp_conn 6 | -------------------------------------------------------------------------------- /teams.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | teams: 3 | bam: 4 | email: team1@yelp.com 5 | team2: 6 | email: team2@yelp.com 7 | -------------------------------------------------------------------------------- /bin/data_pipeline_tailer: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.tailer import Tailer 3 | 4 | Tailer().start() 5 | -------------------------------------------------------------------------------- /bin/data_pipeline_introspector: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.introspector.main import run 3 | 4 | run() 5 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | pypy: 2 | build: . 3 | links: 4 | - zookeeper:zk 5 | - kafka:kafka 6 | - schematizer:schematizer 7 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | At the command line either via pip:: 6 | 7 | $ pip install data_pipeline 8 | -------------------------------------------------------------------------------- /bin/data_pipeline_compaction_setter: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.compaction_setter import CompactionSetter 3 | 4 | CompactionSetter().start() 5 | -------------------------------------------------------------------------------- /bin/data_pipeline_refresh_manager: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.refresh_manager import FullRefreshManager 3 | 4 | FullRefreshManager().start() 5 | -------------------------------------------------------------------------------- /.rat-excludes: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | venv 4 | .tox 5 | data_pipeline.egg-info/ 6 | __pycache__ 7 | .cache/ 8 | logs 9 | debian 10 | .pip 11 | .distlib 12 | bin 13 | -------------------------------------------------------------------------------- /bin/data_pipeline_refresh_requester: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.refresh_requester import FullRefreshRequester 3 | 4 | FullRefreshRequester().start() 5 | -------------------------------------------------------------------------------- /bin/data_pipeline_refresh_runner: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from data_pipeline.tools.copy_table_to_blackhole_table import FullRefreshRunner 3 | 4 | FullRefreshRunner().start() 5 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | .tox 4 | venv 5 | virtualenv_run 6 | build 7 | dist 8 | *.pyc 9 | *.log 10 | *.swp 11 | Dockerfile 12 | docker-compose.yml 13 | .docker 14 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Apache Data Pipeline 2 | Copyright 2016 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /data_pipeline/schemas/initialization_vector_v1.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "fixed", 3 | "size": 16, 4 | "namespace": "yelp.data_pipeline", 5 | "name": "initialization_vector", 6 | "doc": "Serializes an initalization vector for encrypting PII." 7 | } 8 | -------------------------------------------------------------------------------- /travis.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | - "/^v[0-9.]+$/" 5 | language: python 6 | python: 7 | - '2.7' 8 | install: pip install coveralls tox 9 | script: make -f Makefile-opensource test 10 | after_success: 11 | - coveralls 12 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data_pipeline/schemas/envelope_v1.avsc 2 | include data_pipeline/schemas/monitoring_message_v1.avsc 3 | include data_pipeline/testing_helpers/docker-compose.yml 4 | include CONTRIBUTING.rst 5 | include HISTORY.rst 6 | include README.md 7 | include USAGE.rst 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | - /^v[0-9.]+$/ 5 | language: python 6 | services: 7 | - docker 8 | python: 9 | - '2.7' 10 | install: pip install coveralls tox 11 | script: 12 | - make -f Makefile-opensource test 13 | after_success: 14 | - coveralls 15 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | # docs: python -m SimpleHTTPServer 8001 2 | docs: twistd -no web -p 8001 --path=. 3 | kafka: tox -e devenv-command "$(make compose-prefix) kill" && tox -e devenv-command "$(make compose-prefix) rm --force" && tox -e devenv-command "$(make compose-prefix) up kafka schematizer" 4 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 0.1.4 (2015-08-12) 7 | ++++++++++++++++++ 8 | 9 | * Defined consumer/producer registration API 10 | 11 | 0.1.3 (2015-08-10) 12 | ++++++++++++++++++ 13 | 14 | * Added keys kwargs to data pipeline messages 15 | 16 | 0.1.0 (2015-03-01) 17 | ++++++++++++++++++ 18 | 19 | * First release. 20 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: data-pipeline-tools 2 | Section: python 3 | Priority: optional 4 | Maintainer: Justin Cunningham 5 | Build-Depends: debhelper (>= 7), python (>= 2.7), dh-virtualenv, libffi-dev, libssl0.9.8 6 | Standards-Version: 3.8.3 7 | 8 | Package: data-pipeline-tools 9 | Architecture: any 10 | Depends: ${shlibs:Depends}, ${misc:Depends}, libssl0.9.8 11 | Description: Data pipeline clientlib tools 12 | -------------------------------------------------------------------------------- /requirements.d/dev.txt: -------------------------------------------------------------------------------- 1 | # Installs the package with the testing_helpers extras 2 | -e .[testing_helpers] 3 | # TODO(DATAPIPE-167): Add the packages that don't exist yet to the interal Pypi 4 | # and remove this extra-index-url 5 | --extra-index-url https://pypi.python.org/simple/ 6 | flake8 7 | honcho 8 | ipdb 9 | ipython 10 | mock 11 | pre-commit>=0.4.2 12 | pytest<2.8.2 13 | pytest-cov==1.8.1 14 | pytest-raisesregexp 15 | Sphinx==1.3.1 16 | SQLAlchemy==1.0.8 17 | terminaltables>=1.0.2 18 | wheel>=0.22 19 | factory_boy==2.5.2 20 | -------------------------------------------------------------------------------- /debian/data-pipeline-tools.links: -------------------------------------------------------------------------------- 1 | opt/venvs/data-pipeline-tools/bin/data_pipeline_tailer usr/bin/data_pipeline_tailer 2 | opt/venvs/data-pipeline-tools/bin/data_pipeline_refresh_runner usr/bin/data_pipeline_refresh_runner 3 | opt/venvs/data-pipeline-tools/bin/data_pipeline_refresh_manager usr/bin/data_pipeline_refresh_manager 4 | opt/venvs/data-pipeline-tools/bin/data_pipeline_refresh_requester usr/bin/data_pipeline_refresh_requester 5 | opt/venvs/data-pipeline-tools/bin/data_pipeline_compaction_setter usr/bin/data_pipeline_compaction_setter 6 | opt/venvs/data-pipeline-tools/bin/data_pipeline_introspector usr/bin/data_pipeline_introspector 7 | -------------------------------------------------------------------------------- /requirements.d/dev-internal.txt: -------------------------------------------------------------------------------- 1 | # Installs the package with the testing_helpers, tools and internal extras 2 | -e .[testing_helpers, tools, internal] 3 | # TODO(DATAPIPE-167): Add the packages that don't exist yet to the interal Pypi 4 | # and remove this extra-index-url 5 | --extra-index-url https://pypi.python.org/simple/ 6 | flake8 7 | honcho 8 | ipdb 9 | ipython 10 | mock 11 | pre-commit>=0.4.2 12 | # Later versions should be ok once INFRA-3779 is fixed 13 | pytest<2.8.2 14 | pytest-cov==1.8.1 15 | pytest-raisesregexp 16 | Sphinx==1.3.1 17 | SQLAlchemy==1.0.8 18 | terminaltables>=1.0.2 19 | wheel>=0.22 20 | factory_boy==2.5.2 21 | pytest-benchmark==3.0.0 22 | -------------------------------------------------------------------------------- /USAGE.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Usage 3 | ======== 4 | 5 | To use Data Pipeline Clientlib in a project:: 6 | 7 | >>> import data_pipeline 8 | 9 | To use a Consumer:: 10 | 11 | >>> from data_pipeline.consumer import Consumer 12 | >>> from data_pipeline.expected_frequency import ExpectedFrequency 13 | >>> Consumer( 14 | ... 'test', 15 | ... 'bam', 16 | ... ExpectedFrequency.constantly, 17 | ... {'topic_name': None} 18 | ... ) # doctest: +ELLIPSIS 19 | 20 | 21 | And another thing:: 22 | 23 | >>> from data_pipeline.envelope import Envelope 24 | >>> envelope = Envelope() 25 | -------------------------------------------------------------------------------- /data_pipeline/testing_helpers/docker-compose.yml: -------------------------------------------------------------------------------- 1 | zookeeper: 2 | image: wurstmeister/zookeeper 3 | kafka: 4 | image: jcnnghm/kafka:latest 5 | links: 6 | - zookeeper:zk 7 | environment: 8 | KAFKA_BROKER_ID: 1 9 | KAFKA_ADVERTISED_PORT: 9092 10 | schematizerdatabase: 11 | image: docker-dev.yelpcorp.com/schematizer_database:latest 12 | schematizerconfigs: 13 | image: docker-dev.yelpcorp.com/schematizer_configs:latest 14 | schematizer: 15 | image: docker-dev.yelpcorp.com/schematizer_service:latest 16 | links: 17 | - schematizerdatabase 18 | volumes_from: 19 | - schematizerconfigs 20 | command: "/code/virtualenv_run/bin/python /code/serviceinitd/internal_schematizer start-dev" 21 | -------------------------------------------------------------------------------- /yelp_package/dockerfiles/lucid/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker-dev.yelpcorp.com/lucid_yelp 2 | MAINTAINER Justin Cunningham 3 | # Heavily based on kwa's work for paasta-tools 4 | 5 | # Make sure we get a package suitable for building this package correctly. 6 | # Per dnephin we need https://github.com/spotify/dh-virtualenv/pull/20 7 | # Which at this time is in this package 8 | RUN apt-get update && apt-get -y install dpkg-dev python-tox python-setuptools \ 9 | python-dev debhelper dh-virtualenv python-yaml python-pytest \ 10 | pyflakes python2.7 python2.7-dev help2man libffi-dev uuid-dev libuuid1 git \ 11 | libmysqlclient-dev 12 | 13 | ENV HOME /work 14 | ENV PWD /work 15 | WORKDIR /work 16 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = 4 | . 5 | omit = 6 | .tox/* 7 | /usr/* 8 | */tmp* 9 | setup.py 10 | # Don't complain if non-runnable code isn't run 11 | */__main__.py 12 | 13 | [report] 14 | exclude_lines = 15 | # Have to re-enable the standard pragma 16 | \#\s*pragma: no cover 17 | 18 | # Don't complain if tests don't hit defensive assertion code: 19 | ^\s*raise AssertionError\b 20 | ^\s*raise NotImplementedError\b 21 | ^\s*return NotImplemented\b 22 | ^\s*raise$ 23 | 24 | # Don't complain if non-runnable code isn't run: 25 | ^if __name__ == ['"]__main__['"]:$ 26 | 27 | [html] 28 | directory = coverage-html 29 | 30 | # vim:ft=dosini 31 | -------------------------------------------------------------------------------- /yelp_package/dockerfiles/trusty/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker-dev.yelpcorp.com/trusty_yelp 2 | MAINTAINER Justin Cunningham 3 | # Heavily based on kwa's work for paasta-tools 4 | 5 | # Make sure we get a package suitable for building this package correctly. 6 | # Per dnephin we need https://github.com/spotify/dh-virtualenv/pull/20 7 | # Which at this time is in this package 8 | RUN apt-get update && apt-get -y install dpkg-dev python-tox python-setuptools \ 9 | python-dev debhelper dh-virtualenv python-yaml python-pytest \ 10 | pyflakes python2.7 python2.7-dev help2man libffi-dev uuid-dev libuuid1 \ 11 | libssl0.9.8 git libmysqlclient-dev libssl-dev 12 | 13 | ENV HOME /work 14 | ENV PWD /work 15 | WORKDIR /work 16 | -------------------------------------------------------------------------------- /run_guard.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | 17 | ./setup_bundles.sh 18 | export RBENV_ROOT=$HOME/.rbenv 19 | eval "$(rbenv init -)" 20 | bundle exec guard 21 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. complexity documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Contents: 7 | ========= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | installation 13 | usage 14 | contributing 15 | history 16 | code/data_pipeline 17 | 18 | Feedback 19 | ======== 20 | 21 | If you have any suggestions or questions about **Data Pipeline Clientlib** feel free to email me 22 | at justinc@yelp.com. 23 | 24 | If you encounter any errors or problems with **Data Pipeline Clientlib**, please let me know! 25 | Create an issue on jira under the **DATAPIPE** project at 26 | https://jira.yelpcorp.com/browse/DATAPIPE. 27 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | 4 | # Uncomment this to turn on verbose mode. 5 | export DH_VERBOSE=1 6 | 7 | # This has to be exported to make some magic below work. 8 | export DH_OPTIONS 9 | export DH_VIRTUALENV_INSTALL_ROOT=/opt/venvs 10 | 11 | %: 12 | dh $@ --with python-virtualenv 13 | 14 | # do not call `make clean` as part of packaging 15 | override_dh_auto_clean: 16 | true 17 | 18 | # Don't let debian build stuff, but we do hook in here to make man pages 19 | override_dh_auto_build: 20 | true 21 | 22 | # do not call `make test` as part of packaging 23 | override_dh_auto_test: 24 | true 25 | 26 | override_dh_virtualenv: 27 | dh_virtualenv -v --pypi-url='https://pypi.yelpcorp.com/simple' --extra-index-url='https://pypi.python.org/simple/' --python=/usr/bin/python2.7 --extra-pip-arg '--no-use-wheel' 28 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/model_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | 20 | class BaseModel(object): 21 | pass 22 | -------------------------------------------------------------------------------- /data_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | __author__ = 'Justin Cunningham' 20 | __email__ = 'bam@yelp.com' 21 | __version__ = '0.9.13' 22 | -------------------------------------------------------------------------------- /data_pipeline/environment_configs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import os 20 | 21 | IS_OPEN_SOURCE_MODE = os.getenv('OPEN_SOURCE_MODE', 'false').lower() in ['t', 'true', 'y', 'yes'] 22 | -------------------------------------------------------------------------------- /data_pipeline/testing_helpers/docker-compose-opensource.yml: -------------------------------------------------------------------------------- 1 | zookeeper: 2 | image: wurstmeister/zookeeper 3 | kafka: 4 | image: jcnnghm/kafka:latest 5 | links: 6 | - zookeeper:zk 7 | environment: 8 | KAFKA_BROKER_ID: 1 9 | KAFKA_ADVERTISED_PORT: 9092 10 | 11 | # TODO (DATAPIPE-1858|abrar): change all the yelpcorp url's to docker hub once we 12 | # push our docker images to docker hub. 13 | schematizerdatabase: 14 | image: docker.io/yelp/schematizer_database:latest 15 | schematizerconfigs: 16 | image: docker.io/yelp/schematizer_configs:latest 17 | schematizer: 18 | image: docker.io/yelp/schematizer_service:latest 19 | links: 20 | - schematizerdatabase 21 | volumes_from: 22 | - schematizerconfigs 23 | command: "/code/virtualenv_run/bin/python -m serviceinitd.schematizer" 24 | environment: 25 | - SERVICE_CONFIG_PATH=config-open-source.yaml 26 | - FORCE_AVOID_INTERNAL_PACKAGES=True 27 | -------------------------------------------------------------------------------- /tests/helpers/mock_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | 21 | 22 | def attach_spy_on_func(target, attribute): 23 | orig_func = getattr(target, attribute) 24 | return mock.patch.object(target, attribute, side_effect=orig_func) 25 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/data_source_type_enum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from enum import Enum 20 | 21 | 22 | class DataSourceTypeEnum(Enum): 23 | """Eligible data source types.""" 24 | 25 | Namespace = 1 26 | Source = 2 27 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/target_schema_type_enum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from enum import Enum 20 | 21 | 22 | class TargetSchemaTypeEnum(Enum): 23 | """Eligible target schema types.""" 24 | 25 | unsupported = 0 26 | redshift = 1 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | - repo: https://github.com/pre-commit/pre-commit-hooks 2 | sha: v0.6.0 3 | hooks: 4 | - id: trailing-whitespace 5 | - id: end-of-file-fixer 6 | - id: autopep8-wrapper 7 | - id: check-yaml 8 | - id: debug-statements 9 | language_version: python2.7 10 | - id: flake8 11 | language_version: python2.7 12 | - id: check-yaml 13 | - id: check-json 14 | - id: check-merge-conflict 15 | - id: name-tests-test 16 | exclude: (tests/helpers/(.+).py)|(tests/factories/(.+).py) 17 | - id: fix-encoding-pragma 18 | - id: check-added-large-files 19 | - id: check-byte-order-marker 20 | - repo: https://github.com/asottile/reorder_python_imports 21 | sha: v0.3.0 22 | hooks: 23 | - id: reorder-python-imports 24 | language_version: python2.7 25 | args: 26 | - --add-import 27 | - from __future__ import absolute_import 28 | - --add-import 29 | - from __future__ import unicode_literals 30 | -------------------------------------------------------------------------------- /setup_bundles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | 17 | export RBENV_ROOT=$HOME/.rbenv 18 | if [ ! -d "$HOME/.rbenv/plugins/ruby-build" ]; then 19 | git clone https://github.com/sstephenson/ruby-build.git $HOME/.rbenv/plugins/ruby-build 20 | fi 21 | rbenv install 2.2.2 -s 22 | rbenv local 2.2.2 23 | rbenv rehash 24 | eval "$(rbenv init -)" 25 | gem install bundler 26 | rbenv rehash 27 | eval "$(rbenv init -)" 28 | bundle install --path=.bundle 29 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | # flake8: noqa 17 | from __future__ import absolute_import 18 | from __future__ import unicode_literals 19 | 20 | from data_pipeline.tools.introspector.models.namespace import * 21 | from data_pipeline.tools.introspector.models.schema import * 22 | from data_pipeline.tools.introspector.models.source import * 23 | from data_pipeline.tools.introspector.models.topic import * 24 | -------------------------------------------------------------------------------- /tests/team_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline.team import Team 22 | 23 | 24 | @pytest.mark.usefixtures('configure_teams') 25 | class TestTeam(object): 26 | def test_team_exists(self): 27 | assert Team.exists('bam') 28 | 29 | def test_team_does_not_exist(self): 30 | assert not Team.exists('fake_team') 31 | -------------------------------------------------------------------------------- /data_pipeline/helpers/lists.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | """ 17 | Utility methods for manipulating lists. 18 | """ 19 | from __future__ import absolute_import 20 | from __future__ import unicode_literals 21 | 22 | 23 | def unlist(a_list): 24 | """Convert the (possibly) single item list into a single item""" 25 | if len(a_list) > 1: 26 | raise ValueError(len(a_list)) 27 | 28 | if len(a_list) == 0: 29 | return None 30 | else: 31 | return a_list[0] 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *~ 3 | .#* 4 | ._* 5 | \#*\# 6 | .cache 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Develoopment Environment 12 | virtualenv_run 13 | venv 14 | 15 | # Packages 16 | *.egg 17 | *.egg-info 18 | dist 19 | build 20 | eggs 21 | parts 22 | var 23 | sdist 24 | develop-eggs 25 | .installed.cfg 26 | lib 27 | lib64 28 | *.iml 29 | *.log 30 | /.venv.touch 31 | /.venv.docs.touch 32 | /virtualenv_run 33 | 34 | # Installer logs 35 | pip-log.txt 36 | 37 | # Unit test / coverage reports 38 | .coverage* 39 | /coverage-html 40 | .tox 41 | nosetests.xml 42 | 43 | # Translations 44 | *.mo 45 | 46 | # Mr Developer 47 | .mr.developer.cfg 48 | .project 49 | .pydevproject 50 | 51 | # Complexity 52 | output/*.html 53 | output/*/index.html 54 | 55 | # Sphinx 56 | docs/build 57 | docs/code 58 | 59 | # Vim 60 | *.sw[nop] 61 | .ropeproject 62 | 63 | # idea 64 | .idea/ 65 | 66 | # Cache 67 | __pycache__ 68 | 69 | # tmp 70 | tmp 71 | out.txt 72 | scratch 73 | 74 | # Sublime 75 | *.sublime-* 76 | 77 | # Artificats 78 | twistd.pid 79 | logs/*.log 80 | .bundle 81 | 82 | # OS X 83 | .DS_Store 84 | 85 | #vim tags 86 | tags 87 | -------------------------------------------------------------------------------- /data_pipeline/helpers/log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import logging 20 | 21 | from data_pipeline.config import get_config 22 | 23 | 24 | def debug_log(line_lambda, exc_info=None): 25 | """This avoids unnecessary formatting of debug log string. 26 | More info in DATAPIPE-979 27 | """ 28 | if get_config().logger.isEnabledFor(logging.DEBUG): 29 | get_config().logger.debug(line_lambda(), exc_info=exc_info) 30 | -------------------------------------------------------------------------------- /data_pipeline/helpers/singleton.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | 20 | # This metaclass is from http://stackoverflow.com/questions/6760685/creating-a-singleton-in-python 21 | class Singleton(type): 22 | _instances = {} 23 | 24 | def __call__(cls, *args, **kwargs): 25 | if cls not in cls._instances: 26 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 27 | return cls._instances[cls] 28 | -------------------------------------------------------------------------------- /data_pipeline/helpers/frozendict_json_encoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import json 20 | 21 | from frozendict import frozendict 22 | 23 | 24 | class FrozenDictEncoder(json.JSONEncoder): 25 | """Custom json encoder for encoding frozendict objects 26 | """ 27 | 28 | def default(self, obj): 29 | if isinstance(obj, frozendict): 30 | return dict(obj) 31 | return json.JSONEncoder.default(self, obj) 32 | -------------------------------------------------------------------------------- /data_pipeline/tools/_glob_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import glob 20 | 21 | 22 | def get_file_paths_from_glob_patterns(glob_patterns): 23 | """ Return a set of files matching the given list of glob patterns 24 | (for example ["./test.sql", "./other_tables/*.sql"]) 25 | """ 26 | file_paths = set() 27 | for glob_pattern in glob_patterns: 28 | file_paths |= set(glob.glob(glob_pattern)) 29 | return file_paths 30 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | celluloid (0.16.0) 5 | timers (~> 4.0.0) 6 | coderay (1.1.0) 7 | ffi (1.9.8) 8 | foreman (0.78.0) 9 | thor (~> 0.19.1) 10 | formatador (0.2.5) 11 | guard (2.12.5) 12 | formatador (>= 0.2.4) 13 | listen (~> 2.7) 14 | lumberjack (~> 1.0) 15 | nenv (~> 0.1) 16 | notiffany (~> 0.0) 17 | pry (>= 0.9.12) 18 | shellany (~> 0.0) 19 | thor (>= 0.18.1) 20 | hitimes (1.2.2) 21 | listen (2.10.0) 22 | celluloid (~> 0.16.0) 23 | rb-fsevent (>= 0.9.3) 24 | rb-inotify (>= 0.9) 25 | lumberjack (1.0.9) 26 | method_source (0.8.2) 27 | nenv (0.2.0) 28 | notiffany (0.0.6) 29 | nenv (~> 0.1) 30 | shellany (~> 0.0) 31 | pry (0.10.1) 32 | coderay (~> 1.1.0) 33 | method_source (~> 0.8.1) 34 | slop (~> 3.4) 35 | rb-fsevent (0.9.4) 36 | rb-inotify (0.9.5) 37 | ffi (>= 0.5.0) 38 | shellany (0.0.1) 39 | slop (3.6.0) 40 | thor (0.19.1) 41 | timers (4.0.1) 42 | hitimes 43 | 44 | PLATFORMS 45 | ruby 46 | 47 | DEPENDENCIES 48 | foreman 49 | guard 50 | -------------------------------------------------------------------------------- /data_pipeline/publish_guarantee.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from enum import Enum 20 | 21 | 22 | class PublishGuaranteeEnum(Enum): 23 | """Enum that specifies what kind of message publishing guarantee provided 24 | by the producer. 25 | 26 | Attributes: 27 | exact_once: message will be successfully published exactly once. 28 | at_least_once: message will be successfully published at least once. 29 | """ 30 | 31 | exact_once = 0 32 | at_least_once = 1 33 | -------------------------------------------------------------------------------- /data_pipeline/_clog_writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import clog 20 | 21 | from data_pipeline.config import get_config 22 | from data_pipeline.envelope import Envelope 23 | 24 | logger = get_config().logger 25 | 26 | 27 | class ClogWriter(object): 28 | 29 | def __init__(self): 30 | self.envelope = Envelope() 31 | 32 | def publish(self, message): 33 | try: 34 | clog.log_line(message.topic, self.envelope.pack(message, ascii_encoded=True)) 35 | except: 36 | logger.error("Failed to scribe message - {}".format(str(message))) 37 | -------------------------------------------------------------------------------- /tests/tools/_glob_util_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | 21 | from data_pipeline.tools._glob_util import get_file_paths_from_glob_patterns 22 | 23 | 24 | def test_get_file_paths_from_glob_patterns(): 25 | with mock.patch('data_pipeline.tools._glob_util.glob') as mock_glob: 26 | mock_glob.glob = mock.Mock(return_value=['test']) 27 | paths = get_file_paths_from_glob_patterns(['*.sql', 'some/dir/*.avsc']) 28 | assert paths == {'test'} 29 | assert mock_glob.glob.mock_calls == [ 30 | mock.call('*.sql'), 31 | mock.call('some/dir/*.avsc'), 32 | ] 33 | -------------------------------------------------------------------------------- /yelp_package/Makefile: -------------------------------------------------------------------------------- 1 | UID:=`id -u` 2 | GID:=`id -g` 3 | PACKAGE:=data_pipeline 4 | DOCKER_RUN_LUCID:=docker run -t -v $(CURDIR)/../:/work:rw $(PACKAGE)_lucid_container 5 | DOCKER_RUN_TRUSTY:=docker run -t -v $(CURDIR)/../:/work:rw $(PACKAGE)_trusty_container 6 | DOCKER_QUICK_START:=docker run -t -i -v $(CURDIR)/../:/work:rw $(PACKAGE)_lucid_container 7 | 8 | build_lucid_docker: 9 | [ -d ../dist ] || mkdir ../dist 10 | cd dockerfiles/lucid/ && docker build -t "$(PACKAGE)_lucid_container" . 11 | package_lucid: build_lucid_docker 12 | $(DOCKER_RUN_LUCID) /bin/bash -c "dpkg-buildpackage -d && mv ../*.deb dist/" 13 | $(DOCKER_RUN_LUCID) chown -R $(UID):$(GID) /work 14 | itest_lucid: package_lucid 15 | $(DOCKER_RUN_LUCID) /work/yelp_package/itest/ubuntu.sh 16 | 17 | 18 | build_trusty_docker: 19 | [ -d ../dist ] || mkdir ../dist 20 | cd dockerfiles/trusty/ && docker build -t "$(PACKAGE)_trusty_container" . 21 | package_trusty: build_trusty_docker 22 | $(DOCKER_RUN_TRUSTY) /bin/bash -c "dpkg-buildpackage -d && mv ../*.deb dist/" 23 | $(DOCKER_RUN_TRUSTY) chown -R $(UID):$(GID) /work 24 | itest_trusty: package_trusty 25 | $(DOCKER_RUN_TRUSTY) /work/yelp_package/itest/ubuntu.sh 26 | 27 | quick_start: 28 | $(DOCKER_QUICK_START) /bin/bash 29 | 30 | clean: 31 | rm -rf dist/ 32 | find . -name '*.pyc' -delete 33 | find . -name '__pycache__' -delete 34 | -------------------------------------------------------------------------------- /yelp_package/itest/ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | set -e 17 | 18 | # To add additional scripts, just add lines here 19 | SCRIPTS="data_pipeline_tailer 20 | data_pipeline_refresh_runner 21 | data_pipeline_refresh_manager 22 | data_pipeline_refresh_requester 23 | data_pipeline_compaction_setter 24 | data_pipeline_introspector" 25 | 26 | if dpkg -i /work/dist/*.deb; then 27 | echo "Looks like it installed correctly" 28 | else 29 | echo "Dpkg install failed" 30 | exit 1 31 | fi 32 | 33 | for scr in $SCRIPTS 34 | do 35 | which $scr >/dev/null || (echo "$scr failed to install!"; exit 1) 36 | echo "Running '$scr -h' to make sure it works" 37 | $scr -h >/dev/null || (echo "$scr failed to execute!"; exit 1) 38 | done 39 | 40 | echo "Everything worked!" 41 | -------------------------------------------------------------------------------- /tests/benchmarks/_fast_uuid_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline._fast_uuid import FastUUID 22 | 23 | 24 | @pytest.mark.usefixtures( 25 | "config_benchmark_containers_connections" 26 | ) 27 | @pytest.mark.benchmark 28 | class TestBenchFastUUID(object): 29 | 30 | @pytest.fixture 31 | def fuuid(self): 32 | return FastUUID() 33 | 34 | def test_uuid1(self, benchmark, fuuid): 35 | 36 | @benchmark 37 | def create(): 38 | fuuid.uuid1() 39 | 40 | def test_uuid4(self, benchmark, fuuid): 41 | 42 | @benchmark 43 | def create(): 44 | fuuid.uuid4() 45 | -------------------------------------------------------------------------------- /data_pipeline/_consumer_tick.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import time 20 | 21 | 22 | class _ConsumerTick(object): 23 | """ This class manages state related to ticks and triggers ticks on the 24 | attached Consumer every refresh_time_seconds. It can be used by 25 | refresh_new_topics and other methods requiring tick functionality. 26 | """ 27 | 28 | def __init__(self, refresh_time_seconds): 29 | self.refresh_time_seconds = refresh_time_seconds 30 | self._reset() 31 | 32 | def _reset(self): 33 | self.next_tick_time = time.time() + self.refresh_time_seconds 34 | 35 | def should_tick(self): 36 | return time.time() >= self.next_tick_time 37 | -------------------------------------------------------------------------------- /data_pipeline/schemas/monitoring_message_v1.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "yelp.data_pipeline", 4 | "name": "monitoring_message", 5 | "doc": "Monitoring message that counts the number of messages that are produced/consumed by specific client in a given time frame", 6 | "fields": [ 7 | { 8 | "name": "topic", 9 | "type": "string", 10 | "doc": "topic associated with the messages produced/consumed by client" 11 | }, 12 | { 13 | "name": "client_name", 14 | "type": "string", 15 | "doc": "name of the client that is publishing/consuming messages" 16 | }, 17 | { 18 | "name": "client_type", 19 | "type": { 20 | "name": "client_type", 21 | "doc": "ENUM of Client Types", 22 | "type": "enum", 23 | "symbols": [ 24 | "producer", 25 | "consumer" 26 | ] 27 | }, 28 | "doc": "identifies the type of client: a 'consumer' or a 'producer'" 29 | }, 30 | { 31 | "name": "message_count", 32 | "type": "int", 33 | "doc": "number of messages published/consumed by the client" 34 | }, 35 | { 36 | "name": "start_timestamp", 37 | "type": "int", 38 | "doc": "Time the monitoring system started counting messages" 39 | }, 40 | { 41 | "name": "host_info", 42 | "type": "string", 43 | "doc": "Host information of client" 44 | } 45 | ] 46 | } 47 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/schema.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.models.base import BaseIntrospectorModel 20 | from data_pipeline.tools.introspector.models.topic import IntrospectorTopic 21 | 22 | 23 | class IntrospectorSchema(BaseIntrospectorModel): 24 | def __init__(self, schema_obj, include_topic_info=False): 25 | super(IntrospectorSchema, self).__init__( 26 | schema_obj 27 | ) 28 | self._fields = [ 29 | 'schema_id', 'base_schema_id', 'status', 30 | 'primary_keys', 'created_at', 'note', 'schema_json' 31 | ] 32 | if include_topic_info: 33 | self._fields.append('topic') 34 | self.topic = IntrospectorTopic(self.topic).to_ordered_dict() 35 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/register_command_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.register.avro_command import RegisterAvroCommand 20 | from data_pipeline.tools.introspector.register.mysql_command import RegisterMysqlCommand 21 | 22 | 23 | class RegisterCommandParser(object): 24 | 25 | @classmethod 26 | def add_parser(cls, subparsers): 27 | register_command_parser = subparsers.add_parser( 28 | "register", 29 | description="Register a given schema to the schematizer." 30 | ) 31 | 32 | register_command_subparsers = register_command_parser.add_subparsers() 33 | RegisterAvroCommand.add_parser(register_command_subparsers) 34 | RegisterMysqlCommand.add_parser(register_command_subparsers) 35 | -------------------------------------------------------------------------------- /data_pipeline/initialization_vector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import os 20 | 21 | from Crypto.Cipher import AES 22 | 23 | from data_pipeline.meta_attribute import MetaAttribute 24 | 25 | 26 | def get_initialization_vector(schema_id, initialization_vector_array=None): 27 | if initialization_vector_array is None: 28 | initialization_vector_array = os.urandom(AES.block_size) 29 | _verify_initialization_vector_params(initialization_vector_array) 30 | return MetaAttribute( 31 | schema_id=schema_id, 32 | payload_data=initialization_vector_array 33 | ) 34 | 35 | 36 | def _verify_initialization_vector_params(vector_array): 37 | if not isinstance(vector_array, bytes) or not len(vector_array) == 16: 38 | raise TypeError('Initialization Vector must be a 16-byte array') 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04.1 2 | MAINTAINER justinc@yelp.com 3 | 4 | run apt-get update && apt-get upgrade -y 5 | run apt-get install -y wget language-pack-en-base 6 | 7 | run locale-gen en_US en_US.UTF-8 && dpkg-reconfigure locales 8 | 9 | run mkdir /src 10 | 11 | workdir /src 12 | 13 | run wget https://bitbucket.org/pypy/pypy/downloads/pypy-5.1.1-linux64.tar.bz2 14 | run bunzip2 pypy-5.1.1-linux64.tar.bz2 15 | run tar xvf pypy-5.1.1-linux64.tar 16 | 17 | ENV PATH $PATH:/src/pypy-5.1.1-linux64/bin/ 18 | 19 | run wget https://bootstrap.pypa.io/get-pip.py 20 | run pypy get-pip.py 21 | 22 | run apt-get update && apt-get install -y build-essential git vim libpq5 libpq-dev docker \ 23 | libmysqlclient-dev libsnappy-dev 24 | 25 | 26 | run ln -s /usr/bin/gcc /usr/local/bin/cc 27 | 28 | run pip install virtualenv tox 29 | 30 | # Setup clientlib 31 | WORKDIR /data_pipeline 32 | add requirements.d/dev.txt /data_pipeline/requirements.d/dev.txt 33 | add requirements.d/tools.txt /data_pipeline/requirements.d/tools.txt 34 | add requirements.txt /data_pipeline/requirements.txt 35 | add setup.py /data_pipeline/setup.py 36 | add data_pipeline/__init__.py /data_pipeline/data_pipeline/__init__.py 37 | add README.rst /data_pipeline/README.rst 38 | add HISTORY.rst /data_pipeline/HISTORY.rst 39 | add bin/ /data_pipeline/bin 40 | 41 | # Install dependencies 42 | run mkdir /dp_reqs 43 | run virtualenv /dp_reqs/venv 44 | run /dp_reqs/venv/bin/pip install -i https://pypi.yelpcorp.com/simple/ -r /data_pipeline/requirements.d/dev.txt 45 | 46 | ADD . /data_pipeline 47 | 48 | VOLUME ["/data_pipeline"] 49 | -------------------------------------------------------------------------------- /tests/tools/schema_ref_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | 20 | class TestSchemaRef(object): 21 | 22 | def test_source_to_ref_map_is_complete( 23 | self, 24 | schema_ref, 25 | good_source_ref, 26 | bad_source_ref, 27 | source 28 | ): 29 | assert schema_ref.get_source_ref(source) == good_source_ref 30 | assert schema_ref.get_source_ref('bad_source') == bad_source_ref 31 | assert len(schema_ref._source_to_ref_map) == 2 32 | 33 | def test_source_to_ref_map_can_be_empty(self, schema_ref): 34 | schema_ref.schema_ref = {} 35 | assert len(schema_ref._source_to_ref_map) == 0 36 | 37 | def test_defaults_are_respected(self, schema_ref, schema_ref_defaults): 38 | for key, val in schema_ref_defaults.items(): 39 | assert schema_ref.get_source_val('bad_source', key) == val 40 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/info_command_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.info.namespace import NamespaceInfoCommand 20 | from data_pipeline.tools.introspector.info.source import SourceInfoCommand 21 | from data_pipeline.tools.introspector.info.topic import TopicInfoCommand 22 | 23 | 24 | class InfoCommandParser(object): 25 | 26 | @classmethod 27 | def add_parser(cls, subparsers): 28 | info_command_parser = subparsers.add_parser( 29 | "info", 30 | description="Get information on a specific data pipeline item." 31 | ) 32 | 33 | info_command_subparsers = info_command_parser.add_subparsers() 34 | TopicInfoCommand.add_parser(info_command_subparsers) 35 | SourceInfoCommand.add_parser(info_command_subparsers) 36 | NamespaceInfoCommand.add_parser(info_command_subparsers) 37 | -------------------------------------------------------------------------------- /data_pipeline/schemas/registration_message_v1.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "yelp.data_pipeline", 4 | "name": "client_registration_message_v1", 5 | "doc": "Message format for Producer/Consumer registration.", 6 | "fields": [ 7 | { 8 | "name": "team_name", 9 | "type": "string", 10 | "doc": "Team name, as defined in `sensu_handlers::teams` (see y/sensu-teams)" 11 | }, 12 | { 13 | "name": "client_name", 14 | "type": "string", 15 | "doc": "Name associated with the client - this name will be used as the client's identifier for producer/consumer registration." 16 | }, 17 | { 18 | "name": "client_type", 19 | "type": { 20 | "name": "client_type", 21 | "doc": "ENUM of Client Types", 22 | "type": "enum", 23 | "symbols": [ 24 | "producer", 25 | "consumer" 26 | ] 27 | }, 28 | "doc": "identifies the type of client: a 'consumer' or a 'producer'" 29 | }, 30 | { 31 | "name": "timestamp", 32 | "type": ["null", "long"], 33 | "logicalType": "timestamp-millis", 34 | "doc": "The most recent time that the Client read/wrote a message with the schema_id" 35 | }, 36 | { 37 | "name": "expected_frequency_seconds", 38 | "type": "int", 39 | "doc": "How often(seconds) on average the client publishes/receives a messsage to/from the data pipeline" 40 | }, 41 | { 42 | "name": "schema_id", 43 | "type": "int", 44 | "doc": "Schema Id for which this message contains information about last time used" 45 | } 46 | ] 47 | } -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/source.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.models.base import BaseIntrospectorModel 20 | 21 | 22 | class IntrospectorSource(BaseIntrospectorModel): 23 | def __init__(self, source_obj, active_sources=None): 24 | super(IntrospectorSource, self).__init__( 25 | source_obj 26 | ) 27 | self._fields = [ 28 | 'name', 'source_id', 'owner_email', 'namespace' 29 | ] 30 | self.namespace = source_obj.namespace.name 31 | # Need to check for none in case of empty list 32 | if active_sources is not None: 33 | self._fields.append('active_topic_count') 34 | active_source = active_sources.get(source_obj.source_id, None) 35 | self.active_topic_count = 0 if ( 36 | not active_source 37 | ) else active_source['active_topic_count'] 38 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.list_command.namespaces import NamespacesListCommand 20 | from data_pipeline.tools.introspector.list_command.sources import SourcesListCommand 21 | from data_pipeline.tools.introspector.list_command.topics import TopicsListCommand 22 | 23 | 24 | class ListCommandParser(object): 25 | 26 | @classmethod 27 | def add_parser(cls, subparsers): 28 | list_command_parser = subparsers.add_parser( 29 | "list", 30 | description="Get a list of specified items as a JSON array of objects." 31 | ) 32 | 33 | list_command_subparsers = list_command_parser.add_subparsers() 34 | TopicsListCommand.add_parser(list_command_subparsers) 35 | SourcesListCommand.add_parser(list_command_subparsers) 36 | NamespacesListCommand.add_parser(list_command_subparsers) 37 | -------------------------------------------------------------------------------- /tests/benchmarks/envelope_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline.envelope import Envelope 22 | from tests.factories.base_factory import MessageFactory 23 | 24 | 25 | @pytest.mark.usefixtures( 26 | "config_benchmark_containers_connections" 27 | ) 28 | @pytest.mark.benchmark 29 | class TestBenchEnvelope(object): 30 | 31 | @pytest.fixture 32 | def envelope(self): 33 | return Envelope() 34 | 35 | def test_pack(self, benchmark, envelope): 36 | 37 | def setup(): 38 | return [MessageFactory.create_message_with_payload_data()], {} 39 | 40 | benchmark.pedantic(envelope.pack, setup=setup, rounds=1000) 41 | 42 | def test_unpack(self, benchmark, envelope): 43 | 44 | def setup(): 45 | return [envelope.pack(MessageFactory.create_message_with_payload_data())], {} 46 | 47 | benchmark.pedantic(envelope.unpack, setup=setup, rounds=1000) 48 | -------------------------------------------------------------------------------- /data_pipeline/helpers/decorators.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import cPickle 20 | from functools import wraps 21 | 22 | 23 | def memoized(func): 24 | """Decorator that caches a function's return value each time it is called. 25 | If called later with the same arguments, the cached value is returned, and 26 | the function is not re-evaluated. 27 | 28 | Based upon from http://wiki.python.org/moin/PythonDecoratorLibrary#Memoize 29 | Nota bene: this decorator memoizes /all/ calls to the function. 30 | For a memoization decorator with limited cache size, consider: 31 | http://code.activestate.com/recipes/496879-memoize-decorator-function-with-cache-size-limit/ 32 | """ 33 | cache = {} 34 | 35 | @wraps(func) 36 | def func_wrapper(*args, **kwargs): 37 | key = cPickle.dumps((args, kwargs)) 38 | if key not in cache: 39 | cache[key] = func(*args, **kwargs) 40 | return cache[key] 41 | return func_wrapper 42 | -------------------------------------------------------------------------------- /tests/helpers/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from contextlib import contextmanager 20 | 21 | import staticconf 22 | 23 | from data_pipeline.config import configure_from_dict 24 | from data_pipeline.config import namespace 25 | 26 | 27 | @contextmanager 28 | def reconfigure(**kwargs): 29 | """Reconfigures the given kwargs, restoring the current configuration for 30 | only those kwargs when the contextmanager exits. 31 | """ 32 | conf_namespace = staticconf.config.get_namespace(namespace) 33 | starting_config = { 34 | k: v for k, v in conf_namespace.get_config_values().iteritems() 35 | if k in kwargs 36 | } 37 | configure_from_dict(kwargs) 38 | try: 39 | yield 40 | finally: 41 | final_config = { 42 | k: v for k, v in conf_namespace.get_config_values().iteritems() 43 | if k not in kwargs 44 | } 45 | final_config.update(starting_config) 46 | staticconf.config.get_namespace(namespace).clear() 47 | configure_from_dict(final_config) 48 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/namespace.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.models.base import BaseIntrospectorModel 20 | 21 | 22 | class IntrospectorNamespace(BaseIntrospectorModel): 23 | def __init__(self, namespace_obj, active_namespaces=None): 24 | super(IntrospectorNamespace, self).__init__( 25 | namespace_obj 26 | ) 27 | self._fields = [ 28 | 'name', 'namespace_id' 29 | ] 30 | # Need to check for none in case of empty list 31 | if active_namespaces is not None: 32 | self._fields.append('active_source_count') 33 | self._fields.append('active_topic_count') 34 | self.active_source_count = 0 35 | self.active_topic_count = 0 36 | active_namespace = active_namespaces.get(self.name, None) 37 | if active_namespace: 38 | self.active_source_count = active_namespace['active_source_count'] 39 | self.active_topic_count = active_namespace['active_topic_count'] 40 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/namespace.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | 23 | 24 | """ 25 | Represent the data of a namespace. Namespace is a group which the avro schemas 26 | belong to. It is the highest grouping level of schemas. For example, 27 | `yelp_main` could be a namespace. 28 | 29 | Args: 30 | namespace_id (int): The id of the namespace. 31 | name (str): The name of the namespace. 32 | """ 33 | Namespace = namedtuple('Namespace', ['namespace_id', 'name']) 34 | 35 | 36 | class _Namespace(BaseModel): 37 | 38 | def __init__(self, namespace_id, name): 39 | self.namespace_id = namespace_id 40 | self.name = name 41 | 42 | @classmethod 43 | def from_response(cls, response): 44 | return cls( 45 | namespace_id=response.namespace_id, 46 | name=response.name 47 | ) 48 | 49 | def to_result(self): 50 | return Namespace( 51 | namespace_id=self.namespace_id, 52 | name=self.name 53 | ) 54 | -------------------------------------------------------------------------------- /tests/tools/meteorite_wrappers_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | 21 | from data_pipeline.tools.meteorite_wrappers import StatsCounter 22 | 23 | 24 | class TestStatsCounter(object): 25 | 26 | @mock.patch('yelp_meteorite.metrics.Counter.count', autospec=True) 27 | def test_stats_counter(self, mock_count): 28 | counter = StatsCounter('test_stat', message_count_timer=0, stat_type='test_type') 29 | counter.increment('test_type') 30 | assert mock_count.call_count == 1 31 | 32 | @mock.patch('yelp_meteorite.metrics.Counter.count', autospec=True) 33 | def test_batched_counter(self, mock_count): 34 | with mock.patch( 35 | 'data_pipeline.tools.meteorite_wrappers.time', 36 | ) as mock_time: 37 | mock_time.time.side_effect = [ 38 | 2, 3, 8, 9 39 | ] 40 | counter = StatsCounter('test_stat', message_count_timer=4, stat_type='test_type') 41 | counter.increment('test_type') 42 | counter.increment('test_type') 43 | # Two increments are batched into 1 call. 44 | assert mock_count.call_count == 1 45 | -------------------------------------------------------------------------------- /Makefile-opensource: -------------------------------------------------------------------------------- 1 | CURRENT_VERSION=$(strip $(shell sed -n -r "s/__version__ = '(.+)'/\1/p" $(CURDIR)/data_pipeline/__init__.py)) 2 | NEXT_VERSION=$(shell echo $(CURRENT_VERSION) | awk -F. '/[0-9]+\./{$$NF+=1;OFS=".";print}') 3 | 4 | REBUILD_FLAG = 5 | 6 | .PHONY: help all production clean clean-pyc clean-build clean-docs clean-vim lint test docs coverage install-hooks compose-prefix 7 | 8 | help: 9 | @echo "clean-build - remove build artifacts" 10 | @echo "clean-pyc - remove Python file artifacts" 11 | @echo "clean-docs - remove doc creation artifacts" 12 | @echo "clean-vim - remove vim swap file artifacts" 13 | @echo "test - run tests quickly with the default Python" 14 | @echo "coverage - check code coverage" 15 | @echo "docs - generates Sphinx HTML documentation, including API docs" 16 | @echo "compose-prefix - generates a preconfigured docker-compose command" 17 | 18 | all: production install-hooks 19 | 20 | production: 21 | @true 22 | 23 | clean: clean-build clean-pyc clean-docs 24 | 25 | clean-build: 26 | rm -fr build/ 27 | rm -fr dist/ 28 | rm -fr *.egg-info 29 | 30 | clean-pyc: 31 | find . -name '*.pyc' -exec rm -f {} + 32 | find . -name '*.pyo' -exec rm -f {} + 33 | find . -name '*~' -exec rm -f {} + 34 | 35 | clean-docs: 36 | rm -rf docs/build/* 37 | rm -rf docs/code/* 38 | 39 | clean-vim: 40 | find . -name '*.swp' -exec rm -f {} + 41 | find . -name '*.swo' -exec rm -f {} + 42 | 43 | test: 44 | OPEN_SOURCE_MODE=true PULL_CONTAINERS=true FORCE_FRESH_CONTAINERS=true tox -c tox-opensource.ini $(REBUILD_FLAG) 45 | 46 | docs: clean-docs 47 | tox -c tox-opensource.ini -e docs $(REBUILD_FLAG) 48 | 49 | coverage: test 50 | 51 | install-hooks: 52 | tox -c tox-opensource.ini -e pre-commit -- install -f --install-hooks 53 | 54 | compose-prefix: 55 | @OPEN_SOURCE_MODE=true python -c "from data_pipeline.testing_helpers.containers import Containers; print Containers.compose_prefix()" 56 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import datetime 20 | from collections import OrderedDict 21 | 22 | 23 | class BaseIntrospectorModel(object): 24 | def __init__(self, model_obj, excluded_fields=None): 25 | if not excluded_fields: 26 | excluded_fields = [] 27 | fields_to_grab = [ 28 | field for field in model_obj._fields if field not in excluded_fields 29 | ] 30 | for field in fields_to_grab: 31 | value = getattr(model_obj, field) 32 | if isinstance(value, datetime.datetime): 33 | # datetime objects are not json serializable 34 | value = str(value) 35 | setattr(self, field, value) 36 | 37 | def to_ordered_dict(self): 38 | if not hasattr(self, '_fields'): 39 | raise NotImplementedError( 40 | "Derived class does not have a defined _fields " 41 | "attribute to define order of fields for dict" 42 | ) 43 | result_dict = OrderedDict([]) 44 | for field in self._fields: 45 | result_dict[field] = getattr(self, field) 46 | return result_dict 47 | -------------------------------------------------------------------------------- /tox-opensource.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | # Removed for now: 3 | # py34, py35, pypy 4 | envlist = py27, docs 5 | skipsdist = true 6 | 7 | [testenv] 8 | basepython = python2.7 9 | passenv = * 10 | envdir = venv/py27 11 | venv_update = {toxinidir}/bin/venv-update venv= {envdir} install= 12 | commands = 13 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev.txt 14 | py.test -c tox-opensource.ini --cov=data_pipeline --maxfail=3 tests/ 15 | pre-commit run --all-files 16 | 17 | [testenv:pre-commit] 18 | envdir = venv/pre_commit 19 | commands = 20 | {[testenv]venv_update} -r {toxinidir}/requirements.d/pre_commit.txt 21 | pre-commit {posargs} 22 | 23 | [testenv:guard] 24 | envlist = py27 25 | envdir = venv/py27 26 | commands = 27 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev.txt 28 | py.test -c tox-opensource.ini --doctest-modules -m "not pending" {posargs} 29 | 30 | [testenv:docs] 31 | envdir = venv/py27 32 | commands = 33 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev.txt 34 | sphinx-apidoc -f -e -o docs/code data_pipeline 35 | sphinx-build -b html -d docs/build/doctrees docs/ docs/build/html 36 | 37 | [testenv:devenv] 38 | envdir = venv/py27 39 | commands = 40 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev.txt 41 | 42 | [testenv:devenv-command] 43 | envdir = venv/py27 44 | commands = 45 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev.txt 46 | {posargs} 47 | 48 | [flake8] 49 | ignore = 50 | exclude = .git,.tox,docs,virtualenv_run,venv,__pycache__,.ropeproject,debian,dist 51 | filename = *.py,*.wsgi 52 | max-line-length = 131 53 | 54 | [pytest] 55 | norecursedirs = tests/consumer tests/tools tests/benchmarks 56 | addopts = -m"not benchmark" -m"not skip" --ignore=setup.py --doctest-glob=*.rst -vv 57 | 58 | [pep8] 59 | # E265 deals with spacing inside of comments - breaks human formatting 60 | # E309 puts a blank line after class declarations - doesn't work well with docstrings 61 | # E501 reformats lines to fit in --max-line-length poorly 62 | ignore = E265,E309,E501 63 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import argparse 20 | 21 | from data_pipeline import __version__ 22 | from data_pipeline.tools.introspector.info_command_parser import InfoCommandParser 23 | from data_pipeline.tools.introspector.list_command_parser import ListCommandParser 24 | from data_pipeline.tools.introspector.register_command_parser import RegisterCommandParser 25 | from data_pipeline.tools.introspector.schema_check_command import SchemaCheckCommand 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser( 30 | description="data_pipeline_introspector provides ability to view the current " 31 | "state of the data pipeline from a top-down view of namespaces." 32 | ) 33 | parser.add_argument( 34 | '--version', 35 | action='version', 36 | version="data_pipeline {}".format(__version__) 37 | ) 38 | 39 | subparsers = parser.add_subparsers() 40 | ListCommandParser.add_parser(subparsers) 41 | InfoCommandParser.add_parser(subparsers) 42 | RegisterCommandParser.add_parser(subparsers) 43 | SchemaCheckCommand.add_parser(subparsers) 44 | return parser.parse_args() 45 | 46 | 47 | def run(): 48 | args = parse_args() 49 | args.command(args) 50 | 51 | 52 | if __name__ == "__main__": 53 | run() 54 | -------------------------------------------------------------------------------- /tests/factories/base_factory.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from random import randint 20 | 21 | from data_pipeline.message import CreateMessage 22 | from data_pipeline.schematizer_clientlib.schematizer import get_schematizer 23 | 24 | 25 | class SchemaFactory(object): 26 | SOURCE_SCHEMA = ''' 27 | { 28 | "type": "record", 29 | "namespace": "test_namespace", 30 | "doc": "test_doc", 31 | "name": "source_schema", 32 | "fields": [ 33 | {"type": "int","name": "original", "doc": "test_doc"} 34 | ] 35 | } 36 | ''' 37 | 38 | @classmethod 39 | def get_schema_json(cls): 40 | return get_schematizer().register_schema( 41 | schema_str=cls.SOURCE_SCHEMA, 42 | namespace='test_namespace', 43 | source="test_source_{}".format(randint(0, 100)), 44 | source_owner_email='test@yelp.com', 45 | contains_pii=False 46 | ) 47 | 48 | @classmethod 49 | def get_payload_data(cls): 50 | return {"original": randint(0, 1000000)} 51 | 52 | 53 | class MessageFactory(object): 54 | 55 | @classmethod 56 | def create_message_with_payload_data(self): 57 | return CreateMessage( 58 | schema_id=SchemaFactory.get_schema_json().schema_id, 59 | payload_data=SchemaFactory.get_payload_data() 60 | ) 61 | -------------------------------------------------------------------------------- /data_pipeline/tools/sensu_ttl_alerter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import copy 20 | 21 | import pysensu_yelp 22 | 23 | 24 | class SensuTTLAlerter(object): 25 | """ This class triggers a sensu alert if the producer dies. If the sensu_event_info is 26 | not sent within the ttl specified the prior time it was sent then sensu will alert. 27 | 28 | Args: 29 | sensu_event_info(dict): dictionary with sensu parameters. For details see 30 | http://pysensu-yelp.readthedocs.io/en/latest/index.html?highlight=send_event 31 | for details 32 | enable(bool): enable this ttl alert manager 33 | """ 34 | 35 | def __init__(self, sensu_event_info, enable=True): 36 | self._sensu_event_info = sensu_event_info 37 | self._enable = enable 38 | 39 | def process(self): 40 | if self.enable: 41 | pysensu_yelp.send_event(**self._sensu_event_info) 42 | 43 | @property 44 | def enable(self): 45 | return self._enable 46 | 47 | @enable.setter 48 | def enable(self, new_enable_value): 49 | if self._enable and not new_enable_value: 50 | # send final message without ttl 51 | final_sensu_info = copy.deepcopy(self._sensu_event_info) 52 | final_sensu_info.pop('ttl') 53 | pysensu_yelp.send_event(**final_sensu_info) 54 | self._enable = new_enable_value 55 | -------------------------------------------------------------------------------- /tests/initialization_vector_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline._encryption_helper import _AVSCStore 22 | from data_pipeline._encryption_helper import initialization_vector_info 23 | from data_pipeline.initialization_vector import get_initialization_vector 24 | from data_pipeline.meta_attribute import MetaAttribute 25 | 26 | 27 | @pytest.mark.usefixtures('containers') 28 | class TestInitializationVector(object): 29 | 30 | def test_create_vector_fails_with_bad_arg_values(self): 31 | schema_id = _AVSCStore().get_schema_id(initialization_vector_info) 32 | invalid_vector_payload_data = bytes(10) 33 | 34 | with pytest.raises(TypeError): 35 | get_initialization_vector( 36 | schema_id, 37 | invalid_vector_payload_data 38 | ) 39 | 40 | def test_initialization_vector_creation(self): 41 | schema_id = _AVSCStore().get_schema_id(initialization_vector_info) 42 | vector_payload_data = b'0000000000000000' 43 | 44 | for _payload_data in [vector_payload_data, None]: 45 | initialization_vector = get_initialization_vector( 46 | schema_id, 47 | _payload_data 48 | ) 49 | assert isinstance(initialization_vector, MetaAttribute) 50 | assert isinstance(initialization_vector.avro_repr['payload'], bytes) 51 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/meta_attr_source_mapping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | 23 | 24 | """ 25 | Represent the data of meta attribute mapping store. Meta attribute source 26 | mapping should represent a mapping of a source and the corresponding meta 27 | attribute schema id. 28 | """ 29 | MetaAttributeSourceMapping = namedtuple( 30 | 'MetaAttributeSourceMapping', 31 | ['source_id', 'meta_attribute_schema_id'] 32 | ) 33 | 34 | 35 | class _MetaAttributeSourceMapping(BaseModel): 36 | """Internal class used to convert from/to various data structure and 37 | facilitate constructing the return value of schematizer functions. 38 | """ 39 | 40 | def __init__(self, source_id, meta_attribute_schema_id): 41 | self.source_id = source_id 42 | self.meta_attribute_schema_id = meta_attribute_schema_id 43 | 44 | @classmethod 45 | def from_response(cls, source_id, meta_attribute_schema_id): 46 | return cls( 47 | source_id=source_id, 48 | meta_attribute_schema_id=meta_attribute_schema_id 49 | ) 50 | 51 | def to_result(self): 52 | return MetaAttributeSourceMapping( 53 | source_id=self.source_id, 54 | meta_attribute_schema_id=self.meta_attribute_schema_id 55 | ) 56 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/meta_attr_namespace_mapping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | 23 | 24 | """ 25 | Represent the data of meta attribute mapping store. Meta attribute namespace 26 | mapping should represent a mapping of a namespace and the corresponding meta 27 | attribute schema id. 28 | """ 29 | MetaAttributeNamespaceMapping = namedtuple( 30 | 'MetaAttributeNamespaceMapping', 31 | ['namespace_id', 'meta_attribute_schema_id'] 32 | ) 33 | 34 | 35 | class _MetaAttributeNamespaceMapping(BaseModel): 36 | """Internal class used to convert from/to various data structure and 37 | facilitate constructing the return value of schematizer functions. 38 | """ 39 | 40 | def __init__(self, namespace_id, meta_attribute_schema_id): 41 | self.namespace_id = namespace_id 42 | self.meta_attribute_schema_id = meta_attribute_schema_id 43 | 44 | @classmethod 45 | def from_response(cls, namespace_id, meta_attribute_schema_id): 46 | return cls( 47 | namespace_id=namespace_id, 48 | meta_attribute_schema_id=meta_attribute_schema_id 49 | ) 50 | 51 | def to_result(self): 52 | return MetaAttributeNamespaceMapping( 53 | namespace_id=self.namespace_id, 54 | meta_attribute_schema_id=self.meta_attribute_schema_id 55 | ) 56 | -------------------------------------------------------------------------------- /data_pipeline/team.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import yaml 20 | 21 | from data_pipeline.config import get_config 22 | 23 | 24 | class Team(object): 25 | """Contains some helper methods for dealing with the data pipeline team 26 | configuration. Eventually this class should encapsulate information about 27 | each team. That's overkill for the current use-case, so it's not 28 | implemented yet. 29 | 30 | """ 31 | 32 | @classmethod 33 | def config(cls): 34 | """Loads and decodes the 35 | :attr:`data_pipeline.config.Config.data_pipeline_teams_config_file_path`. 36 | 37 | TODO(justinc|DATAPIPE-348): Cache team config, dealing with invalidation 38 | when configuration changes. 39 | 40 | Returns: 41 | dict: team configuration 42 | """ 43 | config_path = get_config().data_pipeline_teams_config_file_path 44 | return yaml.load(open(config_path).read()) 45 | 46 | @classmethod 47 | def team_names(cls): 48 | """Lists all data pipeline teams 49 | 50 | Returns: 51 | list of str: all valid data pipeline team names 52 | """ 53 | return cls.config()['teams'].keys() 54 | 55 | @classmethod 56 | def exists(cls, team_name): 57 | """Determines if a team exists, by name. 58 | 59 | Returns: 60 | bool: True if team_name exists for a valid team, false otherwise 61 | """ 62 | return team_name in cls.team_names() 63 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | # More info at https://github.com/guard/guard#readme 2 | 3 | module ::Guard 4 | class MakeGuard < Plugin 5 | def make_docs(paths=nil) 6 | UI.info 'Rebuilding docs...' 7 | out = `make docs 2>&1` 8 | if $?.exitstatus == 0 9 | UI.info 'Docs built' 10 | else 11 | UI.error '`make docs` exited with non-zero status' 12 | UI.debug out 13 | throw :task_has_failed 14 | end 15 | end 16 | 17 | [:run_all, :run_on_additions, :run_on_modifications, :run_on_removals].each do |method| 18 | alias_method method, :make_docs 19 | end 20 | end 21 | 22 | # Watchers should return :all to test all files, or the path of files to test 23 | class PyTest < Plugin 24 | def run_tests(test_files=[]) 25 | test_files = test_files.select{|f| File.exists?(f)} 26 | UI.info "Running test files: #{test_files.join(', ')}" 27 | cols = ENV['COLUMNS'].to_i - 10 28 | out = `COLUMNS=#{cols} tox -e guard "#{test_files.join(' ')}"` 29 | UI.info "Test Output:\n#{out}" 30 | end 31 | 32 | def run_all(paths=nil) 33 | run_tests 34 | end 35 | alias_method :run_on_removals, :run_all 36 | 37 | def run_on_additions(paths) 38 | if paths == ['all'] 39 | run_all 40 | else 41 | run_tests(paths) 42 | end 43 | end 44 | alias_method :run_on_modifications, :run_on_additions 45 | end 46 | end 47 | 48 | guard :pytest do 49 | # Test the changed file and the corresponding test 50 | watch(%r{^data_pipeline/(.+)\.py$}) {|m| [m[0], "tests/#{m[1]}_test.py"] } 51 | watch(%r{^data_pipeline/(async_producer|position_data|_kafka_producer|_position_data_builder)\.py$}) { 'tests/producer_test.py' } 52 | watch(%r{^tests/(.+)\_test.py$}) 53 | watch(%r{^tests/helpers/(.+).py$}) { :all } 54 | watch(%r{^tests/conftest.py$}) { :all } 55 | watch('tox.ini') { :all } 56 | # rst files in root and docs directories only 57 | watch(%r{^([^/]+|docs/[^/]+)\.rst$}) 58 | end 59 | 60 | guard :make_guard do 61 | watch(%r{^data_pipeline/(.+)\.py$}) 62 | # rst files in root and docs directories only 63 | watch(%r{^([^/]+|docs/[^/]+)\.rst$}) 64 | watch('docs/conf.py') 65 | watch('tox.ini') 66 | end 67 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/models/topic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.models.base import BaseIntrospectorModel 20 | 21 | 22 | class IntrospectorTopic(BaseIntrospectorModel): 23 | def __init__(self, topic_obj, kafka_topics=None, topics_to_range_map=None): 24 | super(IntrospectorTopic, self).__init__( 25 | topic_obj, 26 | excluded_fields=['source'] 27 | ) 28 | self._fields = [ 29 | 'name', 'topic_id', 'source_name', 'source_id', 'namespace', 30 | 'primary_keys', 'contains_pii', 'cluster_type' 31 | ] 32 | self.source_name = topic_obj.source.name 33 | self.source_id = topic_obj.source.source_id 34 | self.namespace = topic_obj.source.namespace.name 35 | if kafka_topics is not None: 36 | self.in_kafka = self.name in kafka_topics 37 | self._fields.append('in_kafka') 38 | if topics_to_range_map is not None: 39 | self.message_count = self._get_topic_message_count( 40 | topics_to_range_map 41 | ) 42 | if self.message_count: 43 | self.in_kafka = True 44 | self._fields.append('message_count') 45 | self._fields.extend(['created_at', 'updated_at']) 46 | 47 | def _get_topic_message_count(self, topics_to_range_map): 48 | if self.name in topics_to_range_map: 49 | return sum(topics_to_range_map[self.name].values()) 50 | return 0 51 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/register/base_command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 22 | from data_pipeline.tools.introspector.models import IntrospectorSchema 23 | 24 | 25 | class _BaseRegisterCommand(IntrospectorCommand): 26 | 27 | @classmethod 28 | def add_base_arguments(cls, parser): 29 | super(_BaseRegisterCommand, cls).add_base_arguments(parser) 30 | cls.add_source_and_namespace_arguments(parser) 31 | 32 | parser.add_argument( 33 | "--source_owner_email", 34 | type=str, 35 | required=True, 36 | help="The email of the owner of the given source." 37 | ) 38 | 39 | parser.add_argument( 40 | "--contains-pii", 41 | dest="pii", 42 | default=False, 43 | action="store_true", 44 | help="Flag indicating if schema contains pii. More info at y/pii" 45 | ) 46 | 47 | def process_args(self, args, parser): 48 | super(_BaseRegisterCommand, self).process_args(args, parser) 49 | self.process_source_and_namespace_args(args, parser) 50 | self.source_owner_email = args.source_owner_email 51 | self.pii = args.pii 52 | 53 | def print_schema(self, schema): 54 | schema_dict = IntrospectorSchema( 55 | schema, 56 | include_topic_info=True 57 | ).to_ordered_dict() 58 | print simplejson.dumps(schema_dict) 59 | -------------------------------------------------------------------------------- /data_pipeline/expected_frequency.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from enum import Enum 20 | 21 | 22 | class ExpectedFrequency(Enum): 23 | """Helper constants specifying how frequently the client expects to produce 24 | or consume messages. Any positive integer number of seconds can be used, 25 | these are provided for convenience only. Expected frequency will be used 26 | to infer schema deprecation. 27 | 28 | For example, if a client registers to produce messages constantly, and a 29 | few months later we observe that the client hasn't published messages using 30 | an older schema verison in a month, but does regularly publish using a 31 | newer version, we can infer that the older schema version is deprecated 32 | and send out a deprecation/migration notice. 33 | 34 | Attributes: 35 | constantly: client expects to always and continuously be producing and 36 | consuming messages. 37 | hourly: client expects to come online to produce or consume messages 38 | approximately every hour. 39 | weekly: client expects to come online to produce or consume messages about 40 | once a week. 41 | monthly: client expects to come online to produce or consume messages about 42 | once a month. 43 | yearly: client expects to come online to produce or consume messages about 44 | once a year. 45 | """ 46 | constantly = 0 47 | hourly = 60 * 60 48 | daily = hourly * 24 49 | weekly = daily * 7 50 | monthly = daily * 30 51 | yearly = daily * 365 52 | -------------------------------------------------------------------------------- /data_pipeline/tools/meteorite_gauge_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.heartbeat_periodic_processor import BasePeriodicProcessor 20 | from data_pipeline.tools.meteorite_wrappers import StatGauge 21 | 22 | 23 | class MeteoriteGaugeManager(BasePeriodicProcessor): 24 | """ 25 | This class reports how far behind real-time the producer is to meteorite/signalfx 26 | 27 | Args: 28 | interval_in_seconds(int): the time interval between two events. 29 | stats_gauge_name(str): name of the stats gauge 30 | container_name(str): paasta container name 31 | container_env(str): paasta cluster name 32 | disable(bool): whether this gauge is disabled or not 33 | kwargs(dict): any additional keyword args for the Meteorite StatsGauge class 34 | """ 35 | 36 | def __init__( 37 | self, 38 | interval_in_seconds, 39 | stats_gauge_name=None, 40 | container_name=None, 41 | container_env=None, 42 | disable=False, 43 | **kwargs 44 | ): 45 | super(MeteoriteGaugeManager, self).__init__(interval_in_seconds) 46 | self.gauge = StatGauge( 47 | stats_gauge_name, 48 | container_name=container_name, 49 | container_env=container_env, 50 | **kwargs 51 | ) 52 | self.disable = disable 53 | 54 | def process(self, timestamp): 55 | if self.disable: 56 | return 57 | 58 | delay_seconds = (self._utc_now - timestamp).total_seconds() 59 | self.gauge.set(delay_seconds) 60 | -------------------------------------------------------------------------------- /tests/benchmarks/producer_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import time 20 | 21 | import mock 22 | import pytest 23 | 24 | from data_pipeline.expected_frequency import ExpectedFrequency 25 | from data_pipeline.producer import Producer 26 | from tests.factories.base_factory import MessageFactory 27 | 28 | 29 | @pytest.mark.usefixtures( 30 | "configure_teams", 31 | "config_benchmark_containers_connections" 32 | ) 33 | @pytest.mark.benchmark 34 | class TestBenchProducer(object): 35 | 36 | @pytest.yield_fixture 37 | def patch_monitor_init_start_time_to_now(self): 38 | with mock.patch( 39 | 'data_pipeline.client._Monitor.get_monitor_window_start_timestamp', 40 | return_value=int(time.time()) 41 | ) as patched_start_time: 42 | yield patched_start_time 43 | 44 | @pytest.yield_fixture 45 | def dp_producer(self, team_name): 46 | with Producer( 47 | producer_name='producer_1', 48 | team_name=team_name, 49 | expected_frequency_seconds=ExpectedFrequency.constantly, 50 | use_work_pool=False 51 | ) as producer: 52 | yield producer 53 | 54 | def test_publish(self, benchmark, dp_producer): 55 | 56 | def setup(): 57 | return [MessageFactory.create_message_with_payload_data()], {} 58 | 59 | # Publishing a message takes 1ms on average. 60 | # Messages are flushed every 100ms. 61 | # config::kafka_producer_flush_time_limit_seconds 62 | # 63 | # Perform 2000 rounds to ensure 20 flushes. 64 | benchmark.pedantic(dp_producer.publish, setup=setup, rounds=2000) 65 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command/base_command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 20 | 21 | 22 | class _BaseListCommand(IntrospectorCommand): 23 | 24 | @classmethod 25 | def add_base_arguments(cls, parser): 26 | super(_BaseListCommand, cls).add_base_arguments(parser) 27 | 28 | parser.add_argument( 29 | "-s", 30 | "--sort-by", 31 | type=str, 32 | default=None, 33 | help="Sort the listing by a particular field of the object " 34 | "in ascending order (by default)" 35 | ) 36 | 37 | parser.add_argument( 38 | "--descending-order", "--desc", 39 | action="store_true", 40 | default=False, 41 | help="Use --sort-by with descending order (Will be ignored if --sort-by is not set)" 42 | ) 43 | 44 | @classmethod 45 | def get_description(cls): 46 | return "List {}, as a JSON array of formatted {}. Fields: {}".format( 47 | cls.list_type, cls.list_type, cls.fields 48 | ) 49 | 50 | def process_args(self, args, parser): 51 | super(_BaseListCommand, self).process_args(args, parser) 52 | self.sort_by = args.sort_by 53 | self.descending_order = args.descending_order 54 | if self.sort_by and self.sort_by not in self.fields: 55 | raise parser.error( 56 | "You can not sort_by by {} for list type {}. Possible fields are: {}".format( 57 | self.sort_by, self.list_type, self.fields 58 | ) 59 | ) 60 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command/topics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.list_command.base_command import _BaseListCommand 22 | 23 | 24 | class TopicsListCommand(_BaseListCommand): 25 | 26 | list_type = 'topics' 27 | fields = [ 28 | 'name', 'topic_id', 'contains_pii', 29 | 'primary_keys', 30 | 'in_kafka', 'message_count', 31 | 'source_name', 'source_id', 32 | 'namespace', 33 | 'created_at', 'updated_at' 34 | ] 35 | 36 | @classmethod 37 | def add_parser(cls, subparsers): 38 | list_command_parser = subparsers.add_parser( 39 | "topics", 40 | description=cls.get_description(), 41 | add_help=False 42 | ) 43 | 44 | cls.add_base_arguments(list_command_parser) 45 | cls.add_source_and_namespace_arguments(list_command_parser) 46 | 47 | list_command_parser.set_defaults( 48 | command=lambda args: 49 | cls("data_pipeline_introspector_list").run(args, list_command_parser) 50 | ) 51 | 52 | def process_args(self, args, parser): 53 | super(TopicsListCommand, self).process_args(args, parser) 54 | self.process_source_and_namespace_args(args, parser) 55 | 56 | def run(self, args, parser): 57 | self.process_args(args, parser) 58 | print simplejson.dumps(self.list_topics( 59 | source_id=self.source_id, 60 | namespace_name=self.namespace, 61 | source_name=self.source_name, 62 | sort_by=self.sort_by, 63 | descending_order=self.descending_order 64 | )) 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file is used to build the data-pipeline-tools debian package. Because 2 | # of this, it installs the tools extras dependencies. All versions should be 3 | # pinned here, because these packages will be included in a virtualenv 4 | # deployed inside a debian package - so the build must be repeatable. 5 | # This list was originally constructed by running 6 | # `pip install -i https://pypi.yelpcorp.com/simple/ ".[tools]"` 7 | # and `pip freeze > requirements.txt` in a virtualenv. 8 | # 9 | # Specifically, run: 10 | # 11 | # virtualenv tools 12 | # source tools/bin/activate 13 | # pip install --upgrade pip 14 | # pip install -b /nail/tmp -i https://pypi.yelpcorp.com/simple/ ".[tools]" 15 | # pip freeze > requirements.tmp 16 | # deactivate 17 | # rm -rf tools 18 | # 19 | # Note that the data_pipeline package should not appear here, and that cffi 20 | # should be at least 1.2.1 for trusty compatibility. 21 | 22 | argparse==1.4.0 23 | boto==2.38.0 24 | bravado==8.3.0 25 | bravado-core==4.5.0 26 | bravado-decorators==0.6.0 27 | cached-property==1.2.0 28 | cffi==1.6.0 29 | characteristic==14.1.0 30 | contextdecorator==0.10.0 31 | crochet==1.4.0 32 | cryptography==1.3.4 33 | dateglob==0.1 34 | enum34==1.0.4 35 | fido==4.0.1 36 | functools32==3.2.3-2 37 | future==0.14.3 38 | futures==3.0.3 39 | geogrid==1.0.9 40 | ipaddress==1.0.14 41 | iso8601==0.1.10 42 | jsonschema==2.5.1 43 | kafka-python==0.9.5.post6 44 | kafka-utils==0.4.2 45 | kazoo==2.2 46 | mysqlclient==1.3.6 47 | ply==3.4 48 | psutil==4.2.0 49 | pycparser==2.14 50 | pycrypto==2.6.1 51 | pyOpenSSL==0.14.0 52 | pysensu_yelp==0.2.3 53 | PyStaticConfiguration==0.9.0 54 | pysubnettree==0.23 55 | python-dateutil==2.4.2 56 | pytz==2014.10 57 | PyYAML==3.11 58 | replication-delay-client==1.1.1 59 | repoze.lru==0.6 60 | requests==2.8.1 61 | retrying==1.3.3 62 | send-nsca==0.1.4.1 63 | setproctitle==1.1.8 64 | simplejson==3.6.5 65 | six==1.9.0 66 | SQLAlchemy==0.9.8 67 | subprocess32==3.2.6 68 | swagger-spec-validator==2.0.2 69 | thriftpy==0.1.15 70 | tornado==2.4.1 71 | Twisted==15.4.0 72 | data-pipeline-avro-util==0.2.1 73 | yelp-avro==1.9.2 74 | yelp-batch==0.19.8 75 | yelp-bytes==0.2.0 76 | yelp-cgeom==1.3.1 77 | yelp-clog==2.5.2 78 | yelp-conn==7.1.3 79 | yelp-encodings==0.1.3 80 | yelp-kafka==5.1.1 81 | yelp-lib==11.0.2 82 | yelp-logging==1.4.1 83 | yelp-meteorite==0.2.4 84 | yelp-servlib==4.7.0 85 | yelp-uri==1.1.0 86 | zope.interface==4.1.2 87 | -------------------------------------------------------------------------------- /data_pipeline/tools/heartbeat_periodic_processor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from datetime import datetime 20 | from datetime import timedelta 21 | 22 | from dateutil.tz import tzutc 23 | 24 | 25 | class BasePeriodicProcessor(object): 26 | """ This class provides an interface for handling periodic events that can 27 | be triggered by a heartbeat event, like sensu alert and data event checkpoint. 28 | That is, it serves as a base class and must be subclassed. 29 | 30 | Args: 31 | interval_in_seconds(int): the time interval between two events. 32 | """ 33 | 34 | def __init__(self, interval_in_seconds): 35 | self.interval_in_seconds = interval_in_seconds 36 | self._next_process_time = self._utc_now 37 | 38 | def periodic_process(self, timestamp=None): 39 | """ This method remains because it's called by the replication handler; 40 | if / when we start calling the process method below directly from the 41 | replication handler we can remove it (DATAPIPE-1435) 42 | Args: 43 | timestamp(datetime.datetime): the datetime of the event with utc 44 | """ 45 | if self._should_process(): 46 | self.process(timestamp) 47 | self._next_process_time = self._compute_next_process_time() 48 | 49 | def process(self, timestamp=None): 50 | raise NotImplementedError 51 | 52 | def _should_process(self): 53 | return self._utc_now >= self._next_process_time 54 | 55 | def _compute_next_process_time(self): 56 | return self._utc_now + timedelta(seconds=self.interval_in_seconds) 57 | 58 | @property 59 | def _utc_now(self): 60 | return datetime.now(tzutc()) 61 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command/namespaces.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.list_command.base_command import _BaseListCommand 22 | 23 | 24 | class NamespacesListCommand(_BaseListCommand): 25 | 26 | list_type = 'namespaces' 27 | fields = [ 28 | 'name', 'namespace_id', 29 | 'active_topic_count', 'active_source_count' 30 | ] 31 | 32 | @classmethod 33 | def add_parser(cls, subparsers): 34 | list_command_parser = subparsers.add_parser( 35 | "namespaces", 36 | description=cls.get_description(), 37 | add_help=False 38 | ) 39 | 40 | list_command_parser.add_argument( 41 | '--active-namespaces', 42 | default=False, 43 | action='store_true', 44 | help=( 45 | 'If set, this command will also return information about active ' 46 | 'sources and topics within each namespace. ' 47 | 'This is a time expensive operation.' 48 | ) 49 | ) 50 | 51 | cls.add_base_arguments(list_command_parser) 52 | 53 | list_command_parser.set_defaults( 54 | command=lambda args: 55 | cls("data_pipeline_introspector_list_namespaces").run( 56 | args, 57 | list_command_parser 58 | ) 59 | ) 60 | 61 | def run(self, args, parser): 62 | self.process_args(args, parser) 63 | print simplejson.dumps(self.list_namespaces( 64 | sort_by=self.sort_by, 65 | descending_order=self.descending_order, 66 | active_namespaces=args.active_namespaces 67 | )) 68 | -------------------------------------------------------------------------------- /tests/benchmarks/logging_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | import pytest 21 | 22 | from data_pipeline.config import get_config 23 | from data_pipeline.helpers.log import debug_log 24 | from tests.factories.base_factory import MessageFactory 25 | 26 | 27 | @pytest.mark.usefixtures( 28 | "config_benchmark_containers_connections" 29 | ) 30 | @pytest.mark.benchmark 31 | class TestBenchLogging(object): 32 | 33 | @pytest.fixture( 34 | params=[True, False], 35 | ids=['logger enabled', 'logger disabled'] 36 | ) 37 | def logger_enabled(self, request): 38 | return request.param 39 | 40 | @pytest.yield_fixture 41 | def patch_logger_enabled(self, logger_enabled): 42 | with mock.patch( 43 | 'data_pipeline.config.logging.Logger' 44 | '.isEnabledFor', 45 | return_value=logger_enabled 46 | ): 47 | yield 48 | 49 | @pytest.fixture 50 | def message(self): 51 | return MessageFactory.create_message_with_payload_data() 52 | 53 | def test_debug_log(self, message, benchmark, patch_logger_enabled): 54 | @benchmark 55 | def log_command(): 56 | debug_log(lambda: "Message buffered: {}".format(repr(message))) 57 | 58 | def test_logger(self, benchmark, message, patch_logger_enabled): 59 | @benchmark 60 | def log_command(): 61 | get_config().logger.debug("Message buffered: {}".format(repr(message))) 62 | 63 | def test_repr_message(self, benchmark, message): 64 | @benchmark 65 | def log_command(): 66 | repr(message) 67 | 68 | def test_pass(self, benchmark, message): 69 | @benchmark 70 | def log_command(): 71 | pass 72 | -------------------------------------------------------------------------------- /data_pipeline/message_type.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from enum import Enum 20 | 21 | 22 | class _EnumRepr(Enum): 23 | def __repr__(self): 24 | return '{}.{}({})'.format( 25 | self.__class__.__name__, 26 | self.name, 27 | self.value 28 | ) 29 | 30 | 31 | class MessageType(_EnumRepr): 32 | """Messages should be published primarily using the create, update, and 33 | delete types. Refresh messages should only be produced if you know what 34 | you're doing, if in doubt, ask please. 35 | 36 | Attributes: 37 | create: when new data is created, the payload contains the contents of 38 | the new row 39 | update: when data is updated, payload contains the new content and 40 | previous_payload contains the old content of the row 41 | delete: when rows are removed, the payload contains the content of the 42 | row before removal 43 | refresh: refresh messages are used to intially populate a topic, they 44 | do not correspond to any particular data change 45 | """ 46 | log = 0 47 | create = 1 48 | update = 2 49 | delete = 3 50 | refresh = 4 51 | 52 | 53 | class _ProtectedMessageType(_EnumRepr): 54 | """Protected message types should generally be avoided. The clientlib 55 | won't expose these messages to users, they're used internally only. 56 | 57 | Attributes: 58 | heartbeat: emitted periodically on low volume topics so auditing 59 | processes can differentiate between slow or stalled topics and 60 | topics without messages. 61 | monitor: monitor messages are used to count the number of messages 62 | produced/consumed by client in a given time frame 63 | """ 64 | heartbeat = 5 65 | monitor = 6 66 | registration = 7 67 | -------------------------------------------------------------------------------- /tests/tools/sensu_ttl_alerter_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | import pysensu_yelp 21 | import pytest 22 | 23 | from data_pipeline.tools.sensu_ttl_alerter import SensuTTLAlerter 24 | 25 | 26 | class TestSensuTTLAlerter(object): 27 | 28 | @pytest.fixture 29 | def sensu_ttl_alerter(self): 30 | test_dict = { 31 | "name": "datapipeline_ttl_alerter_test", 32 | "output": "this is only a test of the datapipeline test alerter", 33 | "irc_channels": "#bam", 34 | "check_every": 60, 35 | "ttl": "300s", 36 | "runbook": "y/datapipeline", 37 | "status": 0, 38 | "team": "bam" 39 | } 40 | return SensuTTLAlerter(test_dict, enable=True) 41 | 42 | @pytest.yield_fixture 43 | def mocked_send_event(self, sensu_ttl_alerter): 44 | with mock.patch.object( 45 | pysensu_yelp, 46 | 'send_event', 47 | autospec=True 48 | ) as mocked_send_event: 49 | yield mocked_send_event 50 | 51 | def test_send_event_while_enabled(self, sensu_ttl_alerter, mocked_send_event): 52 | sensu_ttl_alerter.process() 53 | assert mocked_send_event.call_count == 1 54 | 55 | def test_toggling_enable_to_false(self, sensu_ttl_alerter, mocked_send_event): 56 | sensu_ttl_alerter.enable = False 57 | assert mocked_send_event.call_count == 1 58 | assert 'ttl' not in mocked_send_event.call_args 59 | 60 | def test_no_send_event_while_disabled(self, sensu_ttl_alerter, mocked_send_event): 61 | # there's one call when we toggle from True to False 62 | sensu_ttl_alerter.enable = False 63 | assert mocked_send_event.call_count == 1 64 | # there should be no further calls 65 | sensu_ttl_alerter.process() 66 | assert mocked_send_event.call_count == 1 67 | -------------------------------------------------------------------------------- /tests/tools/meteorite_gauge_manager_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from datetime import datetime 20 | from datetime import timedelta 21 | 22 | import mock 23 | from dateutil.tz import tzutc 24 | 25 | from data_pipeline.tools.meteorite_gauge_manager import MeteoriteGaugeManager 26 | 27 | 28 | class TestMeteoriteGaugeManager(object): 29 | 30 | @mock.patch('yelp_meteorite.metrics.Gauge.set', autospec=True) 31 | def test_gauge_manager_call_count(self, mock_set): 32 | gauge = MeteoriteGaugeManager( 33 | interval_in_seconds=10, 34 | stats_gauge_name='test_gauge' 35 | ) 36 | ts = datetime.now(tzutc()) 37 | gauge.process(ts) 38 | assert mock_set.call_count == 1 39 | 40 | @mock.patch('yelp_meteorite.metrics.Gauge.set', autospec=True) 41 | def test_gauge_manager_process_args(self, mock_set): 42 | with mock.patch( 43 | 'data_pipeline.tools.meteorite_gauge_manager.MeteoriteGaugeManager._utc_now', 44 | new_callable=mock.PropertyMock 45 | ) as utc_now: 46 | fake_time = datetime(year=2016, month=1, day=1) 47 | utc_now.return_value = fake_time 48 | gauge = MeteoriteGaugeManager( 49 | interval_in_seconds=10, 50 | stats_gauge_name='test_gauge' 51 | ) 52 | ts = fake_time - timedelta(seconds=60) 53 | gauge.process(ts) 54 | assert mock_set.call_args[0][1] == 60.0 55 | 56 | @mock.patch('yelp_meteorite.metrics.Gauge.set', autospec=True) 57 | def test_gauge_manager_disabled(self, mock_set): 58 | gauge = MeteoriteGaugeManager( 59 | interval_in_seconds=10, 60 | stats_gauge_name='test_gauge', 61 | disable=True 62 | ) 63 | ts = datetime.now(tzutc()) 64 | gauge.process(ts) 65 | assert mock_set.call_count == 0 66 | -------------------------------------------------------------------------------- /data_pipeline/_kafka_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from kafka_utils.util.offsets import get_topics_watermarks 20 | 21 | 22 | def get_actual_published_messages_count( 23 | kafka_client, 24 | topics, 25 | topic_tracked_offset_map, 26 | raise_on_error=True, 27 | ): 28 | """Get the actual number of published messages of specified topics. 29 | 30 | Args: 31 | kafka_client (kafka.client.KafkaClient): kafka client 32 | topics ([str]): List of topic names to get message count 33 | topic_tracked_offset_map (dict(str, int)): dictionary which 34 | contains each topic and its current stored offset value. 35 | raise_on_error (Optional[bool]): if False, the function ignores 36 | missing topics and missing partitions. It still may fail on 37 | the request send. Default to True. 38 | 39 | Returns: 40 | dict(str, int): Each topic and its actual published messages count 41 | since last offset. If a topic or partition is missing when 42 | `raise_on_error` is False, the returned dict will not contain 43 | the missing topic. 44 | 45 | Raises: 46 | :class:`~yelp_kafka.error.UnknownTopic`: upon missing topics and 47 | raise_on_error=True 48 | :class:`~yelp_kafka.error.UnknownPartition`: upon missing partitions 49 | and raise_on_error=True 50 | FailedPayloadsError: upon send request error. 51 | """ 52 | topic_watermarks = get_topics_watermarks( 53 | kafka_client, 54 | topics, 55 | raise_on_error=raise_on_error 56 | ) 57 | 58 | topic_to_published_msgs_count = {} 59 | for topic, partition_offsets in topic_watermarks.iteritems(): 60 | high_watermark = partition_offsets[0].highmark 61 | offset = topic_tracked_offset_map.get(topic, 0) 62 | topic_to_published_msgs_count[topic] = high_watermark - offset 63 | 64 | return topic_to_published_msgs_count 65 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/schema_check_command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 20 | 21 | 22 | class SchemaCheckCommand(IntrospectorCommand): 23 | @classmethod 24 | def add_parser(cls, subparsers): 25 | schema_check_command_parser = subparsers.add_parser( 26 | "schema-check", 27 | description="Checks the compatibility of an avro schema and all" 28 | " given avro_schemas within the given namespace" 29 | " and source. Compatibility means that the schema can" 30 | " deserialize data serialized by existing schemas within" 31 | " all topics and vice-versa.", 32 | add_help=False 33 | ) 34 | 35 | cls.add_base_arguments(schema_check_command_parser) 36 | cls.add_source_and_namespace_arguments(schema_check_command_parser) 37 | 38 | schema_check_command_parser.add_argument( 39 | "schema", 40 | type=str, 41 | help="The avro schema to check." 42 | ) 43 | 44 | schema_check_command_parser.set_defaults( 45 | command=lambda args: cls("data_pipeline_instropsector_schema_check").run( 46 | args, schema_check_command_parser 47 | ) 48 | ) 49 | 50 | def process_args(self, args, parser): 51 | super(SchemaCheckCommand, self).process_args(args, parser) 52 | self.process_source_and_namespace_args(args, parser) 53 | self.schema = args.schema 54 | 55 | def is_compatible(self): 56 | is_compatible = self.schematizer.is_avro_schema_compatible( 57 | avro_schema_str=self.schema, 58 | source_name=self.source_name, 59 | namespace_name=self.namespace 60 | ) 61 | return is_compatible 62 | 63 | def run(self, args, parser): 64 | self.process_args(args, parser) 65 | print {"is_compatible": self.is_compatible()} 66 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | # Removed for now: 3 | # py34, py35, pypy 4 | envlist = py27, docs 5 | skipsdist = true 6 | indexserver = 7 | default = https://pypi.yelpcorp.com/simple/ 8 | 9 | [testenv] 10 | basepython = python2.7 11 | envdir = venv/py27 12 | setenv = 13 | PIP_INDEX_URL = https://pypi.yelpcorp.com/simple 14 | venv_update = {toxinidir}/bin/venv-update venv= {envdir} install= 15 | commands = 16 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 17 | py.test -c tox.ini --cov=data_pipeline --maxfail=3 --benchmark-skip tests/ 18 | pre-commit run --all-files 19 | 20 | [testenv:pre-commit] 21 | envdir = venv/pre_commit 22 | commands = 23 | {[testenv]venv_update} -r {toxinidir}/requirements.d/pre_commit.txt 24 | pre-commit {posargs} 25 | 26 | [testenv:guard] 27 | envlist = py27 28 | envdir = venv/py27 29 | commands = 30 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 31 | py.test -c tox.ini --doctest-modules -m "not pending" {posargs} 32 | 33 | [testenv:docs] 34 | envdir = venv/py27 35 | commands = 36 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 37 | sphinx-apidoc -f -e -o docs/code data_pipeline 38 | sphinx-build -b html -d docs/build/doctrees docs/ docs/build/html 39 | 40 | [testenv:devenv] 41 | envdir = venv/py27 42 | commands = 43 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 44 | 45 | [testenv:devenv-command] 46 | envdir = venv/py27 47 | commands = 48 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 49 | {posargs} 50 | 51 | [testenv:benchmark] 52 | envdir = venv/py27 53 | commands = 54 | {[testenv]venv_update} -r {toxinidir}/requirements.d/dev-internal.txt 55 | docker-compose --file=docker-compose.yml --file=data_pipeline/testing_helpers/docker-compose.yml kill pypy 56 | docker-compose --file=docker-compose.yml --file=data_pipeline/testing_helpers/docker-compose.yml rm -v --force pypy 57 | docker-compose --file=docker-compose.yml --file=data_pipeline/testing_helpers/docker-compose.yml build pypy 58 | docker-compose --file=docker-compose.yml --file=data_pipeline/testing_helpers/docker-compose.yml run pypy /dp_reqs/venv/bin/py.test -m "benchmark" --benchmark-verbose {posargs} 59 | 60 | [flake8] 61 | ignore = 62 | exclude = .git,.tox,docs,virtualenv_run,venv,__pycache__,.ropeproject,debian,dist 63 | filename = *.py,*.wsgi 64 | max-line-length = 131 65 | 66 | [pytest] 67 | addopts = -m"not benchmark" -m"not skip" --ignore=setup.py --doctest-glob=*.rst -vv 68 | 69 | [pep8] 70 | # E265 deals with spacing inside of comments - breaks human formatting 71 | # E309 puts a blank line after class declarations - doesn't work well with docstrings 72 | # E501 reformats lines to fit in --max-line-length poorly 73 | ignore = E265,E309,E501 74 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/list_command/sources.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.list_command.base_command import _BaseListCommand 22 | 23 | 24 | class SourcesListCommand(_BaseListCommand): 25 | 26 | list_type = 'sources' 27 | fields = [ 28 | 'name', 'source_id', 'owner_email', 29 | 'namespace', 'active_topic_count' 30 | ] 31 | 32 | @classmethod 33 | def add_parser(cls, subparsers): 34 | list_command_parser = subparsers.add_parser( 35 | "sources", 36 | description=cls.get_description(), 37 | add_help=False 38 | ) 39 | 40 | list_command_parser.add_argument( 41 | '--active-sources', 42 | default=False, 43 | action='store_true', 44 | help=( 45 | 'If set, this command will also return information about active ' 46 | 'topics within each source. This is a time expensive operation.' 47 | ) 48 | ) 49 | 50 | cls.add_base_arguments(list_command_parser) 51 | 52 | list_command_parser.add_argument( 53 | "--namespace", 54 | type=str, 55 | default=None, 56 | help="Namespace name that contains a source of source name given. " 57 | "If --source-id is given, then this will be ignored." 58 | ) 59 | 60 | list_command_parser.set_defaults( 61 | command=lambda args: 62 | cls("data_pipeline_introspector_list").run(args, list_command_parser) 63 | ) 64 | 65 | def process_args(self, args, parser): 66 | super(SourcesListCommand, self).process_args(args, parser) 67 | self.namespace = args.namespace 68 | 69 | def run(self, args, parser): 70 | self.process_args(args, parser) 71 | print simplejson.dumps(self.list_sources( 72 | namespace_name=self.namespace, 73 | sort_by=self.sort_by, 74 | descending_order=self.descending_order, 75 | active_sources=args.active_sources 76 | )) 77 | -------------------------------------------------------------------------------- /tests/_fast_uuid_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | import pytest 21 | 22 | import data_pipeline._fast_uuid 23 | from data_pipeline._fast_uuid import _DefaultUUID 24 | from data_pipeline._fast_uuid import _LibUUID 25 | from data_pipeline._fast_uuid import FastUUID 26 | 27 | 28 | class TestFastUUID(object): 29 | 30 | @pytest.fixture(params=[True, False]) 31 | def libuuid_available(self, request): 32 | return request.param 33 | 34 | @pytest.yield_fixture 35 | def fast_uuid(self, libuuid_available): 36 | if libuuid_available: 37 | yield FastUUID() 38 | else: 39 | with mock.patch.object( 40 | data_pipeline._fast_uuid, 41 | 'FFI', 42 | side_effect=Exception 43 | ): 44 | # Save and restore the existing state; this will allow already 45 | # instantiated FastUUID instances to keep working. 46 | original_ffi = data_pipeline._fast_uuid._LibUUID._ffi 47 | data_pipeline._fast_uuid._LibUUID._ffi = None 48 | try: 49 | yield FastUUID() 50 | finally: 51 | data_pipeline._fast_uuid._LibUUID._ffi = original_ffi 52 | 53 | def test_uuid1(self, fast_uuid): 54 | assert self._is_valid_uuid(fast_uuid.uuid1()) 55 | 56 | def test_uuid1_does_not_repeat(self, fast_uuid): 57 | assert fast_uuid.uuid1() != fast_uuid.uuid1() 58 | 59 | def test_uuid4(self, fast_uuid): 60 | assert self._is_valid_uuid(fast_uuid.uuid1()) 61 | 62 | def test_uuid4_does_not_repeat(self, fast_uuid): 63 | assert fast_uuid.uuid4() != fast_uuid.uuid4() 64 | 65 | def _is_valid_uuid(self, uuid_val): 66 | return isinstance(uuid_val, str) and len(uuid_val) == 16 67 | 68 | def test_use_libuuid_when_available(self, fast_uuid, libuuid_available): 69 | fast_uuid.uuid1() 70 | if libuuid_available: 71 | assert isinstance(fast_uuid._uuid_in_use, _LibUUID) 72 | else: 73 | assert isinstance(fast_uuid._uuid_in_use, _DefaultUUID) 74 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CURRENT_VERSION=$(strip $(shell sed -n -r "s/__version__ = '(.+)'/\1/p" $(CURDIR)/data_pipeline/__init__.py)) 2 | NEXT_VERSION=$(shell echo $(CURRENT_VERSION) | awk -F. '/[0-9]+\./{$$NF+=1;OFS=".";print}') 3 | 4 | REBUILD_FLAG = 5 | 6 | .PHONY: help all production clean clean-pyc clean-build clean-docs clean-vim lint test docs coverage install-hooks release prepare-release compose-prefix 7 | 8 | help: 9 | @echo "clean-build - remove build artifacts" 10 | @echo "clean-pyc - remove Python file artifacts" 11 | @echo "clean-docs - remove doc creation artifacts" 12 | @echo "clean-vim - remove vim swap file artifacts" 13 | @echo "test - run tests quickly with the default Python" 14 | @echo "coverage - check code coverage" 15 | @echo "docs - generates Sphinx HTML documentation, including API docs" 16 | @echo "compose-prefix - generates a preconfigured docker-compose command" 17 | @echo "prepare-release - Bump the version number and add a changelog entry (pushmasters only)" 18 | @echo "release - Commit the latest version, tag the commit, and push it (pushmasters only)" 19 | 20 | all: production install-hooks 21 | 22 | production: 23 | @true 24 | 25 | clean: clean-build clean-pyc clean-docs 26 | 27 | clean-build: 28 | rm -fr build/ 29 | rm -fr dist/ 30 | rm -fr *.egg-info 31 | 32 | clean-pyc: 33 | find . -name '*.pyc' -exec rm -f {} + 34 | find . -name '*.pyo' -exec rm -f {} + 35 | find . -name '*~' -exec rm -f {} + 36 | 37 | clean-docs: 38 | rm -rf docs/build/* 39 | rm -rf docs/code/* 40 | 41 | clean-vim: 42 | find . -name '*.swp' -exec rm -f {} + 43 | find . -name '*.swo' -exec rm -f {} + 44 | 45 | test: 46 | # This will timeout after 15 minutes, in case there is a hang on jenkins 47 | PULL_CONTAINERS=true FORCE_FRESH_CONTAINERS=true timeout -9 1800 tox -c tox.ini $(REBUILD_FLAG) 48 | 49 | docs: clean-docs 50 | tox -c tox.ini -e docs $(REBUILD_FLAG) 51 | 52 | coverage: test 53 | 54 | install-hooks: 55 | tox -c tox.ini -e pre-commit -- install -f --install-hooks 56 | 57 | # See the makefile in yelp_package/Makefile for packaging stuff 58 | itest_%: 59 | make -C yelp_package $@ 60 | 61 | # Steps to release (Don't do this if you are not a pushmaster - see "Pushing Code" 62 | # on y/datapipeline) 63 | # 1. `make prepare-release` 64 | # 2. `make release` 65 | LAST_COMMIT_MSG = $(shell git log -1 --pretty=%B ) 66 | prepare-release: 67 | dch -v $(NEXT_VERSION) --changelog debian/changelog "Commit: $(LAST_COMMIT_MSG)" 68 | sed -i -r "s/__version__ = '(.+)'/__version__ = '$(NEXT_VERSION)'/" data_pipeline/__init__.py 69 | @git diff 70 | 71 | release: 72 | git commit -a -m "Released $(CURRENT_VERSION) via make release" 73 | git tag v$(CURRENT_VERSION) 74 | git push --tags origin master && git push origin master 75 | 76 | compose-prefix: 77 | @python -c "from data_pipeline.testing_helpers.containers import Containers; print Containers.compose_prefix()" 78 | -------------------------------------------------------------------------------- /tests/tools/sensu_alert_manager_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from datetime import datetime 20 | from datetime import timedelta 21 | 22 | import mock 23 | import pytest 24 | from dateutil.tz import tzutc 25 | 26 | from data_pipeline.tools.sensu_alert_manager import SensuAlertManager 27 | 28 | 29 | class TestSensuAlertManager(object): 30 | 31 | @pytest.fixture 32 | def sensu_alert_manager(self): 33 | basic_dict = {"check_every": 60, "ttl": "300s"} 34 | return SensuAlertManager( 35 | interval_in_seconds=30, 36 | service_name="test_service", 37 | result_dict=basic_dict, 38 | max_delay_seconds=120, 39 | disable=False 40 | ) 41 | 42 | @pytest.yield_fixture 43 | def mocked_log_and_send(self, sensu_alert_manager): 44 | with mock.patch.object( 45 | sensu_alert_manager, 46 | 'log_and_send_event', 47 | autospec=True 48 | ) as mocked_log_and_send: 49 | yield mocked_log_and_send 50 | 51 | def test_process_no_timestamp(self, sensu_alert_manager, mocked_log_and_send): 52 | sensu_alert_manager.process() 53 | assert mocked_log_and_send.call_count == 0 54 | 55 | def test_process_with_recent_timestamp(self, sensu_alert_manager, mocked_log_and_send): 56 | sensu_alert_manager.process(datetime.now(tzutc())) 57 | assert mocked_log_and_send.call_count == 1 58 | assert mocked_log_and_send.call_args[0][0]['output'] == \ 59 | "test_service has caught up to real time" 60 | 61 | def test_process_with_old_timestamp(self, sensu_alert_manager, mocked_log_and_send): 62 | old_time = datetime.now(tzutc()) - timedelta(hours=24) 63 | sensu_alert_manager.process(old_time) 64 | assert mocked_log_and_send.call_count == 1 65 | assert "min behind real time" in mocked_log_and_send.call_args[0][0]['output'] 66 | 67 | def test_toggling_disable_to_true(self, sensu_alert_manager, mocked_log_and_send): 68 | sensu_alert_manager.disable = True 69 | assert mocked_log_and_send.call_count == 1 70 | assert mocked_log_and_send.call_args[0][0]['output'] == \ 71 | "disabling sensu alert for test_service" 72 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/info/topic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 22 | from data_pipeline.tools.introspector.models import IntrospectorSchema 23 | from data_pipeline.tools.introspector.models import IntrospectorTopic 24 | 25 | 26 | class TopicInfoCommand(IntrospectorCommand): 27 | @classmethod 28 | def add_parser(cls, subparsers): 29 | info_command_parser = subparsers.add_parser( 30 | "topic", 31 | description="Get information on a specific data pipeline topic.", 32 | add_help=False 33 | ) 34 | 35 | cls.add_base_arguments(info_command_parser) 36 | 37 | info_command_parser.add_argument( 38 | "topic_name", 39 | type=str, 40 | help="Name of topic to retrieve information on." 41 | ) 42 | 43 | info_command_parser.set_defaults( 44 | command=lambda args: cls("data_pipeline_instropsector_info_topic").run( 45 | args, 46 | info_command_parser 47 | ) 48 | ) 49 | 50 | def info_topic(self, name): 51 | topic = self.schematizer.get_topic_by_name(name) 52 | topic = IntrospectorTopic( 53 | topic, 54 | kafka_topics=self._kafka_topics, 55 | topics_to_range_map=self._topics_with_messages_to_range_map 56 | ).to_ordered_dict() 57 | topic['schemas'] = self.list_schemas(name) 58 | return topic 59 | 60 | def list_schemas( 61 | self, 62 | topic_name 63 | ): 64 | schemas = self.schematizer.get_schemas_by_topic(topic_name) 65 | schemas = [IntrospectorSchema(schema).to_ordered_dict() for schema in schemas] 66 | schemas.sort(key=lambda schema: schema['created_at'], reverse=True) 67 | return schemas 68 | 69 | def process_args(self, args, parser): 70 | super(TopicInfoCommand, self).process_args(args, parser) 71 | self.topic_name = args.topic_name 72 | 73 | def run(self, args, parser): 74 | self.process_args(args, parser) 75 | print simplejson.dumps( 76 | self.info_topic(self.topic_name) 77 | ) 78 | -------------------------------------------------------------------------------- /tests/benchmarks/message_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline.helpers.yelp_avro_store import _AvroStringStore 22 | from data_pipeline.message import CreateMessage 23 | from tests.factories.base_factory import SchemaFactory 24 | 25 | 26 | @pytest.mark.usefixtures( 27 | "config_benchmark_containers_connections" 28 | ) 29 | @pytest.mark.benchmark 30 | class TestBenchMessage(object): 31 | 32 | def test_create_message(self, benchmark): 33 | 34 | def create_message(schema_id, payload_data): 35 | CreateMessage( 36 | schema_id=schema_id, 37 | payload_data=payload_data 38 | ) 39 | 40 | def setup(): 41 | schema_id = SchemaFactory.get_schema_json().schema_id 42 | payload_data = SchemaFactory.get_payload_data() 43 | return [schema_id, payload_data], {} 44 | benchmark.pedantic(create_message, setup=setup, rounds=1000) 45 | 46 | def test_encode_message(self, benchmark): 47 | 48 | def setup(): 49 | schema_id = SchemaFactory.get_schema_json().schema_id 50 | payload_data = SchemaFactory.get_payload_data() 51 | 52 | return [schema_id, payload_data], {} 53 | 54 | def encode_message(schema_id, payload_data): 55 | _AvroStringStore().get_writer(schema_id).encode( 56 | message_avro_representation=payload_data 57 | ) 58 | 59 | benchmark.pedantic(encode_message, setup=setup, rounds=1000) 60 | 61 | def test_decode_message(self, benchmark): 62 | 63 | def setup(): 64 | schema_id = SchemaFactory.get_schema_json().schema_id 65 | payload_data = SchemaFactory.get_payload_data() 66 | payload = _AvroStringStore().get_writer(schema_id).encode( 67 | message_avro_representation=payload_data 68 | ) 69 | 70 | return [schema_id, payload], {} 71 | 72 | def decode_message(schema_id, payload): 73 | _AvroStringStore().get_reader( 74 | reader_id_key=schema_id, 75 | writer_id_key=schema_id 76 | ).decode( 77 | encoded_message=payload 78 | ) 79 | 80 | benchmark.pedantic(decode_message, setup=setup, rounds=1000) 81 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/data_target.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | 23 | 24 | """ 25 | Represent the data of a data target. A data target represents a destination 26 | where the data(messages) are eventually sent to, such as a Redshift cluster. 27 | 28 | Args: 29 | data_target_id (int): The id of the data target. 30 | target_type (str): The target type, such as Redshift, etc. 31 | destination (str): The actual location of the data target, such as the Url 32 | of a Redshift cluster. 33 | """ 34 | DataTarget = namedtuple( 35 | 'DataTarget', 36 | ['data_target_id', 'name', 'target_type', 'destination'] 37 | ) 38 | 39 | 40 | class _DataTarget(BaseModel): 41 | """Internal class used to convert from/to various data structure and 42 | facilitate constructing the return value of schematizer functions. 43 | """ 44 | 45 | def __init__(self, data_target_id, name, target_type, destination): 46 | self.data_target_id = data_target_id 47 | self.name = name 48 | self.target_type = target_type 49 | self.destination = destination 50 | 51 | @classmethod 52 | def from_response(cls, response): 53 | return cls( 54 | data_target_id=response.data_target_id, 55 | name=response.name, 56 | target_type=response.target_type, 57 | destination=response.destination 58 | ) 59 | 60 | def to_cache_value(self): 61 | return { 62 | 'data_target_id': self.data_target_id, 63 | 'name': self.name, 64 | 'target_type': self.target_type, 65 | 'destination': self.destination 66 | } 67 | 68 | @classmethod 69 | def from_cache_value(cls, cache_value): 70 | return cls( 71 | data_target_id=cache_value['data_target_id'], 72 | name=cache_value['name'], 73 | target_type=cache_value['target_type'], 74 | destination=cache_value['destination'] 75 | ) 76 | 77 | def to_result(self): 78 | return DataTarget( 79 | data_target_id=self.data_target_id, 80 | name=self.name, 81 | target_type=self.target_type, 82 | destination=self.destination 83 | ) 84 | -------------------------------------------------------------------------------- /data_pipeline/schemas/envelope_v1.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "yelp.data_pipeline", 4 | "name": "envelope", 5 | "doc": "Wraps message with schema format for decoding and pipeline metadata.", 6 | "fields": [ 7 | { 8 | "name": "uuid", 9 | "type": { 10 | "name": "uuid", 11 | "type": "fixed", 12 | "size": 16 13 | }, 14 | "doc": "Uniquely identifies the message." 15 | }, 16 | { 17 | "name": "message_type", 18 | "type": { 19 | "name": "message_type", 20 | "type": "enum", 21 | "doc": "Enumeration for possible types of message, these are used as values for data_pipeline.message_type.MessageType.", 22 | "symbols": [ 23 | "create", 24 | "update", 25 | "delete", 26 | "refresh", 27 | "heartbeat", 28 | "monitor", 29 | "registration", 30 | "log" 31 | ] 32 | }, 33 | "doc": "Identifies the type of message. Refresh messages are used to bootstrap topics and can be safely ignored by consumers that only care about data updates. Heartbeats will be emitted periodically on topics without other messages to facilitate auditing - they will not be passed to the consumer." 34 | }, 35 | { 36 | "name": "schema_id", 37 | "type": "int", 38 | "doc": "Schema identifier for the payload." 39 | }, 40 | { 41 | "name": "payload", 42 | "type": "bytes", 43 | "doc": "Avro-encoded data encoded with the schema corresponding to schema_id" 44 | }, 45 | { 46 | "name": "previous_payload", 47 | "type": [ 48 | "null", 49 | "bytes" 50 | ], 51 | "doc": "Avro-encoded state of the row prior to the update for update messages." 52 | }, 53 | { 54 | "name": "meta", 55 | "type": [ 56 | "null", 57 | { 58 | "type": "array", 59 | "items": { 60 | "type": "record", 61 | "name": "meta_envelope", 62 | "doc": "Record to encode and pipeline each meta_attribute in data_pipeline message.", 63 | "fields": [ 64 | { 65 | "name": "schema_id", 66 | "type": "int", 67 | "doc": "Schema identifier for the meta-attribute payload." 68 | }, 69 | { 70 | "name": "payload", 71 | "type": "bytes", 72 | "doc": "Avro-encoded data encoded with the schema corresponding to schema_id" 73 | } 74 | ] 75 | } 76 | } 77 | ], 78 | "doc": "List of record containing schema_id and encoded payloads for meta-attributes." 79 | }, 80 | { 81 | "name": "encryption_type", 82 | "type": [ 83 | "null", 84 | "string" 85 | ], 86 | "doc": "If encryption is used, identifies the type and key used to encrypt the payload." 87 | }, 88 | { 89 | "name": "timestamp", 90 | "type": "int", 91 | "doc": "Time the message was produced." 92 | } 93 | ] 94 | } 95 | -------------------------------------------------------------------------------- /data_pipeline/testing_helpers/kafka_docker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from contextlib import contextmanager 20 | 21 | from kafka import KafkaClient 22 | from kafka import SimpleConsumer 23 | 24 | from data_pipeline.config import get_config 25 | from data_pipeline.message import create_from_offset_and_message 26 | 27 | 28 | _ONE_MEGABYTE = 1024 * 1024 29 | logger = get_config().logger 30 | 31 | 32 | @contextmanager 33 | def capture_new_data_pipeline_messages(topic): 34 | """contextmanager that moves to the tail of the given topic, and waits to 35 | receive new messages, returning a function that can be called zero or more 36 | times which will retrieve decoded data pipeline messages from the topic. 37 | 38 | Returns: 39 | Callable[[int], List[Message]]: Function that takes a single 40 | optional argument, count, and returns up to count decoded data pipeline 41 | messages. This function does not block, and will return however many 42 | messages are available immediately. Default count is 100. 43 | """ 44 | with capture_new_messages(topic) as get_kafka_messages: 45 | def get_data_pipeline_messages(count=100): 46 | kafka_messages = get_kafka_messages(count) 47 | return [ 48 | create_from_offset_and_message(kafka_message) 49 | for kafka_message in kafka_messages 50 | ] 51 | 52 | yield get_data_pipeline_messages 53 | 54 | 55 | @contextmanager 56 | def capture_new_messages(topic): 57 | """Seeks to the tail of the topic then returns a function that can 58 | consume messages from that point. 59 | """ 60 | with setup_capture_new_messages_consumer(topic) as consumer: 61 | def get_messages(count=100): 62 | return consumer.get_messages(count=count) 63 | 64 | yield get_messages 65 | 66 | 67 | @contextmanager 68 | def setup_capture_new_messages_consumer(topic): 69 | """Seeks to the tail of the topic then returns a function that can 70 | consume messages from that point. 71 | """ 72 | kafka = KafkaClient(get_config().cluster_config.broker_list) 73 | group = str('data_pipeline_clientlib_test') 74 | consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE) 75 | consumer.seek(0, 2) # seek to tail, 0 is the offset, and 2 is the tail 76 | 77 | yield consumer 78 | 79 | kafka.close() 80 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/register/mysql_command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.register.base_command import _BaseRegisterCommand 20 | 21 | 22 | class RegisterMysqlCommand(_BaseRegisterCommand): 23 | @classmethod 24 | def add_parser(cls, subparsers): 25 | register_mysql_command_parser = subparsers.add_parser( 26 | "mysql", 27 | description="Register the given mysql statements " 28 | "as schemas to the schematizer.", 29 | add_help=False 30 | ) 31 | 32 | cls.add_base_arguments(register_mysql_command_parser) 33 | 34 | register_mysql_command_parser.add_argument( 35 | "--create-table", 36 | type=str, 37 | required=True, 38 | help="The mysql statement of creating new table" 39 | ) 40 | 41 | register_mysql_command_parser.add_argument( 42 | "--old-create-table", 43 | type=str, 44 | default=None, 45 | help="The mysql statement of creating old table. " 46 | ) 47 | 48 | register_mysql_command_parser.add_argument( 49 | "--alter-table", 50 | type=str, 51 | default=None, 52 | help="The mysql statement of altering table schema. " 53 | ) 54 | 55 | register_mysql_command_parser.set_defaults( 56 | command=lambda args: cls("data_pipeline_instropsector_register_mysql").run( 57 | args, register_mysql_command_parser 58 | ) 59 | ) 60 | 61 | def process_args(self, args, parser): 62 | super(RegisterMysqlCommand, self).process_args(args, parser) 63 | self.create_table = args.create_table 64 | self.old_create_table = args.old_create_table 65 | self.alter_table = args.alter_table 66 | 67 | def run(self, args, parser): 68 | self.process_args(args, parser) 69 | schema = self.schematizer.register_schema_from_mysql_stmts( 70 | namespace=self.namespace, 71 | source=self.source_name, 72 | source_owner_email=self.source_owner_email, 73 | contains_pii=self.pii, 74 | new_create_table_stmt=self.create_table, 75 | old_create_table_stmt=self.old_create_table, 76 | alter_table_stmt=self.alter_table 77 | ) 78 | self.print_schema(schema) 79 | -------------------------------------------------------------------------------- /data_pipeline/helpers/yelp_avro_store.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline_avro_util.avro_string_reader import AvroStringReader 20 | from data_pipeline_avro_util.avro_string_writer import AvroStringWriter 21 | 22 | from data_pipeline.helpers.singleton import Singleton 23 | from data_pipeline.schematizer_clientlib.schematizer import get_schematizer 24 | 25 | 26 | class _AvroStringStore(object): 27 | """Singleton instance of store that caches 28 | AvroStringsWriter and AvroStringReader objects perticularly 29 | used by message class to encode and decode messages respectively. 30 | 31 | This class was added for performance enhancements 32 | w store : pb/199453 33 | w/o store : pb/199448 34 | """ 35 | __metaclass__ = Singleton 36 | 37 | def __init__(self): 38 | self._writer_cache = {} 39 | self._reader_cache = {} 40 | 41 | @property 42 | def _schematizer(self): 43 | return get_schematizer() 44 | 45 | def _get_avro_schema(self, schema_id): 46 | return self._schematizer.get_schema_by_id( 47 | schema_id 48 | ).schema_json 49 | 50 | def get_writer(self, id_key, avro_schema=None): 51 | key = id_key 52 | avro_string_writer = self._writer_cache.get(key) 53 | if avro_string_writer: 54 | return avro_string_writer 55 | 56 | avro_schema = avro_schema or self._get_avro_schema(id_key) 57 | avro_string_writer = AvroStringWriter(schema=avro_schema) 58 | self._writer_cache[key] = avro_string_writer 59 | return avro_string_writer 60 | 61 | def get_reader( 62 | self, 63 | reader_id_key, 64 | writer_id_key, 65 | reader_avro_schema=None, 66 | writer_avro_schema=None 67 | ): 68 | key = reader_id_key, writer_id_key 69 | avro_string_reader = self._reader_cache.get(key) 70 | if avro_string_reader: 71 | return avro_string_reader 72 | 73 | reader_schema = ( 74 | reader_avro_schema or self._get_avro_schema(reader_id_key) 75 | ) 76 | writer_schema = ( 77 | writer_avro_schema or self._get_avro_schema(writer_id_key) 78 | ) 79 | avro_string_reader = AvroStringReader( 80 | reader_schema=reader_schema, 81 | writer_schema=writer_schema 82 | ) 83 | self._reader_cache[key] = avro_string_reader 84 | return avro_string_reader 85 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/register/avro_command.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from data_pipeline.tools.introspector.register.base_command import _BaseRegisterCommand 20 | 21 | 22 | class RegisterAvroCommand(_BaseRegisterCommand): 23 | @classmethod 24 | def add_parser(cls, subparsers): 25 | register_avro_command_parser = subparsers.add_parser( 26 | "avro", 27 | description="Register the given avro schema to the schematizer.", 28 | add_help=False 29 | ) 30 | 31 | cls.add_base_arguments(register_avro_command_parser) 32 | 33 | register_avro_command_parser.add_argument( 34 | "--avro-schema", 35 | type=str, 36 | required=True, 37 | help="The json of the avro schema." 38 | ) 39 | 40 | register_avro_command_parser.add_argument( 41 | "--cluster-type", 42 | dest="cluster_type", 43 | default='datapipe', 44 | help="Kafka cluster type to connect. Defaults to datapipe. " 45 | "Currently only 'datapipe' and 'scribe' cluster types are " 46 | "supported." 47 | ) 48 | 49 | register_avro_command_parser.add_argument( 50 | "--base-schema-id", 51 | type=int, 52 | default=None, 53 | help="The id of the original schema the new avro schema was built upon." 54 | ) 55 | 56 | register_avro_command_parser.set_defaults( 57 | command=lambda args: cls("data_pipeline_instropsector_register_avro").run( 58 | args, register_avro_command_parser 59 | ) 60 | ) 61 | 62 | def process_args(self, args, parser): 63 | super(RegisterAvroCommand, self).process_args(args, parser) 64 | self.avro_schema = args.avro_schema 65 | self.cluster_type = args.cluster_type 66 | self.base_schema_id = args.base_schema_id 67 | 68 | def run(self, args, parser): 69 | self.process_args(args, parser) 70 | schema = self.schematizer.register_schema( 71 | namespace=self.namespace, 72 | source=self.source_name, 73 | schema_str=self.avro_schema, 74 | source_owner_email=self.source_owner_email, 75 | contains_pii=self.pii, 76 | cluster_type=self.cluster_type, 77 | base_schema_id=self.base_schema_id 78 | ) 79 | self.print_schema(schema) 80 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/consumer_group_data_source.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.data_source_type_enum import DataSourceTypeEnum 22 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 23 | 24 | 25 | """ 26 | Represent the data of the mapping between a data source and a consumer group. 27 | A data source represents a namespace or a source. 28 | 29 | Args: 30 | consumer_group_data_source_id (int): The id of the mapping between consumer 31 | group and data source. 32 | consumer_group_id (str): The id of the consumer group. 33 | data_source_type 34 | (data_pipeline.schematizer_clientlib.models.data_source_type_enum.DataSourceTypeEnum): 35 | The type of the data_source. 36 | data_source_id: The id of the data target. Depending on the data source 37 | type, it may be a namespace id or source id. 38 | """ 39 | ConsumerGroupDataSource = namedtuple( 40 | 'ConsumerGroupDataSource', 41 | ['consumer_group_data_source_id', 'consumer_group_id', 'data_source_type', 42 | 'data_source_id'] 43 | ) 44 | 45 | 46 | class _ConsumerGroupDataSource(BaseModel): 47 | """Internal class used to convert from/to various data structure and 48 | facilitate constructing the return value of schematizer functions. 49 | """ 50 | 51 | def __init__(self, consumer_group_data_source_id, consumer_group_id, 52 | data_source_type, data_source_id): 53 | self.consumer_group_data_source_id = consumer_group_data_source_id 54 | self.consumer_group_id = consumer_group_id 55 | self.data_source_type = data_source_type 56 | self.data_source_id = data_source_id 57 | 58 | @classmethod 59 | def from_response(cls, response): 60 | return cls( 61 | consumer_group_data_source_id=response.consumer_group_data_source_id, 62 | consumer_group_id=response.consumer_group_id, 63 | data_source_type=DataSourceTypeEnum[response.data_source_type], 64 | data_source_id=response.data_source_id 65 | ) 66 | 67 | def to_result(self): 68 | return ConsumerGroupDataSource( 69 | consumer_group_data_source_id=self.consumer_group_data_source_id, 70 | consumer_group_id=self.consumer_group_id, 71 | data_source_type=self.data_source_type, 72 | data_source_id=self.data_source_id 73 | ) 74 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/source.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | from data_pipeline.schematizer_clientlib.models.namespace import _Namespace 23 | 24 | 25 | """ 26 | Represent the data of a source. Source is a sub-group under namespaces which 27 | an avro schema is created for. For example, `user` (table) could be a source. 28 | 29 | Args: 30 | source_id (int): The id of the source. 31 | name (str): The name of the source. 32 | owner_email (str): The email of the source owner. 33 | namespace (data_pipeline.schematizer_clientlib.models.namespace.Namespace): 34 | The namespace of the source. 35 | category (str): The category of the source. (e.g. Content, Deals etc.) 36 | """ 37 | Source = namedtuple( 38 | 'Source', 39 | ['source_id', 'name', 'owner_email', 'namespace', 'category'] 40 | ) 41 | 42 | 43 | class _Source(BaseModel): 44 | 45 | def __init__(self, source_id, name, owner_email, namespace, category): 46 | self.source_id = source_id 47 | self.name = name 48 | self.owner_email = owner_email 49 | self.namespace = namespace 50 | self.category = category 51 | 52 | @classmethod 53 | def from_response(cls, response): 54 | return cls( 55 | source_id=response.source_id, 56 | name=response.name, 57 | owner_email=response.owner_email, 58 | namespace=_Namespace.from_response(response.namespace), 59 | category=response.category 60 | ) 61 | 62 | def to_cache_value(self): 63 | return { 64 | 'source_id': self.source_id, 65 | 'name': self.name, 66 | 'owner_email': self.owner_email, 67 | 'namespace': self.namespace, 68 | 'category': self.category 69 | } 70 | 71 | @classmethod 72 | def from_cache_value(cls, cache_value): 73 | return cls( 74 | source_id=cache_value['source_id'], 75 | name=cache_value['name'], 76 | owner_email=cache_value['owner_email'], 77 | namespace=cache_value['namespace'], 78 | category=cache_value['category'] 79 | ) 80 | 81 | def to_result(self): 82 | return Source( 83 | source_id=self.source_id, 84 | name=self.name, 85 | owner_email=self.owner_email, 86 | namespace=self.namespace.to_result(), 87 | category=self.category 88 | ) 89 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/consumer_group.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.data_target import _DataTarget 22 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 23 | 24 | 25 | """ 26 | Represent the data of a consumer group. A consumer group represents a group of 27 | the consumers that send all the messages to the same destination, defined as a 28 | "data target". 29 | 30 | Args: 31 | consumer_group_id (int): The id of the consumer group. 32 | group_name (str): The name of the consumer group. 33 | data_target (data_pipeline.schematizer_clientlib.models.data_target.DataTarget): 34 | The data_target this consumer group associates to. 35 | """ 36 | ConsumerGroup = namedtuple( 37 | 'ConsumerGroup', 38 | ['consumer_group_id', 'group_name', 'data_target'] 39 | ) 40 | 41 | 42 | class _ConsumerGroup(BaseModel): 43 | """Internal class used to convert from/to various data structure and 44 | facilitate constructing the return value of schematizer functions. 45 | """ 46 | 47 | def __init__(self, consumer_group_id, group_name, data_target): 48 | self.consumer_group_id = consumer_group_id 49 | self.group_name = group_name 50 | self.data_target = data_target 51 | 52 | @classmethod 53 | def from_response(cls, response): 54 | return cls( 55 | consumer_group_id=response.consumer_group_id, 56 | group_name=response.group_name, 57 | data_target=_DataTarget.from_response(response.data_target) 58 | ) 59 | 60 | def to_cache_value(self): 61 | return { 62 | 'consumer_group_id': self.consumer_group_id, 63 | 'group_name': self.group_name, 64 | 'data_target_id': self.data_target.data_target_id 65 | } 66 | 67 | @classmethod 68 | def from_cache_value(cls, cache_value): 69 | id_only_data_target = _DataTarget( 70 | data_target_id=cache_value['data_target_id'], 71 | name=None, 72 | target_type=None, 73 | destination=None 74 | ) 75 | return cls( 76 | consumer_group_id=cache_value['consumer_group_id'], 77 | group_name=cache_value['group_name'], 78 | data_target=id_only_data_target 79 | ) 80 | 81 | def to_result(self): 82 | return ConsumerGroup( 83 | consumer_group_id=self.consumer_group_id, 84 | group_name=self.group_name, 85 | data_target=self.data_target.to_result() 86 | ) 87 | -------------------------------------------------------------------------------- /key-1.key: -------------------------------------------------------------------------------- 1 | MIIJKQIBAAKCAgEAgtOJ7j/3h+fynqWCV0C9EV0PMl7x6ht/QX9Ool6aGXuG7IOb 2 | +PHhmpcPrSmoRaC2m3dJooLxzxAtIJ1JbNb9c55lvCg2ft/3d0CwwD/HMmCSQ3V7 3 | QykcgDgqW3W9RWlUa6mb4ExxFowwx5TLa0snCVhPv8FjwLC5z7NqispjuSVdwgoQ 4 | nY4fg4wpvH/ziHT2qkNBLP/y1xHmhzsaWWLxdFT4IBmHCWorQEmfB9WHwxEO510u 5 | cjkOjSZ66ba6yLgsgY1VMsCYj/rQpQeZYgjk7ZwelGJyvz/lrH87piSRE70zReAj 6 | GuGKEULYVPxsFT7xEhgwDbH6+8ZKo1Sz49DTgB9MtSOtI5UkSct1/Hv3SJoZXmDS 7 | Avn0MlyOxAYhlX9FVS2placyksM6lPfRHUH9SndOK4OOLVhnLvKyY/F9pFXx6QP1 8 | Ni0J+s5ylNBNl8dXbPBRtFRFgHnyZ85XDMQ0tDWULydkQEqbfuy0SWIgfS9ClBPo 9 | 1nQNTNvlUp6re0CSz+I0siQ/ctl6F172/8m3TbNVf6FL7VKte6gksVWjeqDH66ww 10 | EyEQC7dLSGFiD/CFv/6k6zv5VZRrpjBISjvXg89h5BhWQ8bXqlbPdEGN0vgx9Kwm 11 | DV8eW3uQZenbepNdLOVIkDzKQZZ+q5C7FNiFM4dyzXQ3uMPcOw+p9nTXUwkCAwEA 12 | AQKCAgBbn174FqB68/y9Etjn1pUTNgNQmaH+/ObwrgIaQPrXYeWqermfbXc81xLJ 13 | iWUp7lY1JyPQxrgm9gNE1iL4MNEsSgCh99/Cl4QuZoXi/ke5hlypFfwPmxk0yyEP 14 | ELRPimbvR8e9HIwO6Hu5KwIT0Hw0VT704rHD34Eib1rsfLW+kubI4FemoER90ong 15 | VBPwu0mNELDynQYrjYPPrZrUgzDq8h8cDATAmTRVP7o6uFbuTGINsvq2UDNas1pd 16 | l/XUYPf6e0rx0mSGYXgCPAI14FjD+BSKgzAttMhbOHhD8VQRaO4hv8mWJ+KUGCZ9 17 | zpK7hms66umsjbti3QqkvMuHf1a/V9NwJ6Dsz41vAzqL4tGvbBrCd24nXeTMbMi8 18 | 0QDhzAMt+z2sB2E087pLyt4AOS9AGSVbR2jBzac0Fkw3J+MAxcKWLbMDiI6NxUoC 19 | olHFDsISpefxqYDkP/+tO4fQbyzwpeArQ3MhmYwpkGfKez8zIFZPzxwSdz6KvEso 20 | HniYA2uR8muXshC+CV0H2aERa8Fuw88RFdHqAB7uS8wiNWTGy1XhByvVBwtgSu2R 21 | usO7r6C3+RMNcj3+i1tBFMLVatzn6Vqo+z2jsJ3NtWExl1Q0UsIeeEu8iwURmopv 22 | 87INvTOoUssX1lS7fTxSCBmmMw6JN5kEGhL92CR4/E7DxEAMyQKCAQEA8cuAJ//e 23 | U8TFbCS6hr30qYDOx0vWZe2Ju1uVz6Sk5sKiKw5PuLnya46SIOY1zGVpT2IJCjut 24 | D2b+IgPateO50kClen7/DM49sBYp7KuFz3XZ7Oc4hq470mQ/wwmMc2nqY5DU4+wm 25 | 6LL0quGK6nBIWE52MYqXFdCXc23ymv5eb4J0qzu5OwLzSAGZtMBXyCH9sKtzcGHJ 26 | E8eOfYNhE8O0/O2dpjpOJJTBuPhQztPKEBLJ5mnlOKoFmzQbJJBo4uwufjkIzD/p 27 | z1P4T/YJheroJMrZOAAVDUZpYDcPK1wtleYINvPUvdru6lxA10wHPysBLHsnSlvu 28 | PJRDTgk+x1XuZwKCAQEAioMdVuAf3ylgvTDNEQCxFjAW7uQkDi9zjkNblRXn/Xh1 29 | dvqtH0SnDXuq0ZkE5Ce1Bh03DQnw7YL21VfL6B10Q+7c7hu04WsI6zoX8AB9s7Si 30 | Jqm5hD4YWjSshwH7Hgojufr1IgYTiIpAGCmJlhOnFRRxj/oLAnJgUqtx8Vr/JNG5 31 | OwZB6ZRmhjhlzODvIIAo0LwRw4KiPj729OpkGAaL3eaHYJdLlGP0xClr60acEK6I 32 | OoEyoFr8ywNsewOOUjO522y75V32YhrbeQdQpjVdQuVuxPSJaSj2VjnKDewLZASr 33 | 7HrNLhQ5qaXcwuUpkn+7eVSYey9Pcna8Mc467H/tDwKCAQEA5vDKV+n/7Jx9P5wY 34 | 98cInrjjRuD9Rt9I9MU14lxMEAeIIn2J0VLw4qAKnBek/LNDkUl+fuKemZ5yTgTG 35 | aYG3IsUSCFyxuAZoTv3sTxtIM+1JFweFKsXRZ1LOv7coCbf44d4cKBGO3CTIoUnI 36 | h8B0qLtQEmNfiLtxXBOAa19WzUSZsE7bzSw1NuooZ8ZqUUF977biDE2pYxXv15ub 37 | Jeh/vjApqfH4XuIh4Uxyygjo7s8zPbqY77Uco69x2UuAknSot3A8IfFNnPODsMO7 38 | Nsg1r6Z7MAyMDuDYlsgf2fcxpwFNiUb8Q9WO6hLw73GzXjbx6Q07ar9BXWTyn98e 39 | sf82MwKCAQBj8FJc+n1EkSBhoTJ00CD1t2D0oNt6+LXRsbwEh7fJZP6sCBaddvVp 40 | fw8zB8tFq3irnqy31bJmTJY4PI69PXNECLaTfp41/vqCbnoTCuenn+9XIiLezcbY 41 | XyUUjFNZy0sXx4DTObsiS04PoPjKtgdZ6FQ/49PJwijJvTYLRPLO6BtUASHRnOeU 42 | dGDPQISI2K+aW/gdLsZNTzT0ZaIuy5pjUw/em4tG2BAk8RYSvfGg/6z/OXUDymKX 43 | QDMnVtt7aCLztTKlPfSluGceni6MnfhaahxQM3hpzcpfUHb/MI7Pbad2iuw8EQ/B 44 | 3hC49ovWTh9AXc9fIjWaix0ieoB9X5f1AoIBAQCp9ALAlnj4rra9SRou7uxCLYSW 45 | IYeqLEkBhnOJJHHekEwFimRwRm9LxSMoYHtbuZByuZBo7bwp2wUj6wrtn1xzsB/r 46 | 3KYX0Z+hDksLhBUQpTeiVAej6BbQUaOiNJ0M++EskpcM9J34nNBbWxk/zB/bFrRA 47 | MC7GGGQ6lXlj/e8YWHfUuLijGF7q9FUfLRAcCDwkoDdfnO/fEKRcKrDBc8ezGTa5 48 | WNLlA7a4nhw9JZ+rTK9wL4PH4GXxSTVkhk/NP/YMSwriGVfeLnWBpW+T/HNarHIk 49 | 3j2M4/b72cIAtQtZfwRR8WZ5JXs8ZIn5UN4kM0MvoVTDdYh0ieLevuIfqA08 50 | -------------------------------------------------------------------------------- /tests/helpers/decorators_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import tempfile 20 | 21 | import pytest 22 | 23 | from data_pipeline.helpers.decorators import memoized 24 | 25 | 26 | def fibonacci(n): 27 | if n <= 0: 28 | return 0 29 | if n == 1: 30 | return 1 31 | else: 32 | return fibonacci(n - 1) + fibonacci(n - 2) 33 | 34 | 35 | @memoized 36 | def fast_fibonacci(n): 37 | if n <= 0: 38 | return 0 39 | if n == 1: 40 | return 1 41 | else: 42 | return fast_fibonacci(n - 1) + fast_fibonacci(n - 2) 43 | 44 | 45 | @memoized 46 | def identity(x): 47 | return x 48 | 49 | 50 | class TestMemoized(object): 51 | """Ensure memoization decorator behaves per its specification""" 52 | 53 | def test_basic(self): 54 | """Basic correctness tests""" 55 | assert identity((1,)) == (1,) 56 | assert fibonacci(1) == fast_fibonacci(1) 57 | assert fibonacci(2) == fast_fibonacci(2) 58 | assert fibonacci(3) == fast_fibonacci(3) 59 | assert fibonacci(10) == fast_fibonacci(10) 60 | 61 | def test_unhashable_args(self): 62 | """The memoization decorator should even work with 63 | common unhashable arguments ...""" 64 | assert identity([1]) == [1] 65 | assert identity(set([1])) == set([1]) 66 | assert identity({'a': 1}) == {'a': 1} 67 | 68 | def test_uncacheable_args(self): 69 | """... but might not work with all unhashable objects.""" 70 | f = tempfile.NamedTemporaryFile() 71 | with pytest.raises(TypeError): 72 | identity(f) 73 | 74 | def test_performance(self): 75 | """Ensure that the memoization decorator actually saves 76 | function calls""" 77 | 78 | @memoized 79 | def my_identity(x, sheep=False): 80 | my_identity.num_calls += 1 81 | if sheep: 82 | return "sheep" 83 | else: 84 | return x 85 | 86 | my_identity.num_calls = 0 87 | 88 | assert my_identity(1) == 1 89 | assert my_identity(1) == 1 90 | assert my_identity(1) == 1 91 | assert my_identity(2) == 2 92 | assert my_identity(2) == 2 93 | assert my_identity(2) == 2 94 | 95 | assert my_identity.num_calls == 2 96 | 97 | # Ensure kwargs work 98 | assert my_identity(1, sheep=True) == "sheep" 99 | assert my_identity(1, sheep=True) == "sheep" 100 | assert my_identity(2, sheep=True) == "sheep" 101 | assert my_identity(2, sheep=True) == "sheep" 102 | 103 | assert my_identity.num_calls == 4 104 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/info/namespace.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 22 | from data_pipeline.tools.introspector.models import IntrospectorNamespace 23 | 24 | 25 | class NamespaceInfoCommand(IntrospectorCommand): 26 | @classmethod 27 | def add_parser(cls, subparsers): 28 | info_command_parser = subparsers.add_parser( 29 | "namespace", 30 | description="Get information on a specific data pipeline namespace.", 31 | add_help=False 32 | ) 33 | 34 | info_command_parser.add_argument( 35 | '--active-namespaces', 36 | default=False, 37 | action='store_true', 38 | help=( 39 | 'If set, this command will also return information about active ' 40 | 'sources and topics for this namespace. ' 41 | 'This is a time expensive operation.' 42 | ) 43 | ) 44 | 45 | cls.add_base_arguments(info_command_parser) 46 | 47 | info_command_parser.add_argument( 48 | "namespace_name", 49 | type=str, 50 | help="Name of namespace to retrieve information on." 51 | ) 52 | 53 | info_command_parser.set_defaults( 54 | command=lambda args: cls("data_pipeline_instropsector_info_namespace").run( 55 | args, 56 | info_command_parser 57 | ) 58 | ) 59 | 60 | def info_namespace(self, name, active_namespaces=False): 61 | namespaces = self.schematizer.get_namespaces() 62 | info_namespace = None 63 | for namespace in namespaces: 64 | if namespace.name == name: 65 | info_namespace = namespace 66 | break 67 | if info_namespace: 68 | namespace = IntrospectorNamespace( 69 | namespace, 70 | active_namespaces=(self.active_namespaces if active_namespaces else None) 71 | ).to_ordered_dict() 72 | namespace['sources'] = self.list_sources( 73 | namespace_name=namespace['name'] 74 | ) 75 | return namespace 76 | else: 77 | raise ValueError("Given namespace doesn't exist") 78 | 79 | def process_args(self, args, parser): 80 | super(NamespaceInfoCommand, self).process_args(args, parser) 81 | self.namespace_name = args.namespace_name 82 | 83 | def run(self, args, parser): 84 | self.process_args(args, parser) 85 | print simplejson.dumps( 86 | self.info_namespace( 87 | self.namespace_name, 88 | active_namespaces=args.active_namespaces 89 | ) 90 | ) 91 | -------------------------------------------------------------------------------- /data_pipeline/schematizer_clientlib/models/avro_schema_element.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | from collections import namedtuple 20 | 21 | from data_pipeline.schematizer_clientlib.models.model_base import BaseModel 22 | from data_pipeline.schematizer_clientlib.models.note import _Note 23 | 24 | 25 | """ 26 | Represent the data of an Avro schema element. 27 | 28 | Args: 29 | id (int): The element id. 30 | schema_id (int): The id of the avro schema. 31 | element_type (): The data type of the element 32 | element_name (str): The column corresponding to the key of the AvroSchemaElement 33 | doc (): 34 | note (Optional[data_pipeline.schematizer_clientlib.models.note.Note]): Information specified by users about the schema. 35 | created_at (str): The timestamp when the schema is created in ISO-8601 36 | format. 37 | updated_at (str): The timestamp when the schema is last updated in ISO-8601 38 | format. 39 | """ 40 | 41 | AvroSchemaElement = namedtuple( 42 | 'AvroSchemaElement', 43 | ['id', 'schema_id', 'element_type', 'element_name', 'doc', 44 | 'note', 'created_at', 'updated_at'] 45 | ) 46 | 47 | _SCHEMA_KEY_DELIMITER = '|' 48 | 49 | 50 | class _AvroSchemaElement(BaseModel): 51 | """Internal class used to convert from/to various data structure and 52 | facilitate constructing the return value of schematizer functions. 53 | """ 54 | 55 | def __init__(self, id, schema_id, element_type, key, doc, note, 56 | created_at, updated_at): 57 | self.id = id 58 | self.schema_id = schema_id 59 | self.element_type = element_type 60 | self.element_name = None 61 | self.doc = doc 62 | self.note = note 63 | self.created_at = created_at 64 | self.updated_at = updated_at 65 | split_keys = key.split(_SCHEMA_KEY_DELIMITER) 66 | if len(split_keys) >= 2: 67 | self.element_name = split_keys[1] 68 | 69 | @classmethod 70 | def from_response(cls, response_lst): 71 | res = [] 72 | for response in response_lst: 73 | res.append( 74 | cls( 75 | id=response.id, 76 | schema_id=response.schema_id, 77 | element_type=response.element_type, 78 | key=response.key, 79 | doc=response.doc, 80 | note=_Note.from_response(response.note), 81 | created_at=response.created_at, 82 | updated_at=response.updated_at 83 | ) 84 | ) 85 | return res 86 | 87 | def to_result(self): 88 | return AvroSchemaElement( 89 | id=self.id, 90 | schema_id=self.schema_id, 91 | element_type=self.element_type, 92 | element_name=self.element_name, 93 | doc=self.doc, 94 | note=self.note.to_result() if self.note is not None else None, 95 | created_at=self.created_at, 96 | updated_at=self.updated_at 97 | ) 98 | -------------------------------------------------------------------------------- /data_pipeline/tools/binlog_analyzer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | """Use this like: 17 | mysqlbinlog --read-from-remote-server --host 10.69.1.100 -u rbr_test \ 18 | --stop-never --verbose --start-datetime="2015-03-08 00:45:00" \ 19 | mysql-bin.000405 | ~/pypy-2.5.0-linux64/bin/pypy ~/binlog_analyzer.py \ 20 | | ~/pypy-2.5.0-linux64/bin/pypy ~/compressed_stream_rotator.py 21 | """ 22 | from __future__ import absolute_import 23 | from __future__ import unicode_literals 24 | 25 | import datetime 26 | import errno 27 | import fileinput 28 | import json 29 | import re 30 | import time 31 | 32 | 33 | class BinlogParser(object): 34 | statement_to_type = {'INSERT INTO': 'insert', 'UPDATE': 'update', 'DELETE FROM': 'delete'} 35 | 36 | def __init__(self): 37 | self.timestamp = None 38 | self.header_timestamp = None 39 | 40 | def run(self): 41 | try: 42 | self._parse_binlog() 43 | except IOError as e: 44 | if e.errno == errno.EPIPE: 45 | # just stop if the pipe breaks 46 | pass 47 | else: 48 | raise 49 | 50 | def _parse_binlog(self): 51 | for line in fileinput.input(): 52 | line = line.strip() 53 | self._process_line(line) 54 | 55 | def _process_line(self, line): 56 | if self._is_setting_timestamp(line): 57 | self._handle_timestamp_line(line) 58 | if self._is_header_line(line): 59 | self._handle_header_line(line) 60 | elif self._is_updating(line): 61 | self._handle_update_line(line) 62 | 63 | def _is_setting_timestamp(self, line): 64 | return line.startswith("SET TIMESTAMP=") and line.endswith("/*!*/;") 65 | 66 | def _handle_header_line(self, line): 67 | m = re.search("\\#(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+server\\s+id\\s+\\d+", line) 68 | datetime_str = "%s %s" % (m.group(1), m.group(2)) 69 | dt = datetime.datetime.strptime(datetime_str, '%y%m%d %H:%M:%S') 70 | new_header_timestamp = int(time.mktime(dt.timetuple())) 71 | self.header_timestamp = new_header_timestamp 72 | 73 | def _is_header_line(self, line): 74 | regex = "\\#(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+server\\s+id\\s+\\d+.+(Update_rows|Write_rows|Delete_rows)" 75 | return re.search(regex, line) is not None 76 | 77 | def _handle_timestamp_line(self, line): 78 | m = re.search("SET\\ TIMESTAMP=(\\d+)/\\*!\\*/;", line) 79 | new_timestamp = int(m.group(1)) 80 | self.timestamp = new_timestamp 81 | 82 | def _is_updating(self, line): 83 | return any(line.startswith("### %s " % s) for s in ['INSERT INTO', 'UPDATE', 'DELETE FROM']) 84 | 85 | def _handle_update_line(self, line): 86 | m = re.search("\\#\\#\\#\\ (DELETE\\ FROM|INSERT\\ INTO|UPDATE)\\ (.+)", line) 87 | statement_type = self.statement_to_type[m.group(1)] 88 | table = m.group(2) 89 | 90 | print json.dumps({ 91 | 'timestamp': self.header_timestamp, 92 | 'statement_type': statement_type, 93 | 'table': table 94 | }) 95 | 96 | 97 | if __name__ == "__main__": 98 | BinlogParser().run() 99 | -------------------------------------------------------------------------------- /tests/client_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | import pytest 21 | 22 | from data_pipeline.client import Client 23 | from data_pipeline.expected_frequency import ExpectedFrequency 24 | 25 | 26 | class ClientTester(Client): 27 | @property 28 | def client_type(self): 29 | return 'tester' 30 | 31 | 32 | @pytest.mark.usefixtures('configure_teams') 33 | class TestClient(object): 34 | @property 35 | def client_name(self): 36 | return 'test_client' 37 | 38 | @property 39 | def team_name(self): 40 | return 'bam' 41 | 42 | @property 43 | def expected_frequency_seconds(self): 44 | return 0 45 | 46 | def _build_client(self, **override_kwargs): 47 | args = dict( 48 | client_name=self.client_name, 49 | team_name=self.team_name, 50 | expected_frequency_seconds=self.expected_frequency_seconds, 51 | monitoring_enabled=False 52 | ) 53 | args.update(override_kwargs) 54 | return ClientTester(**args) 55 | 56 | def test_default_client_is_valid(self): 57 | self._assert_valid(self._build_client()) 58 | 59 | def test_string_client_name_is_valid(self): 60 | name = str("test_client") 61 | assert self._build_client(client_name=name).client_name == name 62 | 63 | def test_non_string_client_name(self): 64 | self._assert_invalid(client_name=1) 65 | 66 | def test_empty_client_name(self): 67 | self._assert_invalid(client_name='') 68 | 69 | def test_invalid_team_name(self): 70 | self._assert_invalid(team_name='bogus_team') 71 | 72 | def test_negative_expected_frequency_seconds(self): 73 | self._assert_invalid(expected_frequency_seconds=-1) 74 | 75 | def test_expected_frequency_seconds_constant_is_valid(self): 76 | client = self._build_client( 77 | expected_frequency_seconds=ExpectedFrequency.constantly 78 | ) 79 | assert client.expected_frequency_seconds == 0 80 | 81 | def _assert_invalid(self, **client_kwargs): 82 | with pytest.raises(ValueError): 83 | self._build_client(**client_kwargs) 84 | 85 | def _assert_valid(self, client): 86 | assert client.client_name == self.client_name 87 | assert client.team_name == self.team_name 88 | assert client.expected_frequency_seconds == self.expected_frequency_seconds 89 | 90 | @pytest.mark.parametrize("method, skipped_method, kwargs", [ 91 | ('record_message', '_get_record', {'message': None}), 92 | ('close', 'flush_buffered_info', {}), 93 | ]) 94 | def test_method_call_with_disabled_monitoring(self, method, skipped_method, kwargs): 95 | client = self._build_client( 96 | expected_frequency_seconds=ExpectedFrequency.constantly 97 | ) 98 | with mock.patch.object(client.monitor, skipped_method) as uncalled_method: 99 | getattr(client.monitor, method)(**kwargs) 100 | assert uncalled_method.called == 0 101 | -------------------------------------------------------------------------------- /data_pipeline/tools/introspector/info/source.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import simplejson 20 | 21 | from data_pipeline.tools.introspector.base_command import IntrospectorCommand 22 | from data_pipeline.tools.introspector.models import IntrospectorSource 23 | 24 | 25 | class SourceInfoCommand(IntrospectorCommand): 26 | @classmethod 27 | def add_parser(cls, subparsers): 28 | info_command_parser = subparsers.add_parser( 29 | "source", 30 | description="Get information on a specific data pipeline source.", 31 | add_help=False 32 | ) 33 | 34 | info_command_parser.add_argument( 35 | '--active-sources', 36 | default=False, 37 | action='store_true', 38 | help=( 39 | 'If set, this command will also return information about active ' 40 | 'topics for this source. ' 41 | 'This is a time expensive operation.' 42 | ) 43 | ) 44 | 45 | cls.add_base_arguments(info_command_parser) 46 | cls.add_source_and_namespace_arguments(info_command_parser) 47 | 48 | info_command_parser.set_defaults( 49 | command=lambda args: cls("data_pipeline_instropsector_info_source").run( 50 | args, 51 | info_command_parser 52 | ) 53 | ) 54 | 55 | def process_args(self, args, parser): 56 | super(SourceInfoCommand, self).process_args(args, parser) 57 | self.process_source_and_namespace_args(args, parser) 58 | 59 | def info_source( 60 | self, 61 | source_id=None, 62 | source_name=None, 63 | namespace_name=None, 64 | active_sources=False 65 | ): 66 | info_source = None 67 | if source_id: 68 | info_source = self.schematizer.get_source_by_id(source_id) 69 | else: 70 | sources = self.schematizer.get_sources_by_namespace(namespace_name) 71 | for source in sources: 72 | if source.name == source_name: 73 | info_source = source 74 | break 75 | if not info_source: 76 | raise ValueError("Given SOURCE_NAME|NAMESPACE_NAME doesn't exist") 77 | info_source = IntrospectorSource( 78 | info_source 79 | ).to_ordered_dict() 80 | topics = self.list_topics( 81 | source_id=info_source["source_id"] 82 | ) 83 | if active_sources: 84 | info_source['active_topic_count'] = len( 85 | [topic for topic in topics if topic['message_count']] 86 | ) 87 | info_source['topics'] = topics 88 | return info_source 89 | 90 | def run(self, args, parser): 91 | self.process_args(args, parser) 92 | print simplejson.dumps( 93 | self.info_source( 94 | source_id=self.source_id, 95 | source_name=self.source_name, 96 | namespace_name=self.namespace, 97 | active_sources=args.active_sources 98 | ) 99 | ) 100 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://jira.yelpcorp.com/browse/DATAPIPE/, 17 | on the DATAPIPE project. 18 | 19 | If you are reporting a bug, please include: 20 | 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs or Implement Features 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | Look through the Jira issues. Anything 28 | is open to whoever wants to implement it. 29 | 30 | Write Documentation 31 | ~~~~~~~~~~~~~~~~~~~ 32 | 33 | Data Pipeline Clientlib could always use more documentation, whether as part of the 34 | official Data Pipeline Clientlib docs, in docstrings, or even on trac. 35 | 36 | Submit Feedback 37 | ~~~~~~~~~~~~~~~ 38 | 39 | The best way to send feedback is to file an issue at 40 | https://jira.yelpcorp.com/browse/DATAPIPE/ on the 41 | DATAPIPE project. 42 | 43 | If you are proposing a feature: 44 | 45 | * Explain in detail how it would work. 46 | * Keep the scope as narrow as possible, to make it easier to implement. 47 | * Remember that contributions are welcome :) 48 | 49 | Get Started! 50 | ------------ 51 | 52 | Ready to contribute? Here's how to set up `data_pipeline` for 53 | local development. 54 | 55 | 1. Clone the `data_pipeline` repo:: 56 | 57 | $ git clone git@git.yelpcorp.com:clientlibs/data_pipeline 58 | 59 | 2. Create a branch for local development:: 60 | 61 | $ git checkout -b name-of-your-bugfix-or-feature 62 | 63 | Now you can make your changes locally. 64 | 65 | See :doc:`index` for information about setting up TDD tools. 66 | 67 | 3. When you're done making changes, check that your changes pass style and unit 68 | tests, including testing other Python versions with tox:: 69 | 70 | $ tox 71 | 72 | To get tox, just pip install it. 73 | 74 | 4. Commit your changes and push your branch:: 75 | 76 | $ git add . 77 | $ git commit -m "Your detailed description of your changes." 78 | $ git push origin name-of-your-bugfix-or-feature 79 | 80 | 5. Get a code review:: 81 | 82 | $ review-branch 83 | 84 | Contribution Guidelines 85 | ----------------------- 86 | 87 | Before you submit a pull request, check that it meets these guidelines: 88 | 89 | 1. The change should include tests. 90 | 2. If the change adds functionality, the docs should be updated. Put 91 | your new functionality into a function with a docstring, and add the 92 | feature to the list in README.rst. 93 | 3. The pull request should work for Python 2.6, 2.7, and 3.3, and for PyPy. 94 | Run the ``tox`` command and make sure that the tests pass for all supported 95 | Python versions. 96 | 97 | Building Tools 98 | -------------- 99 | 100 | Before you submit a pull request, make sure that any new tool meets these guidelines: 101 | 102 | 1. Your tool is runnable from the commandline 103 | (with python or on it's own) in the data_pipeline/tools folder 104 | 2. Any python libraries your tool requires is in both 105 | requirements-tools.txt and in setup.py in extras_require["tools"] 106 | 3. In bin, create a script that can be run from the commandline that runs your tool. 107 | See the others in the folder as an example on what to do with a python batch. 108 | Put the path to this new script in setup.py under scripts 109 | 4. In debian/data-pipeline-tools.links create a new link. Use the others there as an example. 110 | 5. In yelp_package/itest/ubuntu.sh, add a line to the string SCRIPTS 111 | with the name of the script you put in bin 112 | 113 | Tips 114 | ---- 115 | 116 | To run a subset of tests:: 117 | 118 | $ py.test tests/data_pipeline_test.py 119 | -------------------------------------------------------------------------------- /tests/_retry_util_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import mock 20 | import pytest 21 | 22 | from data_pipeline._retry_util import ExpBackoffPolicy 23 | from data_pipeline._retry_util import MaxRetryError 24 | from data_pipeline._retry_util import Predicate 25 | from data_pipeline._retry_util import retry_on_condition 26 | from data_pipeline._retry_util import RetryPolicy 27 | 28 | 29 | # TODO(DATAPIPE-368|clin): add unit tests for rest of the retry module 30 | 31 | class TestRetryOnCondition(object): 32 | 33 | def always_true(self): 34 | return True 35 | 36 | def always_false(self): 37 | return False 38 | 39 | @pytest.fixture 40 | def return_true_then_false_func(self): 41 | return mock.Mock(side_effect=(True, False)) 42 | 43 | @pytest.fixture 44 | def number_sequence_func(self): 45 | return mock.Mock(side_effect=(i + 1 for i in xrange(10))) 46 | 47 | @property 48 | def max_retry_count(self): 49 | return 3 50 | 51 | @pytest.fixture 52 | def exp_backoff_with_max_retry_count_policy(self): 53 | return RetryPolicy( 54 | ExpBackoffPolicy( 55 | initial_delay_secs=0.1, 56 | max_delay_secs=0.5, 57 | backoff_factor=2 58 | ), 59 | max_retry_count=self.max_retry_count, 60 | ) 61 | 62 | def test_no_retry( 63 | self, 64 | number_sequence_func, 65 | exp_backoff_with_max_retry_count_policy 66 | ): 67 | actual = retry_on_condition( 68 | retry_policy=exp_backoff_with_max_retry_count_policy, 69 | retry_conditions=[Predicate(self.always_false)], 70 | func_to_retry=number_sequence_func 71 | ) 72 | assert actual == 1 73 | assert number_sequence_func.call_count == 1 74 | 75 | def test_exceed_max_retry_count( 76 | self, 77 | number_sequence_func, 78 | exp_backoff_with_max_retry_count_policy 79 | ): 80 | with pytest.raises(MaxRetryError) as e, mock.patch.object( 81 | exp_backoff_with_max_retry_count_policy.backoff_policy, 82 | 'next_backoff_delay', 83 | return_value=0.1 84 | ) as next_backoff_delay_spy: 85 | retry_on_condition( 86 | retry_policy=exp_backoff_with_max_retry_count_policy, 87 | retry_conditions=[Predicate(self.always_true)], 88 | func_to_retry=number_sequence_func 89 | ) 90 | assert number_sequence_func.call_count == self.max_retry_count + 1 91 | assert e.value.last_result == 4 92 | assert next_backoff_delay_spy.call_count == self.max_retry_count 93 | 94 | def test_use_previous_result_as_params_in_retry( 95 | self, 96 | return_true_then_false_func, 97 | exp_backoff_with_max_retry_count_policy 98 | ): 99 | actual = retry_on_condition( 100 | retry_policy=exp_backoff_with_max_retry_count_policy, 101 | retry_conditions=[Predicate(return_true_then_false_func)], 102 | func_to_retry=lambda i: i + i + i, 103 | use_previous_result_as_param=True, 104 | i=1 105 | ) 106 | assert actual == 9 107 | -------------------------------------------------------------------------------- /tests/envelope_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016 Yelp Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an 12 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, either express or implied. See the License for the 14 | # specific language governing permissions and limitations 15 | # under the License. 16 | from __future__ import absolute_import 17 | from __future__ import unicode_literals 18 | 19 | import pytest 20 | 21 | from data_pipeline import message as dp_message 22 | from data_pipeline.envelope import Envelope 23 | from data_pipeline.meta_attribute import MetaAttribute 24 | 25 | 26 | class TestEnvelope(object): 27 | 28 | @pytest.fixture 29 | def envelope(self): 30 | return Envelope() 31 | 32 | @pytest.fixture(params=[ 33 | None, 34 | {'good_payload': 26} 35 | ]) 36 | def meta_attr_payload_data(self, request): 37 | return request.param 38 | 39 | @pytest.fixture 40 | def valid_meta( 41 | self, 42 | meta_attr_payload_data, 43 | registered_meta_attribute_schema 44 | ): 45 | if meta_attr_payload_data is None: 46 | return None 47 | meta_attr = MetaAttribute( 48 | schema_id=registered_meta_attribute_schema.schema_id, 49 | payload_data=meta_attr_payload_data 50 | ) 51 | return [meta_attr] 52 | 53 | @pytest.fixture 54 | def meta_attr_param(self, valid_meta): 55 | return {'meta': valid_meta} 56 | 57 | @pytest.fixture(params=[ 58 | (dp_message.CreateMessage, {}), 59 | (dp_message.RefreshMessage, {}), 60 | (dp_message.DeleteMessage, {}), 61 | (dp_message.UpdateMessage, {'previous_payload': bytes(20)}) 62 | ]) 63 | def message(self, request, registered_schema, payload, meta_attr_param): 64 | message_class, additional_params = request.param 65 | if meta_attr_param: 66 | additional_params.update(meta_attr_param) 67 | return message_class( 68 | schema_id=registered_schema.schema_id, 69 | payload=payload, 70 | **additional_params 71 | ) 72 | 73 | @pytest.fixture 74 | def expected_unpacked_message(self, message): 75 | previous_payload = None 76 | if isinstance(message, dp_message.UpdateMessage): 77 | previous_payload = message.previous_payload 78 | return dict( 79 | encryption_type=message.encryption_type, 80 | message_type=message.message_type.name, 81 | meta=[ 82 | meta_attr.avro_repr 83 | for meta_attr in message.meta 84 | ] if message.meta else None, 85 | payload=message.payload, 86 | previous_payload=previous_payload, 87 | schema_id=message.schema_id, 88 | timestamp=message.timestamp, 89 | uuid=message.uuid 90 | ) 91 | 92 | def test_pack_create_bytes(self, message, envelope): 93 | assert isinstance(envelope.pack(message), bytes) 94 | 95 | def test_pack_create_str(self, message, envelope): 96 | assert isinstance(envelope.pack(message, ascii_encoded=True), str) 97 | 98 | def test_pack_unpack(self, message, envelope, expected_unpacked_message): 99 | unpacked = envelope.unpack(envelope.pack(message)) 100 | assert unpacked == expected_unpacked_message 101 | 102 | def test_pack_unpack_ascii(self, message, envelope, expected_unpacked_message): 103 | unpacked = envelope.unpack(envelope.pack(message, ascii_encoded=True)) 104 | assert unpacked == expected_unpacked_message 105 | --------------------------------------------------------------------------------