├── .dockerignore ├── .github └── workflows │ └── ci.yaml ├── .gitignore ├── .readthedocs.yml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── bin └── sparkly-testing ├── docker-compose.yml ├── docs └── source │ ├── catalog.rst │ ├── conf.py │ ├── functions.rst │ ├── index.rst │ ├── license.rst │ ├── reader_and_writer.rst │ ├── session.rst │ ├── testing.rst │ └── utils.rst ├── requirements.txt ├── requirements_dev.txt ├── requirements_extras.txt ├── setup.cfg ├── setup.py ├── sparkly ├── __init__.py ├── catalog.py ├── exceptions.py ├── functions.py ├── instant_testing.py ├── reader.py ├── session.py ├── testing.py ├── utils.py └── writer.py ├── tests ├── __init__.py ├── integration │ ├── __init__.py │ ├── base.py │ ├── fake_modules │ │ ├── __init__.py │ │ └── testing.py │ ├── resources │ │ ├── brickhouse-0.7.1.jar │ │ ├── test_fixtures │ │ │ ├── cassandra_setup.cql │ │ │ ├── cassandra_teardown.cql │ │ │ ├── data.json │ │ │ ├── data_for_es7.json │ │ │ ├── kafka.json │ │ │ ├── mapping.json │ │ │ ├── mysql_setup.sql │ │ │ └── mysql_teardown.sql │ │ ├── test_read │ │ │ ├── cassandra_setup.cql │ │ │ ├── cassandra_teardown.cql │ │ │ ├── elastic7_setup.json │ │ │ ├── elastic_setup.json │ │ │ ├── kafka_setup.json │ │ │ ├── mysql_setup.sql │ │ │ └── mysql_teardown.sql │ │ ├── test_testing │ │ │ ├── kafka_watcher_1.json │ │ │ └── kafka_watcher_2.json │ │ └── test_write │ │ │ ├── cassandra_setup.cql │ │ │ ├── cassandra_teardown.cql │ │ │ ├── elastic7_setup.json │ │ │ ├── elastic_setup.json │ │ │ ├── kafka_setup.json │ │ │ ├── mysql_setup.sql │ │ │ └── mysql_teardown.sql │ ├── test_catalog.py │ ├── test_functions.py │ ├── test_instant_testing.py │ ├── test_reader.py │ ├── test_session.py │ ├── test_testing.py │ └── test_writer.py ├── no_extras │ ├── __init__.py │ └── test_testing.py └── unit │ ├── __init__.py │ ├── test_instant_testing.py │ ├── test_reader.py │ ├── test_session.py │ ├── test_testing.py │ ├── test_utils.py │ └── test_writer.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | .git/ 18 | .idea/ -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - run: make test 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *,cover 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Virtualenvs 75 | venv* 76 | 77 | # idea 78 | .idea/ 79 | 80 | .DS_Store 81 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.7" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | formats: all 20 | 21 | # Optionally set the version of Python and requirements required to build your docs 22 | python: 23 | install: 24 | - requirements: requirements_dev.txt 25 | - requirements: requirements_extras.txt 26 | - requirements: requirements.txt 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 3.0.0 2 | * Improved performance of `catalog_ext.has_table` function by trying to execute a dummy SQL rather than listing the entire database, noticable mostly with databases with many tables. 3 | * Some minor changes to help in a spark-on-kubernetes environment: 4 | * In addition to setting `PYSPARK_SUBMIT_ARGS`, also explicitly set config params so they are picked up by an already-running JVM 5 | * Register a handler to stop spark session on python termination to deal with [SPARK-27927](https://issues.apache.org/jira/browse/SPARK-27927) 6 | * Removed `has_package` and `has_jar` functions, which are incomplete checks (resulting in false negatives) and are merely syntactic sugar. 7 | * Added options (class variables) `name` and `app_id_template` to autogenerate a unique value for 8 | spark option `spark.app.id`, which can help to preserve spark history data for all sessions across restarts. 9 | This functionality can be disabled by setting `app_id_template` to `None` or `''`. 10 | * Drop support of Python 2.7 11 | * Run integration tests using Python 3.7 12 | * Drop tests for Elastic 6.x 13 | * Use Kafka 2.8.0 for integration tests 14 | 15 | ## 2.8.2 16 | * Support 0.9.x `pymysql` in `sparkly.testing.MysqlFixture` 17 | 18 | ## 2.8.1 19 | * Fix support for using multiple sparkly sessions during tests 20 | * SparklySession does not persist modifications to os.environ 21 | * Support ElasticSearch 7 by making type optional. 22 | 23 | ## 2.8.0 24 | * Extend `SparklyCatalog` to work with database properties: 25 | - `spark.catalog_ext.set_database_property` 26 | - `spark.catalog_ext.get_database_property` 27 | - `spark.catalog_ext.get_database_properties` 28 | 29 | ## 2.7.1 30 | * Allow newer versions of `six` package (avoid depednecy hell) 31 | 32 | ## 2.7.0 33 | * Migrate to spark 2.4.0 34 | * Fix testing.DataType to use new convention to get field type 35 | 36 | ## 2.6.0 37 | * Add argmax function to sparkly.functions 38 | 39 | ## 2.5.1 40 | * Fix port issue with reading and writing `by_url`. `urlparse` return `netloc` with port, which breaks read and write from MySQL and Cassandra. 41 | 42 | ## 2.5.0 43 | * Add `port` argument to `CassandraFixture` and `MysqlFixture` 44 | * Add `Content-Type` header to `ElasticFixture` to support ElasticSearch `6.x` 45 | * Update `elasticsearch-hadoop` connector to `6.5.4` 46 | * Update image tag for elasticsearch to `6.5.4` 47 | 48 | ## 2.4.1 49 | * Fix write_ext.kafka: run foreachPartition instead of mapPartitions because returned value can cause spark.driver.maxResultSize excess 50 | 51 | ## 2.4.0 52 | * Respect PYSPARK_SUBMIT_ARGS if it is already set by appending SparklySession related options at the end instead of overwriting. 53 | * Fix additional_options to always override SparklySession.options when a session is initialized 54 | * Fix ujson dependency on environments where redis-py is already installed 55 | * Access or initialize SparklySession through get_or_create classmethod 56 | * Ammend `sparkly.functions.switch_case` to accept a user defined function for 57 | deciding whether the switch column matches a specific case 58 | 59 | ## 2.3.0 60 | * Overwrite existing tables in the metastore 61 | * Add functions module and provide switch_case column generation and multijoin 62 | * Add implicit test target import and extended assertEqual variation 63 | * Support writing to redis:// and rediss:// URLs 64 | * Add LRU cache that persists DataFrames under the hood 65 | * Add ability to check whether a complex type defines specific fields 66 | 67 | # 2.2.1 68 | * `spark.sql.shuffle.partitions` in `SparklyTest` should be set to string, 69 | because `int` value breaks integration testing in Spark 2.0.2. 70 | 71 | # 2.2.0 72 | * Add instant iterative development mode. `sparkly-testing --help` for more details. 73 | * Use in-memory db for Hive Metastore in `SparklyTest` (faster tests). 74 | * `spark.sql.shuffle.partitions = 4` for `SparklyTest` (faster tests). 75 | * `spark.sql.warehouse.dir = ` for `SparklyTest` (no side effects) 76 | 77 | ## 2.1.1 78 | * Fix: remove backtick quoting from catalog utils to ease work with different databases. 79 | 80 | ## 2.1.0 81 | * Add ability to specify custom maven repositories. 82 | 83 | ## 2.0.4 84 | * Make it possible to override default value of spark.sql.catalogImplementation 85 | 86 | ## 2.0.3 87 | * Add KafkaWatcher to facilitate testing of writing to Kafka 88 | * Fix a few minor pyflakes warnings and typos 89 | 90 | ## 2.0.2 91 | * Fix: #40 write_ext.kafka ignores errors. 92 | 93 | ## 2.0.1 94 | * Migrate to Spark 2, Spark 1.6.x isn't supported by sparkly 2.x. 95 | * Rename `SparklyContext` to `SparklySession` and derive it from `SparkSession`. 96 | * Use built-in csv reader. 97 | * Replace `hms` with `catalog_ext`. 98 | * `parse_schema` is now consistent with `DataType.simpleString` method. 99 | 100 | ## 1.1.1 101 | * Fix: kafka import error. 102 | 103 | ## 1.1.0 104 | * Kafka reader and writer. 105 | * Kafka fixtures. 106 | 107 | ## 1.0.0 108 | * Initial open-source release. 109 | * Features: 110 | - Declarative definition of application dependencies (spark packages, jars, UDFs) 111 | - Readers and writers for ElasticSearch, Cassandra, MySQL 112 | - DSL for interaction with Apache Hive Metastore 113 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | FROM python:3.10 18 | 19 | LABEL maintainer="dev@tubularlabs.com" 20 | 21 | # Install Java 8 22 | RUN apt-get update && apt-get install -y software-properties-common 23 | RUN apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main' 24 | RUN apt-add-repository 'deb http://deb.debian.org/debian/ sid main' 25 | RUN apt-get update && apt-get install -y openjdk-8-jdk 26 | 27 | # Python env 28 | ENV CASS_DRIVER_NO_EXTENSIONS=1 29 | COPY requirements.txt /tmp/requirements.txt 30 | COPY requirements_dev.txt /tmp/requirements_dev.txt 31 | COPY requirements_extras.txt /tmp/requirements_extras.txt 32 | RUN python -m pip install -r /tmp/requirements.txt 33 | RUN python -m pip install -r /tmp/requirements_dev.txt 34 | RUN python -m pip install -r /tmp/requirements_extras.txt 35 | 36 | # Provision Sparkly 37 | ADD . /opt/sparkly/ 38 | WORKDIR /opt/sparkly/ 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Tubular Labs, Inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | 204 | ======================================================================= 205 | Sparkly Subcomponents: 206 | 207 | The Sparkly project contains subcomponents with separate copyright 208 | notices and license terms. Your use of the source code for the these 209 | subcomponents is subject to the terms and conditions of the following 210 | licenses. 211 | 212 | ======================================================================== 213 | Apache licenses 214 | ======================================================================== 215 | 216 | The following dependencies are provided under a Apache license. See project link for details. 217 | 218 | (Apache License 2.0) Spark (https://github.com/apache/spark) 219 | (Apache License 2.0) cassandra-driver (https://github.com/datastax/python-driver) 220 | 221 | ======================================================================== 222 | BSD-style licenses 223 | ======================================================================== 224 | 225 | The following dependencies are provided under a BSD-style license. See project link for details. 226 | 227 | (BSD License) mock (https://github.com/testing-cabal/mock) 228 | (PSF License) Sphinx (https://github.com/sphinx-doc/sphinx) 229 | 230 | ======================================================================== 231 | MIT licenses 232 | ======================================================================== 233 | 234 | The following dependencies are provided under the MIT License. See project link for details. 235 | 236 | (MIT License) sphinx_rtd_theme (https://github.com/snide/sphinx_rtd_theme) 237 | (MIT License) pytest (https://github.com/pytest-dev/pytest) 238 | (MIT License) pytest-cov (https://github.com/pytest-dev/pytest-cov) 239 | (MIT License) PyMySQL (https://github.com/PyMySQL/PyMySQL) 240 | 241 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include requirements.txt README.md 18 | recursive-include sparkly/resources * 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | dev: 18 | docker-compose build dev 19 | docker-compose run dev bash 20 | docker-compose down -v ; exit $$retcode 21 | 22 | dist: 23 | docker-compose build dev 24 | docker-compose run --no-deps dev python setup.py bdist_wheel ; retcode="$$?" ; docker-compose down -v ; exit $$retcode 25 | 26 | docs: 27 | docker-compose build dev 28 | docker-compose run --no-deps dev python -m sphinx -b html docs/source docs/build 29 | 30 | test: 31 | docker-compose build test 32 | docker-compose run test tox ; retcode="$$?" ; docker-compose down -v ; exit $$retcode 33 | 34 | .PHONY: docs dist 35 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Sparkly 2 | ======= 3 | 4 | |Sparkly PyPi Version| |Documentation Status| 5 | 6 | Helpers & syntax sugar for PySpark. There are several features to make your life easier: 7 | 8 | - Definition of spark packages, external jars, UDFs and spark options within your code; 9 | - Simplified reader/writer api for Cassandra, Elastic, MySQL, Kafka; 10 | - Testing framework for spark applications. 11 | 12 | More details could be found in `the official 13 | documentation `__. 14 | 15 | Installation 16 | ------------ 17 | 18 | Sparkly itself is easy to install:: 19 | 20 | pip install pyspark # pick your version 21 | pip install sparkly (compatible with spark >= 2.4) 22 | 23 | 24 | Getting Started 25 | --------------- 26 | 27 | Here is a small code snippet to show how to easily read Cassandra table 28 | and write its content to ElasticSearch index:: 29 | 30 | from sparkly import SparklySession 31 | 32 | 33 | class MySession(SparklySession): 34 | packages = [ 35 | 'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11', 36 | 'org.elasticsearch:elasticsearch-spark-20_2.11:6.5.4', 37 | ] 38 | 39 | 40 | if __name__ == '__main__': 41 | spark = MySession() 42 | df = spark.read_ext.cassandra('localhost', 'my_keyspace', 'my_table') 43 | df.write_ext.elastic('localhost', 'my_index', 'my_type') 44 | 45 | See `the online documentation `__ for 46 | more details. 47 | 48 | Testing 49 | ------- 50 | 51 | To run tests you have to have `docker `__ and 52 | `docker-compose `__ installed on your 53 | system. If you are working on MacOS we highly recommend you to use 54 | `docker-machine `__. As soon as the 55 | tools mentioned above have been installed, all you need is to run:: 56 | 57 | make test 58 | 59 | Supported Spark Versions 60 | ------------------------ 61 | 62 | At the moment we support: 63 | 64 | +---------------------------------------------------------------------------+ 65 | | sparkly >= 2.7 | Spark 2.4.x | 66 | +---------------------------------------------------------------------------+ 67 | | sparkly 2.x | Spark 2.0.x and Spark 2.1.x and Spark 2.2.x | 68 | +---------------------------------------------------------------------------+ 69 | | sparkly 1.x | Spark 1.6.x | 70 | +---------------------------------------------------------------------------+ 71 | 72 | .. |Sparkly PyPi Version| image:: http://img.shields.io/pypi/v/sparkly.svg 73 | :target: https://pypi.python.org/pypi/sparkly 74 | .. |Sparkly Build Status| image:: https://app.travis-ci.com/tubular/sparkly.svg?branch=master 75 | :target: https://app.travis-ci.com/github/tubular/sparkly 76 | .. |Documentation Status| image:: https://readthedocs.org/projects/sparkly/badge/?version=latest 77 | :target: http://sparkly.readthedocs.io/en/latest/?badge=latest 78 | -------------------------------------------------------------------------------- /bin/sparkly-testing: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright 2017 Tubular Labs, Inc. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import argparse 20 | import logging 21 | import sys 22 | import textwrap 23 | 24 | from sparkly.instant_testing import InstantTesting 25 | 26 | 27 | logging.basicConfig( 28 | stream=sys.stderr, 29 | level=logging.INFO, 30 | format='%(levelname)s %(message)s', 31 | ) 32 | 33 | 34 | if __name__ == '__main__': 35 | parser = argparse.ArgumentParser( 36 | formatter_class=argparse.RawDescriptionHelpFormatter, 37 | description=textwrap.dedent( 38 | """\ 39 | Sparkly Instant Testing. 40 | 41 | The tool speeds up iterative development on spark-based tests. 42 | It keeps JVM with initialised SparkContext running between multiple test sessions. 43 | 44 | Usage: 45 | sparkly-testing up 46 | py.test path/to/test_integration_with_pyspark.py # slow (first run) 47 | py.test path/to/test_integration_with_pyspark.py # fast (next runs) 48 | sparkly-testing down 49 | 50 | To change SparkContext options or to add new jars/packages call: 51 | sparkly-testing refresh 52 | """, 53 | ) 54 | ) 55 | 56 | sub_commands = parser.add_subparsers() 57 | 58 | # Instant testing mode. 59 | sub_commands.add_parser( 60 | name='up', 61 | help='Activate instant testing mode.', 62 | ).set_defaults(func=lambda _: InstantTesting.activate()) 63 | 64 | sub_commands.add_parser( 65 | name='down', 66 | help='Deactivate instant testing mode.', 67 | ).set_defaults(func=lambda _: InstantTesting.deactivate()) 68 | 69 | sub_commands.add_parser( 70 | name='refresh', 71 | help='Refresh SparkContext options or add new jars/packages.', 72 | ).set_defaults(func=lambda _: InstantTesting.deactivate() or InstantTesting.activate()) 73 | 74 | args = parser.parse_args() 75 | 76 | if hasattr(args, 'func'): 77 | args.func(args) 78 | else: 79 | parser.print_help() 80 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | version: '2.1' 18 | services: 19 | dev: 20 | build: . 21 | depends_on: 22 | cassandra.docker: 23 | condition: service_healthy 24 | elastic.docker: 25 | condition: service_healthy 26 | kafka.docker: 27 | condition: service_healthy 28 | mysql.docker: 29 | condition: service_healthy 30 | redis.docker: 31 | condition: service_healthy 32 | volumes: 33 | - .:/opt/sparkly/ 34 | 35 | test: 36 | build: . 37 | depends_on: 38 | cassandra.docker: 39 | condition: service_healthy 40 | elastic.docker: 41 | condition: service_healthy 42 | kafka.docker: 43 | condition: service_healthy 44 | mysql.docker: 45 | condition: service_healthy 46 | redis.docker: 47 | condition: service_healthy 48 | 49 | cassandra.docker: 50 | image: cassandra:4.1 51 | healthcheck: 52 | test: ["CMD-SHELL", "[ $$(nodetool statusgossip) = running ]"] 53 | 54 | elastic.docker: 55 | image: docker.elastic.co/elasticsearch/elasticsearch:7.17.8 56 | environment: 57 | - xpack.security.enabled=false 58 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 59 | - discovery.type=single-node 60 | healthcheck: 61 | test: "curl -f http://localhost:9200/_cat/health | grep green" 62 | interval: 5s 63 | timeout: 5s 64 | retries: 20 65 | 66 | mysql.docker: 67 | image: mysql:8.0 68 | environment: 69 | MYSQL_DATABASE: sparkly_test 70 | MYSQL_ALLOW_EMPTY_PASSWORD: "yes" 71 | healthcheck: 72 | test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] 73 | 74 | kafka.docker: 75 | image: confluentinc/cp-kafka:7.3.0 76 | depends_on: 77 | zookeeper.docker: 78 | condition: service_healthy 79 | expose: 80 | - "9092" 81 | environment: 82 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://:9092 83 | KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" 84 | KAFKA_ZOOKEEPER_CONNECT: zookeeper.docker:2181 85 | KAFKA_NUM_PARTITIONS: 3 86 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 87 | healthcheck: 88 | test: ps ax | grep kafka 89 | 90 | redis.docker: 91 | image: redis:7.0 92 | expose: 93 | - "6379" 94 | healthcheck: 95 | test: ["CMD", "redis-cli", "ping"] 96 | 97 | zookeeper.docker: 98 | image: confluent/zookeeper 99 | expose: 100 | - "2181" 101 | healthcheck: 102 | test: ps ax | grep zookeeper 103 | -------------------------------------------------------------------------------- /docs/source/catalog.rst: -------------------------------------------------------------------------------- 1 | Hive Metastore Utils 2 | ==================== 3 | 4 | About Hive Metastore 5 | -------------------- 6 | 7 | The Hive Metastore is a database with metadata for Hive tables. 8 | 9 | To configure ``SparklySession`` to work with external Hive Metastore, you need to set ``hive.metastore.uris`` option. 10 | You can do this via ``hive-site.xml`` file in spark config ($SPARK_HOME/conf/hive-site.xml): 11 | 12 | .. code-block:: xml 13 | 14 | 15 | hive.metastore.uris 16 | thrift://:9083 17 | IP address (or fully-qualified domain name) and port of the metastore host 18 | 19 | 20 | 21 | or set it dynamically via ``SparklySession`` options: 22 | 23 | .. code-block:: python 24 | 25 | class MySession(SparklySession): 26 | options = { 27 | 'hive.metastore.uris': 'thrift://:9083', 28 | } 29 | 30 | 31 | Tables management 32 | ----------------- 33 | 34 | **Why:** you need to check if tables exist, rename them, drop them, or even overwrite existing aliases in your catalog. 35 | 36 | .. code-block:: python 37 | 38 | from sparkly import SparklySession 39 | 40 | 41 | spark = SparklySession() 42 | 43 | assert spark.catalog_ext.has_table('my_table') in {True, False} 44 | spark.catalog_ext.rename_table('my_table', 'my_new_table') 45 | spark.catalog_ext.create_table('my_new_table', path='s3://my/parquet/data', source='parquet', mode='overwrite') 46 | spark.catalog_ext.drop_table('my_new_table') 47 | 48 | Table properties management 49 | --------------------------- 50 | 51 | **Why:** sometimes you want to assign custom attributes for your table, e.g. creation time, last update, purpose, data source. 52 | The only way to interact with table properties in spark - use raw SQL queries. 53 | We implemented a more convenient interface to make your code cleaner. 54 | 55 | .. code-block:: python 56 | 57 | from sparkly import SparklySession 58 | 59 | 60 | spark = SparklySession() 61 | spark.catalog_ext.set_table_property('my_table', 'foo', 'bar') 62 | assert spark.catalog_ext.get_table_property('my_table', 'foo') == 'bar' 63 | assert spark.catalog_ext.get_table_properties('my_table') == {'foo': 'bar'} 64 | 65 | *Note* properties are stored as strings. 66 | In case if you need other types, consider using a serialisation format, e.g. JSON. 67 | 68 | 69 | Using non-default database 70 | -------------------------- 71 | 72 | **Why** to split your warehouse into logical groups (for example by system components). 73 | In all catalog_ext.* methods you can specify full table names . and 74 | it should operate properly 75 | 76 | .. code-block:: python 77 | 78 | from time import time 79 | from sparkly import SparklySession 80 | 81 | spark = SparklySession() 82 | 83 | if spark.catalog_ext.has_database('my_database'): 84 | self.catalog_ext.rename_table( 85 | 'my_database.my_badly_named_table', 86 | 'new_shiny_name', 87 | ) 88 | self.catalog_ext.set_table_property( 89 | 'my_database.new_shiny_name', 90 | 'last_update_at', 91 | time(), 92 | ) 93 | 94 | *Note* be careful using 'USE' statements like: spark.sql('USE my_database'), 95 | it's stateful and may lead to weird errors, if code assumes correct current database. 96 | 97 | 98 | API documentation 99 | ----------------- 100 | 101 | .. automodule:: sparkly.catalog 102 | :members: 103 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Copyright 2017 Tubular Labs, Inc. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # -*- coding: utf-8 -*- 20 | # 21 | # sparkly documentation build configuration file, created by 22 | # sphinx-quickstart on Tue Sep 20 08:46:42 2016. 23 | # 24 | # This file is execfile()d with the current directory set to its 25 | # containing dir. 26 | # 27 | # Note that not all possible configuration values are present in this 28 | # autogenerated file. 29 | # 30 | # All configuration values have a default; values that are commented out 31 | # serve to show the default. 32 | 33 | # If extensions (or modules to document with autodoc) are in another directory, 34 | # add these directories to sys.path here. If the directory is relative to the 35 | # documentation root, use os.path.abspath to make it absolute, like shown here. 36 | # 37 | import os 38 | import sys 39 | import sphinx_rtd_theme 40 | 41 | 42 | sys.path.insert(0, os.path.abspath('../..')) 43 | 44 | # -- General configuration ------------------------------------------------ 45 | 46 | # If your documentation needs a minimal Sphinx version, state it here. 47 | # 48 | # needs_sphinx = '1.0' 49 | 50 | # Add any Sphinx extension module names here, as strings. They can be 51 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 52 | # ones. 53 | extensions = [ 54 | 'sphinx.ext.autodoc', 55 | 'sphinx.ext.doctest', 56 | 'sphinx.ext.coverage', 57 | 'sphinx.ext.viewcode', 58 | # 'sphinx.ext.githubpages', 59 | 'sphinx.ext.napoleon', 60 | ] 61 | 62 | # Add any paths that contain templates here, relative to this directory. 63 | templates_path = ['_templates'] 64 | 65 | # The suffix(es) of source filenames. 66 | # You can specify multiple suffix as a list of string: 67 | # 68 | # source_suffix = ['.rst', '.md'] 69 | source_suffix = '.rst' 70 | 71 | # The encoding of source files. 72 | # 73 | # source_encoding = 'utf-8-sig' 74 | 75 | # The master toctree document. 76 | master_doc = 'index' 77 | 78 | # General information about the project. 79 | project = 'sparkly' 80 | copyright = '2016, Tubular' 81 | author = 'Tubular' 82 | 83 | # The version info for the project you're documenting, acts as replacement for 84 | # |version| and |release|, also used in various other places throughout the 85 | # built documents. 86 | # 87 | import re 88 | 89 | with open(os.path.join(os.path.dirname(__file__), '../../sparkly/__init__.py')) as init_py: 90 | init_py_content = init_py.read() 91 | 92 | # The short X.Y version. 93 | version = re.search('__version__ = \'([\d.]+)[\w.]*\'', init_py_content).group(1).strip('.') 94 | 95 | # The full version, including alpha/beta/rc tags. 96 | release = re.search('__version__ = \'([\w.]+)\'', init_py_content).group(1) 97 | 98 | # The language for content autogenerated by Sphinx. Refer to documentation 99 | # for a list of supported languages. 100 | # 101 | # This is also used if you do content translation via gettext catalogs. 102 | # Usually you set "language" from the command line for these cases. 103 | language = 'en' 104 | 105 | # There are two options for replacing |today|: either, you set today to some 106 | # non-false value, then it is used: 107 | # 108 | # today = '' 109 | # 110 | # Else, today_fmt is used as the format for a strftime call. 111 | # 112 | # today_fmt = '%B %d, %Y' 113 | 114 | # List of patterns, relative to source directory, that match files and 115 | # directories to ignore when looking for source files. 116 | # This patterns also effect to html_static_path and html_extra_path 117 | exclude_patterns = [] 118 | 119 | # The reST default role (used for this markup: `text`) to use for all 120 | # documents. 121 | # 122 | # default_role = None 123 | 124 | # If true, '()' will be appended to :func: etc. cross-reference text. 125 | # 126 | # add_function_parentheses = True 127 | 128 | # If true, the current module name will be prepended to all description 129 | # unit titles (such as .. function::). 130 | # 131 | # add_module_names = True 132 | 133 | # If true, sectionauthor and moduleauthor directives will be shown in the 134 | # output. They are ignored by default. 135 | # 136 | # show_authors = False 137 | 138 | # The name of the Pygments (syntax highlighting) style to use. 139 | pygments_style = 'sphinx' 140 | 141 | # A list of ignored prefixes for module index sorting. 142 | # modindex_common_prefix = [] 143 | 144 | # If true, keep warnings as "system message" paragraphs in the built documents. 145 | # keep_warnings = False 146 | 147 | # If true, `todo` and `todoList` produce output, else they produce nothing. 148 | todo_include_todos = False 149 | 150 | 151 | # -- Options for HTML output ---------------------------------------------- 152 | 153 | # The theme to use for HTML and HTML Help pages. See the documentation for 154 | # a list of builtin themes. 155 | # 156 | html_theme = 'sphinx_rtd_theme' 157 | 158 | # Theme options are theme-specific and customize the look and feel of a theme 159 | # further. For a list of options available for each theme, see the 160 | # documentation. 161 | # 162 | # html_theme_options = {} 163 | 164 | # Add any paths that contain custom themes here, relative to this directory. 165 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 166 | 167 | # The name for this set of Sphinx documents. 168 | # " v documentation" by default. 169 | # 170 | # html_title = 'sparkly v0.3.1' 171 | 172 | # A shorter title for the navigation bar. Default is the same as html_title. 173 | # 174 | # html_short_title = None 175 | 176 | # The name of an image file (relative to this directory) to place at the top 177 | # of the sidebar. 178 | # 179 | # html_logo = None 180 | 181 | # The name of an image file (relative to this directory) to use as a favicon of 182 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 183 | # pixels large. 184 | # 185 | # html_favicon = None 186 | 187 | # Add any paths that contain custom static files (such as style sheets) here, 188 | # relative to this directory. They are copied after the builtin static files, 189 | # so a file named "default.css" will overwrite the builtin "default.css". 190 | #html_static_path = ['_static'] 191 | 192 | # Add any extra paths that contain custom files (such as robots.txt or 193 | # .htaccess) here, relative to this directory. These files are copied 194 | # directly to the root of the documentation. 195 | # 196 | # html_extra_path = [] 197 | 198 | # If not None, a 'Last updated on:' timestamp is inserted at every page 199 | # bottom, using the given strftime format. 200 | # The empty string is equivalent to '%b %d, %Y'. 201 | # 202 | # html_last_updated_fmt = None 203 | 204 | # If true, SmartyPants will be used to convert quotes and dashes to 205 | # typographically correct entities. 206 | # 207 | # html_use_smartypants = True 208 | 209 | # Custom sidebar templates, maps document names to template names. 210 | # 211 | # html_sidebars = {} 212 | 213 | # Additional templates that should be rendered to pages, maps page names to 214 | # template names. 215 | # 216 | # html_additional_pages = {} 217 | 218 | # If false, no module index is generated. 219 | # 220 | # html_domain_indices = True 221 | 222 | # If false, no index is generated. 223 | # 224 | # html_use_index = True 225 | 226 | # If true, the index is split into individual pages for each letter. 227 | # 228 | # html_split_index = False 229 | 230 | # If true, links to the reST sources are added to the pages. 231 | # 232 | # html_show_sourcelink = True 233 | 234 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 235 | # 236 | # html_show_sphinx = True 237 | 238 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 239 | # 240 | # html_show_copyright = True 241 | 242 | # If true, an OpenSearch description file will be output, and all pages will 243 | # contain a tag referring to it. The value of this option must be the 244 | # base URL from which the finished HTML is served. 245 | # 246 | # html_use_opensearch = '' 247 | 248 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 249 | # html_file_suffix = None 250 | 251 | # Language to be used for generating the HTML full-text search index. 252 | # Sphinx supports the following languages: 253 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 254 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 255 | # 256 | # html_search_language = 'en' 257 | 258 | # A dictionary with options for the search language support, empty by default. 259 | # 'ja' uses this config value. 260 | # 'zh' user can custom change `jieba` dictionary path. 261 | # 262 | # html_search_options = {'type': 'default'} 263 | 264 | # The name of a javascript file (relative to the configuration directory) that 265 | # implements a search results scorer. If empty, the default will be used. 266 | # 267 | # html_search_scorer = 'scorer.js' 268 | 269 | # Output file base name for HTML help builder. 270 | htmlhelp_basename = 'sparklydoc' 271 | 272 | # -- Options for LaTeX output --------------------------------------------- 273 | 274 | latex_elements = { 275 | # The paper size ('letterpaper' or 'a4paper'). 276 | # 277 | # 'papersize': 'letterpaper', 278 | 279 | # The font size ('10pt', '11pt' or '12pt'). 280 | # 281 | # 'pointsize': '10pt', 282 | 283 | # Additional stuff for the LaTeX preamble. 284 | # 285 | # 'preamble': '', 286 | 287 | # Latex figure (float) alignment 288 | # 289 | # 'figure_align': 'htbp', 290 | } 291 | 292 | # Grouping the document tree into LaTeX files. List of tuples 293 | # (source start file, target name, title, 294 | # author, documentclass [howto, manual, or own class]). 295 | latex_documents = [ 296 | (master_doc, 'sparkly.tex', 'sparkly Documentation', 297 | 'Tubular', 'manual'), 298 | ] 299 | 300 | # The name of an image file (relative to this directory) to place at the top of 301 | # the title page. 302 | # 303 | # latex_logo = None 304 | 305 | # For "manual" documents, if this is true, then toplevel headings are parts, 306 | # not chapters. 307 | # 308 | # latex_use_parts = False 309 | 310 | # If true, show page references after internal links. 311 | # 312 | # latex_show_pagerefs = False 313 | 314 | # If true, show URL addresses after external links. 315 | # 316 | # latex_show_urls = False 317 | 318 | # Documents to append as an appendix to all manuals. 319 | # 320 | # latex_appendices = [] 321 | 322 | # It false, will not define \strong, \code, itleref, \crossref ... but only 323 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 324 | # packages. 325 | # 326 | # latex_keep_old_macro_names = True 327 | 328 | # If false, no module index is generated. 329 | # 330 | # latex_domain_indices = True 331 | 332 | 333 | # -- Options for manual page output --------------------------------------- 334 | 335 | # One entry per manual page. List of tuples 336 | # (source start file, name, description, authors, manual section). 337 | man_pages = [ 338 | (master_doc, 'sparkly', 'sparkly Documentation', 339 | [author], 1) 340 | ] 341 | 342 | # If true, show URL addresses after external links. 343 | # 344 | # man_show_urls = False 345 | 346 | 347 | # -- Options for Texinfo output ------------------------------------------- 348 | 349 | # Grouping the document tree into Texinfo files. List of tuples 350 | # (source start file, target name, title, author, 351 | # dir menu entry, description, category) 352 | texinfo_documents = [ 353 | (master_doc, 'sparkly', 'sparkly Documentation', 354 | author, 'sparkly', 'One line description of project.', 355 | 'Miscellaneous'), 356 | ] 357 | 358 | # Documents to append as an appendix to all manuals. 359 | # 360 | # texinfo_appendices = [] 361 | 362 | # If false, no module index is generated. 363 | # 364 | # texinfo_domain_indices = True 365 | 366 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 367 | # 368 | # texinfo_show_urls = 'footnote' 369 | 370 | # If true, do not generate a @detailmenu in the "Top" node's menu. 371 | # 372 | # texinfo_no_detailmenu = False 373 | -------------------------------------------------------------------------------- /docs/source/functions.rst: -------------------------------------------------------------------------------- 1 | Column and DataFrame Functions 2 | ============================== 3 | 4 | A counterpart of pyspark.sql.functions providing useful shortcuts: 5 | 6 | - a cleaner alternative to chaining together multiple when/otherwise statements. 7 | - an easy way to join multiple dataframes at once and disambiguate fields with the same name. 8 | - agg function to select a value from the row that maximizes other column(s) 9 | 10 | 11 | API documentation 12 | ----------------- 13 | 14 | .. automodule:: sparkly.functions 15 | :members: 16 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to sparkly's documentation! 2 | =================================== 3 | 4 | Sparkly is a library that makes usage of pyspark more convenient and consistent. 5 | 6 | A brief tour on Sparkly features: 7 | 8 | .. code-block:: python 9 | 10 | # The main entry point is SparklySession, 11 | # you can think of it as of a combination of SparkSession and SparkSession.builder. 12 | from sparkly import SparklySession 13 | 14 | 15 | # Define dependencies in the code instead of messing with `spark-submit`. 16 | class MySession(SparklySession): 17 | # Spark packages and dependencies from Maven. 18 | packages = [ 19 | 'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11', 20 | 'mysql:mysql-connector-java:5.1.39', 21 | ] 22 | 23 | # Jars and Hive UDFs 24 | jars = ['/path/to/brickhouse-0.7.1.jar'], 25 | udfs = { 26 | 'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF', 27 | } 28 | 29 | 30 | spark = MySession() 31 | 32 | # Operate with interchangeable URL-like data source definitions: 33 | df = spark.read_ext.by_url('mysql:///my_database/my_database') 34 | df.write_ext('parquet:s3:////data?partition_by=') 35 | 36 | # Interact with Hive Metastore via convenient python api, 37 | # instead of verbose SQL queries: 38 | spark.catalog_ext.has_table('my_custom_table') 39 | spark.catalog_ext.get_table_properties('my_custom_table') 40 | 41 | # Easy integration testing with Fixtures and base test classes. 42 | from pyspark.sql import types as T 43 | from sparkly.testing import SparklyTest 44 | 45 | 46 | class TestMyShinySparkScript(SparklyTest): 47 | session = MySession 48 | 49 | fixtures = [ 50 | MysqlFixture('', '', '', '/path/to/data.sql', '/path/to/clear.sql') 51 | ] 52 | 53 | def test_job_works_with_mysql(self): 54 | df = self.spark.read_ext.by_url('mysql:////?user=&password=') 55 | res_df = my_shiny_script(df) 56 | self.assertRowsEqual( 57 | res_df.collect(), 58 | [T.Row(fieldA='DataA', fieldB='DataB', fieldC='DataC')], 59 | ) 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | 64 | session 65 | reader_and_writer 66 | catalog 67 | testing 68 | functions 69 | utils 70 | license 71 | 72 | .. automodule:: sparkly 73 | :members: 74 | 75 | Indices and tables 76 | ------------------ 77 | 78 | * :ref:`genindex` 79 | * :ref:`modindex` 80 | * :ref:`search` 81 | -------------------------------------------------------------------------------- /docs/source/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | .. include:: ../../LICENSE 5 | :literal: 6 | -------------------------------------------------------------------------------- /docs/source/reader_and_writer.rst: -------------------------------------------------------------------------------- 1 | .. _reader_and_writer: 2 | 3 | Read/write utilities for DataFrames 4 | =================================== 5 | 6 | Sparkly isn't trying to replace any of existing storage connectors. 7 | The goal is to provide a simplified and consistent api across a wide array of storage connectors. 8 | We also added the way to work with :ref:`abstract data sources `, 9 | so you can keep your code agnostic to the storages you use. 10 | 11 | .. _cassandra: 12 | 13 | Cassandra 14 | --------- 15 | 16 | Sparkly relies on the official spark cassandra connector and was successfully tested in production using version `2.4.0`. 17 | 18 | +---------------+---------------------------------------------------------------------------------------+ 19 | | Package | https://spark-packages.org/package/datastax/spark-cassandra-connector | 20 | +---------------+---------------------------------------------------------------------------------------+ 21 | | Configuration | https://github.com/datastax/spark-cassandra-connector/blob/v2.4.0/doc/reference.md | 22 | +---------------+---------------------------------------------------------------------------------------+ 23 | 24 | For using overwrite mode, it is needed to specify confirm.truncate as true. Otherwise, use append mode to update existing data. 25 | 26 | .. code-block:: python 27 | 28 | from sparkly import SparklySession 29 | 30 | 31 | class MySession(SparklySession): 32 | # Feel free to play with other versions 33 | packages = ['datastax:spark-cassandra-connector:2.4.0-s_2.11'] 34 | 35 | spark = MySession() 36 | 37 | # To read data 38 | df = spark.read_ext.cassandra('localhost', 'my_keyspace', 'my_table') 39 | # To write data 40 | df.write_ext.cassandra('localhost', 'my_keyspace', 'my_table') 41 | 42 | 43 | .. _elastic: 44 | 45 | Elastic 46 | ------- 47 | 48 | Sparkly relies on the official elastic spark connector and was successfully tested in production using version `6.5.4`. 49 | 50 | +---------------+-----------------------------------------------------------------------------+ 51 | | Package | https://spark-packages.org/package/elastic/elasticsearch-hadoop | 52 | +---------------+-----------------------------------------------------------------------------+ 53 | | Configuration | https://www.elastic.co/guide/en/elasticsearch/hadoop/7.3/configuration.html | 54 | +---------------+-----------------------------------------------------------------------------+ 55 | 56 | .. code-block:: python 57 | 58 | from sparkly import SparklySession 59 | 60 | 61 | class MySession(SparklySession): 62 | # Feel free to play with other versions 63 | packages = ['org.elasticsearch:elasticsearch-spark-20_2.11:7.3.0'] 64 | 65 | spark = MySession() 66 | 67 | # To read data 68 | df = spark.read_ext.elastic('localhost', 'my_index', 'my_type', query='?q=awesomeness') 69 | # To write data 70 | df.write_ext.elastic('localhost', 'my_index', 'my_type') 71 | 72 | .. _kafka: 73 | 74 | Kafka 75 | ----- 76 | 77 | Sparkly's reader and writer for Kafka are built on top of the official spark package for Kafka-SQL. 78 | 79 | +---------------+------------------------------------------------------------------------------------------+ 80 | | Package | https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10_2.11/2.4.0 | 81 | +---------------+------------------------------------------------------------------------------------------+ 82 | | Configuration | https://spark.apache.org/docs/2.4.0/structured-streaming-kafka-integration.html | 83 | +---------------+------------------------------------------------------------------------------------------+ 84 | 85 | .. code-block:: python 86 | 87 | import json 88 | 89 | from sparkly import SparklySession 90 | 91 | 92 | class MySession(SparklySession): 93 | packages = [ 94 | 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0', 95 | ] 96 | 97 | spark = MySession() 98 | 99 | # To read JSON messaged from Kafka into a dataframe: 100 | 101 | # 1. Define a schema of the messages you read. 102 | df_schema = StructType([ 103 | StructField('key', StructType([ 104 | StructField('id', StringType(), True) 105 | ])), 106 | StructField('value', StructType([ 107 | StructField('name', StringType(), True), 108 | StructField('surname', StringType(), True), 109 | ])) 110 | ]) 111 | 112 | # 2. Specify the schema as a reader parameter. 113 | df = hc.read_ext.kafka( 114 | 'kafka.host', 115 | topic='my.topic', 116 | # key & value deserialization is optional; if not provided, 117 | # then the user will have to deal with decoding the binary directly. 118 | key_deserializer=lambda item: json.loads(item.decode('utf-8')), 119 | value_deserializer=lambda item: json.loads(item.decode('utf-8')), 120 | # if deserializers are used, the schema must be provided: 121 | schema=df_schema, 122 | ) 123 | 124 | # To write a dataframe to Kafka in JSON format: 125 | df.write_ext.kafka( 126 | 'kafka.host', 127 | topic='my.topic', 128 | # key & value serialization is optional; if not provided, 129 | # the `key` and `value` columns MUST already be StringType or BinaryType 130 | key_serializer=lambda item: json.dumps(item).encode('utf-8'), 131 | value_serializer=lambda item: json.dumps(item).encode('utf-8'), 132 | ) 133 | 134 | .. _mysql: 135 | 136 | MySQL 137 | ----- 138 | 139 | Basically, it's just a high level api on top of the native 140 | `jdbc reader `_ and 141 | `jdbc writer `_. 142 | 143 | +---------------+--------------------------------------------------------------------------------------------------+ 144 | | Jars | https://mvnrepository.com/artifact/mysql/mysql-connector-java | 145 | +---------------+--------------------------------------------------------------------------------------------------+ 146 | | Configuration | https://dev.mysql.com/doc/connector-j/5.1/en/connector-j-reference-configuration-properties.html | 147 | +---------------+--------------------------------------------------------------------------------------------------+ 148 | 149 | .. code-block:: python 150 | 151 | from sparkly import SparklySession 152 | from sparkly.utils import absolute_path 153 | 154 | 155 | class MySession(SparklySession): 156 | # Feel free to play with other versions. 157 | packages = ['mysql:mysql-connector-java:6.0.6'] 158 | 159 | 160 | spark = MySession() 161 | 162 | # To read data 163 | df = spark.read_ext.mysql('localhost', 'my_database', 'my_table', 164 | options={'user': 'root', 'password': 'root'}) 165 | # To write data 166 | df.write_ext.mysql('localhost', 'my_database', 'my_table', options={ 167 | 'user': 'root', 168 | 'password': 'root', 169 | 'rewriteBatchedStatements': 'true', # improves write throughput dramatically 170 | }) 171 | 172 | .. _redis: 173 | 174 | Redis 175 | ----- 176 | 177 | Sparkly provides a writer for Redis that is built on top of the official redis python library 178 | `redis-py `_ . 179 | It is currently capable of exporting your DataFrame as a JSON blob per row or group of rows. 180 | 181 | .. note:: 182 | - To interact with Redis, ``sparkly`` needs the ``redis`` library. You can get it via: 183 | ``pip install sparkly[redis]`` 184 | 185 | .. code-block:: python 186 | 187 | import json 188 | 189 | from sparkly import SparklySession 190 | 191 | 192 | spark = SparklySession() 193 | 194 | # Write JSON.gz data indexed by col1.col2 that will expire in a day 195 | df.write_ext.redis( 196 | host='localhost', 197 | port=6379, 198 | key_by=['col1', 'col2'], 199 | exclude_key_columns=True, 200 | expire=24 * 60 * 60, 201 | compression='gzip', 202 | ) 203 | 204 | 205 | .. _universal-reader-and-writer: 206 | 207 | Universal reader/writer 208 | ----------------------- 209 | 210 | The `DataFrame` abstraction is really powerful when it comes to transformations. 211 | You can shape your data from various storages using exactly the same api. 212 | For instance, you can join data from Cassandra with data from Elasticsearch and write the result to MySQL. 213 | 214 | The only problem - you have to explicitly define sources (or destinations) in order to create (or export) a `DataFrame`. 215 | But the source/destination of data doesn't really change the logic of transformations (if the schema is preserved). 216 | To solve the problem, we decided to add the universal api to read/write `DataFrames`: 217 | 218 | .. code-block:: python 219 | 220 | from sparkly import SparklyContext 221 | 222 | class MyContext(SparklyContext): 223 | packages = [ 224 | 'datastax:spark-cassandra-connector:1.6.1-s_2.10', 225 | 'com.databricks:spark-csv_2.10:1.4.0', 226 | 'org.elasticsearch:elasticsearch-spark_2.10:6.5.4', 227 | ] 228 | 229 | hc = MyContext() 230 | 231 | # To read data 232 | df = hc.read_ext.by_url('cassandra://localhost/my_keyspace/my_table?consistency=ONE') 233 | df = hc.read_ext.by_url('csv:s3://my-bucket/my-data?header=true') 234 | df = hc.read_ext.by_url('elastic://localhost/my_index/my_type?q=awesomeness') 235 | df = hc.read_ext.by_url('parquet:hdfs://my.name.node/path/on/hdfs') 236 | 237 | # To write data 238 | df.write_ext.by_url('cassandra://localhost/my_keyspace/my_table?consistency=QUORUM¶llelism=8') 239 | df.write_ext.by_url('csv:hdfs://my.name.node/path/on/hdfs') 240 | df.write_ext.by_url('elastic://localhost/my_index/my_type?parallelism=4') 241 | df.write_ext.by_url('parquet:s3://my-bucket/my-data?header=false') 242 | 243 | 244 | .. _controlling-the-load: 245 | 246 | Controlling the load 247 | -------------------- 248 | 249 | From the official documentation: 250 | 251 | | Don’t create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems. 252 | 253 | link: 254 | 255 | It's a very good advice, but in practice it's hard to track the number of partitions. 256 | For instance, if you write a result of a join operation to database the number of splits 257 | might be changed implicitly via `spark.sql.shuffle.partitions`. 258 | 259 | To prevent us from shooting to the foot, we decided to add `parallelism` option for all our readers and writers. 260 | The option is designed to control a load on a source we write to / read from. 261 | It's especially useful when you are working with data storages like Cassandra, MySQL or Elastic. 262 | However, the implementation of the throttling has some drawbacks and you should be aware of them. 263 | 264 | The way we implemented it is pretty simple: we use `coalesce` on a dataframe 265 | to reduce an amount of tasks that will be executed in parallel. 266 | Let's say you have a dataframe with 1000 splits and you want to write no more than 10 task 267 | in parallel. In such case `coalesce` will create a dataframe that has 10 splits 268 | with 100 original tasks in each. An outcome of this: if any of these 100 tasks fails, 269 | we have to retry the whole pack in 100 tasks. 270 | 271 | `Read more about coalesce `_ 272 | 273 | Reader API documentation 274 | ------------------------ 275 | 276 | .. automodule:: sparkly.reader 277 | :members: 278 | 279 | Writer API documentation 280 | ------------------------ 281 | 282 | .. automodule:: sparkly.writer 283 | :members: 284 | -------------------------------------------------------------------------------- /docs/source/session.rst: -------------------------------------------------------------------------------- 1 | Sparkly Session 2 | =============== 3 | 4 | ``SparklySession`` is the main entry point to sparkly's functionality. 5 | It's derived from ``SparkSession`` to provide additional features on top of the default session. 6 | The are two main differences between ``SparkSession`` and ``SparklySession``: 7 | 8 | 1. ``SparklySession`` doesn't have ``builder`` attribute, 9 | because we prefer declarative session definition over imperative. 10 | 2. Hive support is enabled by default. 11 | 12 | The example below shows both imperative and declarative approaches: 13 | 14 | .. code-block:: python 15 | 16 | # PySpark-style (imperative) 17 | from pyspark import SparkSession 18 | 19 | spark = SparkSession.builder\ 20 | .appName('My App')\ 21 | .master('spark://')\ 22 | .config('spark.sql.shuffle.partitions', 10)\ 23 | .getOrCreate() 24 | 25 | # Sparkly-style (declarative) 26 | from sparkly import SparklySession 27 | 28 | class MySession(SparklySession): 29 | options = { 30 | 'spark.app.name': 'My App', 31 | 'spark.master': 'spark://', 32 | 'spark.sql.shuffle.partitions': 10, 33 | } 34 | 35 | spark = MySession() 36 | 37 | # In case you want to change default options 38 | spark = MySession({'spark.app.name': 'My Awesome App'}) 39 | 40 | # In case you want to access the session singleton 41 | spark = MySession.get_or_create() 42 | 43 | 44 | Installing dependencies 45 | ----------------------- 46 | 47 | **Why**: Spark forces you to specify dependencies (spark packages or maven artifacts) 48 | when a spark job is submitted (something like ``spark-submit --packages=...``). 49 | We prefer a code-first approach where dependencies are actually 50 | declared as part of the job. 51 | 52 | **For example**: You want to read data from Cassandra. 53 | 54 | .. code-block:: python 55 | 56 | from sparkly import SparklySession 57 | 58 | 59 | class MySession(SparklySession): 60 | # Define a list of spark packages or maven artifacts. 61 | packages = [ 62 | 'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11', 63 | ] 64 | 65 | # Dependencies will be fetched during the session initialisation. 66 | spark = MySession() 67 | 68 | # Here is how you now can access a dataset in Cassandra. 69 | df = spark.read_ext.by_url('cassandra:////?consistency=QUORUM') 70 | 71 | 72 | Custom Maven repositories 73 | ------------------------- 74 | 75 | **Why**: If you have a private maven repository, this is how to point spark to it when it performs a package lookup. 76 | Order in which dependencies will be resolved is next: 77 | - Local cache 78 | - Custom maven repositories (if specified) 79 | - Maven Central 80 | 81 | **For example**: Let's assume your maven repository is available on: http://my.repo.net/maven, 82 | and there is some spark package published there, with identifier: `my.corp:spark-handy-util:0.0.1` 83 | You can install it to a spark session like this: 84 | 85 | .. code-block:: python 86 | 87 | from sparkly import SparklySession 88 | 89 | class MySession(SparklySession): 90 | repositories = ['http://my.repo.net/maven'] 91 | packages = ['my.corp:spark-handy-util:0.0.1'] 92 | 93 | spark = MySession() 94 | 95 | 96 | Tuning options 97 | -------------- 98 | 99 | **Why**: You want to customise your spark session. 100 | 101 | **For example**: 102 | 103 | - ``spark.sql.shuffle.partitions`` to tune shuffling; 104 | - ``hive.metastore.uris`` to connect to your own HiveMetastore; 105 | - ``spark.hadoop.avro.mapred.ignore.inputs.without.extension`` package specific options. 106 | 107 | .. code-block:: python 108 | 109 | from sparkly import SparklySession 110 | 111 | 112 | class MySession(SparklySession): 113 | options = { 114 | # Increase the default amount of partitions for shuffling. 115 | 'spark.sql.shuffle.partitions': 1000, 116 | # Setup remote Hive Metastore. 117 | 'hive.metastore.uris': 'thrift://:9083,thrift://:9083', 118 | # Ignore files without `avro` extensions. 119 | 'spark.hadoop.avro.mapred.ignore.inputs.without.extension': 'false', 120 | } 121 | 122 | # You can also overwrite or add some options at initialisation time. 123 | spark = MySession({'spark.sql.shuffle.partitions': 10}) 124 | 125 | 126 | Tuning options through shell environment 127 | ---------------------------------------- 128 | 129 | **Why**: You want to customize your spark session in a way that depends on the 130 | hardware specifications of your worker (or driver) machine(s), so you'd rather 131 | define them close to where the actual machine specs are requested / defined. 132 | Or you just want to test some new configuration without having to change your 133 | code. In both cases, you can do so by using the environmental variable 134 | ``PYSPARK_SUBMIT_ARGS``. Note that any options defined this way will override 135 | any conflicting options from your Python code. 136 | 137 | **For example**: 138 | 139 | - ``spark.executor.cores`` to tune the cores used by each executor; 140 | - ``spark.executor.memory`` to tune the memory available to each executor. 141 | 142 | .. code-block:: sh 143 | 144 | PYSPARK_SUBMIT_ARGS='--conf "spark.executor.cores=32" --conf "spark.executor.memory=160g"' \ 145 | ./my_spark_app.py 146 | 147 | 148 | Using UDFs 149 | ---------- 150 | 151 | **Why**: To start using Java UDF you have to import JAR file 152 | via SQL query like ``add jar ../path/to/file`` and then call ``registerJavaFunction``. 153 | We think it's too many actions for such simple functionality. 154 | 155 | **For example**: You want to import UDFs from `brickhouse library `_. 156 | 157 | .. code-block:: python 158 | 159 | from pyspark.sql.types import IntegerType 160 | from sparkly import SparklySession 161 | 162 | 163 | def my_own_udf(item): 164 | return len(item) 165 | 166 | 167 | class MySession(SparklySession): 168 | # Import local jar files. 169 | jars = [ 170 | '/path/to/brickhouse.jar' 171 | ] 172 | # Define UDFs. 173 | udfs = { 174 | 'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF', # Java UDF. 175 | 'my_udf': (my_own_udf, IntegerType()), # Python UDF. 176 | } 177 | 178 | spark = MySession() 179 | 180 | spark.sql('SELECT collect_max(amount) FROM my_data GROUP BY ...') 181 | spark.sql('SELECT my_udf(amount) FROM my_data') 182 | 183 | 184 | Lazy access / initialization 185 | ---------------------------- 186 | 187 | **Why**: A lot of times you might need access to the sparkly session at a low-level, 188 | deeply nested function in your code. A first approach is to declare a global sparkly 189 | session instance that you access explicitly, but this usually makes testing painful 190 | because of unexpected importing side effects. A second approach is to pass the session 191 | instance explicitly as a function argument, but this makes the code ugly since you then 192 | need to propagate that argument all the way up to every caller of that function. 193 | 194 | Other times you might want to be able to glue together and run one after the other 195 | different code segments, where each segment initializes its own sparkly session, 196 | despite the sessions being identical. This situation could occur when you are doing 197 | investigative work in a notebook. 198 | 199 | In both cases, ``SparklySession.get_or_create`` is the answer, as it solves the 200 | problems mentioned above while keeping your code clean and tidy. 201 | 202 | 203 | **For example**: You want to use a read function within a transformation. 204 | 205 | .. code-block:: python 206 | 207 | from sparkly import SparklySession 208 | 209 | 210 | class MySession(SparklySession): 211 | pass 212 | 213 | def my_awesome_transformation(): 214 | df = read_dataset('parquet:s3://path/to/my/data') 215 | df2 = read_dataset('parquet:s3://path/to/my/other/data') 216 | # do something with df and df2... 217 | 218 | def read_dataset(url): 219 | spark = MySession.get_or_create() 220 | return spark.read_ext.by_url(url) 221 | 222 | 223 | API documentation 224 | ----------------- 225 | 226 | .. automodule:: sparkly.session 227 | :members: 228 | -------------------------------------------------------------------------------- /docs/source/testing.rst: -------------------------------------------------------------------------------- 1 | Testing Utils 2 | ============= 3 | 4 | Base TestCases 5 | -------------- 6 | 7 | There are two main test cases available in Sparkly: 8 | - ``SparklyTest`` creates a new session for each test case. 9 | - ``SparklyGlobalSessionTest`` uses a single sparkly session for all test cases to boost performance. 10 | 11 | .. code-block:: python 12 | 13 | from pyspark.sql import types as T 14 | 15 | from sparkly import SparklySession 16 | from sparkly.testing import SparklyTest, SparklyGlobalSessionTest 17 | 18 | 19 | class MyTestCase(SparklyTest): 20 | session = SparklySession 21 | 22 | def test(self): 23 | df = self.spark.read_ext.by_url(...) 24 | 25 | # Compare all fields 26 | self.assertRowsEqual( 27 | df.collect(), 28 | [ 29 | T.Row(col1='row1', col2=1), 30 | T.Row(col1='row2', col2=2), 31 | ], 32 | ) 33 | 34 | ... 35 | 36 | class MyTestWithReusableSession(SparklyGlobalSessionTest): 37 | context = SparklySession 38 | 39 | def test(self): 40 | df = self.spark.read_ext.by_url(...) 41 | 42 | ... 43 | 44 | 45 | DataFrame Assertions 46 | -------------------- 47 | 48 | Asserting that the dataframe produced by your transformation is equal to some expected 49 | output can be unnecessarily complicated at times. Common issues include: 50 | 51 | - Ignoring the order in which elements appear in an array. 52 | This could be particularly useful when that array is generated as part of a 53 | ``groupBy`` aggregation, and you only care about all elements being part of the end 54 | result, rather than the order in which Spark encountered them. 55 | - Comparing floats that could be arbitrarily nested in complicated datatypes 56 | within a given tolerance; exact matching is either fragile or impossible. 57 | - Ignoring whether a field of a complex datatype is nullable. 58 | Spark infers this based on the applied transformations, but it is oftentimes 59 | inaccurate. As a result, assertions on complex data types might fail, even 60 | though in theory they shouldn't have. 61 | - Having rows with different field names compare equal if the values match in 62 | alphabetical order of the names (see unit tests for example). 63 | - Unhelpful diffs in case of mismatches. 64 | 65 | Sparkly addresses these issues by providing ``assertRowsEqual``: 66 | 67 | .. code-block:: python 68 | 69 | from pyspark.sql import types as T 70 | 71 | from sparkly import SparklySession 72 | from sparkly.test import SparklyTest 73 | 74 | 75 | def my_transformation(spark): 76 | return spark.createDataFrame( 77 | data=[ 78 | ('row1', {'field': 'value_1'}, [1.1, 2.2, 3.3]), 79 | ('row2', {'field': 'value_2'}, [4.1, 5.2, 6.3]), 80 | ], 81 | schema=T.StructType([ 82 | T.StructField('id', T.StringType()), 83 | T.StructField( 84 | 'st', 85 | T.StructType([ 86 | T.StructField('field', T.StringType()), 87 | ]), 88 | ), 89 | T.StructField('ar', T.ArrayType(T.FloatType())), 90 | ]), 91 | ) 92 | 93 | 94 | class MyTestCase(SparklyTest): 95 | session = SparklySession 96 | 97 | def test(self): 98 | df = my_transformation(self.spark) 99 | 100 | self.assertRowsEqual( 101 | df.collect(), 102 | [ 103 | T.Row(id='row2', st=T.Row(field='value_2'), ar=[6.0, 5.0, 4.0]), 104 | T.Row(id='row1', st=T.Row(field='value_1'), ar=[2.0, 3.0, 1.0]), 105 | ], 106 | atol=0.5, 107 | ) 108 | 109 | 110 | Instant Iterative Development 111 | ----------------------------- 112 | 113 | The slowest part in Spark integration testing is context initialisation. 114 | ``SparklyGlobalSessionTest`` allows you to keep the same instance of spark context between different test cases, 115 | but it still kills the context at the end. It's especially annoying if you work in `TDD fashion `_. 116 | On each run you have to wait 25-30 seconds till a new context is ready. 117 | We added a tool to preserve spark context between multiple test runs. 118 | 119 | .. code-block:: bash 120 | 121 | # Activate instant testing mode. 122 | sparkly-testing up 123 | 124 | # The first run is slow (context is created). 125 | py.test tests/my_integration_test_with_sparkly.py 126 | 127 | # The second run and all after it are fast (context is reused). 128 | py.test tests/my_integration_test_with_sparkly.py 129 | 130 | # Deactivate instant testing mode (when you are done with testing). 131 | sparkly-testing down 132 | 133 | .. note:: 134 | In case if you change ``SparklySession`` definition (new options, jars or packages) 135 | you have to refresh the context via ``sparkly-testing refresh``. 136 | However, you don't need to refresh context if ``udfs`` are changed. 137 | 138 | 139 | Fixtures 140 | -------- 141 | 142 | "Fixture" is a term borrowed from Django framework. 143 | Fixtures load data to a database before the test execution. 144 | 145 | There are several storages supported in Sparkly: 146 | - Elastic 147 | - Cassandra (requires ``cassandra-driver``) 148 | - Mysql (requires ``PyMySql``) 149 | - Kafka (requires ``kafka-python``) 150 | 151 | .. code-block:: python 152 | 153 | from sparkly.test import MysqlFixture, SparklyTest 154 | 155 | 156 | class MyTestCase(SparklyTest): 157 | ... 158 | fixtures = [ 159 | MysqlFixture('mysql.host', 160 | 'user', 161 | 'password', 162 | '/path/to/setup_data.sql', 163 | '/path/to/remove_data.sql') 164 | ] 165 | ... 166 | 167 | .. automodule:: sparkly.testing 168 | :members: 169 | -------------------------------------------------------------------------------- /docs/source/utils.rst: -------------------------------------------------------------------------------- 1 | Generic Utils 2 | ============= 3 | 4 | These are generic utils used in Sparkly. 5 | 6 | .. automodule:: sparkly.utils 7 | :members: 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | pylru==1.0.9 18 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | mock==1.3.0 18 | pytest==6.2.5 19 | pytest-cov==3.0.0 20 | Sphinx==4.2.0 21 | sphinx_rtd_theme==1.0.0 22 | tox==3.24.4 23 | -------------------------------------------------------------------------------- /requirements_extras.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | cassandra-driver==3.25.0 18 | PyMySQL==0.9.3 19 | kafka-python==2.0.2 20 | redis==2.10.5 21 | ujson==1.35 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | [bdist_wheel] 18 | # This flag says that the code is written to work on both Python 2 and Python 19 | # 3. If at all possible, it is good practice to do this. If you cannot, you 20 | # will need to generate wheels for each Python version that you support. 21 | universal=1 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from codecs import open 18 | import os 19 | import re 20 | 21 | from setuptools import setup, find_packages 22 | 23 | 24 | here = os.path.abspath(os.path.dirname(__file__)) 25 | 26 | # Get version 27 | with open(os.path.join(here, 'sparkly/__init__.py'), 'rb') as init_py: 28 | version = re.search('__version__ = \'([\w.]+)\'', init_py.read().decode('utf-8')).group(1) 29 | 30 | # Get the long description from the relevant file 31 | with open(os.path.join(here, 'README.rst'), 'rb') as readme_rst: 32 | long_description = readme_rst.read().decode('utf-8') 33 | 34 | # Get requirements 35 | with open(os.path.join(here, 'requirements.txt')) as requirements_txt: 36 | requirements = [req for req in requirements_txt.readlines() if re.match(u'^[^#\-\s]', req)] 37 | 38 | 39 | setup( 40 | name='sparkly', 41 | 42 | # Versions should comply with PEP440. For a discussion on single-sourcing 43 | # the version across setup.py and the project code, see 44 | # https://packaging.python.org/en/latest/single_source_version.html 45 | version=version, 46 | 47 | description='Helpers & syntax sugar for PySpark.', 48 | long_description=long_description, 49 | 50 | # The project's main homepage. 51 | url='https://github.com/Tubular/sparkly', 52 | 53 | # Author details 54 | author='Tubular Engineering', 55 | author_email='dev@tubularlabs.com', 56 | 57 | # License 58 | license='Apache License 2.0', 59 | 60 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 61 | classifiers=[ 62 | 'Development Status :: 5 - Production/Stable', 63 | 64 | # Indicate who your project is intended for 65 | 'Intended Audience :: Developers', 66 | 'Topic :: Software Development :: Build Tools', 67 | 68 | # Pick your license as you wish (should match "license" above) 69 | 'License :: OSI Approved :: Apache Software License', 70 | 71 | # Specify the Python versions you support here. In particular, ensure 72 | # that you indicate whether you support Python 2, Python 3 or both. 73 | 'Programming Language :: Python :: 3', 74 | 'Programming Language :: Python :: 3.7', 75 | 'Programming Language :: Python :: 3.8', 76 | 'Programming Language :: Python :: 3.10', 77 | ], 78 | 79 | # What does your project relate to? 80 | keywords='sparkly spark pyspark', 81 | 82 | # You can just specify the packages manually here if your project is 83 | # simple. Or you can use find_packages(). 84 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']), 85 | scripts=['bin/sparkly-testing'], 86 | include_package_data=True, 87 | 88 | # List run-time dependencies here. These will be installed by pip when 89 | # your project is installed. For an analysis of "install_requires" vs pip's 90 | # requirements files see: 91 | # https://packaging.python.org/en/latest/requirements.html 92 | install_requires=requirements, 93 | extras_require={ 94 | 'redis': ['redis>=2.10,<3', 'ujson>=1.33,<2'], 95 | 'test': [ 96 | 'cassandra-driver>=3.25,<3.26', 97 | 'PyMySQL>=0.7,<0.10', 98 | 'kafka-python>=2.0.2,<2.1', 99 | 'redis>=2.10,<3', 100 | 'ujson>=1.33,<2', 101 | ], 102 | }, 103 | ) 104 | -------------------------------------------------------------------------------- /sparkly/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from sparkly.session import SparklySession 18 | 19 | assert SparklySession 20 | 21 | 22 | __version__ = '3.0.0' 23 | -------------------------------------------------------------------------------- /sparkly/catalog.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import uuid 17 | 18 | from pyspark.sql import functions as F 19 | from pyspark.sql import utils as U 20 | 21 | 22 | class SparklyCatalog(object): 23 | """A set of tools to interact with HiveMetastore.""" 24 | 25 | def __init__(self, spark): 26 | """Constructor. 27 | 28 | Args: 29 | spark (sparkly.SparklySession) 30 | """ 31 | self._spark = spark 32 | 33 | def create_table(self, table_name, path=None, source=None, schema=None, **options): 34 | """Create table in the metastore. 35 | 36 | Extend ``SparkSession.Catalog.createExternalTable`` by accepting 37 | a ``mode='overwrite'`` option which creates the table even if a 38 | table with the same name already exists. All other args are 39 | exactly the same. 40 | 41 | Note: 42 | If the table exists, create two unique names, one for the 43 | new and one for the old instance, then try to swap names 44 | and drop the "old" instance. If any step fails, the metastore 45 | might be currently left at a broken state. 46 | 47 | Args: 48 | mode (str): if set to ``'overwrite'``, drop any table of the 49 | same name from the metastore. Given as a kwarg. Default 50 | is error out if table already exists. 51 | 52 | Returns: 53 | pyspark.sql.DataFrame: DataFrame associated with the created 54 | table. 55 | """ 56 | overwrite_existing_table = ( 57 | options.pop('mode', '').lower() == 'overwrite' and 58 | self.has_table(table_name) 59 | ) 60 | 61 | def _append_unique_suffix(*args): 62 | return '__'.join(args + (uuid.uuid4().hex, )) 63 | 64 | if overwrite_existing_table: 65 | new_table_name = _append_unique_suffix(table_name, 'new') 66 | else: 67 | new_table_name = table_name 68 | 69 | if hasattr(self._spark.catalog, 'createTable'): 70 | createTable = self._spark.catalog.createTable 71 | else: # before Spark 2.2 72 | createTable = self._spark.catalog.createExternalTable 73 | 74 | df = createTable( 75 | new_table_name, 76 | path=path, 77 | source=source, 78 | schema=schema, 79 | **options 80 | ) 81 | 82 | if overwrite_existing_table: 83 | old_table_name = _append_unique_suffix(table_name, 'old') 84 | self.rename_table(table_name, old_table_name) 85 | self.rename_table(new_table_name, table_name) 86 | self.drop_table(old_table_name) 87 | 88 | return df 89 | 90 | def drop_table(self, table_name, checkfirst=True): 91 | """Drop table from the metastore. 92 | 93 | Note: 94 | Follow the official documentation to understand `DROP TABLE` semantic. 95 | https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL\ 96 | #LanguageManualDDL-DropTable 97 | 98 | Args: 99 | table_name (str): A table name. 100 | checkfirst (bool): Only issue DROPs for tables that are presented in the database. 101 | """ 102 | db_name = get_db_name(table_name) 103 | if checkfirst and not self.has_database(db_name): 104 | return 105 | 106 | drop_statement = 'DROP TABLE IF EXISTS' if checkfirst else 'DROP TABLE' 107 | return self._spark.sql( 108 | '{} {}'.format(drop_statement, table_name) 109 | ) 110 | 111 | def has_table(self, table_name): 112 | """Check if table is available in the metastore. 113 | 114 | Args: 115 | table_name (str): A table name. 116 | 117 | Returns: 118 | bool 119 | """ 120 | 121 | if not table_name: 122 | return False 123 | 124 | try: 125 | self._spark.sql('SELECT 1 FROM {} WHERE 1=0'.format(table_name)) 126 | except U.AnalysisException: 127 | return False 128 | 129 | return True 130 | 131 | def has_database(self, db_name): 132 | """Check if database exists in the metastore. 133 | 134 | Args: 135 | db_name (str): Database name. 136 | 137 | Returns: 138 | bool 139 | """ 140 | if not db_name: 141 | return True 142 | 143 | for db in self._spark.catalog.listDatabases(): 144 | if db_name == db.name: 145 | return True 146 | 147 | return False 148 | 149 | def rename_table(self, old_table_name, new_table_name): 150 | """Rename table in the metastore. 151 | 152 | Note: 153 | Follow the official documentation to understand `ALTER TABLE` semantic. 154 | https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL\ 155 | #LanguageManualDDL-RenameTable 156 | 157 | Args: 158 | old_table_name (str): The current table name. 159 | new_table_name (str): An expected table name. 160 | """ 161 | self._spark.sql('ALTER TABLE {} RENAME TO {}'.format(old_table_name, new_table_name)) 162 | 163 | def get_table_property(self, table_name, property_name, to_type=None): 164 | """Get table property value from the metastore. 165 | 166 | Args: 167 | table_name (str): A table name. Might contain a db name. 168 | E.g. "my_table" or "default.my_table". 169 | property_name (str): A property name to read value for. 170 | to_type (function): Cast value to the given type. E.g. `int` or `float`. 171 | 172 | Returns: 173 | Any 174 | """ 175 | if not to_type: 176 | to_type = str 177 | 178 | df = self._spark.sql("SHOW TBLPROPERTIES {}('{}')".format(table_name, property_name)) 179 | prop_val = df.collect()[0].value.strip() 180 | 181 | if 'does not have property' not in prop_val: 182 | return to_type(prop_val) 183 | 184 | def get_table_properties(self, table_name): 185 | """Get table properties from the metastore. 186 | 187 | Args: 188 | table_name (str): A table name. 189 | 190 | Returns: 191 | dict[str,str]: Key/value for properties. 192 | """ 193 | rows = self._spark.sql('SHOW TBLPROPERTIES {}'.format(table_name)).collect() 194 | return {row.key: row.value for row in rows} 195 | 196 | def set_table_property(self, table_name, property_name, value): 197 | """Set value for table property. 198 | 199 | Args: 200 | table_name (str): A table name. 201 | property_name (str): A property name to set value for. 202 | value (Any): Will be automatically casted to string. 203 | """ 204 | self._spark.sql("ALTER TABLE {} SET TBLPROPERTIES ('{}'='{}')".format( 205 | table_name, property_name, value 206 | )) 207 | 208 | def get_database_property(self, db_name, property_name, to_type=None): 209 | """Read value for database property. 210 | 211 | Args: 212 | db_name (str): A database name. 213 | property_name (str): A property name to read value for. 214 | to_type (function): Cast value to the given type. E.g. `int` or `float`. 215 | 216 | Returns: 217 | Any 218 | """ 219 | if not to_type: 220 | to_type = str 221 | 222 | value = self.get_database_properties(db_name).get(property_name) 223 | if value is not None: 224 | return to_type(value) 225 | 226 | def get_database_properties(self, db_name): 227 | """Get database properties from the metastore. 228 | 229 | Args: 230 | db_name (str): A database name. 231 | 232 | Returns: 233 | dict[str,str]: Key/value for properties. 234 | """ 235 | describe = self._spark.sql(f'DESCRIBE DATABASE EXTENDED {db_name}') 236 | 237 | if 'database_description_item' in describe.columns: 238 | key_col = 'database_description_item' 239 | val_col = 'database_description_value' 240 | else: 241 | key_col = 'info_name' 242 | val_col = 'info_value' 243 | 244 | properties = ( 245 | self._spark.sql('DESCRIBE DATABASE EXTENDED {}'.format(db_name)) 246 | .where(F.col(key_col) == 'Properties') 247 | .select(val_col) 248 | .first() 249 | ) 250 | 251 | parsed_properties = {} 252 | 253 | if properties: 254 | info_value = getattr(properties, val_col) 255 | for name, value in read_db_properties_format(info_value): 256 | parsed_properties[name] = value 257 | 258 | return parsed_properties 259 | 260 | def set_database_property(self, db_name, property_name, value): 261 | """Set value for database property. 262 | 263 | Args: 264 | db_name (str): A database name. 265 | property_name (str): A property name to set value for. 266 | value (Any): Will be automatically casted to string. 267 | """ 268 | property_name_blacklist = {',', '(', ')'} 269 | property_value_blacklist = {'(', ')'} 270 | 271 | if set(property_name) & property_name_blacklist: 272 | raise ValueError( 273 | 'Property name must not contain symbols: {}'.format(property_name_blacklist)) 274 | 275 | if set(str(value)) & property_value_blacklist: 276 | raise ValueError( 277 | 'Property value must not contain symbols: {}'.format(property_value_blacklist)) 278 | 279 | self._spark.sql("ALTER DATABASE {} SET DBPROPERTIES ('{}'='{}')".format( 280 | db_name, property_name, value, 281 | )) 282 | 283 | 284 | def get_db_name(table_name): 285 | """Get database name from full table name.""" 286 | parts = table_name.split('.', 1) 287 | if len(parts) == 1: 288 | return None 289 | else: 290 | return parts[0] 291 | 292 | 293 | def get_table_name(table_name): 294 | """Get table name from full table name.""" 295 | parts = table_name.split('.', 1) 296 | return parts[-1] 297 | 298 | 299 | def read_db_properties_format(raw_db_properties): 300 | """Helper to read non-standard db properties format. 301 | 302 | Note: 303 | Spark/Hive doesn't provide a way to read separate key/values for database properties. 304 | They provide a custom format like: ((key_a,value_a), (key_b,value_b)) 305 | Neither keys nor values are escaped. 306 | Here we try our best to parse this format by tracking balanced parentheses. 307 | We assume property names don't contain comma. 308 | 309 | Return: 310 | list[list[str]] - the list of key-value pairs. 311 | """ 312 | def _unpack_parentheses(string): 313 | bits = [] 314 | last_bit = '' 315 | checksum = 0 316 | 317 | for c in string: 318 | if c == '(': 319 | if checksum == 0: 320 | last_bit = '' 321 | else: 322 | last_bit += c 323 | checksum += 1 324 | elif c == ')': 325 | checksum -= 1 326 | if checksum == 0: 327 | bits.append(last_bit) 328 | else: 329 | last_bit += c 330 | else: 331 | last_bit += c 332 | 333 | if checksum < 0: 334 | raise ValueError('Parentheses are not balanced') 335 | 336 | if checksum != 0: 337 | raise ValueError('Parentheses are not balanced') 338 | 339 | return bits 340 | 341 | properties = _unpack_parentheses(raw_db_properties) 342 | if properties: 343 | return [x.split(',', 1) for x in _unpack_parentheses(properties[0])] 344 | else: 345 | return [] 346 | -------------------------------------------------------------------------------- /sparkly/exceptions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | class SparklyException(Exception): 19 | """Base exception of sparkly lib.""" 20 | 21 | 22 | class UnsupportedDataType(SparklyException): 23 | """Happen when schema defines unsupported data type.""" 24 | pass 25 | 26 | 27 | class FixtureError(SparklyException): 28 | """Happen when testing data setup or teardown fails.""" 29 | pass 30 | 31 | 32 | class InvalidArgumentError(SparklyException): 33 | """Happen when invalid parameters are passed to a function.""" 34 | 35 | 36 | class WriteError(SparklyException): 37 | """Happen when errors occured while writting dataframe into storage.""" 38 | -------------------------------------------------------------------------------- /sparkly/functions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from collections import defaultdict 18 | from functools import reduce 19 | import operator 20 | 21 | from pyspark.sql import Column 22 | from pyspark.sql import functions as F 23 | 24 | 25 | def multijoin(dfs, on=None, how=None, coalesce=None): 26 | """Join multiple dataframes. 27 | 28 | Args: 29 | dfs (list[pyspark.sql.DataFrame]). 30 | on: same as ``pyspark.sql.DataFrame.join``. 31 | how: same as ``pyspark.sql.DataFrame.join``. 32 | coalesce (list[str]): column names to disambiguate by coalescing 33 | across the input dataframes. A column must be of the same type 34 | across all dataframes that define it; if different types appear 35 | coalesce will do a best-effort attempt in merging them. The 36 | selected value is the first non-null one in order of appearance 37 | of the dataframes in the input list. Default is None - don't 38 | coalesce any ambiguous columns. 39 | 40 | Returns: 41 | pyspark.sql.DataFrame or None if provided dataframe list is empty. 42 | 43 | Example: 44 | Assume we have two DataFrames, the first is 45 | ``first = [{'id': 1, 'value': None}, {'id': 2, 'value': 2}]`` 46 | and the second is 47 | ``second = [{'id': 1, 'value': 1}, {'id': 2, 'value': 22}]`` 48 | 49 | Then collecting the ``DataFrame`` produced by 50 | 51 | ``multijoin([first, second], on='id', how='inner', coalesce=['value'])`` 52 | 53 | yields ``[{'id': 1, 'value': 1}, {'id': 2, 'value': 2}]``. 54 | """ 55 | if not dfs: 56 | return None 57 | 58 | # Go over the input dataframes and rename each to-be-resolved 59 | # column to ensure name uniqueness 60 | coalesce = set(coalesce or []) 61 | renamed_columns = defaultdict(list) 62 | for idx, df in enumerate(dfs): 63 | for col in df.columns: 64 | if col in coalesce: 65 | disambiguation = '__{}_{}'.format(idx, col) 66 | df = df.withColumnRenamed(col, disambiguation) 67 | renamed_columns[col].append(disambiguation) 68 | dfs[idx] = df 69 | 70 | # Join the dataframes 71 | joined_df = reduce(lambda x, y: x.join(y, on=on, how=how), dfs) 72 | 73 | # And coalesce the would-have-been-ambiguities 74 | for col, disambiguations in renamed_columns.items(): 75 | joined_df = joined_df.withColumn(col, F.coalesce(*disambiguations)) 76 | for disambiguation in disambiguations: 77 | joined_df = joined_df.drop(disambiguation) 78 | 79 | return joined_df 80 | 81 | 82 | def switch_case(switch, case=None, default=None, operand=operator.eq, **additional_cases): 83 | """Switch/case style column generation. 84 | 85 | Args: 86 | switch (str, pyspark.sql.Column): column to "switch" on; 87 | its values are going to be compared against defined cases. 88 | case (dict): case statements. When a key matches the value of 89 | the column in a specific row, the respective value will be 90 | assigned to the new column for that row. This is useful when 91 | your case condition constants are not strings. 92 | default: default value to be used when the value of the switch 93 | column doesn't match any keys. 94 | operand: function to compare the value of the switch column to the 95 | value of each case. Default is Column's eq. If user-provided, 96 | first argument will always be the switch Column; it's the 97 | user's responsibility to transform the case value to a column 98 | if they need to. 99 | additional_cases: additional "case" statements, kwargs style. 100 | Same semantics with cases above. If both are provided, 101 | cases takes precedence. 102 | 103 | Returns: 104 | pyspark.sql.Column 105 | 106 | Example: 107 | ``switch_case('state', CA='California', NY='New York', default='Other')`` 108 | 109 | is equivalent to 110 | 111 | >>> F.when( 112 | ... F.col('state') == 'CA', 'California' 113 | ).when( 114 | ... F.col('state') == 'NY', 'New York' 115 | ).otherwise('Other') 116 | 117 | If you need to "bucketize" a value 118 | 119 | ``switch_case('age', {(13, 17): 1, (18, 24): 2, ...}, operand=lambda c, v: c.between(*v))`` 120 | 121 | is equivalent to 122 | 123 | >>> F.when( 124 | ... F.col('age').between(13, 17), F.lit(1) 125 | ).when( 126 | ... F.col('age').between(18, 24), F.lit(2) 127 | ) 128 | """ 129 | if not isinstance(switch, Column): 130 | switch = F.col(switch) 131 | 132 | def _column_or_lit(x): 133 | return F.lit(x) if not isinstance(x, Column) else x 134 | 135 | def _execute_case(accumulator, case): 136 | # transform the case to a pyspark.sql.functions.when statement, 137 | # then chain it to existing when statements 138 | condition_constant, assigned_value = case 139 | when_args = (operand(switch, condition_constant), _column_or_lit(assigned_value)) 140 | return accumulator.when(*when_args) 141 | 142 | 143 | cases = case or {} 144 | for conflict in set(cases.keys()) & set(additional_cases.keys()): 145 | del additional_cases[conflict] 146 | cases = list(cases.items()) + list(additional_cases.items()) 147 | 148 | default = _column_or_lit(default) 149 | 150 | if not cases: 151 | return default 152 | 153 | result = reduce(_execute_case, cases, F).otherwise(default) 154 | 155 | return result 156 | 157 | 158 | def argmax(field, by, condition=None): 159 | """Select a value from the row that maximizes other column(s) 160 | 161 | Args: 162 | field (string, pyspark.sql.Column): the field to return that maximizes the "by" columns 163 | by (*string, *pyspark.sql.Column): field or list of fields to maximize. In reality, this 164 | will usually be only one field. But you may use multiple for tiebreakers 165 | condition (optional): Only consider the entities that pass this condition 166 | 167 | Returns: 168 | pyspark.sql.Column 169 | 170 | Example: 171 | df = ( 172 | df 173 | .groupBy('id') 174 | .agg(argmax('field1', 'by_field')) 175 | ) 176 | 177 | argmax('field1', ['by_field1', 'by_field2'], condition=F.col('col') == 1) 178 | argmax(F.col('field1'), [F.col('by_field1'), F.col('by_field2')], condition=F.lit(True)) 179 | """ 180 | if not isinstance(by, list): 181 | by = [by] 182 | 183 | if isinstance(field, str): 184 | field = F.col(field) 185 | 186 | by.append(field.alias('__tmp_argmax__')) 187 | result = F.struct(*by) 188 | if condition is not None: 189 | result = F.when(condition, result) 190 | result = F.max(result).getField('__tmp_argmax__') 191 | 192 | return result 193 | -------------------------------------------------------------------------------- /sparkly/instant_testing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import json 18 | import logging 19 | import os 20 | import signal 21 | import tempfile 22 | 23 | from py4j.java_gateway import java_import 24 | from pyspark import SparkContext 25 | from pyspark.java_gateway import launch_gateway 26 | 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | class InstantTesting(object): 32 | """The set of tools to run tests using Spark Context running in the background. 33 | 34 | Implementation: 35 | We create a lock file that will contain Python gateway port (exposed by JVM). 36 | 37 | On the first run: 38 | - initialise Spark Context as usual; 39 | - write Python gateway port to the lock file; 40 | - fork current process. 41 | 42 | On the second run: 43 | - connect to the background JVM process using Python gateway port from the lock file; 44 | - recover Spark Context from JVM. 45 | """ 46 | LOCK_FILE_PATH = os.path.join(tempfile.gettempdir(), 'sparkly_instant_testing_lock') 47 | 48 | @classmethod 49 | def activate(cls): 50 | """Activate instant testing mode.""" 51 | if os.path.exists(cls.LOCK_FILE_PATH): 52 | logger.error('Instant testing mode is already activate, deactivate it first.') 53 | else: 54 | with open(cls.LOCK_FILE_PATH, 'w'): 55 | logger.info('Instant testing mode has been activated.') 56 | 57 | @classmethod 58 | def deactivate(cls): 59 | """Deactivate instance testing mode.""" 60 | if not os.path.exists(cls.LOCK_FILE_PATH): 61 | logger.error('Instant testing mode is not activated, activate it first.') 62 | else: 63 | try: 64 | with open(cls.LOCK_FILE_PATH) as lock: 65 | state = lock.read() 66 | if state: 67 | session_pid = json.loads(state)['session_pid'] 68 | try: 69 | os.kill(session_pid, signal.SIGTERM) 70 | except OSError: 71 | logger.exception( 72 | 'Can not kill background SparkContext (pid %d)', session_pid, 73 | ) 74 | else: 75 | logger.info( 76 | 'Killed background SparkContext (pid %d)', session_pid, 77 | ) 78 | finally: 79 | try: 80 | os.remove(cls.LOCK_FILE_PATH) 81 | except OSError: 82 | logger.exception('Can not remove lock file: %s', cls.LOCK_FILE_PATH) 83 | 84 | logger.info('Instant testing mode has been deactivated.') 85 | 86 | @classmethod 87 | def is_activated(cls): 88 | """Check if instant testing has been activated before. 89 | 90 | Returns: 91 | bool 92 | """ 93 | return os.path.exists(cls.LOCK_FILE_PATH) 94 | 95 | @classmethod 96 | def set_context(cls, spark_context): 97 | """Set the given spark context for instant testing. 98 | 99 | Args: 100 | spark_context (pyspark.SparkContext) 101 | """ 102 | assert cls.is_activated() 103 | 104 | gateway_port = spark_context._gateway.java_gateway_server.getListeningPort() 105 | 106 | # pid of the python process that holds JVM with running Spark Context. 107 | session_pid = os.getpid() 108 | 109 | with open(cls.LOCK_FILE_PATH, 'w') as lock: 110 | json.dump( 111 | { 112 | 'gateway_port': gateway_port, 113 | 'session_pid': session_pid, 114 | 'gateway_secret': getattr( 115 | spark_context._gateway.gateway_parameters, 'auth_token', None, 116 | ), 117 | }, 118 | lock, 119 | ) 120 | logger.info( 121 | 'Successfully set spark context for the instant testing [pid=%s, gateway=%s]', 122 | session_pid, gateway_port 123 | ) 124 | 125 | @classmethod 126 | def get_context(cls): 127 | """Get the current global spark context. 128 | 129 | Returns: 130 | pyspark.SparkContext or None (if wasn't set before). 131 | """ 132 | assert cls.is_activated() 133 | 134 | state = None 135 | 136 | with open(cls.LOCK_FILE_PATH) as lock: 137 | serialised_state = lock.read() 138 | if serialised_state: 139 | try: 140 | state = json.loads(serialised_state) 141 | except ValueError: 142 | logger.error( 143 | 'Unable to deserialize lock file. Try to reactivate instant testing. ' 144 | 'The broken content is: %s', 145 | serialised_state, 146 | ) 147 | 148 | if state: 149 | logger.info( 150 | 'Recovering context for the instant testing [pid=%s, gateway=%s]', 151 | state['session_pid'], state['gateway_port'], 152 | ) 153 | 154 | os.environ['PYSPARK_GATEWAY_PORT'] = str(state['gateway_port']) 155 | os.environ['PYSPARK_GATEWAY_SECRET'] = str(state['gateway_secret']) 156 | gateway = launch_gateway() 157 | java_import(gateway.jvm, 'org.apache.spark.SparkContext') 158 | jvm_spark_context = gateway.jvm.SparkContext.getOrCreate() 159 | jvm_java_spark_context = gateway.jvm.JavaSparkContext(jvm_spark_context) 160 | 161 | SparkContext._gateway = gateway 162 | SparkContext._jvm = gateway.jvm 163 | 164 | return SparkContext( 165 | appName=jvm_spark_context.appName(), 166 | master=jvm_spark_context.master(), 167 | gateway=gateway, 168 | jsc=jvm_java_spark_context, 169 | ) 170 | -------------------------------------------------------------------------------- /sparkly/session.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import atexit 18 | from copy import deepcopy 19 | import os 20 | import signal 21 | import sys 22 | import time 23 | import uuid 24 | 25 | from pyspark import SparkContext 26 | from pyspark.conf import SparkConf 27 | from pyspark.sql import SparkSession 28 | 29 | from sparkly.catalog import SparklyCatalog 30 | from sparkly.instant_testing import InstantTesting 31 | from sparkly.reader import SparklyReader 32 | from sparkly.writer import attach_writer_to_dataframe 33 | 34 | 35 | class SparklySession(SparkSession): 36 | """Wrapper around SparkSession to simplify definition of options, packages, JARs and UDFs. 37 | 38 | Example:: 39 | 40 | from pyspark.sql.types import IntegerType 41 | import sparkly 42 | 43 | 44 | class MySession(sparkly.SparklySession): 45 | options = {'spark.sql.shuffle.partitions': '2000'} 46 | repositories = ['http://packages.confluent.io/maven/'] 47 | packages = ['com.databricks:spark-csv_2.10:1.4.0'] 48 | jars = ['../path/to/brickhouse-0.7.1.jar'] 49 | udfs = { 50 | 'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF', 51 | 'my_python_udf': (lambda x: len(x), IntegerType()), 52 | } 53 | 54 | 55 | spark = MySession() 56 | spark.read_ext.cassandra(...) 57 | 58 | # Alternatively 59 | spark = MySession.get_or_create() 60 | spark.read_ext.cassandra(...) 61 | 62 | Attributes: 63 | options (dict[str,str]): Configuration options that are passed to spark-submit. 64 | See `the list of possible options 65 | `_. 66 | Note that any options set already through PYSPARK_SUBMIT_ARGS will override 67 | these. 68 | repositories (list[str]): List of additional maven repositories for package lookup. 69 | packages (list[str]): Spark packages that should be installed. 70 | See https://spark-packages.org/ 71 | jars (list[str]): Full paths to jar files that we want to include to the session. 72 | E.g. a JDBC connector or a library with UDF functions. 73 | udfs (dict[str,str|typing.Callable]): Register UDF functions within the session. 74 | Key - a name of the function, 75 | Value - either a class name imported from a JAR file 76 | or a tuple with python function and its return type. 77 | name (str): a name that is used in default app_id_template (see below) 78 | app_id_template (str|None): if set and nonempty, generate the `spark.app.id` with 79 | this template. Interpolation is available with some pre-defined variables: 80 | * initial_time: the time that the first session started 81 | * initial_uid: a unique id associated with the first session 82 | * session_time: the time the session started 83 | * session_uid: a unique id associated with the session 84 | A default value is provided using the name, initial-uid and session-time. 85 | This helps a specific use case when running in Kubernetes: when a session 86 | is restarted, the same app-id is used, breaking storage of spark-history data 87 | (only the first session will have its history stored, unless overwrite mode 88 | is used, in which case only the last session will have its history stored). 89 | By defaulting to using the initial-uid and session-time information, we get 90 | sane "grouping" of all sessions originating from the same initial session, but also 91 | achieve separate individual app ids so that history for each can be maintained. 92 | To disable this functionality entirely, simply set to None or emptystring. 93 | Finally, if a user manually specifies `spark.app.id`, then that value will 94 | always trump any template provided here. 95 | """ 96 | name = 'sparkly' 97 | options = {} 98 | packages = [] 99 | jars = [] 100 | udfs = {} 101 | repositories = [] 102 | app_id_template = '{name}-{initial_uid}-{session_time}' 103 | 104 | _instantiated_session = None 105 | _original_environment = None 106 | 107 | _initial_time = None 108 | _initial_uid = None 109 | 110 | def __init__(self, additional_options=None): 111 | SparklySession._original_environment = deepcopy(os.environ) 112 | os.environ['PYSPARK_PYTHON'] = sys.executable 113 | 114 | self._initial_time = self._initial_time or int(time.time()) 115 | self._initial_uid = self._initial_uid or uuid.uuid4().hex 116 | self._session_time = int(time.time()) 117 | self._session_uid = uuid.uuid4().hex 118 | 119 | options = { 120 | 'spark.sql.catalogImplementation': 'hive', 121 | } 122 | app_id_template = self.app_id_template 123 | if app_id_template: 124 | options.update({ 125 | 'spark.app.id': app_id_template.format( 126 | name=self.name, 127 | initial_time=self._initial_time, 128 | initial_uid=self._initial_uid, 129 | session_time=self._session_time, 130 | session_uid=self._session_uid, 131 | ), 132 | }) 133 | options.update(self.options or {}) 134 | options.update(additional_options or {}) 135 | options = {str(key): str(value) for key, value in options.items()} 136 | 137 | submit_args = [ 138 | # options that were already defined through PYSPARK_SUBMIT_ARGS 139 | # take precedence over SparklySession's 140 | os.environ.get('PYSPARK_SUBMIT_ARGS', '').replace('pyspark-shell', ''), 141 | self._setup_repositories(), 142 | self._setup_packages(), 143 | self._setup_jars(), 144 | self._setup_options(options), 145 | 'pyspark-shell', 146 | ] 147 | os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(filter(None, submit_args)) 148 | 149 | def get_context(): 150 | conf = SparkConf() 151 | conf.setAll(options.items()) 152 | return SparkContext(conf=conf) 153 | 154 | # If we are in instant testing mode 155 | if InstantTesting.is_activated(): 156 | context = InstantTesting.get_context() 157 | 158 | # It's the first run, so we have to create context and demonise the process. 159 | if context is None: 160 | context = get_context() 161 | if os.fork() == 0: # Detached process. 162 | signal.pause() 163 | else: 164 | InstantTesting.set_context(context) 165 | else: 166 | context = get_context() 167 | 168 | super(SparklySession, self).__init__(context) 169 | 170 | # similar to session builder: 171 | for key, value in options.items(): 172 | self._jsparkSession.sessionState().conf().setConfString(key, value) 173 | 174 | self._setup_udfs() 175 | 176 | self.read_ext = SparklyReader(self) 177 | self.catalog_ext = SparklyCatalog(self) 178 | 179 | attach_writer_to_dataframe() 180 | SparklySession._instantiated_session = self 181 | 182 | @classmethod 183 | def get_or_create(cls): 184 | """Access instantiated sparkly session. 185 | 186 | If sparkly session has already been instantiated, return that 187 | instance; if not, then instantiate one and return it. Useful 188 | for lazy access to the session. Not thread-safe. 189 | 190 | Returns: 191 | SparklySession (or subclass). 192 | """ 193 | if SparklySession._instantiated_session is None: 194 | cls() 195 | return SparklySession._instantiated_session 196 | 197 | @classmethod 198 | def stop(cls): 199 | """Stop instantiated sparkly session.""" 200 | if SparklySession._instantiated_session is not None: 201 | SparkSession.stop(SparklySession._instantiated_session) 202 | SparklySession._instantiated_session = None 203 | os.environ = SparklySession._original_environment 204 | SparklySession._original_environment = None 205 | 206 | @property 207 | def builder(self): 208 | raise NotImplementedError( 209 | 'You do not need a builder for SparklySession. ' 210 | 'Just use a regular python constructor. ' 211 | 'Please, follow the documentation for more details.' 212 | ) 213 | 214 | def _setup_repositories(self): 215 | if self.repositories: 216 | return '--repositories {}'.format(','.join(self.repositories)) 217 | else: 218 | return '' 219 | 220 | def _setup_packages(self): 221 | if self.packages: 222 | return '--packages {}'.format(','.join(self.packages)) 223 | else: 224 | return '' 225 | 226 | def _setup_jars(self): 227 | if self.jars: 228 | return '--jars {}'.format(','.join(self.jars)) 229 | else: 230 | return '' 231 | 232 | def _setup_options(self, options): 233 | # Here we massage conf properties with the intent to pass them to 234 | # spark-submit; this is convenient as it is unified with the approach 235 | # we take for repos, packages and jars, and it also handles precedence 236 | # of conf properties already defined by the user in a very 237 | # straightforward way (since we always append to PYSPARK_SUBMIT_ARGS) 238 | return ' '.join('--conf "{}={}"'.format(*o) for o in sorted(options.items())) 239 | 240 | def _setup_udfs(self): 241 | for name, defn in self.udfs.items(): 242 | if isinstance(defn, str): 243 | self.sql('create temporary function {} as "{}"'.format(name, defn)) 244 | elif isinstance(defn, tuple): 245 | self.udf.register(name, *defn) 246 | else: 247 | raise NotImplementedError('Incorrect UDF definition: {}: {}'.format(name, defn)) 248 | 249 | 250 | # https://issues.apache.org/jira/browse/SPARK-27927 251 | # Spark on Kubernetes has an issue where the python process finishes, 252 | # but the controlling java process just hangs, so nothing terminates. 253 | # There is a simple workaround to stop the session prior to python termination. 254 | # We do that here with an atexit registration. 255 | atexit.register(SparklySession.stop) 256 | -------------------------------------------------------------------------------- /sparkly/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import functools 18 | import inspect 19 | from itertools import islice 20 | import os 21 | import re 22 | 23 | try: 24 | from kafka import KafkaAdminClient, KafkaConsumer, TopicPartition 25 | import kafka.admin 26 | except ImportError: 27 | pass 28 | 29 | import pylru 30 | from pyspark import StorageLevel 31 | from pyspark.sql import DataFrame 32 | from pyspark.sql import types as T 33 | 34 | from sparkly.exceptions import UnsupportedDataType 35 | 36 | 37 | def absolute_path(file_path, *rel_path): 38 | """Return absolute path to file. 39 | 40 | Usage: 41 | >>> absolute_path('/my/current/dir/x.txt', '..', 'x.txt') 42 | '/my/current/x.txt' 43 | 44 | >>> absolute_path('/my/current/dir/x.txt', 'relative', 'path') 45 | '/my/current/dir/relative/path' 46 | 47 | >>> import os 48 | >>> absolute_path('x.txt', 'relative/path') == os.getcwd() + '/relative/path' 49 | True 50 | 51 | Args: 52 | file_path (str): file 53 | rel_path (list[str]): path parts 54 | 55 | Returns: 56 | str 57 | """ 58 | return os.path.abspath( 59 | os.path.join( 60 | os.path.dirname( 61 | os.path.realpath(file_path) 62 | ), 63 | *rel_path 64 | ) 65 | ) 66 | 67 | 68 | def kafka_get_topics_offsets(host, topic, port=9092): 69 | """Return available partitions and their offsets for the given topic. 70 | 71 | Args: 72 | host (str): Kafka host. 73 | topic (str): Kafka topic. 74 | port (int): Kafka port. 75 | 76 | Returns: 77 | [(int, int, int)]: [(partition, start_offset, end_offset)]. 78 | """ 79 | brokers = ['{}:{}'.format(host, port)] 80 | consumer = KafkaConsumer(bootstrap_servers=brokers) 81 | partitions = consumer.partitions_for_topic(topic) 82 | offsets = [] 83 | if partitions: 84 | topic_partitions = [TopicPartition(topic, p) for p in partitions] 85 | start_offsets_raw = consumer.beginning_offsets(topic_partitions) 86 | end_offsets_raw = consumer.end_offsets(topic_partitions) 87 | start_offsets = {tp.partition: offset for tp, offset in start_offsets_raw.items()} 88 | end_offsets = {tp.partition: offset for tp, offset in end_offsets_raw.items()} 89 | offsets = [ 90 | (partition, start_offsets[partition], end_offsets[partition]) 91 | for partition in start_offsets 92 | ] 93 | 94 | return offsets 95 | 96 | def kafka_create_topic(host, topic, port=9092, num_partitions=2, replication_factor=1): 97 | """Creates Kafka topic. 98 | 99 | Args: 100 | host (str): Kafka host. 101 | topic (str): Kafka topic. 102 | port (int): Kafka port. 103 | num_partitions (int): Number of topic's partitions. 104 | replication_factor (int): Number of partition's replicas. 105 | """ 106 | kafka_admin = KafkaAdminClient(bootstrap_servers=f'{host}:{port}') 107 | kafka_admin.create_topics([ 108 | kafka.admin.NewTopic( 109 | name=topic, 110 | num_partitions=num_partitions, 111 | replication_factor=replication_factor, 112 | ), 113 | ]) 114 | 115 | 116 | class lru_cache(object): 117 | """LRU cache that supports DataFrames. 118 | 119 | Enables caching of both the dataframe object and the data that df 120 | contains by persisting it according to user specs. It's the user's 121 | responsibility to make sure that the dataframe contents are not 122 | evicted from memory and/or disk should this feature get overused. 123 | 124 | Args: 125 | maxsize (int|128): maximum number of items to cache. 126 | storage_level (pyspark.StorageLevel|MEMORY_ONLY): how to cache 127 | the contents of a dataframe (only used when the cached 128 | function results in a dataframe). 129 | """ 130 | def __init__(self, maxsize=128, storage_level=StorageLevel.MEMORY_ONLY): 131 | self.maxsize = maxsize 132 | self.storage_level = storage_level 133 | 134 | def __call__(self, func): 135 | # Whenever an object is evicted from the cache we want to 136 | # unpersist its contents too if it's a dataframe 137 | def eviction_callback(key, value): 138 | if isinstance(value, DataFrame): 139 | value.unpersist() 140 | 141 | lru_decorator = pylru.lrudecorator(self.maxsize) 142 | lru_decorator.cache.callback = eviction_callback 143 | 144 | @lru_decorator 145 | @functools.wraps(func) 146 | def func_and_persist(*args, **kwargs): 147 | result = func(*args, **kwargs) 148 | if isinstance(result, DataFrame): 149 | result.persist(self.storage_level) 150 | return result 151 | 152 | return func_and_persist 153 | 154 | 155 | def parse_schema(schema): 156 | """Generate schema by its string definition. 157 | 158 | It's basically an opposite action to `DataType.simpleString` method. 159 | Supports all atomic types (like string, int, float...) and complex types (array, map, struct) 160 | except DecimalType. 161 | 162 | Usages: 163 | >>> parse_schema('string') 164 | StringType 165 | >>> parse_schema('int') 166 | IntegerType 167 | >>> parse_schema('array') 168 | ArrayType(IntegerType,true) 169 | >>> parse_schema('map') 170 | MapType(StringType,IntegerType,true) 171 | >>> parse_schema('struct') 172 | StructType(List(StructField(a,IntegerType,true),StructField(b,StringType,true))) 173 | >>> parse_schema('unsupported') 174 | Traceback (most recent call last): 175 | ... 176 | sparkly.exceptions.UnsupportedDataType: Cannot parse type from string: "unsupported" 177 | """ 178 | try: 179 | return T._parse_datatype_string(schema) 180 | except Exception as e: 181 | raise UnsupportedDataType(f'Cannot parse schema: {schema}: {e}') 182 | 183 | def schema_has(t, required_fields): 184 | """Check whether a complex dataType has specific fields. 185 | 186 | Args: 187 | t (pyspark.sql.types.ArrayType, MapType, StructType): type to 188 | check. 189 | required_fields (same with t or dict[str, pyspark.sql.DataType]): 190 | fields that need to be present in t. For convenience, a user 191 | can define a ``dict`` in place of a 192 | ``pyspark.sql.types.StructType``, but other than that this 193 | argument must have the same type as t. 194 | 195 | Raises: 196 | AssertionError: if t and required_fields cannot be compared 197 | because they aren't instances of the same complex dataType. 198 | KeyError: if a required field is not found in the struct. 199 | TypeError: if a required field exists but its actual type does 200 | not match the required one. 201 | """ 202 | if isinstance(required_fields, dict): 203 | required_fields = T.StructType([ 204 | T.StructField(*field_def) for field_def in required_fields.items() 205 | ]) 206 | 207 | assert type(t) == type(required_fields), 'Cannot compare heterogeneous types' 208 | 209 | def _unpack(t): 210 | if isinstance(t, T.ArrayType): 211 | return {'element': t.elementType} 212 | elif isinstance(t, T.MapType): 213 | return {'key': t.keyType, 'value': t.valueType} 214 | elif isinstance(t, T.StructType): 215 | return {field.name: field.dataType for field in t.fields} 216 | return {} 217 | 218 | def _is_complex(t): 219 | return isinstance(t, (T.ArrayType, T.MapType, T.StructType)) 220 | 221 | existing_fields = _unpack(t) 222 | required_fields = _unpack(required_fields) 223 | 224 | for required_field, required_type in required_fields.items(): 225 | try: 226 | current_type = existing_fields[required_field] 227 | except KeyError: 228 | raise KeyError(required_field) 229 | 230 | if _is_complex(current_type): 231 | try: 232 | schema_has(current_type, required_type) 233 | except (KeyError, TypeError) as e: 234 | raise type(e)('{}.{}'.format(required_field, e.args[0])) 235 | except AssertionError: 236 | pass 237 | else: 238 | continue 239 | 240 | if required_type != current_type: 241 | raise TypeError( 242 | '{} is {}, expected {}'.format(required_field, current_type, required_type) 243 | ) 244 | 245 | return True 246 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /tests/integration/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | 19 | from pyspark.sql.types import StringType 20 | 21 | import pyspark 22 | from sparkly import SparklySession 23 | from sparkly.utils import absolute_path 24 | 25 | 26 | class SparklyTestSession(SparklySession): 27 | packages = [ 28 | 'com.datastax.spark:spark-cassandra-connector_2.12:3.2.0', 29 | 'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8', 30 | 'org.apache.spark:spark-sql-kafka-0-10_2.12:{}'.format(pyspark.__version__), 31 | 'mysql:mysql-connector-java:8.0.31', 32 | 'io.confluent:kafka-avro-serializer:3.0.1', 33 | ] 34 | 35 | repositories = [ 36 | 'http://packages.confluent.io/maven/', 37 | ] 38 | 39 | jars = [ 40 | absolute_path(__file__, 'resources', 'brickhouse-0.7.1.jar'), 41 | ] 42 | 43 | udfs = { 44 | 'collect': 'brickhouse.udf.collect.CollectUDAF', 45 | 'length_of_text': (lambda text: len(text), StringType()) 46 | } 47 | 48 | options = { 49 | 'my.custom.option.1': '117', 50 | 'my.custom.option.2': 223, 51 | # will be overwritten by additional_options passed in setup_session 52 | 'my.custom.option.3': '319', 53 | } 54 | 55 | 56 | class SparklyTestSessionWithOldCatalog(SparklyTestSession): 57 | options = { 58 | 'spark.sql.legacy.keepCommandOutputSchema': 'true', 59 | } 60 | -------------------------------------------------------------------------------- /tests/integration/fake_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tubular/sparkly/393d1342ade404461a41d42e730e8571f92fcd28/tests/integration/fake_modules/__init__.py -------------------------------------------------------------------------------- /tests/integration/fake_modules/testing.py: -------------------------------------------------------------------------------- 1 | def is_fake(): 2 | return True 3 | -------------------------------------------------------------------------------- /tests/integration/resources/brickhouse-0.7.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tubular/sparkly/393d1342ade404461a41d42e730e8571f92fcd28/tests/integration/resources/brickhouse-0.7.1.jar -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/cassandra_setup.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS sparkly_test; 2 | 3 | CREATE KEYSPACE sparkly_test WITH replication = 4 | {'class': 'SimpleStrategy', 'replication_factor': '1'}; 5 | 6 | CREATE TABLE sparkly_test.test ( 7 | uid text, 8 | created text, 9 | countries map, 10 | PRIMARY KEY (uid, created) 11 | ); 12 | 13 | INSERT INTO sparkly_test.test (uid, created, countries) VALUES ('1', '1234567899', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206}); 14 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/cassandra_teardown.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/data.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test_fixture", "_type" : "test", "_id": "1" } } 2 | { "name" : "John", "age": 56} 3 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/data_for_es7.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test_fixture", "_id": "1" } } 2 | { "name" : "John", "age": 56} 3 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/kafka.json: -------------------------------------------------------------------------------- 1 | {"key": {"name": "johny"}, "value": {"name": "johny", "surname": "cage"}} 2 | {"key": {"name": "johny"}, "value": {"name": "johny", "surname": "smith"}} 3 | {"key": {"name": "aron"}, "value": {"name": "aron", "surname": "ramsey"}} 4 | {"key": {"name": "killy"}, "value": {"name": "killy", "surname": "gonsales"}} 5 | {"key": {"name": "shefkey"}, "value": {"name": "shefkey", "surname": "kuki"}} 6 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "name": { 4 | "type": "text" 5 | }, 6 | "age": { 7 | "type": "integer" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/mysql_setup.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE if not exists sparkly_test; 2 | 3 | CREATE TABLE sparkly_test.test ( 4 | id int, 5 | name varchar(30), 6 | surname varchar(40), 7 | age int, 8 | primary key (id) 9 | ); 10 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (1, 'john', 'sk', 111); 11 | -------------------------------------------------------------------------------- /tests/integration/resources/test_fixtures/mysql_teardown.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/resources/test_read/cassandra_setup.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS sparkly_test; 2 | 3 | CREATE KEYSPACE sparkly_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}; 4 | 5 | CREATE TABLE sparkly_test.test ( 6 | uid text, 7 | created text, 8 | countries map, 9 | PRIMARY KEY (uid, created) 10 | ); 11 | 12 | INSERT INTO sparkly_test.test (uid, created, countries) 13 | VALUES ('1', '1234567894', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206}); 14 | 15 | INSERT INTO sparkly_test.test (uid, created, countries) 16 | VALUES ('2', '1234567893', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206}); 17 | 18 | INSERT INTO sparkly_test.test (uid, created, countries) 19 | VALUES ('3', '1234567891', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206}); 20 | -------------------------------------------------------------------------------- /tests/integration/resources/test_read/cassandra_teardown.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/resources/test_read/elastic7_setup.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test", "_id": "1" } } 2 | { "name" : "John2", "topics": [1, 2, 3, 4, 5], "age": 56, "demo": { "age_30": 20, "age_10": 50 } } 3 | { "index" : { "_index" : "sparkly_test", "_id": "2" } } 4 | { "name" : "Smith3", "topics": [1, 4, 5], "age": 31, "demo": { "age_30": 110, "age_10": 50 } } 5 | { "index" : { "_index" : "sparkly_test", "_id": "3" } } 6 | { "name" : "Smith4", "topics": [4, 5], "age": 12, "demo": { "age_30": 20, "age_10": 1 } } 7 | -------------------------------------------------------------------------------- /tests/integration/resources/test_read/elastic_setup.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "1" } } 2 | { "name" : "John2", "topics": [1, 2, 3, 4, 5], "age": 56, "demo": { "age_30": 20, "age_10": 50 } } 3 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "2" } } 4 | { "name" : "Smith3", "topics": [1, 4, 5], "age": 31, "demo": { "age_30": 110, "age_10": 50 } } 5 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "3" } } 6 | { "name" : "Smith4", "topics": [4, 5], "age": 12, "demo": { "age_30": 20, "age_10": 1 } } 7 | -------------------------------------------------------------------------------- /tests/integration/resources/test_read/kafka_setup.json: -------------------------------------------------------------------------------- 1 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 1}} 2 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 2}} 3 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 3}} 4 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 4}} 5 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 5}} 6 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 6}} 7 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 7}} 8 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 8}} 9 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 9}} 10 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 10}} 11 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 11}} 12 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 12}} 13 | -------------------------------------------------------------------------------- /tests/integration/resources/test_read/mysql_setup.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE if not exists sparkly_test; 2 | 3 | CREATE TABLE sparkly_test.test ( 4 | id int, 5 | name varchar(30), 6 | surname varchar(40), 7 | age int, 8 | primary key (id) 9 | ); 10 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (1, 'john', 'sk', 111); 11 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (2, 'john', 'po', 222); 12 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (3, 'john', 'ku', 333); 13 | -------------------------------------------------------------------------------- /tests/integration/resources/test_read/mysql_teardown.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/resources/test_testing/kafka_watcher_1.json: -------------------------------------------------------------------------------- 1 | {"key": {"user_id": 1}, "value": {"meal": "dinner", "food": ["spaghetti", "meatballs"]}} 2 | {"key": {"user_id": 2}, "value": {"meal": "lunch", "food": ["soylent"]}} 3 | {"key": {"user_id": 3}, "value": {"meal": "breakfast", "food": []}} 4 | {"key": {"user_id": 2}, "value": {"meal": "second dinner", "food": ["galbi", "ice cream"]}} 5 | -------------------------------------------------------------------------------- /tests/integration/resources/test_testing/kafka_watcher_2.json: -------------------------------------------------------------------------------- 1 | {"key": {"user_id": 1}, "value": {"meal": "lunch", "food": ["pizza", "stinky tofu"]}} 2 | {"key": {"user_id": 4}, "value": {"meal": "lunch", "food": ["cuban sandwich", "mashed potatoes"]}} 3 | {"key": {"user_id": 5}, "value": {"meal": "dessert", "food": ["pecan pie", "mango"]}} 4 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/cassandra_setup.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE IF EXISTS sparkly_test; 2 | 3 | CREATE KEYSPACE sparkly_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}; 4 | 5 | CREATE TABLE sparkly_test.test_writer ( 6 | uid text, 7 | title text, 8 | views bigint, 9 | PRIMARY KEY (uid) 10 | ); 11 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/cassandra_teardown.cql: -------------------------------------------------------------------------------- 1 | DROP KEYSPACE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/resources/test_write/elastic7_setup.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test", "_id": "1111" } } 2 | { "uid": "1111", "title": "xxxx", "views": 1111} 3 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/elastic_setup.json: -------------------------------------------------------------------------------- 1 | { "index" : { "_index" : "sparkly_test", "_type" : "test_writer", "_id": "1111" } } 2 | { "uid": "1111", "title": "xxxx", "views": 1111} 3 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/kafka_setup.json: -------------------------------------------------------------------------------- 1 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 1}} 2 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 2}} 3 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 3}} 4 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 4}} 5 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 5}} 6 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 6}} 7 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 7}} 8 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 8}} 9 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 9}} 10 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 10}} 11 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 11}} 12 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 12}} 13 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/mysql_setup.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE if not exists sparkly_test; 2 | CREATE TABLE sparkly_test.test_writer ( 3 | uid varchar(30), 4 | title varchar(40), 5 | views int, 6 | primary key (uid) 7 | ); 8 | INSERT INTO sparkly_test.test_writer (uid, title, views) VALUES ('1111', '1111', 999); 9 | -------------------------------------------------------------------------------- /tests/integration/resources/test_write/mysql_teardown.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE sparkly_test; -------------------------------------------------------------------------------- /tests/integration/test_catalog.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from sparkly.testing import SparklyGlobalSessionTest 18 | from tests.integration.base import SparklyTestSession, SparklyTestSessionWithOldCatalog 19 | from sparkly.catalog import read_db_properties_format 20 | 21 | 22 | class TestSparklyCatalog(SparklyGlobalSessionTest): 23 | session = SparklyTestSession 24 | 25 | def setUp(self): 26 | self.spark.catalog_ext.drop_table('test_table') 27 | 28 | if self.spark.catalog_ext.has_database('test_db'): 29 | for table in self.spark.catalog.listTables('test_db'): 30 | self.spark.catalog_ext.drop_table('test_db.{}'.format(table.name)) 31 | self.spark.sql('DROP DATABASE test_db') 32 | 33 | df = self.spark.createDataFrame([('row_1', 1), ('row_2', 2)], schema=('a', 'b')) 34 | df.write.saveAsTable('test_table', format='parquet', location='/tmp/test_table') 35 | 36 | self.spark.catalog_ext.set_table_property('test_table', 'property_a', 'str_value') 37 | self.spark.catalog_ext.set_table_property('test_table', 'property_b', 2) 38 | 39 | self.spark.sql('CREATE DATABASE test_db') 40 | df.write.saveAsTable('test_db.test_table', format='parquet', location='/tmp/test_table') 41 | self.spark.catalog_ext.set_table_property('test_db.test_table', 42 | 'property_a', 43 | 'str_value') 44 | self.spark.catalog_ext.set_table_property('test_db.test_table', 45 | 'property_b', 46 | 2) 47 | 48 | def test_has_database(self): 49 | self.assertTrue(self.spark.catalog_ext.has_database('test_db')) 50 | self.assertFalse(self.spark.catalog_ext.has_database('not_exists')) 51 | 52 | def test_create_table_when_exists(self): 53 | self.assertTrue(self.spark.catalog_ext.has_table('test_table')) 54 | 55 | new_df = self.spark.createDataFrame([('row_5', 'hi')], schema=('c', 'd')) 56 | new_df.write.save('/tmp/test_table_2', format='parquet', mode='overwrite') 57 | 58 | self.spark.catalog_ext.create_table( 59 | 'test_table', 60 | path='/tmp/test_table_2', 61 | schema=new_df.schema, 62 | mode='overwrite', 63 | ) 64 | 65 | self.assertTrue(self.spark.catalog_ext.has_table('test_table')) 66 | 67 | new_table = self.spark.table('test_table') 68 | self.assertEqual( 69 | [r.asDict() for r in new_table.collect()], 70 | [{'c': 'row_5', 'd': 'hi'}], 71 | ) 72 | 73 | def test_drop_table(self): 74 | self.assertTrue(self.spark.catalog_ext.has_table('test_table')) 75 | 76 | self.spark.catalog_ext.drop_table('test_table') 77 | 78 | self.assertFalse(self.spark.catalog_ext.has_table('test_table')) 79 | 80 | def test_drop_table_non_default_db(self): 81 | self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table')) 82 | 83 | self.spark.catalog_ext.drop_table('test_db.test_table') 84 | 85 | self.assertFalse(self.spark.catalog_ext.has_table('test_db.test_table')) 86 | 87 | def test_has_table(self): 88 | self.assertFalse(self.spark.catalog_ext.has_table(None)) 89 | self.assertFalse(self.spark.catalog_ext.has_table('')) 90 | self.assertTrue(self.spark.catalog_ext.has_table('test_table')) 91 | self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table')) 92 | self.assertFalse(self.spark.catalog_ext.has_table('test_unknown_table')) 93 | self.assertFalse(self.spark.catalog_ext.has_table('non_exists.test_unknown_table')) 94 | 95 | def test_rename_table(self): 96 | self.spark.catalog_ext.drop_table('new_test_table') 97 | self.assertTrue(self.spark.catalog_ext.has_table('test_table')) 98 | self.assertFalse(self.spark.catalog_ext.has_table('new_test_table')) 99 | 100 | self.spark.catalog_ext.rename_table('test_table', 'new_test_table') 101 | 102 | self.assertFalse(self.spark.catalog_ext.has_table('test_table')) 103 | self.assertTrue(self.spark.catalog_ext.has_table('new_test_table')) 104 | self.assertEqual(self.spark.table('new_test_table').count(), 2) 105 | 106 | def test_rename_table_non_default_db(self): 107 | self.spark.catalog_ext.drop_table('test_db.new_test_table') 108 | self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table')) 109 | self.assertFalse(self.spark.catalog_ext.has_table('test_db.new_test_table')) 110 | 111 | self.spark.catalog_ext.rename_table('test_db.test_table', 'test_db.new_test_table') 112 | 113 | self.assertFalse(self.spark.catalog_ext.has_table('test_db.test_table')) 114 | self.assertTrue(self.spark.catalog_ext.has_table('test_db.new_test_table')) 115 | self.assertEqual(self.spark.table('test_db.new_test_table').count(), 2) 116 | 117 | def test_get_table_properties(self): 118 | properties = self.spark.catalog_ext.get_table_properties('test_table') 119 | 120 | self.assertEqual(properties.get('property_a'), 'str_value') 121 | self.assertEqual(properties.get('property_b'), '2') 122 | 123 | def test_get_table_property(self): 124 | self.assertEqual( 125 | self.spark.catalog_ext.get_table_property('test_table', 'property_a'), 126 | 'str_value', 127 | ) 128 | self.assertEqual( 129 | self.spark.catalog_ext.get_table_property('test_db.test_table', 'property_a'), 130 | 'str_value', 131 | ) 132 | 133 | def test_get_table_property_to_type(self): 134 | self.assertEqual( 135 | self.spark.catalog_ext.get_table_property('test_table', 'property_b', to_type=int), 136 | 2, 137 | ) 138 | self.assertEqual( 139 | self.spark.catalog_ext.get_table_property('test_db.test_table', 140 | 'property_b', 141 | to_type=int), 142 | 2, 143 | ) 144 | 145 | def test_get_table_property_unknown(self): 146 | self.assertIsNone(self.spark.catalog_ext.get_table_property('test_table', 'unknown')) 147 | self.assertIsNone( 148 | self.spark.catalog_ext.get_table_property('test_db.test_table', 'unknown') 149 | ) 150 | 151 | def test_set_database_property_with_prohibited_symbols(self): 152 | with self.assertRaises(ValueError): 153 | self.spark.catalog_ext.set_database_property('test_db', 'broken,key', 'normal_value') 154 | 155 | with self.assertRaises(ValueError): 156 | self.spark.catalog_ext.set_database_property('test_db', 'normal_key', 'broken(value)') 157 | 158 | def test_get_database_property(self): 159 | self.spark.catalog_ext.set_database_property('test_db', 'property_a', 'just,a,string') 160 | self.spark.catalog_ext.set_database_property('test_db', 'property_b', '123') 161 | 162 | self.assertEqual( 163 | self.spark.catalog_ext.get_database_property('test_db', 'property_a'), 164 | 'just,a,string', 165 | ) 166 | self.assertEqual( 167 | self.spark.catalog_ext.get_database_property('test_db', 'property_b', to_type=int), 168 | 123, 169 | ) 170 | self.assertIsNone( 171 | self.spark.catalog_ext.get_database_property('test_db', 'unknown_prop', to_type=int), 172 | ) 173 | 174 | def test_get_database_properties(self): 175 | self.spark.catalog_ext.set_database_property('test_db', 'property_a', 'just,a,string') 176 | self.spark.catalog_ext.set_database_property('test_db', 'property_b', '123') 177 | 178 | self.assertEqual(self.spark.catalog_ext.get_database_properties('test_db'), { 179 | 'property_a': 'just,a,string', 180 | 'property_b': '123', 181 | }) 182 | 183 | def test_read_db_properties_format_for_typical_input(self): 184 | self.assertEqual(read_db_properties_format('((a,b), (c,d))'), [['a', 'b'], ['c', 'd']]) 185 | self.assertEqual(read_db_properties_format('((a,b))'), [['a', 'b']]) 186 | self.assertEqual(read_db_properties_format('()'), []) 187 | 188 | def test_read_db_properties_format_for_broken_input(self): 189 | with self.assertRaises(ValueError): 190 | read_db_properties_format('((a, b), (c, d)') 191 | 192 | with self.assertRaises(ValueError): 193 | read_db_properties_format(')(a, b), (c, d)(') 194 | 195 | with self.assertRaises(ValueError): 196 | read_db_properties_format(')(') 197 | 198 | with self.assertRaises(ValueError): 199 | read_db_properties_format(')') 200 | 201 | 202 | class TestSparklyWithOldCatalog(TestSparklyCatalog): 203 | session = SparklyTestSessionWithOldCatalog 204 | -------------------------------------------------------------------------------- /tests/integration/test_instant_testing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import json 18 | import os 19 | 20 | try: 21 | from unittest import mock 22 | except ImportError: 23 | import mock 24 | 25 | from pyspark import SparkContext 26 | 27 | from sparkly.instant_testing import InstantTesting 28 | from sparkly.testing import SparklyGlobalSessionTest 29 | from tests.integration.base import SparklyTestSession 30 | 31 | 32 | _MOCK_LOCK_FILE_PATH = InstantTesting.LOCK_FILE_PATH + '__test' 33 | 34 | 35 | @mock.patch.object(InstantTesting, 'LOCK_FILE_PATH', _MOCK_LOCK_FILE_PATH) 36 | class TestInstantTesting(SparklyGlobalSessionTest): 37 | session = SparklyTestSession 38 | 39 | def setUp(self): 40 | try: 41 | os.remove(_MOCK_LOCK_FILE_PATH) 42 | except: 43 | pass 44 | 45 | def test_set_context(self): 46 | InstantTesting.activate() 47 | InstantTesting.set_context(self.spark.sparkContext) 48 | 49 | with open(_MOCK_LOCK_FILE_PATH) as f: 50 | state = json.load(f) 51 | self.assertEqual(state, { 52 | 'gateway_port': 53 | self.spark.sparkContext._gateway.java_gateway_server.getListeningPort(), 54 | 'session_pid': os.getpid(), 55 | 'gateway_secret': getattr( 56 | self.spark.sparkContext._gateway.gateway_parameters, 'auth_token', None, 57 | ), 58 | }) 59 | 60 | def test_get_context(self): 61 | initial_context = self.spark.sparkContext 62 | 63 | InstantTesting.activate() 64 | InstantTesting.set_context(initial_context) 65 | 66 | with mock.patch.object(SparkContext, '_active_spark_context', None): 67 | recovered_context = InstantTesting.get_context() 68 | 69 | self.assertIsInstance(recovered_context, SparkContext) 70 | self.assertEqual(initial_context.appName, recovered_context.appName) 71 | self.assertEqual(initial_context.master, recovered_context.master) 72 | -------------------------------------------------------------------------------- /tests/integration/test_reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import json 17 | import uuid 18 | 19 | from sparkly.exceptions import InvalidArgumentError 20 | from sparkly.testing import ( 21 | SparklyGlobalSessionTest, 22 | SparklyTest, 23 | CassandraFixture, 24 | MysqlFixture, 25 | ElasticFixture, 26 | KafkaFixture, 27 | ) 28 | from sparkly.utils import absolute_path, kafka_get_topics_offsets 29 | from tests.integration.base import ( 30 | SparklyTestSession, 31 | ) 32 | 33 | 34 | class SparklyReaderCassandraTest(SparklyGlobalSessionTest): 35 | session = SparklyTestSession 36 | 37 | fixtures = [ 38 | CassandraFixture( 39 | 'cassandra.docker', 40 | absolute_path(__file__, 'resources', 'test_read', 'cassandra_setup.cql'), 41 | absolute_path(__file__, 'resources', 'test_read', 'cassandra_teardown.cql'), 42 | ) 43 | ] 44 | 45 | def test_read(self): 46 | df = self.spark.read_ext.cassandra( 47 | host='cassandra.docker', 48 | port=9042, 49 | keyspace='sparkly_test', 50 | table='test', 51 | consistency='ONE', 52 | ) 53 | 54 | self.assertRowsEqual(df.collect(), [ 55 | { 56 | 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3}, 57 | 'uid': '1', 58 | 'created': '1234567894', 59 | }, 60 | { 61 | 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3}, 62 | 'uid': '2', 63 | 'created': '1234567893', 64 | }, 65 | { 66 | 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3}, 67 | 'uid': '3', 68 | 'created': '1234567891', 69 | } 70 | ]) 71 | 72 | 73 | ELASTIC_TEST_DATA = [ 74 | { 75 | 'name': 'Smith3', 76 | 'topics': [1, 4, 5], 77 | 'age': 31, 78 | 'demo': { 79 | 'age_30': 110, 80 | 'age_10': 50, 81 | } 82 | }, 83 | { 84 | 'name': 'Smith4', 85 | 'topics': [4, 5], 86 | 'age': 12, 87 | 'demo': { 88 | 'age_30': 20, 89 | 'age_10': 1, 90 | } 91 | } 92 | ] 93 | 94 | 95 | class SparklyReaderElasticTest(SparklyGlobalSessionTest): 96 | session = SparklyTestSession 97 | 98 | fixtures = [ 99 | ElasticFixture( 100 | 'elastic.docker', 101 | 'sparkly_test', 102 | None, 103 | None, 104 | absolute_path(__file__, 'resources', 'test_read', 'elastic7_setup.json'), 105 | ) 106 | ] 107 | 108 | def test_elastic(self): 109 | df = self.spark.read_ext.elastic( 110 | host='elastic.docker', 111 | port=9200, 112 | es_index='sparkly_test', 113 | es_type=None, 114 | query='?q=name:*Smith*', 115 | options={ 116 | 'es.read.field.as.array.include': 'topics', 117 | 'es.read.metadata': 'false', 118 | }, 119 | ) 120 | 121 | self.assertRowsEqual(df.collect(), ELASTIC_TEST_DATA) 122 | 123 | 124 | class SparklyReaderMySQLTest(SparklyGlobalSessionTest): 125 | session = SparklyTestSession 126 | 127 | fixtures = [ 128 | MysqlFixture( 129 | 'mysql.docker', 130 | 'root', 131 | None, 132 | absolute_path(__file__, 'resources', 'test_read', 'mysql_setup.sql'), 133 | absolute_path(__file__, 'resources', 'test_read', 'mysql_teardown.sql'), 134 | ) 135 | ] 136 | 137 | def test_read_mysql(self): 138 | df = self.spark.read_ext.mysql( 139 | host='mysql.docker', 140 | database='sparkly_test', 141 | table='test', 142 | options={ 143 | 'user': 'root', 144 | 'password': '', 145 | } 146 | ) 147 | 148 | self.assertRowsEqual(df.collect(), [ 149 | {'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111}, 150 | {'id': 2, 'name': 'john', 'surname': 'po', 'age': 222}, 151 | {'id': 3, 'name': 'john', 'surname': 'ku', 'age': 333}, 152 | ]) 153 | 154 | 155 | class TestReaderKafka(SparklyGlobalSessionTest): 156 | session = SparklyTestSession 157 | 158 | def setUp(self): 159 | self.json_decoder = lambda item: json.loads(item.decode('utf-8')) 160 | self.json_encoder = lambda item: json.dumps(item).encode('utf-8') 161 | self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10]) 162 | self.fixture_path = absolute_path(__file__, 'resources', 'test_read', 'kafka_setup.json') 163 | self.fixture = KafkaFixture( 164 | 'kafka.docker', 165 | topic=self.topic, 166 | key_serializer=self.json_encoder, 167 | value_serializer=self.json_encoder, 168 | data=self.fixture_path, 169 | ) 170 | self.fixture.setup_data() 171 | self.expected_data_df = self.spark.read.json(self.fixture_path) 172 | self.expected_data = [item.asDict(recursive=True) 173 | for item in self.expected_data_df.collect()] 174 | 175 | def test_read_by_topic(self): 176 | df = self.spark.read_ext.kafka( 177 | 'kafka.docker', 178 | topic=self.topic, 179 | key_deserializer=self.json_decoder, 180 | value_deserializer=self.json_decoder, 181 | schema=self.expected_data_df.schema, 182 | ) 183 | self.assertRowsEqual( 184 | df.collect(), 185 | self.expected_data, 186 | ) 187 | 188 | def test_read_by_offsets(self): 189 | offsets = kafka_get_topics_offsets('kafka.docker', self.topic) 190 | df = self.spark.read_ext.kafka( 191 | 'kafka.docker', 192 | topic=self.topic, 193 | offset_ranges=offsets, 194 | key_deserializer=self.json_decoder, 195 | value_deserializer=self.json_decoder, 196 | schema=self.expected_data_df.schema, 197 | ) 198 | 199 | self.assertRowsEqual(df.collect(), self.expected_data) 200 | 201 | self.fixture.setup_data() 202 | 203 | offsets = kafka_get_topics_offsets('kafka.docker', self.topic) 204 | df = self.spark.read_ext.kafka( 205 | 'kafka.docker', 206 | topic=self.topic, 207 | offset_ranges=offsets, 208 | key_deserializer=self.json_decoder, 209 | value_deserializer=self.json_decoder, 210 | schema=self.expected_data_df.schema, 211 | ) 212 | 213 | self.assertRowsEqual(df.collect(), self.expected_data * 2) 214 | 215 | df = self.spark.read_ext.kafka( 216 | 'kafka.docker', 217 | topic=self.topic, 218 | offset_ranges=offsets, 219 | key_deserializer=self.json_decoder, 220 | value_deserializer=self.json_decoder, 221 | schema=self.expected_data_df.schema, 222 | include_meta_cols=True, 223 | ) 224 | expected = [ 225 | # normal fields: 226 | 'key', 227 | 'value', 228 | # meta fields: 229 | 'topic', 230 | 'partition', 231 | 'offset', 232 | 'timestamp', 233 | 'timestampType', 234 | ] 235 | self.assertListEqual(sorted(expected), sorted(df.schema.fieldNames())) 236 | 237 | def test_argument_errors(self): 238 | with self.assertRaises(InvalidArgumentError): 239 | self.spark.read_ext.kafka( 240 | 'kafka.docker', 241 | topic=self.topic, 242 | key_deserializer=self.json_decoder, 243 | value_deserializer=self.json_decoder, 244 | # no schema! 245 | ) 246 | self.spark.read_ext.kafka( 247 | 'kafka.docker', 248 | topic=self.topic, 249 | key_deserializer=self.json_decoder, 250 | # no schema! 251 | ) 252 | self.spark.read_ext.kafka( 253 | 'kafka.docker', 254 | topic=self.topic, 255 | value_deserializer=self.json_decoder, 256 | # no schema! 257 | ) 258 | -------------------------------------------------------------------------------- /tests/integration/test_session.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from sparkly.testing import SparklyGlobalSessionTest 18 | from tests.integration.base import SparklyTestSession 19 | 20 | 21 | class TestSparklySession(SparklyGlobalSessionTest): 22 | session = SparklyTestSession 23 | 24 | def test_options(self): 25 | self.assertEqual('hive', self.spark.conf.get('spark.sql.catalogImplementation')) 26 | self.assertEqual('117', self.spark.conf.get('my.custom.option.1')) 27 | self.assertEqual('223', self.spark.conf.get('my.custom.option.2')) 28 | self.assertEqual('333', self.spark.conf.get('my.custom.option.3')) 29 | 30 | def test_python_udf(self): 31 | rows = self.spark.sql('select length_of_text("hello world")') 32 | self.assertEqual(rows.collect()[0][0], '11') 33 | 34 | def test_jar_udf(self): 35 | self.spark.createDataFrame( 36 | [ 37 | {'key_field': 'A', 'value_field': 1}, 38 | {'key_field': 'B', 'value_field': 2}, 39 | {'key_field': 'C', 'value_field': 3}, 40 | {'key_field': 'D', 'value_field': 4}, 41 | ], 42 | ).createOrReplaceTempView('test_jar_udf') 43 | 44 | rows = self.spark.sql('select collect(key_field, value_field) from test_jar_udf') 45 | self.assertEqual(rows.collect()[0][0], {'A': 1, 'B': 2, 'C': 3, 'D': 4}) 46 | 47 | def test_builder(self): 48 | with self.assertRaises(NotImplementedError): 49 | assert self.spark.builder 50 | -------------------------------------------------------------------------------- /tests/integration/test_testing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import json 17 | import uuid 18 | import pickle 19 | import time 20 | import unittest 21 | 22 | from sparkly.session import SparklySession 23 | from sparkly.testing import ( 24 | CassandraFixture, 25 | ElasticFixture, 26 | MysqlFixture, 27 | SparklyGlobalSessionTest, 28 | SparklyTest, 29 | KafkaFixture, 30 | KafkaWatcher, 31 | ) 32 | from sparkly.utils import absolute_path 33 | from tests.integration.base import ( 34 | SparklyTestSession, 35 | ) 36 | 37 | 38 | try: 39 | from kafka import KafkaConsumer, KafkaProducer 40 | except ImportError: 41 | pass 42 | 43 | 44 | class TestAssertions(SparklyGlobalSessionTest): 45 | session = SparklyTestSession 46 | 47 | def test_assert_dataframe_equal(self): 48 | df = self.spark.createDataFrame([('Alice', 1), 49 | ('Kelly', 1), 50 | ('BigBoss', 999)], 51 | ['name', 'age']) 52 | self.assertDataFrameEqual( 53 | df, 54 | [{'name': 'Alice', 'age': 1}, 55 | {'name': 'BigBoss', 'age': 999}, 56 | {'name': 'Kelly', 'age': 1}, 57 | ], 58 | ordered=False, 59 | ) 60 | 61 | with self.assertRaises(AssertionError): 62 | self.assertDataFrameEqual( 63 | df, 64 | [{'name': 'Alice', 'age': 1}, 65 | {'name': 'BigBoss', 'age': 999}, 66 | {'name': 'Kelly', 'age': 1}, 67 | ], 68 | ordered=True, 69 | ) 70 | 71 | 72 | class TestSparklyGlobalSessionTest(unittest.TestCase): 73 | 74 | def test_imports_test_target(self): 75 | 76 | class MyGlobalTest(SparklyGlobalSessionTest): 77 | session = SparklyTestSession 78 | test_target = 'tests.integration.fake_modules.testing.is_fake' 79 | 80 | 81 | MyGlobalTest.setUpClass() 82 | 83 | self.assertTrue(is_fake) 84 | 85 | 86 | class TestCassandraFixtures(SparklyGlobalSessionTest): 87 | session = SparklyTestSession 88 | 89 | def test_cassandra_fixture(self): 90 | data_in_cassandra = CassandraFixture( 91 | 'cassandra.docker', 92 | absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_setup.cql'), 93 | absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_teardown.cql'), 94 | ) 95 | 96 | with data_in_cassandra: 97 | time.sleep(2) # wait till keyspace is up 98 | df = self.spark.read_ext.by_url('cassandra://cassandra.docker/sparkly_test/test') 99 | self.assertRowsEqual(df.select('uid', 'countries').collect(), [ 100 | { 101 | 'uid': '1', 102 | 'countries': {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206}, 103 | }, 104 | ]) 105 | 106 | 107 | class TestMysqlFixtures(SparklyGlobalSessionTest): 108 | 109 | session = SparklyTestSession 110 | 111 | fixtures = [ 112 | MysqlFixture( 113 | 'mysql.docker', 114 | 'root', 115 | None, 116 | absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_setup.sql'), 117 | absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_teardown.sql'), 118 | ) 119 | ] 120 | 121 | def test_mysql_fixture(self): 122 | df = self.spark.read_ext.by_url('mysql://mysql.docker/sparkly_test/test?user=root&password=') 123 | self.assertRowsEqual(df.collect(), [ 124 | {'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111}, 125 | ]) 126 | 127 | 128 | class TestElasticFixture(SparklyGlobalSessionTest): 129 | 130 | session = SparklyTestSession 131 | 132 | class_fixtures = [ 133 | ElasticFixture( 134 | 'elastic.docker', 135 | 'sparkly_test_fixture', 136 | None, 137 | absolute_path(__file__, 'resources', 'test_fixtures', 'mapping.json'), 138 | absolute_path(__file__, 'resources', 'test_fixtures', 'data_for_es7.json'), 139 | ) 140 | ] 141 | 142 | def test_elastic_fixture(self): 143 | df = self.spark.read_ext.by_url( 144 | 'elastic://elastic.docker/sparkly_test_fixture?es.read.metadata=false' 145 | ) 146 | self.assertRowsEqual(df.collect(), [{'name': 'John', 'age': 56}]) 147 | 148 | 149 | class TestKafkaFixture(SparklyGlobalSessionTest): 150 | 151 | session = SparklyTestSession 152 | 153 | topic = 'sparkly.test.fixture.{}'.format(uuid.uuid4().hex[:10]) 154 | fixtures = [ 155 | KafkaFixture( 156 | 'kafka.docker', 157 | topic=topic, 158 | key_serializer=lambda item: json.dumps(item).encode('utf-8'), 159 | value_serializer=lambda item: json.dumps(item).encode('utf-8'), 160 | data=absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json'), 161 | ) 162 | ] 163 | 164 | def test_kafka_fixture(self): 165 | consumer = KafkaConsumer( 166 | self.topic, 167 | bootstrap_servers='kafka.docker:9092', 168 | key_deserializer=lambda item: json.loads(item.decode('utf-8')), 169 | value_deserializer=lambda item: json.loads(item.decode('utf-8')), 170 | auto_offset_reset='earliest', 171 | ) 172 | 173 | actual_data = [] 174 | for i in range(5): 175 | message = next(consumer) 176 | data = {'key': message.key, 'value': message.value} 177 | actual_data.append(data) 178 | 179 | expected_data = self.spark.read.json( 180 | absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json') 181 | ) 182 | self.assertRowsEqual(expected_data.collect(), actual_data) 183 | 184 | 185 | class TestKafkaWatcher(SparklyGlobalSessionTest): 186 | session = SparklyTestSession 187 | 188 | def test_write_kafka_dataframe(self): 189 | host = 'kafka.docker' 190 | topic = 'test.topic.kafkawatcher.{}'.format(uuid.uuid4().hex[:10]) 191 | port = 9092 192 | input_df, expected_data = self.get_test_data('kafka_watcher_1.json') 193 | 194 | kafka_watcher = KafkaWatcher( 195 | self.spark, 196 | input_df.schema, 197 | pickle.loads, 198 | pickle.loads, 199 | host, 200 | topic, 201 | port, 202 | ) 203 | with kafka_watcher: 204 | expected_count = self.write_data(input_df, host, topic, port) 205 | self.assertEqual(kafka_watcher.count, expected_count) 206 | self.assertRowsEqual(kafka_watcher.df.collect(), expected_data) 207 | 208 | with kafka_watcher: 209 | pass 210 | self.assertEqual(kafka_watcher.count, 0) 211 | self.assertIsNone(kafka_watcher.df, None) 212 | 213 | input_df, expected_data = self.get_test_data('kafka_watcher_2.json') 214 | with kafka_watcher: 215 | expected_count = self.write_data(input_df, host, topic, port) 216 | self.assertEqual(kafka_watcher.count, expected_count) 217 | self.assertRowsEqual(kafka_watcher.df.collect(), expected_data) 218 | 219 | def get_test_data(self, filename): 220 | file_path = absolute_path(__file__, 'resources', 'test_testing', filename) 221 | df = self.spark.read.json(file_path) 222 | data = [item.asDict(recursive=True) for item in df.collect()] 223 | return df, data 224 | 225 | def write_data(self, df, host, topic, port): 226 | producer = KafkaProducer( 227 | bootstrap_servers=['{}:{}'.format(host, port)], 228 | key_serializer=pickle.dumps, 229 | value_serializer=pickle.dumps, 230 | ) 231 | rows = df.collect() 232 | for row in rows: 233 | producer.send(topic, key=row.key, value=row.value) 234 | producer.flush() 235 | return len(rows) 236 | 237 | 238 | class TestSwitchingBetweenTestSessions(unittest.TestCase): 239 | # Test whether a user can switch between different sessions 240 | # during tests 241 | 242 | def test_switch_session_between_sparkly_tests(self): 243 | # Define a test session with ES 7.14 244 | class SessionA(SparklySession): 245 | packages = [ 246 | 'org.elasticsearch:elasticsearch-spark-30_2.12:7.14.0', 247 | ] 248 | 249 | repositories = [ 250 | 'http://packages.confluent.io/maven/', 251 | ] 252 | 253 | class TestSessionA(SparklyTest): 254 | session = SessionA 255 | 256 | # Define a test session with ES 7.17 257 | class SessionB(SparklySession): 258 | packages = [ 259 | 'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8', 260 | ] 261 | 262 | repositories = [ 263 | 'http://packages.confluent.io/maven/', 264 | ] 265 | 266 | class TestSessionB(SparklyTest): 267 | session = SessionB 268 | 269 | # Make sure that when the ES6 session is set up, the underlying 270 | # spark session contains the appropriate jars 271 | TestSessionA.setUpClass() 272 | es_7_14_jar = ( 273 | 'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.14.0.jar' 274 | ) 275 | installed_jars = list(TestSessionA.spark._jsc.jars()) 276 | self.assertIn(es_7_14_jar, installed_jars) 277 | TestSessionA.tearDownClass() 278 | 279 | # And now make sure that when the ES7 session is set up, the underlying 280 | # spark session contains the appropriate jars as well 281 | TestSessionB.setUpClass() 282 | es_7_17_jar = ( 283 | 'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.17.8.jar' 284 | ) 285 | installed_jars = list(TestSessionB.spark._jsc.jars()) 286 | self.assertIn(es_7_17_jar, installed_jars) 287 | self.assertNotIn(es_7_14_jar, installed_jars) 288 | 289 | TestSessionB.tearDownClass() 290 | 291 | def test_switch_global_session_between_sparkly_tests(self): 292 | # Define a test session with ES 7.14 293 | class SessionA(SparklySession): 294 | packages = [ 295 | 'org.elasticsearch:elasticsearch-spark-30_2.12:7.14.0', 296 | ] 297 | 298 | repositories = [ 299 | 'http://packages.confluent.io/maven/', 300 | ] 301 | 302 | class TestSessionA(SparklyGlobalSessionTest): 303 | session = SessionA 304 | 305 | # Define a test session with ES 7.17 306 | class SessionB(SparklySession): 307 | packages = [ 308 | 'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8', 309 | ] 310 | 311 | repositories = [ 312 | 'http://packages.confluent.io/maven/', 313 | ] 314 | 315 | class TestSessionB(SparklyGlobalSessionTest): 316 | session = SessionB 317 | 318 | # Make sure that when the ES6 session is set up, the underlying 319 | # spark session contains the appropriate jars 320 | TestSessionA.setUpClass() 321 | es_7_14_jar = ( 322 | 'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.14.0.jar' 323 | ) 324 | installed_jars = list(TestSessionA.spark._jsc.jars()) 325 | self.assertIn(es_7_14_jar, installed_jars) 326 | TestSessionA.tearDownClass() 327 | 328 | # And now make sure that when the ES7 session is set up, the underlying 329 | # spark session contains the appropriate jars as well 330 | TestSessionB.setUpClass() 331 | es_7_17_jar = ( 332 | 'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.17.8.jar' 333 | ) 334 | installed_jars = list(TestSessionB.spark._jsc.jars()) 335 | self.assertIn(es_7_17_jar, installed_jars) 336 | self.assertNotIn(es_7_14_jar, installed_jars) 337 | TestSessionB.tearDownClass() 338 | -------------------------------------------------------------------------------- /tests/no_extras/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /tests/no_extras/test_testing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import json 18 | 19 | from sparkly.testing import ( 20 | CassandraFixture, 21 | MysqlFixture, 22 | SparklyGlobalSessionTest, 23 | KafkaFixture) 24 | from tests.integration.base import SparklyTestSession 25 | 26 | 27 | class TestCassandraFixtures(SparklyGlobalSessionTest): 28 | session = SparklyTestSession 29 | 30 | def test_cassandra_fixture(self): 31 | with self.assertRaises(NotImplementedError): 32 | CassandraFixture( 33 | 'cassandra.docker', 34 | 'test', 35 | 'test', 36 | ) 37 | 38 | 39 | class TestMysqlFixtures(SparklyGlobalSessionTest): 40 | 41 | session = SparklyTestSession 42 | 43 | def test_mysql_fixture(self): 44 | with self.assertRaises(NotImplementedError): 45 | MysqlFixture( 46 | 'mysql.docker', 47 | 'root', 48 | None, 49 | 'test', 50 | 'test', 51 | ) 52 | 53 | 54 | class TestKafkaFixture(SparklyGlobalSessionTest): 55 | 56 | session = SparklyTestSession 57 | 58 | def test_kafka_fixture(self): 59 | with self.assertRaises(NotImplementedError): 60 | KafkaFixture( 61 | 'kafka.docker', 62 | topic='test', 63 | key_serializer=lambda item: json.dumps(item).encode('utf-8'), 64 | value_serializer=lambda item: json.dumps(item).encode('utf-8'), 65 | data='test.json', 66 | ) 67 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /tests/unit/test_instant_testing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | from unittest import TestCase 19 | 20 | try: 21 | from unittest import mock 22 | except ImportError: 23 | import mock 24 | 25 | from sparkly.instant_testing import InstantTesting 26 | 27 | 28 | _MOCK_LOCK_FILE_PATH = InstantTesting.LOCK_FILE_PATH + '__test' 29 | 30 | 31 | @mock.patch.object(InstantTesting, 'LOCK_FILE_PATH', _MOCK_LOCK_FILE_PATH) 32 | class TestInstantTesting(TestCase): 33 | def setUp(self): 34 | try: 35 | os.remove(_MOCK_LOCK_FILE_PATH) 36 | except: 37 | pass 38 | 39 | def test_activate(self): 40 | self.assertFalse(InstantTesting.is_activated()) 41 | InstantTesting.activate() 42 | self.assertTrue(InstantTesting.is_activated()) 43 | 44 | def test_deactivate(self): 45 | InstantTesting.activate() 46 | self.assertTrue(InstantTesting.is_activated()) 47 | InstantTesting.deactivate() 48 | self.assertFalse(InstantTesting.is_activated()) 49 | 50 | def test_double_activation(self): 51 | InstantTesting.activate() 52 | InstantTesting.activate() 53 | 54 | def test_double_deactivation(self): 55 | InstantTesting.deactivate() 56 | InstantTesting.deactivate() 57 | -------------------------------------------------------------------------------- /tests/unit/test_reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import unittest 18 | try: 19 | from unittest import mock 20 | except ImportError: 21 | import mock 22 | 23 | import pyspark.sql 24 | 25 | import sparkly 26 | from sparkly.reader import SparklyReader 27 | from sparkly.utils import parse_schema 28 | 29 | 30 | class TestSparklyReaderByUrl(unittest.TestCase): 31 | def setUp(self): 32 | self.spark = mock.Mock(spec=sparkly.SparklySession) 33 | self.read_ext = SparklyReader(self.spark) 34 | self.fake_df = mock.Mock(spec=pyspark.sql.DataFrame) 35 | 36 | def test_table(self): 37 | self.spark.table.return_value = self.fake_df 38 | 39 | df = self.read_ext.by_url('table://some_hive_table') 40 | 41 | self.assertEqual(df, self.fake_df) 42 | self.spark.table.assert_called_with('some_hive_table') 43 | 44 | def test_parquet(self): 45 | self.spark.read.load.return_value = self.fake_df 46 | 47 | df = self.read_ext.by_url('parquet:s3://my-bucket/path/to/parquet') 48 | 49 | self.assertEqual(df, self.fake_df) 50 | self.spark.read.load.assert_called_with( 51 | path='s3://my-bucket/path/to/parquet', 52 | format='parquet', 53 | ) 54 | 55 | def test_csv(self): 56 | self.spark.read.csv.return_value = self.fake_df 57 | 58 | df = self.read_ext.by_url('csv:s3://my-bucket/path/to/csv?header=true') 59 | 60 | self.assertEqual(df, self.fake_df) 61 | self.spark.read.csv.assert_called_with( 62 | path='s3://my-bucket/path/to/csv', 63 | header='true', 64 | ) 65 | 66 | def test_csv_on_local_file_system(self): 67 | self.spark.read.csv.return_value = self.fake_df 68 | 69 | schema = 'struct>' 70 | df = self.read_ext.by_url('csv:///path/on/file/system?header=false&schema={}' 71 | .format(schema)) 72 | 73 | self.assertEqual(df, self.fake_df) 74 | self.spark.read.csv.assert_called_with( 75 | path='/path/on/file/system', 76 | schema=parse_schema(schema), 77 | header='false', 78 | ) 79 | 80 | def test_elastic_on_or_before_6(self): 81 | self.read_ext.elastic = mock.Mock(return_value=self.fake_df) 82 | 83 | df = self.read_ext.by_url('elastic://es_host/test_index/test_type?' 84 | 'q=name:*Johnny*&fields=name,surname&' 85 | 'es.input.json=true¶llelism=4') 86 | 87 | self.assertEqual(df, self.fake_df) 88 | self.read_ext.elastic.assert_called_with( 89 | host='es_host', 90 | es_index='test_index', 91 | es_type='test_type', 92 | query='?q=name:*Johnny*', 93 | fields=['name', 'surname'], 94 | port=None, 95 | parallelism=4, 96 | options={'es.input.json': 'true'}, 97 | ) 98 | 99 | def test_elastic_on_and_after_7(self): 100 | self.read_ext.elastic = mock.Mock(return_value=self.fake_df) 101 | 102 | df = self.read_ext.by_url('elastic://es_host/test_index?' 103 | 'q=name:*Johnny*&fields=name,surname&' 104 | 'es.input.json=true¶llelism=4') 105 | 106 | self.assertEqual(df, self.fake_df) 107 | self.read_ext.elastic.assert_called_with( 108 | host='es_host', 109 | es_index='test_index', 110 | es_type=None, 111 | query='?q=name:*Johnny*', 112 | fields=['name', 'surname'], 113 | port=None, 114 | parallelism=4, 115 | options={'es.input.json': 'true'}, 116 | ) 117 | 118 | def test_cassandra(self): 119 | self.read_ext.cassandra = mock.Mock(return_value=self.fake_df) 120 | 121 | df = self.read_ext.by_url('cassandra://localhost/test_cf/test_table?' 122 | 'consistency=ONE¶llelism=8&query.retry.count=2') 123 | 124 | self.assertEqual(df, self.fake_df) 125 | self.read_ext.cassandra.assert_called_with( 126 | host='localhost', 127 | port=None, 128 | keyspace='test_cf', 129 | table='test_table', 130 | consistency='ONE', 131 | parallelism=8, 132 | options={'query.retry.count': '2'}, 133 | ) 134 | 135 | def test_cassandra_custom_port(self): 136 | self.read_ext.cassandra = mock.Mock(return_value=self.fake_df) 137 | 138 | df = self.read_ext.by_url('cassandra://localhost:19042/test_cf/test_table?' 139 | 'consistency=ONE¶llelism=8&query.retry.count=2') 140 | 141 | self.assertEqual(df, self.fake_df) 142 | self.read_ext.cassandra.assert_called_with( 143 | host='localhost', 144 | port=19042, 145 | keyspace='test_cf', 146 | table='test_table', 147 | consistency='ONE', 148 | parallelism=8, 149 | options={'query.retry.count': '2'}, 150 | ) 151 | 152 | def test_mysql(self): 153 | self.read_ext.mysql = mock.Mock(return_value=self.fake_df) 154 | 155 | df = self.read_ext.by_url('mysql://localhost/test_database/test_table?' 156 | 'user=root&password=pass') 157 | 158 | self.assertEqual(df, self.fake_df) 159 | self.read_ext.mysql.assert_called_with( 160 | host='localhost', 161 | database='test_database', 162 | table='test_table', 163 | port=None, 164 | parallelism=None, 165 | options={'user': 'root', 'password': 'pass'}, 166 | ) 167 | 168 | def test_mysql_custom_port(self): 169 | self.read_ext.mysql = mock.Mock(return_value=self.fake_df) 170 | 171 | df = self.read_ext.by_url('mysql://localhost:33306/test_database/test_table?' 172 | 'user=root&password=pass') 173 | 174 | self.assertEqual(df, self.fake_df) 175 | self.read_ext.mysql.assert_called_with( 176 | host='localhost', 177 | database='test_database', 178 | table='test_table', 179 | port=33306, 180 | parallelism=None, 181 | options={'user': 'root', 'password': 'pass'}, 182 | ) 183 | 184 | def test_unknown_format(self): 185 | self.assertRaises(NotImplementedError, self.read_ext.by_url, 'fake://host') 186 | -------------------------------------------------------------------------------- /tests/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from unittest import TestCase 18 | try: 19 | from unittest import mock 20 | except ImportError: 21 | import mock 22 | 23 | from pyspark import StorageLevel 24 | from pyspark.sql import DataFrame 25 | from pyspark.sql import types as T 26 | 27 | from sparkly.utils import lru_cache, parse_schema, schema_has 28 | 29 | 30 | class TestParseSchema(TestCase): 31 | def test_atomic(self): 32 | self.assert_parsed_properly('date') 33 | self.assert_parsed_properly('float') 34 | self.assert_parsed_properly('string') 35 | self.assert_parsed_properly('timestamp') 36 | self.assert_parsed_properly('int') 37 | 38 | def test_array(self): 39 | self.assert_parsed_properly('array') 40 | 41 | def test_map(self): 42 | self.assert_parsed_properly('map') 43 | 44 | def test_struct(self): 45 | self.assert_parsed_properly('struct') 46 | 47 | def test_parse_complex_types(self): 48 | self.assert_parsed_properly('array>') 49 | self.assert_parsed_properly('map>>') 50 | self.assert_parsed_properly('struct>') 51 | self.assert_parsed_properly('struct>,c:map>') 52 | 53 | def assert_parsed_properly(self, schema): 54 | self.assertEqual(parse_schema(schema).simpleString(), schema) 55 | 56 | 57 | class TestLruCache(TestCase): 58 | def test_caching(self): 59 | df = mock.MagicMock(spec=DataFrame) 60 | 61 | called = [0] 62 | 63 | @lru_cache(storage_level=StorageLevel.DISK_ONLY) 64 | def func(*args, **kwargs): 65 | called[0] += 1 66 | return df 67 | 68 | func() 69 | df.persist.assert_called_once_with(StorageLevel.DISK_ONLY) 70 | self.assertEqual(df.unpersist.mock_calls, []) 71 | self.assertEqual(called[0], 1) 72 | 73 | cached_df = func() 74 | self.assertEqual(cached_df, df) 75 | self.assertEqual(called[0], 1) 76 | 77 | def test_eviction(self): 78 | first_df = mock.MagicMock(spec=DataFrame) 79 | second_df = mock.MagicMock(spec=DataFrame) 80 | 81 | @lru_cache(maxsize=1, storage_level=StorageLevel.DISK_ONLY) 82 | def func(uid): 83 | if uid == 'first': 84 | return first_df 85 | else: 86 | return second_df 87 | 88 | func('first') 89 | first_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY) 90 | self.assertEqual(first_df.unpersist.mock_calls, []) 91 | 92 | func('second') 93 | first_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY) 94 | first_df.unpersist.assert_called_once_with() 95 | second_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY) 96 | self.assertEqual(second_df.unpersist.mock_calls, []) 97 | 98 | 99 | class TestSchemaHas(TestCase): 100 | def test_structs_equal(self): 101 | schema_has( 102 | T.StructType([ 103 | T.StructField('f1', T.IntegerType()), 104 | T.StructField('f2', T.FloatType()), 105 | T.StructField('f3', T.StringType()), 106 | ]), 107 | T.StructType([ 108 | T.StructField('f3', T.StringType()), 109 | T.StructField('f2', T.FloatType()), 110 | T.StructField('f1', T.IntegerType()), 111 | ]), 112 | ) 113 | 114 | def test_structs_equal_with_dict(self): 115 | schema_has( 116 | T.StructType([ 117 | T.StructField('f1', T.IntegerType()), 118 | T.StructField('f2', T.FloatType()), 119 | T.StructField('f3', T.StringType()), 120 | ]), 121 | { 122 | 'f1': T.IntegerType(), 123 | 'f2': T.FloatType(), 124 | 'f3': T.StringType(), 125 | }, 126 | ) 127 | 128 | def test_structs_subset(self): 129 | schema_has( 130 | T.StructType([ 131 | T.StructField('f1', T.IntegerType()), 132 | T.StructField('f2', T.FloatType()), 133 | T.StructField('f3', T.StringType()), 134 | ]), 135 | T.StructType([ 136 | T.StructField('f2', T.FloatType()), 137 | ]), 138 | ) 139 | 140 | def test_structs_nested_subset(self): 141 | schema_has( 142 | T.StructType([ 143 | T.StructField( 144 | 'f1', 145 | T.ArrayType(T.StructType([ 146 | T.StructField('f11', T.IntegerType()), 147 | T.StructField('f12', T.StringType()), 148 | ])), 149 | ), 150 | ]), 151 | T.StructType([ 152 | T.StructField( 153 | 'f1', 154 | T.ArrayType(T.StructType([T.StructField('f11', T.IntegerType())])), 155 | ), 156 | ]), 157 | ) 158 | 159 | def test_arrays_equal(self): 160 | schema_has( 161 | T.ArrayType(T.ArrayType(T.ArrayType(T.LongType()))), 162 | T.ArrayType(T.ArrayType(T.ArrayType(T.LongType()))), 163 | ) 164 | 165 | def test_arrays_nested_subset(self): 166 | schema_has( 167 | T.ArrayType(T.ArrayType(T.StructType([ 168 | T.StructField('f1', T.ArrayType(T.LongType())), 169 | T.StructField('f2', T.ArrayType(T.StringType())), 170 | ]))), 171 | T.ArrayType(T.ArrayType(T.StructType([ 172 | T.StructField('f1', T.ArrayType(T.LongType())) 173 | ]))), 174 | ) 175 | 176 | def test_maps_equal(self): 177 | schema_has( 178 | T.MapType(T.StringType(), T.MapType(T.StringType(), T.LongType())), 179 | T.MapType(T.StringType(), T.MapType(T.StringType(), T.LongType())), 180 | ) 181 | 182 | def test_maps_nested_subset(self): 183 | schema_has( 184 | T.MapType( 185 | T.StringType(), 186 | T.MapType( 187 | T.StringType(), 188 | T.StructType([ 189 | T.StructField('f1', T.MapType(T.StringType(), T.LongType())), 190 | T.StructField('f2', T.MapType(T.StringType(), T.IntegerType())), 191 | ]), 192 | ), 193 | ), 194 | T.MapType( 195 | T.StringType(), 196 | T.MapType( 197 | T.StringType(), 198 | T.StructType([ 199 | T.StructField('f1', T.MapType(T.StringType(), T.LongType())), 200 | ]), 201 | ), 202 | ), 203 | ) 204 | 205 | def test_type_mismatch(self): 206 | with self.assertRaisesRegex(AssertionError, 'Cannot compare heterogeneous types'): 207 | schema_has( 208 | T.StructType([T.StructField('f1', T.IntegerType())]), 209 | T.ArrayType(T.IntegerType()), 210 | ) 211 | 212 | with self.assertRaisesRegex(AssertionError, 'Cannot compare heterogeneous types'): 213 | schema_has( 214 | T.ArrayType(T.IntegerType()), 215 | {'f1': T.IntegerType()}, 216 | ) 217 | 218 | with self.assertRaisesRegex(TypeError, r'f1 is IntegerType\(?\)?, expected LongType\(?\)?'): 219 | schema_has( 220 | T.StructType([T.StructField('f1', T.IntegerType())]), 221 | T.StructType([T.StructField('f1', T.LongType())]), 222 | ) 223 | 224 | with self.assertRaisesRegex( 225 | TypeError, 226 | r'f1\.element\.s1 is IntegerType\(?\)?, expected LongType\(?\)?', 227 | ): 228 | schema_has( 229 | T.StructType([ 230 | T.StructField( 231 | 'f1', 232 | T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), 233 | ), 234 | ]), 235 | T.StructType([ 236 | T.StructField( 237 | 'f1', 238 | T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])), 239 | ), 240 | ]), 241 | ) 242 | 243 | with self.assertRaisesRegex( 244 | TypeError, 245 | r'element is IntegerType\(?\)?, expected LongType\(?\)?', 246 | ): 247 | schema_has( 248 | T.ArrayType(T.IntegerType()), 249 | T.ArrayType(T.LongType()), 250 | ) 251 | 252 | with self.assertRaisesRegex( 253 | TypeError, 254 | r'key is StringType\(?\)?, expected LongType\(?\)?', 255 | ): 256 | schema_has( 257 | T.MapType(T.StringType(), T.IntegerType()), 258 | T.MapType(T.LongType(), T.IntegerType()), 259 | ) 260 | 261 | with self.assertRaisesRegex( 262 | TypeError, 263 | r'value is IntegerType\(?\)?, expected LongType\(?\)?' 264 | ): 265 | schema_has( 266 | T.MapType(T.StringType(), T.IntegerType()), 267 | T.MapType(T.StringType(), T.LongType()), 268 | ) 269 | 270 | def test_undefined_field(self): 271 | with self.assertRaisesRegex(KeyError, 'f2'): 272 | schema_has( 273 | T.StructType([T.StructField('f1', T.IntegerType())]), 274 | T.StructType([T.StructField('f2', T.LongType())]), 275 | ) 276 | 277 | with self.assertRaisesRegex(KeyError, r'f1\.element\.s2'): 278 | schema_has( 279 | T.StructType([ 280 | T.StructField( 281 | 'f1', 282 | T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), 283 | ), 284 | ]), 285 | T.StructType([ 286 | T.StructField( 287 | 'f1', 288 | T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])), 289 | ), 290 | ]), 291 | ) 292 | 293 | with self.assertRaisesRegex( 294 | TypeError, 295 | r'element is IntegerType\(?\)?, expected LongType\(?\)?', 296 | ): 297 | schema_has( 298 | T.ArrayType(T.IntegerType()), 299 | T.ArrayType(T.LongType()), 300 | ) 301 | -------------------------------------------------------------------------------- /tests/unit/test_writer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import unittest 18 | try: 19 | from unittest import mock 20 | except ImportError: 21 | import mock 22 | 23 | from pyspark.sql import DataFrame, SQLContext 24 | 25 | from sparkly import SparklySession 26 | from sparkly.writer import SparklyWriter 27 | 28 | 29 | class TestWriteByUrl(unittest.TestCase): 30 | def setUp(self): 31 | self.df = mock.Mock(spec=DataFrame) 32 | self.df.sql_ctx = mock.Mock(spec=SQLContext) 33 | self.df.sql_ctx.sparkSession = mock.Mock(spec=SparklySession) 34 | self.write_ext = SparklyWriter(self.df) 35 | 36 | def test_parquet_s3(self): 37 | self.write_ext.by_url( 38 | 'parquet:s3://my-bucket/path/to/parquet?partitionBy=x,y,z&mode=append&' 39 | 'additional=1¶llelism=20', 40 | ) 41 | 42 | self.df.coalesce.assert_called_once_with(20) 43 | self.df.coalesce.return_value.write.save.assert_called_once_with( 44 | path='s3://my-bucket/path/to/parquet', 45 | format='parquet', 46 | mode='append', 47 | partitionBy=['x', 'y', 'z'], 48 | additional='1', 49 | ) 50 | 51 | def test_csv_local(self): 52 | self.df.write.csv = mock.Mock() 53 | 54 | self.write_ext.by_url('csv:///my-bucket/path/to/csv?parallelism=10') 55 | 56 | self.df.coalesce.assert_called_once_with(10) 57 | self.df.coalesce.return_value.write.save.assert_called_once_with( 58 | path='/my-bucket/path/to/csv', 59 | format='csv', 60 | ) 61 | 62 | def test_cassandra(self): 63 | self.write_ext.cassandra = mock.Mock() 64 | 65 | self.write_ext.by_url( 66 | 'cassandra://host/ks/cf?consistency=ONE&mode=overwrite¶llelism=10', 67 | ) 68 | 69 | self.write_ext.cassandra.assert_called_once_with( 70 | host='host', 71 | keyspace='ks', 72 | table='cf', 73 | port=None, 74 | mode='overwrite', 75 | consistency='ONE', 76 | parallelism=10, 77 | options={}, 78 | ) 79 | 80 | def test_cassandra_custom_port(self): 81 | self.write_ext.cassandra = mock.Mock() 82 | 83 | self.write_ext.by_url( 84 | 'cassandra://host:19042/ks/cf?consistency=ONE&mode=overwrite¶llelism=10', 85 | ) 86 | 87 | self.write_ext.cassandra.assert_called_once_with( 88 | host='host', 89 | keyspace='ks', 90 | table='cf', 91 | port=19042, 92 | mode='overwrite', 93 | consistency='ONE', 94 | parallelism=10, 95 | options={}, 96 | ) 97 | 98 | def test_elastic_on_or_before_6(self): 99 | self.write_ext.elastic = mock.Mock() 100 | 101 | self.write_ext.by_url('elastic://host/index/type?parallelism=15') 102 | 103 | self.write_ext.elastic.assert_called_once_with( 104 | host='host', 105 | es_index='index', 106 | es_type='type', 107 | port=None, 108 | mode=None, 109 | parallelism=15, 110 | options={}, 111 | ) 112 | 113 | def test_elastic_on_and_after_7(self): 114 | self.write_ext.elastic = mock.Mock() 115 | 116 | self.write_ext.by_url('elastic://host/index?parallelism=15') 117 | 118 | self.write_ext.elastic.assert_called_once_with( 119 | host='host', 120 | es_index='index', 121 | es_type=None, 122 | port=None, 123 | mode=None, 124 | parallelism=15, 125 | options={}, 126 | ) 127 | 128 | def test_mysql(self): 129 | self.write_ext.mysql = mock.Mock() 130 | 131 | self.write_ext.by_url('mysql://host/db/table?parallelism=20') 132 | 133 | self.write_ext.mysql.assert_called_with( 134 | host='host', 135 | database='db', 136 | table='table', 137 | port=None, 138 | mode=None, 139 | parallelism=20, 140 | options={}, 141 | ) 142 | 143 | def test_mysql_custom_port(self): 144 | self.write_ext.mysql = mock.Mock() 145 | 146 | self.write_ext.by_url('mysql://host:33306/db/table?parallelism=20') 147 | 148 | self.write_ext.mysql.assert_called_with( 149 | host='host', 150 | database='db', 151 | table='table', 152 | port=33306, 153 | mode=None, 154 | parallelism=20, 155 | options={}, 156 | ) 157 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Tubular Labs, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | [tox] 18 | envlist = spark32,spark33,spark34,no_extras,docs 19 | 20 | [testenv:spark32] 21 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit 22 | deps = 23 | -rrequirements.txt 24 | -rrequirements_dev.txt 25 | -rrequirements_extras.txt 26 | pyspark==3.2.3 27 | 28 | [testenv:spark33] 29 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit 30 | deps = 31 | -rrequirements.txt 32 | -rrequirements_dev.txt 33 | -rrequirements_extras.txt 34 | pyspark==3.3.1 35 | 36 | [testenv:spark34] 37 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit 38 | deps = 39 | -rrequirements.txt 40 | -rrequirements_dev.txt 41 | -rrequirements_extras.txt 42 | pyspark==3.4.0 43 | 44 | [testenv:no_extras] 45 | commands = py.test tests/no_extras 46 | deps = 47 | -rrequirements.txt 48 | -rrequirements_dev.txt 49 | pyspark==3.3.1 50 | 51 | [testenv:docs] 52 | commands = sphinx-build -b html docs/source docs/build 53 | deps = 54 | -rrequirements_dev.txt 55 | -rrequirements_extras.txt 56 | -rrequirements.txt 57 | pyspark==3.3.1 58 | --------------------------------------------------------------------------------