├── .dockerignore
├── .github
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── .readthedocs.yml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── bin
    └── sparkly-testing
├── docker-compose.yml
├── docs
    └── source
    │   ├── catalog.rst
    │   ├── conf.py
    │   ├── functions.rst
    │   ├── index.rst
    │   ├── license.rst
    │   ├── reader_and_writer.rst
    │   ├── session.rst
    │   ├── testing.rst
    │   └── utils.rst
├── requirements.txt
├── requirements_dev.txt
├── requirements_extras.txt
├── setup.cfg
├── setup.py
├── sparkly
    ├── __init__.py
    ├── catalog.py
    ├── exceptions.py
    ├── functions.py
    ├── instant_testing.py
    ├── reader.py
    ├── session.py
    ├── testing.py
    ├── utils.py
    └── writer.py
├── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   ├── base.py
    │   ├── fake_modules
    │   │   ├── __init__.py
    │   │   └── testing.py
    │   ├── resources
    │   │   ├── brickhouse-0.7.1.jar
    │   │   ├── test_fixtures
    │   │   │   ├── cassandra_setup.cql
    │   │   │   ├── cassandra_teardown.cql
    │   │   │   ├── data.json
    │   │   │   ├── data_for_es7.json
    │   │   │   ├── kafka.json
    │   │   │   ├── mapping.json
    │   │   │   ├── mysql_setup.sql
    │   │   │   └── mysql_teardown.sql
    │   │   ├── test_read
    │   │   │   ├── cassandra_setup.cql
    │   │   │   ├── cassandra_teardown.cql
    │   │   │   ├── elastic7_setup.json
    │   │   │   ├── elastic_setup.json
    │   │   │   ├── kafka_setup.json
    │   │   │   ├── mysql_setup.sql
    │   │   │   └── mysql_teardown.sql
    │   │   ├── test_testing
    │   │   │   ├── kafka_watcher_1.json
    │   │   │   └── kafka_watcher_2.json
    │   │   └── test_write
    │   │   │   ├── cassandra_setup.cql
    │   │   │   ├── cassandra_teardown.cql
    │   │   │   ├── elastic7_setup.json
    │   │   │   ├── elastic_setup.json
    │   │   │   ├── kafka_setup.json
    │   │   │   ├── mysql_setup.sql
    │   │   │   └── mysql_teardown.sql
    │   ├── test_catalog.py
    │   ├── test_functions.py
    │   ├── test_instant_testing.py
    │   ├── test_reader.py
    │   ├── test_session.py
    │   ├── test_testing.py
    │   └── test_writer.py
    ├── no_extras
    │   ├── __init__.py
    │   └── test_testing.py
    └── unit
    │   ├── __init__.py
    │   ├── test_instant_testing.py
    │   ├── test_reader.py
    │   ├── test_session.py
    │   ├── test_testing.py
    │   ├── test_utils.py
    │   └── test_writer.py
└── tox.ini


/.dockerignore:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | .git/
18 | .idea/


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - run: make test
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | 
21 | # C extensions
22 | *.so
23 | 
24 | # Distribution / packaging
25 | .Python
26 | build/
27 | develop-eggs/
28 | dist/
29 | downloads/
30 | eggs/
31 | .eggs/
32 | lib/
33 | lib64/
34 | parts/
35 | sdist/
36 | var/
37 | *.egg-info/
38 | .installed.cfg
39 | *.egg
40 | 
41 | # PyInstaller
42 | #  Usually these files are written by a python script from a template
43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 | 
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 | 
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *,cover
60 | 
61 | # Translations
62 | *.mo
63 | *.pot
64 | 
65 | # Django stuff:
66 | *.log
67 | 
68 | # Sphinx documentation
69 | docs/_build/
70 | 
71 | # PyBuilder
72 | target/
73 | 
74 | # Virtualenvs
75 | venv*
76 | 
77 | # idea
78 | .idea/
79 | 
80 | .DS_Store
81 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.7"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | formats: all
20 | 
21 | # Optionally set the version of Python and requirements required to build your docs
22 | python:
23 |   install:
24 |     - requirements: requirements_dev.txt
25 |     - requirements: requirements_extras.txt
26 |     - requirements: requirements.txt
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## 3.0.0
  2 | * Improved performance of `catalog_ext.has_table` function by trying to execute a dummy SQL rather than listing the entire database, noticable mostly with databases with many tables.
  3 | * Some minor changes to help in a spark-on-kubernetes environment:
  4 |     * In addition to setting `PYSPARK_SUBMIT_ARGS`, also explicitly set config params so they are picked up by an already-running JVM
  5 |     * Register a handler to stop spark session on python termination to deal with [SPARK-27927](https://issues.apache.org/jira/browse/SPARK-27927)
  6 | * Removed `has_package` and `has_jar` functions, which are incomplete checks (resulting in false negatives) and are merely syntactic sugar.
  7 | * Added options (class variables) `name` and `app_id_template` to autogenerate a unique value for
  8 |   spark option `spark.app.id`, which can help to preserve spark history data for all sessions across restarts.
  9 |   This functionality can be disabled by setting `app_id_template` to `None` or `''`.
 10 | * Drop support of Python 2.7
 11 | * Run integration tests using Python 3.7
 12 | * Drop tests for Elastic 6.x
 13 | * Use Kafka 2.8.0 for integration tests
 14 | 
 15 | ## 2.8.2
 16 | * Support 0.9.x `pymysql` in `sparkly.testing.MysqlFixture`
 17 | 
 18 | ## 2.8.1
 19 | * Fix support for using multiple sparkly sessions during tests
 20 | * SparklySession does not persist modifications to os.environ
 21 | * Support ElasticSearch 7 by making type optional.
 22 | 
 23 | ## 2.8.0
 24 | * Extend `SparklyCatalog` to work with database properties:
 25 | - `spark.catalog_ext.set_database_property`
 26 | - `spark.catalog_ext.get_database_property`
 27 | - `spark.catalog_ext.get_database_properties`
 28 | 
 29 | ## 2.7.1
 30 | * Allow newer versions of `six` package (avoid depednecy hell)
 31 | 
 32 | ## 2.7.0
 33 | * Migrate to spark 2.4.0
 34 | * Fix testing.DataType to use new convention to get field type
 35 | 
 36 | ## 2.6.0
 37 | * Add argmax function to sparkly.functions
 38 | 
 39 | ## 2.5.1
 40 | * Fix port issue with reading and writing `by_url`. `urlparse` return `netloc` with port, which breaks read and write from MySQL and Cassandra.
 41 | 
 42 | ## 2.5.0
 43 | * Add `port` argument to `CassandraFixture` and `MysqlFixture`
 44 | * Add `Content-Type` header to `ElasticFixture` to support ElasticSearch `6.x`
 45 | * Update `elasticsearch-hadoop` connector to `6.5.4`
 46 | * Update image tag for elasticsearch to `6.5.4`
 47 | 
 48 | ## 2.4.1
 49 | * Fix write_ext.kafka: run foreachPartition instead of mapPartitions because returned value can cause spark.driver.maxResultSize excess
 50 | 
 51 | ## 2.4.0
 52 | * Respect PYSPARK_SUBMIT_ARGS if it is already set by appending SparklySession related options at the end instead of overwriting.
 53 | * Fix additional_options to always override SparklySession.options when a session is initialized
 54 | * Fix ujson dependency on environments where redis-py is already installed
 55 | * Access or initialize SparklySession through get_or_create classmethod
 56 | * Ammend `sparkly.functions.switch_case` to accept a user defined function for
 57 |   deciding whether the switch column matches a specific case
 58 | 
 59 | ## 2.3.0
 60 | * Overwrite existing tables in the metastore
 61 | * Add functions module and provide switch_case column generation and multijoin
 62 | * Add implicit test target import and extended assertEqual variation
 63 | * Support writing to redis:// and rediss:// URLs
 64 | * Add LRU cache that persists DataFrames under the hood
 65 | * Add ability to check whether a complex type defines specific fields
 66 | 
 67 | # 2.2.1
 68 | * `spark.sql.shuffle.partitions` in `SparklyTest` should be set to string,
 69 | because `int` value breaks integration testing in Spark 2.0.2.
 70 | 
 71 | # 2.2.0
 72 | * Add instant iterative development mode. `sparkly-testing --help` for more details.
 73 | * Use in-memory db for Hive Metastore in `SparklyTest` (faster tests).
 74 | * `spark.sql.shuffle.partitions = 4` for `SparklyTest` (faster tests).
 75 | * `spark.sql.warehouse.dir = <random tmp dir>` for `SparklyTest` (no side effects)
 76 | 
 77 | ## 2.1.1
 78 | * Fix: remove backtick quoting from catalog utils to ease work with different databases.
 79 | 
 80 | ## 2.1.0
 81 | * Add ability to specify custom maven repositories.
 82 | 
 83 | ## 2.0.4
 84 | * Make it possible to override default value of spark.sql.catalogImplementation
 85 | 
 86 | ## 2.0.3
 87 | * Add KafkaWatcher to facilitate testing of writing to Kafka
 88 | * Fix a few minor pyflakes warnings and typos
 89 | 
 90 | ## 2.0.2
 91 | * Fix: #40 write_ext.kafka ignores errors.
 92 | 
 93 | ## 2.0.1
 94 | * Migrate to Spark 2, Spark 1.6.x isn't supported by sparkly 2.x.
 95 | * Rename `SparklyContext` to `SparklySession` and derive it from `SparkSession`.
 96 | * Use built-in csv reader.
 97 | * Replace `hms` with `catalog_ext`.
 98 | * `parse_schema` is now consistent with `DataType.simpleString` method.
 99 | 
100 | ## 1.1.1
101 | * Fix: kafka import error.
102 | 
103 | ## 1.1.0
104 | * Kafka reader and writer.
105 | * Kafka fixtures.
106 | 
107 | ## 1.0.0
108 | * Initial open-source release.
109 | * Features:
110 |  - Declarative definition of application dependencies (spark packages, jars, UDFs)
111 |  - Readers and writers for ElasticSearch, Cassandra, MySQL
112 |  - DSL for interaction with Apache Hive Metastore
113 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | FROM python:3.10
18 | 
19 | LABEL maintainer="dev@tubularlabs.com"
20 | 
21 | # Install Java 8
22 | RUN apt-get update && apt-get install -y software-properties-common
23 | RUN apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main'
24 | RUN apt-add-repository 'deb http://deb.debian.org/debian/ sid main'
25 | RUN apt-get update && apt-get install -y openjdk-8-jdk
26 | 
27 | # Python env
28 | ENV CASS_DRIVER_NO_EXTENSIONS=1
29 | COPY requirements.txt /tmp/requirements.txt
30 | COPY requirements_dev.txt /tmp/requirements_dev.txt
31 | COPY requirements_extras.txt /tmp/requirements_extras.txt
32 | RUN python -m pip install -r /tmp/requirements.txt
33 | RUN python -m pip install -r /tmp/requirements_dev.txt
34 | RUN python -m pip install -r /tmp/requirements_extras.txt
35 | 
36 | # Provision Sparkly
37 | ADD . /opt/sparkly/
38 | WORKDIR /opt/sparkly/
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017 Tubular Labs, Inc.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 
204 | =======================================================================
205 | Sparkly Subcomponents:
206 | 
207 | The Sparkly project contains subcomponents with separate copyright
208 | notices and license terms. Your use of the source code for the these
209 | subcomponents is subject to the terms and conditions of the following
210 | licenses.
211 | 
212 | ========================================================================
213 | Apache licenses
214 | ========================================================================
215 | 
216 | The following dependencies are provided under a Apache license. See project link for details.
217 | 
218 | (Apache License 2.0) Spark (https://github.com/apache/spark)
219 | (Apache License 2.0) cassandra-driver (https://github.com/datastax/python-driver)
220 | 
221 | ========================================================================
222 | BSD-style licenses
223 | ========================================================================
224 | 
225 | The following dependencies are provided under a BSD-style license. See project link for details.
226 | 
227 | (BSD License) mock (https://github.com/testing-cabal/mock)
228 | (PSF License) Sphinx (https://github.com/sphinx-doc/sphinx)
229 | 
230 | ========================================================================
231 | MIT licenses
232 | ========================================================================
233 | 
234 | The following dependencies are provided under the MIT License. See project link for details.
235 | 
236 | (MIT License) sphinx_rtd_theme (https://github.com/snide/sphinx_rtd_theme)
237 | (MIT License) pytest (https://github.com/pytest-dev/pytest)
238 | (MIT License) pytest-cov (https://github.com/pytest-dev/pytest-cov)
239 | (MIT License) PyMySQL (https://github.com/PyMySQL/PyMySQL)
240 | 
241 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | include requirements.txt README.md
18 | recursive-include sparkly/resources *
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | dev:
18 | 	docker-compose build dev
19 | 	docker-compose run dev bash
20 | 	docker-compose down -v ; exit $$retcode
21 | 
22 | dist:
23 | 	docker-compose build dev
24 | 	docker-compose run --no-deps dev python setup.py bdist_wheel ; retcode="$$?" ; docker-compose down -v ; exit $$retcode
25 | 
26 | docs:
27 | 	docker-compose build dev
28 | 	docker-compose run --no-deps dev python -m sphinx -b html docs/source docs/build
29 | 
30 | test:
31 | 	docker-compose build test
32 | 	docker-compose run test tox ; retcode="$$?" ; docker-compose down -v ; exit $$retcode
33 | 
34 | .PHONY: docs dist
35 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Sparkly
 2 | =======
 3 | 
 4 | |Sparkly PyPi Version| |Documentation Status|
 5 | 
 6 | Helpers & syntax sugar for PySpark. There are several features to make your life easier:
 7 | 
 8 | - Definition of spark packages, external jars, UDFs and spark options within your code;
 9 | - Simplified reader/writer api for Cassandra, Elastic, MySQL, Kafka;
10 | - Testing framework for spark applications.
11 | 
12 | More details could be found in `the official
13 | documentation <https://sparkly.readthedocs.org>`__.
14 | 
15 | Installation
16 | ------------
17 | 
18 | Sparkly itself is easy to install::
19 | 
20 |     pip install pyspark  # pick your version
21 |     pip install sparkly (compatible with spark >= 2.4)
22 | 
23 | 
24 | Getting Started
25 | ---------------
26 | 
27 | Here is a small code snippet to show how to easily read Cassandra table
28 | and write its content to ElasticSearch index::
29 | 
30 |     from sparkly import SparklySession
31 | 
32 | 
33 |     class MySession(SparklySession):
34 |         packages = [
35 |             'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
36 |             'org.elasticsearch:elasticsearch-spark-20_2.11:6.5.4',
37 |         ]
38 | 
39 | 
40 |     if __name__ == '__main__':
41 |         spark = MySession()
42 |         df = spark.read_ext.cassandra('localhost', 'my_keyspace', 'my_table')
43 |         df.write_ext.elastic('localhost', 'my_index', 'my_type')
44 | 
45 | See `the online documentation <https://sparkly.readthedocs.org>`__ for
46 | more details.
47 | 
48 | Testing
49 | -------
50 | 
51 | To run tests you have to have `docker <https://www.docker.com/>`__ and
52 | `docker-compose <https://docs.docker.com/compose/>`__ installed on your
53 | system. If you are working on MacOS we highly recommend you to use
54 | `docker-machine <https://docs.docker.com/machine/>`__. As soon as the
55 | tools mentioned above have been installed, all you need is to run::
56 | 
57 |     make test
58 | 
59 | Supported Spark Versions
60 | ------------------------
61 | 
62 | At the moment we support:
63 | 
64 | +---------------------------------------------------------------------------+
65 | | sparkly >= 2.7 | Spark 2.4.x                                              |
66 | +---------------------------------------------------------------------------+
67 | | sparkly 2.x    | Spark 2.0.x and Spark 2.1.x and Spark 2.2.x              |
68 | +---------------------------------------------------------------------------+
69 | | sparkly 1.x    | Spark 1.6.x                                              |
70 | +---------------------------------------------------------------------------+
71 | 
72 | .. |Sparkly PyPi Version| image:: http://img.shields.io/pypi/v/sparkly.svg
73 |    :target: https://pypi.python.org/pypi/sparkly
74 | .. |Sparkly Build Status| image:: https://app.travis-ci.com/tubular/sparkly.svg?branch=master
75 |    :target: https://app.travis-ci.com/github/tubular/sparkly
76 | .. |Documentation Status| image:: https://readthedocs.org/projects/sparkly/badge/?version=latest
77 |    :target: http://sparkly.readthedocs.io/en/latest/?badge=latest
78 | 


--------------------------------------------------------------------------------
/bin/sparkly-testing:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #
 4 | # Copyright 2017 Tubular Labs, Inc.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import argparse
20 | import logging
21 | import sys
22 | import textwrap
23 | 
24 | from sparkly.instant_testing import InstantTesting
25 | 
26 | 
27 | logging.basicConfig(
28 |     stream=sys.stderr,
29 |     level=logging.INFO,
30 |     format='%(levelname)s %(message)s',
31 | )
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     parser = argparse.ArgumentParser(
36 |         formatter_class=argparse.RawDescriptionHelpFormatter,
37 |         description=textwrap.dedent(
38 |             """\
39 |             Sparkly Instant Testing.
40 |             
41 |             The tool speeds up iterative development on spark-based tests.
42 |             It keeps JVM with initialised SparkContext running between multiple test sessions.
43 |              
44 |             Usage:
45 |                 sparkly-testing up
46 |                 py.test path/to/test_integration_with_pyspark.py  # slow (first run)
47 |                 py.test path/to/test_integration_with_pyspark.py  # fast (next runs)
48 |                 sparkly-testing down
49 |                 
50 |             To change SparkContext options or to add new jars/packages call:
51 |                 sparkly-testing refresh
52 |             """,
53 |         )
54 |     )
55 | 
56 |     sub_commands = parser.add_subparsers()
57 | 
58 |     # Instant testing mode.
59 |     sub_commands.add_parser(
60 |         name='up',
61 |         help='Activate instant testing mode.',
62 |     ).set_defaults(func=lambda _: InstantTesting.activate())
63 | 
64 |     sub_commands.add_parser(
65 |         name='down',
66 |         help='Deactivate instant testing mode.',
67 |     ).set_defaults(func=lambda _: InstantTesting.deactivate())
68 | 
69 |     sub_commands.add_parser(
70 |         name='refresh',
71 |         help='Refresh SparkContext options or add new jars/packages.',
72 |     ).set_defaults(func=lambda _: InstantTesting.deactivate() or InstantTesting.activate())
73 | 
74 |     args = parser.parse_args()
75 | 
76 |     if hasattr(args, 'func'):
77 |         args.func(args)
78 |     else:
79 |         parser.print_help()
80 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | version: '2.1'
 18 | services:
 19 |   dev:
 20 |     build: .
 21 |     depends_on:
 22 |       cassandra.docker:
 23 |         condition: service_healthy
 24 |       elastic.docker:
 25 |         condition: service_healthy
 26 |       kafka.docker:
 27 |         condition: service_healthy
 28 |       mysql.docker:
 29 |         condition: service_healthy
 30 |       redis.docker:
 31 |         condition: service_healthy
 32 |     volumes:
 33 |       - .:/opt/sparkly/
 34 | 
 35 |   test:
 36 |     build: .
 37 |     depends_on:
 38 |       cassandra.docker:
 39 |         condition: service_healthy
 40 |       elastic.docker:
 41 |         condition: service_healthy
 42 |       kafka.docker:
 43 |         condition: service_healthy
 44 |       mysql.docker:
 45 |         condition: service_healthy
 46 |       redis.docker:
 47 |         condition: service_healthy
 48 | 
 49 |   cassandra.docker:
 50 |     image: cassandra:4.1
 51 |     healthcheck:
 52 |       test: ["CMD-SHELL", "[ $$(nodetool statusgossip) = running ]"]
 53 | 
 54 |   elastic.docker:
 55 |     image: docker.elastic.co/elasticsearch/elasticsearch:7.17.8
 56 |     environment:
 57 |       - xpack.security.enabled=false
 58 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
 59 |       - discovery.type=single-node
 60 |     healthcheck:
 61 |       test: "curl -f http://localhost:9200/_cat/health | grep green"
 62 |       interval: 5s
 63 |       timeout: 5s
 64 |       retries: 20
 65 | 
 66 |   mysql.docker:
 67 |     image: mysql:8.0
 68 |     environment:
 69 |       MYSQL_DATABASE: sparkly_test
 70 |       MYSQL_ALLOW_EMPTY_PASSWORD: "yes"
 71 |     healthcheck:
 72 |       test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
 73 | 
 74 |   kafka.docker:
 75 |     image: confluentinc/cp-kafka:7.3.0
 76 |     depends_on:
 77 |       zookeeper.docker:
 78 |         condition: service_healthy
 79 |     expose:
 80 |       - "9092"
 81 |     environment:
 82 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://:9092
 83 |       KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
 84 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper.docker:2181
 85 |       KAFKA_NUM_PARTITIONS: 3
 86 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 87 |     healthcheck:
 88 |       test: ps ax | grep kafka
 89 | 
 90 |   redis.docker:
 91 |     image: redis:7.0
 92 |     expose:
 93 |       - "6379"
 94 |     healthcheck:
 95 |       test: ["CMD", "redis-cli", "ping"]
 96 | 
 97 |   zookeeper.docker:
 98 |     image: confluent/zookeeper
 99 |     expose:
100 |       - "2181"
101 |     healthcheck:
102 |       test: ps ax | grep zookeeper
103 | 


--------------------------------------------------------------------------------
/docs/source/catalog.rst:
--------------------------------------------------------------------------------
  1 | Hive Metastore Utils
  2 | ====================
  3 | 
  4 | About Hive Metastore
  5 | --------------------
  6 | 
  7 | The Hive Metastore is a database with metadata for Hive tables.
  8 | 
  9 | To configure ``SparklySession`` to work with external Hive Metastore, you need to set ``hive.metastore.uris`` option.
 10 | You can do this via ``hive-site.xml`` file in spark config ($SPARK_HOME/conf/hive-site.xml):
 11 | 
 12 | .. code-block:: xml
 13 | 
 14 |     <property>
 15 |       <name>hive.metastore.uris</name>
 16 |       <value>thrift://<n.n.n.n>:9083</value>
 17 |       <description>IP address (or fully-qualified domain name) and port of the metastore host</description>
 18 |     </property>
 19 | 
 20 | 
 21 | or set it dynamically via ``SparklySession`` options:
 22 | 
 23 | .. code-block:: python
 24 | 
 25 |     class MySession(SparklySession):
 26 |         options = {
 27 |             'hive.metastore.uris': 'thrift://<n.n.n.n>:9083',
 28 |         }
 29 | 
 30 | 
 31 | Tables management
 32 | -----------------
 33 | 
 34 | **Why:** you need to check if tables exist, rename them, drop them, or even overwrite existing aliases in your catalog.
 35 | 
 36 | .. code-block:: python
 37 | 
 38 |     from sparkly import SparklySession
 39 | 
 40 | 
 41 |     spark = SparklySession()
 42 | 
 43 |     assert spark.catalog_ext.has_table('my_table') in {True, False}
 44 |     spark.catalog_ext.rename_table('my_table', 'my_new_table')
 45 |     spark.catalog_ext.create_table('my_new_table', path='s3://my/parquet/data', source='parquet', mode='overwrite')
 46 |     spark.catalog_ext.drop_table('my_new_table')
 47 | 
 48 | Table properties management
 49 | ---------------------------
 50 | 
 51 | **Why:** sometimes you want to assign custom attributes for your table, e.g. creation time, last update, purpose, data source.
 52 | The only way to interact with table properties in spark - use raw SQL queries.
 53 | We implemented a more convenient interface to make your code cleaner.
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |     from sparkly import SparklySession
 58 | 
 59 | 
 60 |     spark = SparklySession()
 61 |     spark.catalog_ext.set_table_property('my_table', 'foo', 'bar')
 62 |     assert spark.catalog_ext.get_table_property('my_table', 'foo') == 'bar'
 63 |     assert spark.catalog_ext.get_table_properties('my_table') == {'foo': 'bar'}
 64 | 
 65 | *Note* properties are stored as strings.
 66 | In case if you need other types, consider using a serialisation format, e.g. JSON.
 67 | 
 68 | 
 69 | Using non-default database
 70 | --------------------------
 71 | 
 72 | **Why** to split your warehouse into logical groups (for example by system components).
 73 | In all catalog_ext.* methods you can specify full table names <db-name>.<table-name> and
 74 | it should operate properly
 75 | 
 76 | .. code-block:: python
 77 | 
 78 |     from time import time
 79 |     from sparkly import SparklySession
 80 | 
 81 |     spark = SparklySession()
 82 | 
 83 |     if spark.catalog_ext.has_database('my_database'):
 84 |         self.catalog_ext.rename_table(
 85 |             'my_database.my_badly_named_table',
 86 |             'new_shiny_name',
 87 |         )
 88 |         self.catalog_ext.set_table_property(
 89 |             'my_database.new_shiny_name',
 90 |             'last_update_at',
 91 |             time(),
 92 |         )
 93 | 
 94 | *Note* be careful using 'USE' statements like: spark.sql('USE my_database'),
 95 | it's stateful and may lead to weird errors, if code assumes correct current database.
 96 | 
 97 | 
 98 | API documentation
 99 | -----------------
100 | 
101 | .. automodule:: sparkly.catalog
102 |     :members:
103 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | #
  4 | # Copyright 2017 Tubular Labs, Inc.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | # -*- coding: utf-8 -*-
 20 | #
 21 | # sparkly documentation build configuration file, created by
 22 | # sphinx-quickstart on Tue Sep 20 08:46:42 2016.
 23 | #
 24 | # This file is execfile()d with the current directory set to its
 25 | # containing dir.
 26 | #
 27 | # Note that not all possible configuration values are present in this
 28 | # autogenerated file.
 29 | #
 30 | # All configuration values have a default; values that are commented out
 31 | # serve to show the default.
 32 | 
 33 | # If extensions (or modules to document with autodoc) are in another directory,
 34 | # add these directories to sys.path here. If the directory is relative to the
 35 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 36 | #
 37 | import os
 38 | import sys
 39 | import sphinx_rtd_theme
 40 | 
 41 | 
 42 | sys.path.insert(0, os.path.abspath('../..'))
 43 | 
 44 | # -- General configuration ------------------------------------------------
 45 | 
 46 | # If your documentation needs a minimal Sphinx version, state it here.
 47 | #
 48 | # needs_sphinx = '1.0'
 49 | 
 50 | # Add any Sphinx extension module names here, as strings. They can be
 51 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 52 | # ones.
 53 | extensions = [
 54 |     'sphinx.ext.autodoc',
 55 |     'sphinx.ext.doctest',
 56 |     'sphinx.ext.coverage',
 57 |     'sphinx.ext.viewcode',
 58 |     # 'sphinx.ext.githubpages',
 59 |     'sphinx.ext.napoleon',
 60 | ]
 61 | 
 62 | # Add any paths that contain templates here, relative to this directory.
 63 | templates_path = ['_templates']
 64 | 
 65 | # The suffix(es) of source filenames.
 66 | # You can specify multiple suffix as a list of string:
 67 | #
 68 | # source_suffix = ['.rst', '.md']
 69 | source_suffix = '.rst'
 70 | 
 71 | # The encoding of source files.
 72 | #
 73 | # source_encoding = 'utf-8-sig'
 74 | 
 75 | # The master toctree document.
 76 | master_doc = 'index'
 77 | 
 78 | # General information about the project.
 79 | project = 'sparkly'
 80 | copyright = '2016, Tubular'
 81 | author = 'Tubular'
 82 | 
 83 | # The version info for the project you're documenting, acts as replacement for
 84 | # |version| and |release|, also used in various other places throughout the
 85 | # built documents.
 86 | #
 87 | import re
 88 | 
 89 | with open(os.path.join(os.path.dirname(__file__), '../../sparkly/__init__.py')) as init_py:
 90 |     init_py_content = init_py.read()
 91 | 
 92 |     # The short X.Y version.
 93 |     version = re.search('__version__ = \'([\d.]+)[\w.]*\'', init_py_content).group(1).strip('.')
 94 | 
 95 |     # The full version, including alpha/beta/rc tags.
 96 |     release = re.search('__version__ = \'([\w.]+)\'', init_py_content).group(1)
 97 | 
 98 | # The language for content autogenerated by Sphinx. Refer to documentation
 99 | # for a list of supported languages.
100 | #
101 | # This is also used if you do content translation via gettext catalogs.
102 | # Usually you set "language" from the command line for these cases.
103 | language = 'en'
104 | 
105 | # There are two options for replacing |today|: either, you set today to some
106 | # non-false value, then it is used:
107 | #
108 | # today = ''
109 | #
110 | # Else, today_fmt is used as the format for a strftime call.
111 | #
112 | # today_fmt = '%B %d, %Y'
113 | 
114 | # List of patterns, relative to source directory, that match files and
115 | # directories to ignore when looking for source files.
116 | # This patterns also effect to html_static_path and html_extra_path
117 | exclude_patterns = []
118 | 
119 | # The reST default role (used for this markup: `text`) to use for all
120 | # documents.
121 | #
122 | # default_role = None
123 | 
124 | # If true, '()' will be appended to :func: etc. cross-reference text.
125 | #
126 | # add_function_parentheses = True
127 | 
128 | # If true, the current module name will be prepended to all description
129 | # unit titles (such as .. function::).
130 | #
131 | # add_module_names = True
132 | 
133 | # If true, sectionauthor and moduleauthor directives will be shown in the
134 | # output. They are ignored by default.
135 | #
136 | # show_authors = False
137 | 
138 | # The name of the Pygments (syntax highlighting) style to use.
139 | pygments_style = 'sphinx'
140 | 
141 | # A list of ignored prefixes for module index sorting.
142 | # modindex_common_prefix = []
143 | 
144 | # If true, keep warnings as "system message" paragraphs in the built documents.
145 | # keep_warnings = False
146 | 
147 | # If true, `todo` and `todoList` produce output, else they produce nothing.
148 | todo_include_todos = False
149 | 
150 | 
151 | # -- Options for HTML output ----------------------------------------------
152 | 
153 | # The theme to use for HTML and HTML Help pages.  See the documentation for
154 | # a list of builtin themes.
155 | #
156 | html_theme = 'sphinx_rtd_theme'
157 | 
158 | # Theme options are theme-specific and customize the look and feel of a theme
159 | # further.  For a list of options available for each theme, see the
160 | # documentation.
161 | #
162 | # html_theme_options = {}
163 | 
164 | # Add any paths that contain custom themes here, relative to this directory.
165 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
166 | 
167 | # The name for this set of Sphinx documents.
168 | # "<project> v<release> documentation" by default.
169 | #
170 | # html_title = 'sparkly v0.3.1'
171 | 
172 | # A shorter title for the navigation bar.  Default is the same as html_title.
173 | #
174 | # html_short_title = None
175 | 
176 | # The name of an image file (relative to this directory) to place at the top
177 | # of the sidebar.
178 | #
179 | # html_logo = None
180 | 
181 | # The name of an image file (relative to this directory) to use as a favicon of
182 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
183 | # pixels large.
184 | #
185 | # html_favicon = None
186 | 
187 | # Add any paths that contain custom static files (such as style sheets) here,
188 | # relative to this directory. They are copied after the builtin static files,
189 | # so a file named "default.css" will overwrite the builtin "default.css".
190 | #html_static_path = ['_static']
191 | 
192 | # Add any extra paths that contain custom files (such as robots.txt or
193 | # .htaccess) here, relative to this directory. These files are copied
194 | # directly to the root of the documentation.
195 | #
196 | # html_extra_path = []
197 | 
198 | # If not None, a 'Last updated on:' timestamp is inserted at every page
199 | # bottom, using the given strftime format.
200 | # The empty string is equivalent to '%b %d, %Y'.
201 | #
202 | # html_last_updated_fmt = None
203 | 
204 | # If true, SmartyPants will be used to convert quotes and dashes to
205 | # typographically correct entities.
206 | #
207 | # html_use_smartypants = True
208 | 
209 | # Custom sidebar templates, maps document names to template names.
210 | #
211 | # html_sidebars = {}
212 | 
213 | # Additional templates that should be rendered to pages, maps page names to
214 | # template names.
215 | #
216 | # html_additional_pages = {}
217 | 
218 | # If false, no module index is generated.
219 | #
220 | # html_domain_indices = True
221 | 
222 | # If false, no index is generated.
223 | #
224 | # html_use_index = True
225 | 
226 | # If true, the index is split into individual pages for each letter.
227 | #
228 | # html_split_index = False
229 | 
230 | # If true, links to the reST sources are added to the pages.
231 | #
232 | # html_show_sourcelink = True
233 | 
234 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
235 | #
236 | # html_show_sphinx = True
237 | 
238 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
239 | #
240 | # html_show_copyright = True
241 | 
242 | # If true, an OpenSearch description file will be output, and all pages will
243 | # contain a <link> tag referring to it.  The value of this option must be the
244 | # base URL from which the finished HTML is served.
245 | #
246 | # html_use_opensearch = ''
247 | 
248 | # This is the file name suffix for HTML files (e.g. ".xhtml").
249 | # html_file_suffix = None
250 | 
251 | # Language to be used for generating the HTML full-text search index.
252 | # Sphinx supports the following languages:
253 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
254 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
255 | #
256 | # html_search_language = 'en'
257 | 
258 | # A dictionary with options for the search language support, empty by default.
259 | # 'ja' uses this config value.
260 | # 'zh' user can custom change `jieba` dictionary path.
261 | #
262 | # html_search_options = {'type': 'default'}
263 | 
264 | # The name of a javascript file (relative to the configuration directory) that
265 | # implements a search results scorer. If empty, the default will be used.
266 | #
267 | # html_search_scorer = 'scorer.js'
268 | 
269 | # Output file base name for HTML help builder.
270 | htmlhelp_basename = 'sparklydoc'
271 | 
272 | # -- Options for LaTeX output ---------------------------------------------
273 | 
274 | latex_elements = {
275 |      # The paper size ('letterpaper' or 'a4paper').
276 |      #
277 |      # 'papersize': 'letterpaper',
278 | 
279 |      # The font size ('10pt', '11pt' or '12pt').
280 |      #
281 |      # 'pointsize': '10pt',
282 | 
283 |      # Additional stuff for the LaTeX preamble.
284 |      #
285 |      # 'preamble': '',
286 | 
287 |      # Latex figure (float) alignment
288 |      #
289 |      # 'figure_align': 'htbp',
290 | }
291 | 
292 | # Grouping the document tree into LaTeX files. List of tuples
293 | # (source start file, target name, title,
294 | #  author, documentclass [howto, manual, or own class]).
295 | latex_documents = [
296 |     (master_doc, 'sparkly.tex', 'sparkly Documentation',
297 |      'Tubular', 'manual'),
298 | ]
299 | 
300 | # The name of an image file (relative to this directory) to place at the top of
301 | # the title page.
302 | #
303 | # latex_logo = None
304 | 
305 | # For "manual" documents, if this is true, then toplevel headings are parts,
306 | # not chapters.
307 | #
308 | # latex_use_parts = False
309 | 
310 | # If true, show page references after internal links.
311 | #
312 | # latex_show_pagerefs = False
313 | 
314 | # If true, show URL addresses after external links.
315 | #
316 | # latex_show_urls = False
317 | 
318 | # Documents to append as an appendix to all manuals.
319 | #
320 | # latex_appendices = []
321 | 
322 | # It false, will not define \strong, \code, 	itleref, \crossref ... but only
323 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
324 | # packages.
325 | #
326 | # latex_keep_old_macro_names = True
327 | 
328 | # If false, no module index is generated.
329 | #
330 | # latex_domain_indices = True
331 | 
332 | 
333 | # -- Options for manual page output ---------------------------------------
334 | 
335 | # One entry per manual page. List of tuples
336 | # (source start file, name, description, authors, manual section).
337 | man_pages = [
338 |     (master_doc, 'sparkly', 'sparkly Documentation',
339 |      [author], 1)
340 | ]
341 | 
342 | # If true, show URL addresses after external links.
343 | #
344 | # man_show_urls = False
345 | 
346 | 
347 | # -- Options for Texinfo output -------------------------------------------
348 | 
349 | # Grouping the document tree into Texinfo files. List of tuples
350 | # (source start file, target name, title, author,
351 | #  dir menu entry, description, category)
352 | texinfo_documents = [
353 |     (master_doc, 'sparkly', 'sparkly Documentation',
354 |      author, 'sparkly', 'One line description of project.',
355 |      'Miscellaneous'),
356 | ]
357 | 
358 | # Documents to append as an appendix to all manuals.
359 | #
360 | # texinfo_appendices = []
361 | 
362 | # If false, no module index is generated.
363 | #
364 | # texinfo_domain_indices = True
365 | 
366 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
367 | #
368 | # texinfo_show_urls = 'footnote'
369 | 
370 | # If true, do not generate a @detailmenu in the "Top" node's menu.
371 | #
372 | # texinfo_no_detailmenu = False
373 | 


--------------------------------------------------------------------------------
/docs/source/functions.rst:
--------------------------------------------------------------------------------
 1 | Column and DataFrame Functions
 2 | ==============================
 3 | 
 4 | A counterpart of pyspark.sql.functions providing useful shortcuts:
 5 | 
 6 | - a cleaner alternative to chaining together multiple when/otherwise statements.
 7 | - an easy way to join multiple dataframes at once and disambiguate fields with the same name.
 8 | - agg function to select a value from the row that maximizes other column(s)
 9 | 
10 | 
11 | API documentation
12 | -----------------
13 | 
14 | .. automodule:: sparkly.functions
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to sparkly's documentation!
 2 | ===================================
 3 | 
 4 | Sparkly is a library that makes usage of pyspark more convenient and consistent.
 5 | 
 6 | A brief tour on Sparkly features:
 7 | 
 8 | .. code-block:: python
 9 | 
10 |    # The main entry point is SparklySession,
11 |    # you can think of it as of a combination of SparkSession and SparkSession.builder.
12 |    from sparkly import SparklySession
13 | 
14 | 
15 |    # Define dependencies in the code instead of messing with `spark-submit`.
16 |    class MySession(SparklySession):
17 |        # Spark packages and dependencies from Maven.
18 |        packages = [
19 |            'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
20 |            'mysql:mysql-connector-java:5.1.39',
21 |        ]
22 | 
23 |        # Jars and Hive UDFs
24 |        jars = ['/path/to/brickhouse-0.7.1.jar'],
25 |        udfs = {
26 |            'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',
27 |        }
28 | 
29 | 
30 |    spark = MySession()
31 | 
32 |    # Operate with interchangeable URL-like data source definitions:
33 |    df = spark.read_ext.by_url('mysql://<my-sql.host>/my_database/my_database')
34 |    df.write_ext('parquet:s3://<my-bucket>/<path>/data?partition_by=<field_name1>')
35 | 
36 |    # Interact with Hive Metastore via convenient python api,
37 |    # instead of verbose SQL queries:
38 |    spark.catalog_ext.has_table('my_custom_table')
39 |    spark.catalog_ext.get_table_properties('my_custom_table')
40 | 
41 |    # Easy integration testing with Fixtures and base test classes.
42 |    from pyspark.sql import types as T
43 |    from sparkly.testing import SparklyTest
44 | 
45 | 
46 |    class TestMyShinySparkScript(SparklyTest):
47 |        session = MySession
48 | 
49 |        fixtures = [
50 |            MysqlFixture('<my-testing-host>', '<test-user>', '<test-pass>', '/path/to/data.sql', '/path/to/clear.sql')
51 |        ]
52 | 
53 |       def test_job_works_with_mysql(self):
54 |          df = self.spark.read_ext.by_url('mysql://<my-testing-host>/<test-db>/<test-table>?user=<test-usre>&password=<test-password>')
55 |          res_df = my_shiny_script(df)
56 |          self.assertRowsEqual(
57 |             res_df.collect(),
58 |             [T.Row(fieldA='DataA', fieldB='DataB', fieldC='DataC')],
59 |          )
60 | 
61 | .. toctree::
62 |    :maxdepth: 2
63 | 
64 |    session
65 |    reader_and_writer
66 |    catalog
67 |    testing
68 |    functions
69 |    utils
70 |    license
71 | 
72 | .. automodule:: sparkly
73 |    :members:
74 | 
75 | Indices and tables
76 | ------------------
77 | 
78 | * :ref:`genindex`
79 | * :ref:`modindex`
80 | * :ref:`search`
81 | 


--------------------------------------------------------------------------------
/docs/source/license.rst:
--------------------------------------------------------------------------------
1 | License
2 | =======
3 | 
4 | .. include:: ../../LICENSE
5 |    :literal:
6 | 


--------------------------------------------------------------------------------
/docs/source/reader_and_writer.rst:
--------------------------------------------------------------------------------
  1 | .. _reader_and_writer:
  2 | 
  3 | Read/write utilities for DataFrames
  4 | ===================================
  5 | 
  6 | Sparkly isn't trying to replace any of existing storage connectors.
  7 | The goal is to provide a simplified and consistent api across a wide array of storage connectors.
  8 | We also added the way to work with :ref:`abstract data sources <universal-reader-and-writer>`,
  9 | so you can keep your code agnostic to the storages you use.
 10 | 
 11 | .. _cassandra:
 12 | 
 13 | Cassandra
 14 | ---------
 15 | 
 16 | Sparkly relies on the official spark cassandra connector and was successfully tested in production using version `2.4.0`.
 17 | 
 18 | +---------------+---------------------------------------------------------------------------------------+
 19 | | Package       | https://spark-packages.org/package/datastax/spark-cassandra-connector                 |
 20 | +---------------+---------------------------------------------------------------------------------------+
 21 | | Configuration | https://github.com/datastax/spark-cassandra-connector/blob/v2.4.0/doc/reference.md    |
 22 | +---------------+---------------------------------------------------------------------------------------+
 23 | 
 24 | For using overwrite mode, it is needed to specify confirm.truncate as true. Otherwise, use append mode to update existing data.
 25 | 
 26 | .. code-block:: python
 27 | 
 28 |     from sparkly import SparklySession
 29 | 
 30 | 
 31 |     class MySession(SparklySession):
 32 |         # Feel free to play with other versions
 33 |         packages = ['datastax:spark-cassandra-connector:2.4.0-s_2.11']
 34 | 
 35 |     spark = MySession()
 36 | 
 37 |     # To read data
 38 |     df = spark.read_ext.cassandra('localhost', 'my_keyspace', 'my_table')
 39 |     # To write data
 40 |     df.write_ext.cassandra('localhost', 'my_keyspace', 'my_table')
 41 | 
 42 | 
 43 | .. _elastic:
 44 | 
 45 | Elastic
 46 | -------
 47 | 
 48 | Sparkly relies on the official elastic spark connector and was successfully tested in production using version `6.5.4`.
 49 | 
 50 | +---------------+-----------------------------------------------------------------------------+
 51 | | Package       | https://spark-packages.org/package/elastic/elasticsearch-hadoop             |
 52 | +---------------+-----------------------------------------------------------------------------+
 53 | | Configuration | https://www.elastic.co/guide/en/elasticsearch/hadoop/7.3/configuration.html |
 54 | +---------------+-----------------------------------------------------------------------------+
 55 | 
 56 | .. code-block:: python
 57 | 
 58 |     from sparkly import SparklySession
 59 | 
 60 | 
 61 |     class MySession(SparklySession):
 62 |         # Feel free to play with other versions
 63 |         packages = ['org.elasticsearch:elasticsearch-spark-20_2.11:7.3.0']
 64 | 
 65 |     spark = MySession()
 66 | 
 67 |     # To read data
 68 |     df = spark.read_ext.elastic('localhost', 'my_index', 'my_type', query='?q=awesomeness')
 69 |     # To write data
 70 |     df.write_ext.elastic('localhost', 'my_index', 'my_type')
 71 | 
 72 | .. _kafka:
 73 | 
 74 | Kafka
 75 | -----
 76 | 
 77 | Sparkly's reader and writer for Kafka are built on top of the official spark package for Kafka-SQL.
 78 | 
 79 | +---------------+------------------------------------------------------------------------------------------+
 80 | | Package       | https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10_2.11/2.4.0      |
 81 | +---------------+------------------------------------------------------------------------------------------+
 82 | | Configuration | https://spark.apache.org/docs/2.4.0/structured-streaming-kafka-integration.html          |
 83 | +---------------+------------------------------------------------------------------------------------------+
 84 | 
 85 | .. code-block:: python
 86 | 
 87 |     import json
 88 | 
 89 |     from sparkly import SparklySession
 90 | 
 91 | 
 92 |     class MySession(SparklySession):
 93 |         packages = [
 94 |             'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0',
 95 |         ]
 96 | 
 97 |     spark = MySession()
 98 | 
 99 |     # To read JSON messaged from Kafka into a dataframe:
100 | 
101 |     #   1. Define a schema of the messages you read.
102 |     df_schema = StructType([
103 |         StructField('key', StructType([
104 |             StructField('id', StringType(), True)
105 |         ])),
106 |         StructField('value', StructType([
107 |             StructField('name', StringType(), True),
108 |             StructField('surname', StringType(), True),
109 |         ]))
110 |     ])
111 | 
112 |     #   2. Specify the schema as a reader parameter.
113 |     df = hc.read_ext.kafka(
114 |         'kafka.host',
115 |         topic='my.topic',
116 |         # key & value deserialization is optional; if not provided,
117 |         # then the user will have to deal with decoding the binary directly.
118 |         key_deserializer=lambda item: json.loads(item.decode('utf-8')),
119 |         value_deserializer=lambda item: json.loads(item.decode('utf-8')),
120 |         # if deserializers are used, the schema must be provided:
121 |         schema=df_schema,
122 |     )
123 | 
124 |     # To write a dataframe to Kafka in JSON format:
125 |     df.write_ext.kafka(
126 |         'kafka.host',
127 |         topic='my.topic',
128 |         # key & value serialization is optional; if not provided,
129 |         # the `key` and `value` columns MUST already be StringType or BinaryType
130 |         key_serializer=lambda item: json.dumps(item).encode('utf-8'),
131 |         value_serializer=lambda item: json.dumps(item).encode('utf-8'),
132 |     )
133 | 
134 | .. _mysql:
135 | 
136 | MySQL
137 | -----
138 | 
139 | Basically, it's just a high level api on top of the native
140 | `jdbc reader <http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.jdbc>`_ and
141 | `jdbc writer <http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.jdbc>`_.
142 | 
143 | +---------------+--------------------------------------------------------------------------------------------------+
144 | | Jars          | https://mvnrepository.com/artifact/mysql/mysql-connector-java                                    |
145 | +---------------+--------------------------------------------------------------------------------------------------+
146 | | Configuration | https://dev.mysql.com/doc/connector-j/5.1/en/connector-j-reference-configuration-properties.html |
147 | +---------------+--------------------------------------------------------------------------------------------------+
148 | 
149 | .. code-block:: python
150 | 
151 |     from sparkly import SparklySession
152 |     from sparkly.utils import absolute_path
153 | 
154 | 
155 |     class MySession(SparklySession):
156 |         # Feel free to play with other versions.
157 |         packages = ['mysql:mysql-connector-java:6.0.6']
158 | 
159 | 
160 |     spark = MySession()
161 | 
162 |     # To read data
163 |     df = spark.read_ext.mysql('localhost', 'my_database', 'my_table',
164 |                               options={'user': 'root', 'password': 'root'})
165 |     # To write data
166 |     df.write_ext.mysql('localhost', 'my_database', 'my_table', options={
167 |         'user': 'root',
168 |         'password': 'root',
169 |         'rewriteBatchedStatements': 'true',  # improves write throughput dramatically
170 |     })
171 | 
172 | .. _redis:
173 | 
174 | Redis
175 | -----
176 | 
177 | Sparkly provides a writer for Redis that is built on top of the official redis python library
178 | `redis-py <https://github.com/andymccurdy/redis-py>`_ .
179 | It is currently capable of exporting your DataFrame as a JSON blob per row or group of rows.
180 | 
181 | .. note::
182 |     - To interact with Redis, ``sparkly`` needs the ``redis`` library. You can get it via:
183 |       ``pip install sparkly[redis]``
184 | 
185 | .. code-block:: python
186 | 
187 |     import json
188 | 
189 |     from sparkly import SparklySession
190 | 
191 | 
192 |     spark = SparklySession()
193 | 
194 |     # Write JSON.gz data indexed by col1.col2 that will expire in a day
195 |     df.write_ext.redis(
196 |         host='localhost',
197 |         port=6379,
198 |         key_by=['col1', 'col2'],
199 |         exclude_key_columns=True,
200 |         expire=24 * 60 * 60,
201 |         compression='gzip',
202 |     )
203 | 
204 | 
205 | .. _universal-reader-and-writer:
206 | 
207 | Universal reader/writer
208 | -----------------------
209 | 
210 | The `DataFrame` abstraction is really powerful when it comes to transformations.
211 | You can shape your data from various storages using exactly the same api.
212 | For instance, you can join data from Cassandra with data from Elasticsearch and write the result to MySQL.
213 | 
214 | The only problem - you have to explicitly define sources (or destinations) in order to create (or export) a `DataFrame`.
215 | But the source/destination of data doesn't really change the logic of transformations (if the schema is preserved).
216 | To solve the problem, we decided to add the universal api to read/write `DataFrames`:
217 | 
218 | .. code-block:: python
219 | 
220 |     from sparkly import SparklyContext
221 | 
222 |     class MyContext(SparklyContext):
223 |         packages = [
224 |             'datastax:spark-cassandra-connector:1.6.1-s_2.10',
225 |             'com.databricks:spark-csv_2.10:1.4.0',
226 |             'org.elasticsearch:elasticsearch-spark_2.10:6.5.4',
227 |         ]
228 | 
229 |     hc = MyContext()
230 | 
231 |     # To read data
232 |     df = hc.read_ext.by_url('cassandra://localhost/my_keyspace/my_table?consistency=ONE')
233 |     df = hc.read_ext.by_url('csv:s3://my-bucket/my-data?header=true')
234 |     df = hc.read_ext.by_url('elastic://localhost/my_index/my_type?q=awesomeness')
235 |     df = hc.read_ext.by_url('parquet:hdfs://my.name.node/path/on/hdfs')
236 | 
237 |     # To write data
238 |     df.write_ext.by_url('cassandra://localhost/my_keyspace/my_table?consistency=QUORUM&parallelism=8')
239 |     df.write_ext.by_url('csv:hdfs://my.name.node/path/on/hdfs')
240 |     df.write_ext.by_url('elastic://localhost/my_index/my_type?parallelism=4')
241 |     df.write_ext.by_url('parquet:s3://my-bucket/my-data?header=false')
242 | 
243 | 
244 | .. _controlling-the-load:
245 | 
246 | Controlling the load
247 | --------------------
248 | 
249 | From the official documentation:
250 | 
251 |     | Don’t create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
252 | 
253 |     link: <https://spark.apache.org/docs/2.0.1/api/java/org/apache/spark/sql/DataFrameReader.html>
254 | 
255 | It's a very good advice, but in practice it's hard to track the number of partitions.
256 | For instance, if you write a result of a join operation to database the number of splits
257 | might be changed implicitly via `spark.sql.shuffle.partitions`.
258 | 
259 | To prevent us from shooting to the foot, we decided to add `parallelism` option for all our readers and writers.
260 | The option is designed to control a load on a source we write to / read from.
261 | It's especially useful when you are working with data storages like Cassandra, MySQL or Elastic.
262 | However, the implementation of the throttling has some drawbacks and you should be aware of them.
263 | 
264 | The way we implemented it is pretty simple: we use `coalesce` on a dataframe
265 | to reduce an amount of tasks that will be executed in parallel.
266 | Let's say you have a dataframe with 1000 splits and you want to write no more than 10 task
267 | in parallel. In such case `coalesce` will create a dataframe that has 10 splits
268 | with 100 original tasks in each. An outcome of this: if any of these 100 tasks fails,
269 | we have to retry the whole pack in 100 tasks.
270 | 
271 | `Read more about coalesce <http://spark.apache.org/docs/latest/programming-guide.html#CoalesceLink>`_
272 | 
273 | Reader API documentation
274 | ------------------------
275 | 
276 | .. automodule:: sparkly.reader
277 |     :members:
278 | 
279 | Writer API documentation
280 | ------------------------
281 | 
282 | .. automodule:: sparkly.writer
283 |     :members:
284 | 


--------------------------------------------------------------------------------
/docs/source/session.rst:
--------------------------------------------------------------------------------
  1 | Sparkly Session
  2 | ===============
  3 | 
  4 | ``SparklySession`` is the main entry point to sparkly's functionality.
  5 | It's derived from ``SparkSession`` to provide additional features on top of the default session.
  6 | The are two main differences between ``SparkSession`` and ``SparklySession``:
  7 | 
  8 |     1. ``SparklySession`` doesn't have ``builder`` attribute,
  9 |        because we prefer declarative session definition over imperative.
 10 |     2. Hive support is enabled by default.
 11 | 
 12 | The example below shows both imperative and declarative approaches:
 13 | 
 14 | .. code-block:: python
 15 | 
 16 |     # PySpark-style (imperative)
 17 |     from pyspark import SparkSession
 18 | 
 19 |     spark = SparkSession.builder\
 20 |         .appName('My App')\
 21 |         .master('spark://')\
 22 |         .config('spark.sql.shuffle.partitions', 10)\
 23 |         .getOrCreate()
 24 | 
 25 |     # Sparkly-style (declarative)
 26 |     from sparkly import SparklySession
 27 | 
 28 |     class MySession(SparklySession):
 29 |         options = {
 30 |             'spark.app.name': 'My App',
 31 |             'spark.master': 'spark://',
 32 |             'spark.sql.shuffle.partitions': 10,
 33 |         }
 34 | 
 35 |     spark = MySession()
 36 | 
 37 |     # In case you want to change default options
 38 |     spark = MySession({'spark.app.name': 'My Awesome App'})
 39 | 
 40 |     # In case you want to access the session singleton
 41 |     spark = MySession.get_or_create()
 42 | 
 43 | 
 44 | Installing dependencies
 45 | -----------------------
 46 | 
 47 | **Why**: Spark forces you to specify dependencies (spark packages or maven artifacts)
 48 | when a spark job is submitted (something like ``spark-submit --packages=...``).
 49 | We prefer a code-first approach where dependencies are actually
 50 | declared as part of the job.
 51 | 
 52 | **For example**: You want to read data from Cassandra.
 53 | 
 54 | .. code-block:: python
 55 | 
 56 |     from sparkly import SparklySession
 57 | 
 58 | 
 59 |     class MySession(SparklySession):
 60 |         # Define a list of spark packages or maven artifacts.
 61 |         packages = [
 62 |             'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
 63 |         ]
 64 | 
 65 |     # Dependencies will be fetched during the session initialisation.
 66 |     spark = MySession()
 67 | 
 68 |     # Here is how you now can access a dataset in Cassandra.
 69 |     df = spark.read_ext.by_url('cassandra://<cassandra-host>/<db>/<table>?consistency=QUORUM')
 70 | 
 71 | 
 72 | Custom Maven repositories
 73 | -------------------------
 74 | 
 75 | **Why**: If you have a private maven repository, this is how to point spark to it when it performs a package lookup.
 76 | Order in which dependencies will be resolved is next:
 77 |  - Local cache
 78 |  - Custom maven repositories (if specified)
 79 |  - Maven Central
 80 | 
 81 | **For example**: Let's assume your maven repository is available on: http://my.repo.net/maven,
 82 | and there is some spark package published there, with identifier: `my.corp:spark-handy-util:0.0.1`
 83 | You can install it to a spark session like this:
 84 | 
 85 | .. code-block:: python
 86 | 
 87 |     from sparkly import SparklySession
 88 | 
 89 |     class MySession(SparklySession):
 90 |         repositories = ['http://my.repo.net/maven']
 91 |         packages = ['my.corp:spark-handy-util:0.0.1']
 92 | 
 93 |     spark = MySession()
 94 | 
 95 | 
 96 | Tuning options
 97 | --------------
 98 | 
 99 | **Why**: You want to customise your spark session.
100 | 
101 | **For example**:
102 | 
103 |     - ``spark.sql.shuffle.partitions`` to tune shuffling;
104 |     - ``hive.metastore.uris`` to connect to your own HiveMetastore;
105 |     - ``spark.hadoop.avro.mapred.ignore.inputs.without.extension`` package specific options.
106 | 
107 | .. code-block:: python
108 | 
109 |     from sparkly import SparklySession
110 | 
111 | 
112 |     class MySession(SparklySession):
113 |         options = {
114 |             # Increase the default amount of partitions for shuffling.
115 |             'spark.sql.shuffle.partitions': 1000,
116 |             # Setup remote Hive Metastore.
117 |             'hive.metastore.uris': 'thrift://<host1>:9083,thrift://<host2>:9083',
118 |             # Ignore files without `avro` extensions.
119 |             'spark.hadoop.avro.mapred.ignore.inputs.without.extension': 'false',
120 |         }
121 | 
122 |     # You can also overwrite or add some options at initialisation time.
123 |     spark = MySession({'spark.sql.shuffle.partitions': 10})
124 | 
125 | 
126 | Tuning options through shell environment
127 | ----------------------------------------
128 | 
129 | **Why**: You want to customize your spark session in a way that depends on the
130 | hardware specifications of your worker (or driver) machine(s), so you'd rather
131 | define them close to where the actual machine specs are requested / defined.
132 | Or you just want to test some new configuration without having to change your
133 | code. In both cases, you can do so by using the environmental variable
134 | ``PYSPARK_SUBMIT_ARGS``. Note that any options defined this way will override
135 | any conflicting options from your Python code.
136 | 
137 | **For example**:
138 | 
139 |     - ``spark.executor.cores`` to tune the cores used by each executor;
140 |     - ``spark.executor.memory`` to tune the memory available to each executor.
141 | 
142 | .. code-block:: sh
143 | 
144 |     PYSPARK_SUBMIT_ARGS='--conf "spark.executor.cores=32" --conf "spark.executor.memory=160g"' \
145 |         ./my_spark_app.py
146 | 
147 | 
148 | Using UDFs
149 | ----------
150 | 
151 | **Why**: To start using Java UDF you have to import JAR file
152 | via SQL query like ``add jar ../path/to/file`` and then call ``registerJavaFunction``.
153 | We think it's too many actions for such simple functionality.
154 | 
155 | **For example**: You want to import UDFs from `brickhouse library <https://github.com/klout/brickhouse>`_.
156 | 
157 | .. code-block:: python
158 | 
159 |     from pyspark.sql.types import IntegerType
160 |     from sparkly import SparklySession
161 | 
162 | 
163 |     def my_own_udf(item):
164 |         return len(item)
165 | 
166 | 
167 |     class MySession(SparklySession):
168 |         # Import local jar files.
169 |         jars = [
170 |             '/path/to/brickhouse.jar'
171 |         ]
172 |         # Define UDFs.
173 |         udfs = {
174 |             'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',  # Java UDF.
175 |             'my_udf': (my_own_udf, IntegerType()),  # Python UDF.
176 |         }
177 | 
178 |     spark = MySession()
179 | 
180 |     spark.sql('SELECT collect_max(amount) FROM my_data GROUP BY ...')
181 |     spark.sql('SELECT my_udf(amount) FROM my_data')
182 | 
183 | 
184 | Lazy access / initialization
185 | ----------------------------
186 | 
187 | **Why**: A lot of times you might need access to the sparkly session at a low-level,
188 | deeply nested function in your code. A first approach is to declare a global sparkly
189 | session instance that you access explicitly, but this usually makes testing painful
190 | because of unexpected importing side effects. A second approach is to pass the session
191 | instance explicitly as a function argument, but this makes the code ugly since you then
192 | need to propagate that argument all the way up to every caller of that function.
193 | 
194 | Other times you might want to be able to glue together and run one after the other
195 | different code segments, where each segment initializes its own sparkly session,
196 | despite the sessions being identical. This situation could occur when you are doing
197 | investigative work in a notebook.
198 | 
199 | In both cases, ``SparklySession.get_or_create`` is the answer, as it solves the
200 | problems mentioned above while keeping your code clean and tidy.
201 | 
202 | 
203 | **For example**: You want to use a read function within a transformation.
204 | 
205 | .. code-block:: python
206 | 
207 |     from sparkly import SparklySession
208 | 
209 | 
210 |     class MySession(SparklySession):
211 |         pass
212 | 
213 |     def my_awesome_transformation():
214 |         df = read_dataset('parquet:s3://path/to/my/data')
215 |         df2 = read_dataset('parquet:s3://path/to/my/other/data')
216 | 	# do something with df and df2...
217 | 
218 |     def read_dataset(url):
219 |         spark = MySession.get_or_create()
220 |         return spark.read_ext.by_url(url)
221 | 
222 | 
223 | API documentation
224 | -----------------
225 | 
226 | .. automodule:: sparkly.session
227 |     :members:
228 | 


--------------------------------------------------------------------------------
/docs/source/testing.rst:
--------------------------------------------------------------------------------
  1 | Testing Utils
  2 | =============
  3 | 
  4 | Base TestCases
  5 | --------------
  6 | 
  7 | There are two main test cases available in Sparkly:
  8 |  - ``SparklyTest`` creates a new session for each test case.
  9 |  - ``SparklyGlobalSessionTest`` uses a single sparkly session for all test cases to boost performance.
 10 | 
 11 | .. code-block:: python
 12 | 
 13 |     from pyspark.sql import types as T
 14 | 
 15 |     from sparkly import SparklySession
 16 |     from sparkly.testing import SparklyTest, SparklyGlobalSessionTest
 17 | 
 18 | 
 19 |     class MyTestCase(SparklyTest):
 20 |         session = SparklySession
 21 | 
 22 |         def test(self):
 23 |             df = self.spark.read_ext.by_url(...)
 24 | 
 25 |             # Compare all fields
 26 |             self.assertRowsEqual(
 27 |                 df.collect(),
 28 |                 [
 29 |                     T.Row(col1='row1', col2=1),
 30 |                     T.Row(col1='row2', col2=2),
 31 |                 ],
 32 |             )
 33 | 
 34 |     ...
 35 | 
 36 |     class MyTestWithReusableSession(SparklyGlobalSessionTest):
 37 |         context = SparklySession
 38 | 
 39 |         def test(self):
 40 |             df = self.spark.read_ext.by_url(...)
 41 | 
 42 |     ...
 43 | 
 44 | 
 45 | DataFrame Assertions
 46 | --------------------
 47 | 
 48 | Asserting that the dataframe produced by your transformation is equal to some expected
 49 | output can be unnecessarily complicated at times. Common issues include:
 50 | 
 51 | - Ignoring the order in which elements appear in an array.
 52 |   This could be particularly useful when that array is generated as part of a
 53 |   ``groupBy`` aggregation, and you only care about all elements being part of the end
 54 |   result, rather than the order in which Spark encountered them.
 55 | - Comparing floats that could be arbitrarily nested in complicated datatypes
 56 |   within a given tolerance; exact matching is either fragile or impossible.
 57 | - Ignoring whether a field of a complex datatype is nullable.
 58 |   Spark infers this based on the applied transformations, but it is oftentimes
 59 |   inaccurate. As a result, assertions on complex data types might fail, even
 60 |   though in theory they shouldn't have.
 61 | - Having rows with different field names compare equal if the values match in
 62 |   alphabetical order of the names (see unit tests for example).
 63 | - Unhelpful diffs in case of mismatches.
 64 | 
 65 | Sparkly addresses these issues by providing ``assertRowsEqual``:
 66 | 
 67 | .. code-block:: python
 68 | 
 69 |     from pyspark.sql import types as T
 70 | 
 71 |     from sparkly import SparklySession
 72 |     from sparkly.test import SparklyTest
 73 | 
 74 | 
 75 |     def my_transformation(spark):
 76 |         return spark.createDataFrame(
 77 |             data=[
 78 |                 ('row1', {'field': 'value_1'}, [1.1, 2.2, 3.3]),
 79 |                 ('row2', {'field': 'value_2'}, [4.1, 5.2, 6.3]),
 80 |             ],
 81 |             schema=T.StructType([
 82 |                 T.StructField('id', T.StringType()),
 83 |                 T.StructField(
 84 |                     'st',
 85 |                     T.StructType([
 86 |                         T.StructField('field', T.StringType()),
 87 |                     ]),
 88 |                 ),
 89 |                 T.StructField('ar', T.ArrayType(T.FloatType())),
 90 |             ]),
 91 |         )
 92 | 
 93 | 
 94 |     class MyTestCase(SparklyTest):
 95 |         session = SparklySession
 96 | 
 97 |         def test(self):
 98 |             df = my_transformation(self.spark)
 99 | 
100 |             self.assertRowsEqual(
101 |                 df.collect(),
102 |                 [
103 |                     T.Row(id='row2', st=T.Row(field='value_2'), ar=[6.0, 5.0, 4.0]),
104 |                     T.Row(id='row1', st=T.Row(field='value_1'), ar=[2.0, 3.0, 1.0]),
105 |                 ],
106 |                 atol=0.5,
107 |             )
108 | 
109 | 
110 | Instant Iterative Development
111 | -----------------------------
112 | 
113 | The slowest part in Spark integration testing is context initialisation.
114 | ``SparklyGlobalSessionTest`` allows you to keep the same instance of spark context between different test cases,
115 | but it still kills the context at the end. It's especially annoying if you work in `TDD fashion <https://en.wikipedia.org/wiki/Test-driven_development>`_.
116 | On each run you have to wait 25-30 seconds till a new context is ready.
117 | We added a tool to preserve spark context between multiple test runs.
118 | 
119 | .. code-block:: bash
120 | 
121 |     # Activate instant testing mode.
122 |     sparkly-testing up
123 | 
124 |     # The first run is slow (context is created).
125 |     py.test tests/my_integration_test_with_sparkly.py
126 | 
127 |     # The second run and all after it are fast (context is reused).
128 |     py.test tests/my_integration_test_with_sparkly.py
129 | 
130 |     # Deactivate instant testing mode (when you are done with testing).
131 |     sparkly-testing down
132 | 
133 | .. note::
134 |     In case if you change ``SparklySession`` definition (new options, jars or packages)
135 |     you have to refresh the context via ``sparkly-testing refresh``.
136 |     However, you don't need to refresh context if ``udfs`` are changed.
137 | 
138 | 
139 | Fixtures
140 | --------
141 | 
142 | "Fixture" is a term borrowed from Django framework.
143 | Fixtures load data to a database before the test execution.
144 | 
145 | There are several storages supported in Sparkly:
146 |     - Elastic
147 |     - Cassandra (requires ``cassandra-driver``)
148 |     - Mysql (requires ``PyMySql``)
149 |     - Kafka (requires ``kafka-python``)
150 | 
151 | .. code-block:: python
152 | 
153 |     from sparkly.test import MysqlFixture, SparklyTest
154 | 
155 | 
156 |     class MyTestCase(SparklyTest):
157 |         ...
158 |         fixtures = [
159 |             MysqlFixture('mysql.host',
160 |                          'user',
161 |                          'password',
162 |                          '/path/to/setup_data.sql',
163 |                          '/path/to/remove_data.sql')
164 |         ]
165 |         ...
166 | 
167 | .. automodule:: sparkly.testing
168 |     :members:
169 | 


--------------------------------------------------------------------------------
/docs/source/utils.rst:
--------------------------------------------------------------------------------
1 | Generic Utils
2 | =============
3 | 
4 | These are generic utils used in Sparkly.
5 | 
6 | .. automodule:: sparkly.utils
7 |     :members:
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | pylru==1.0.9
18 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | mock==1.3.0
18 | pytest==6.2.5
19 | pytest-cov==3.0.0
20 | Sphinx==4.2.0
21 | sphinx_rtd_theme==1.0.0
22 | tox==3.24.4
23 | 


--------------------------------------------------------------------------------
/requirements_extras.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | cassandra-driver==3.25.0
18 | PyMySQL==0.9.3
19 | kafka-python==2.0.2
20 | redis==2.10.5
21 | ujson==1.35
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | [bdist_wheel]
18 | # This flag says that the code is written to work on both Python 2 and Python
19 | # 3. If at all possible, it is good practice to do this. If you cannot, you
20 | # will need to generate wheels for each Python version that you support.
21 | universal=1
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | from codecs import open
 18 | import os
 19 | import re
 20 | 
 21 | from setuptools import setup, find_packages
 22 | 
 23 | 
 24 | here = os.path.abspath(os.path.dirname(__file__))
 25 | 
 26 | # Get version
 27 | with open(os.path.join(here, 'sparkly/__init__.py'), 'rb') as init_py:
 28 |     version = re.search('__version__ = \'([\w.]+)\'', init_py.read().decode('utf-8')).group(1)
 29 | 
 30 | # Get the long description from the relevant file
 31 | with open(os.path.join(here, 'README.rst'), 'rb') as readme_rst:
 32 |     long_description = readme_rst.read().decode('utf-8')
 33 | 
 34 | # Get requirements
 35 | with open(os.path.join(here, 'requirements.txt')) as requirements_txt:
 36 |     requirements = [req for req in requirements_txt.readlines() if re.match(u'^[^#\-\s]', req)]
 37 | 
 38 | 
 39 | setup(
 40 |     name='sparkly',
 41 | 
 42 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
 43 |     # the version across setup.py and the project code, see
 44 |     # https://packaging.python.org/en/latest/single_source_version.html
 45 |     version=version,
 46 | 
 47 |     description='Helpers & syntax sugar for PySpark.',
 48 |     long_description=long_description,
 49 | 
 50 |     # The project's main homepage.
 51 |     url='https://github.com/Tubular/sparkly',
 52 | 
 53 |     # Author details
 54 |     author='Tubular Engineering',
 55 |     author_email='dev@tubularlabs.com',
 56 | 
 57 |     # License
 58 |     license='Apache License 2.0',
 59 | 
 60 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 61 |     classifiers=[
 62 |         'Development Status :: 5 - Production/Stable',
 63 | 
 64 |         # Indicate who your project is intended for
 65 |         'Intended Audience :: Developers',
 66 |         'Topic :: Software Development :: Build Tools',
 67 | 
 68 |         # Pick your license as you wish (should match "license" above)
 69 |         'License :: OSI Approved :: Apache Software License',
 70 | 
 71 |         # Specify the Python versions you support here. In particular, ensure
 72 |         # that you indicate whether you support Python 2, Python 3 or both.
 73 |         'Programming Language :: Python :: 3',
 74 |         'Programming Language :: Python :: 3.7',
 75 |         'Programming Language :: Python :: 3.8',
 76 |         'Programming Language :: Python :: 3.10',
 77 |     ],
 78 | 
 79 |     # What does your project relate to?
 80 |     keywords='sparkly spark pyspark',
 81 | 
 82 |     # You can just specify the packages manually here if your project is
 83 |     # simple. Or you can use find_packages().
 84 |     packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
 85 |     scripts=['bin/sparkly-testing'],
 86 |     include_package_data=True,
 87 | 
 88 |     # List run-time dependencies here.  These will be installed by pip when
 89 |     # your project is installed. For an analysis of "install_requires" vs pip's
 90 |     # requirements files see:
 91 |     # https://packaging.python.org/en/latest/requirements.html
 92 |     install_requires=requirements,
 93 |     extras_require={
 94 |         'redis': ['redis>=2.10,<3', 'ujson>=1.33,<2'],
 95 |         'test': [
 96 |             'cassandra-driver>=3.25,<3.26',
 97 |             'PyMySQL>=0.7,<0.10',
 98 |             'kafka-python>=2.0.2,<2.1',
 99 |             'redis>=2.10,<3',
100 |             'ujson>=1.33,<2',
101 |         ],
102 |     },
103 | )
104 | 


--------------------------------------------------------------------------------
/sparkly/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from sparkly.session import SparklySession
18 | 
19 | assert SparklySession
20 | 
21 | 
22 | __version__ = '3.0.0'
23 | 


--------------------------------------------------------------------------------
/sparkly/catalog.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import uuid
 17 | 
 18 | from pyspark.sql import functions as F
 19 | from pyspark.sql import utils as U
 20 | 
 21 | 
 22 | class SparklyCatalog(object):
 23 |     """A set of tools to interact with HiveMetastore."""
 24 | 
 25 |     def __init__(self, spark):
 26 |         """Constructor.
 27 | 
 28 |         Args:
 29 |             spark (sparkly.SparklySession)
 30 |         """
 31 |         self._spark = spark
 32 | 
 33 |     def create_table(self, table_name, path=None, source=None, schema=None, **options):
 34 |         """Create table in the metastore.
 35 | 
 36 |         Extend ``SparkSession.Catalog.createExternalTable`` by accepting
 37 |         a ``mode='overwrite'`` option which creates the table even if a
 38 |         table with the same name already exists. All other args are
 39 |         exactly the same.
 40 | 
 41 |         Note:
 42 |             If the table exists, create two unique names, one for the
 43 |             new and one for the old instance, then try to swap names
 44 |             and drop the "old" instance. If any step fails, the metastore
 45 |             might be currently left at a broken state.
 46 | 
 47 |         Args:
 48 |             mode (str): if set to ``'overwrite'``, drop any table of the
 49 |                 same name from the metastore. Given as a kwarg. Default
 50 |                 is error out if table already exists.
 51 | 
 52 |         Returns:
 53 |             pyspark.sql.DataFrame: DataFrame associated with the created
 54 |             table.
 55 |         """
 56 |         overwrite_existing_table = (
 57 |             options.pop('mode', '').lower() == 'overwrite' and
 58 |             self.has_table(table_name)
 59 |         )
 60 | 
 61 |         def _append_unique_suffix(*args):
 62 |             return '__'.join(args + (uuid.uuid4().hex, ))
 63 | 
 64 |         if overwrite_existing_table:
 65 |             new_table_name = _append_unique_suffix(table_name, 'new')
 66 |         else:
 67 |             new_table_name = table_name
 68 | 
 69 |         if hasattr(self._spark.catalog, 'createTable'):
 70 |             createTable = self._spark.catalog.createTable
 71 |         else:  # before Spark 2.2
 72 |             createTable = self._spark.catalog.createExternalTable
 73 | 
 74 |         df = createTable(
 75 |             new_table_name,
 76 |             path=path,
 77 |             source=source,
 78 |             schema=schema,
 79 |             **options
 80 |         )
 81 | 
 82 |         if overwrite_existing_table:
 83 |             old_table_name = _append_unique_suffix(table_name, 'old')
 84 |             self.rename_table(table_name, old_table_name)
 85 |             self.rename_table(new_table_name, table_name)
 86 |             self.drop_table(old_table_name)
 87 | 
 88 |         return df
 89 | 
 90 |     def drop_table(self, table_name, checkfirst=True):
 91 |         """Drop table from the metastore.
 92 | 
 93 |         Note:
 94 |             Follow the official documentation to understand `DROP TABLE` semantic.
 95 |             https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL\
 96 |             #LanguageManualDDL-DropTable
 97 | 
 98 |         Args:
 99 |             table_name (str): A table name.
100 |             checkfirst (bool): Only issue DROPs for tables that are presented in the database.
101 |         """
102 |         db_name = get_db_name(table_name)
103 |         if checkfirst and not self.has_database(db_name):
104 |             return
105 | 
106 |         drop_statement = 'DROP TABLE IF EXISTS' if checkfirst else 'DROP TABLE'
107 |         return self._spark.sql(
108 |             '{} {}'.format(drop_statement, table_name)
109 |         )
110 | 
111 |     def has_table(self, table_name):
112 |         """Check if table is available in the metastore.
113 | 
114 |         Args:
115 |             table_name (str): A table name.
116 | 
117 |         Returns:
118 |             bool
119 |         """
120 | 
121 |         if not table_name:
122 |             return False
123 | 
124 |         try:
125 |             self._spark.sql('SELECT 1 FROM {} WHERE 1=0'.format(table_name))
126 |         except U.AnalysisException:
127 |             return False
128 | 
129 |         return True
130 | 
131 |     def has_database(self, db_name):
132 |         """Check if database exists in the metastore.
133 | 
134 |         Args:
135 |             db_name (str): Database name.
136 | 
137 |         Returns:
138 |             bool
139 |         """
140 |         if not db_name:
141 |             return True
142 | 
143 |         for db in self._spark.catalog.listDatabases():
144 |             if db_name == db.name:
145 |                 return True
146 | 
147 |         return False
148 | 
149 |     def rename_table(self, old_table_name, new_table_name):
150 |         """Rename table in the metastore.
151 | 
152 |         Note:
153 |             Follow the official documentation to understand `ALTER TABLE` semantic.
154 |             https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL\
155 |             #LanguageManualDDL-RenameTable
156 | 
157 |         Args:
158 |             old_table_name (str): The current table name.
159 |             new_table_name (str): An expected table name.
160 |         """
161 |         self._spark.sql('ALTER TABLE {} RENAME TO {}'.format(old_table_name, new_table_name))
162 | 
163 |     def get_table_property(self, table_name, property_name, to_type=None):
164 |         """Get table property value from the metastore.
165 | 
166 |         Args:
167 |             table_name (str): A table name. Might contain a db name.
168 |                 E.g. "my_table" or "default.my_table".
169 |             property_name (str): A property name to read value for.
170 |             to_type (function): Cast value to the given type. E.g. `int` or `float`.
171 | 
172 |         Returns:
173 |             Any
174 |         """
175 |         if not to_type:
176 |             to_type = str
177 | 
178 |         df = self._spark.sql("SHOW TBLPROPERTIES {}('{}')".format(table_name, property_name))
179 |         prop_val = df.collect()[0].value.strip()
180 | 
181 |         if 'does not have property' not in prop_val:
182 |             return to_type(prop_val)
183 | 
184 |     def get_table_properties(self, table_name):
185 |         """Get table properties from the metastore.
186 | 
187 |         Args:
188 |             table_name (str): A table name.
189 | 
190 |         Returns:
191 |             dict[str,str]: Key/value for properties.
192 |         """
193 |         rows = self._spark.sql('SHOW TBLPROPERTIES {}'.format(table_name)).collect()
194 |         return {row.key: row.value for row in rows}
195 | 
196 |     def set_table_property(self, table_name, property_name, value):
197 |         """Set value for table property.
198 | 
199 |         Args:
200 |             table_name (str): A table name.
201 |             property_name (str): A property name to set value for.
202 |             value (Any): Will be automatically casted to string.
203 |         """
204 |         self._spark.sql("ALTER TABLE {} SET TBLPROPERTIES ('{}'='{}')".format(
205 |             table_name, property_name, value
206 |         ))
207 | 
208 |     def get_database_property(self, db_name, property_name, to_type=None):
209 |         """Read value for database property.
210 | 
211 |         Args:
212 |             db_name (str): A database name.
213 |             property_name (str): A property name to read value for.
214 |             to_type (function): Cast value to the given type. E.g. `int` or `float`.
215 | 
216 |         Returns:
217 |             Any
218 |         """
219 |         if not to_type:
220 |             to_type = str
221 | 
222 |         value = self.get_database_properties(db_name).get(property_name)
223 |         if value is not None:
224 |             return to_type(value)
225 | 
226 |     def get_database_properties(self, db_name):
227 |         """Get database properties from the metastore.
228 | 
229 |         Args:
230 |             db_name (str): A database name.
231 | 
232 |         Returns:
233 |             dict[str,str]: Key/value for properties.
234 |         """
235 |         describe = self._spark.sql(f'DESCRIBE DATABASE EXTENDED {db_name}')
236 | 
237 |         if 'database_description_item' in describe.columns:
238 |             key_col = 'database_description_item'
239 |             val_col = 'database_description_value'
240 |         else:
241 |             key_col = 'info_name'
242 |             val_col = 'info_value'
243 | 
244 |         properties = (
245 |             self._spark.sql('DESCRIBE DATABASE EXTENDED {}'.format(db_name))
246 |             .where(F.col(key_col) == 'Properties')
247 |             .select(val_col)
248 |             .first()
249 |         )
250 | 
251 |         parsed_properties = {}
252 | 
253 |         if properties:
254 |             info_value = getattr(properties, val_col)
255 |             for name, value in read_db_properties_format(info_value):
256 |                 parsed_properties[name] = value
257 | 
258 |         return parsed_properties
259 | 
260 |     def set_database_property(self, db_name, property_name, value):
261 |         """Set value for database property.
262 | 
263 |         Args:
264 |             db_name (str): A database name.
265 |             property_name (str): A property name to set value for.
266 |             value (Any): Will be automatically casted to string.
267 |         """
268 |         property_name_blacklist = {',', '(', ')'}
269 |         property_value_blacklist = {'(', ')'}
270 | 
271 |         if set(property_name) & property_name_blacklist:
272 |             raise ValueError(
273 |                 'Property name must not contain symbols: {}'.format(property_name_blacklist))
274 | 
275 |         if set(str(value)) & property_value_blacklist:
276 |             raise ValueError(
277 |                 'Property value must not contain symbols: {}'.format(property_value_blacklist))
278 | 
279 |         self._spark.sql("ALTER DATABASE {} SET DBPROPERTIES ('{}'='{}')".format(
280 |             db_name, property_name, value,
281 |         ))
282 | 
283 | 
284 | def get_db_name(table_name):
285 |     """Get database name from full table name."""
286 |     parts = table_name.split('.', 1)
287 |     if len(parts) == 1:
288 |         return None
289 |     else:
290 |         return parts[0]
291 | 
292 | 
293 | def get_table_name(table_name):
294 |     """Get table name from full table name."""
295 |     parts = table_name.split('.', 1)
296 |     return parts[-1]
297 | 
298 | 
299 | def read_db_properties_format(raw_db_properties):
300 |     """Helper to read non-standard db properties format.
301 | 
302 |     Note:
303 |         Spark/Hive doesn't provide a way to read separate key/values for database properties.
304 |         They provide a custom format like: ((key_a,value_a), (key_b,value_b))
305 |         Neither keys nor values are escaped.
306 |         Here we try our best to parse this format by tracking balanced parentheses.
307 |         We assume property names don't contain comma.
308 | 
309 |     Return:
310 |         list[list[str]] - the list of key-value pairs.
311 |     """
312 |     def _unpack_parentheses(string):
313 |         bits = []
314 |         last_bit = ''
315 |         checksum = 0
316 | 
317 |         for c in string:
318 |             if c == '(':
319 |                 if checksum == 0:
320 |                     last_bit = ''
321 |                 else:
322 |                     last_bit += c
323 |                 checksum += 1
324 |             elif c == ')':
325 |                 checksum -= 1
326 |                 if checksum == 0:
327 |                     bits.append(last_bit)
328 |                 else:
329 |                     last_bit += c
330 |             else:
331 |                 last_bit += c
332 | 
333 |             if checksum < 0:
334 |                 raise ValueError('Parentheses are not balanced')
335 | 
336 |         if checksum != 0:
337 |             raise ValueError('Parentheses are not balanced')
338 | 
339 |         return bits
340 | 
341 |     properties = _unpack_parentheses(raw_db_properties)
342 |     if properties:
343 |         return [x.split(',', 1) for x in _unpack_parentheses(properties[0])]
344 |     else:
345 |         return []
346 | 


--------------------------------------------------------------------------------
/sparkly/exceptions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | class SparklyException(Exception):
19 |     """Base exception of sparkly lib."""
20 | 
21 | 
22 | class UnsupportedDataType(SparklyException):
23 |     """Happen when schema defines unsupported data type."""
24 |     pass
25 | 
26 | 
27 | class FixtureError(SparklyException):
28 |     """Happen when testing data setup or teardown fails."""
29 |     pass
30 | 
31 | 
32 | class InvalidArgumentError(SparklyException):
33 |     """Happen when invalid parameters are passed to a function."""
34 | 
35 | 
36 | class WriteError(SparklyException):
37 |     """Happen when errors occured while writting dataframe into storage."""
38 | 


--------------------------------------------------------------------------------
/sparkly/functions.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | from collections import defaultdict
 18 | from functools import reduce
 19 | import operator
 20 | 
 21 | from pyspark.sql import Column
 22 | from pyspark.sql import functions as F
 23 | 
 24 | 
 25 | def multijoin(dfs, on=None, how=None, coalesce=None):
 26 |     """Join multiple dataframes.
 27 | 
 28 |     Args:
 29 |         dfs (list[pyspark.sql.DataFrame]).
 30 |         on: same as ``pyspark.sql.DataFrame.join``.
 31 |         how: same as ``pyspark.sql.DataFrame.join``.
 32 |         coalesce (list[str]): column names to disambiguate by coalescing
 33 |             across the input dataframes. A column must be of the same type
 34 |             across all dataframes that define it; if different types appear
 35 |             coalesce will do a best-effort attempt in merging them. The
 36 |             selected value is the first non-null one in order of appearance
 37 |             of the dataframes in the input list. Default is None - don't
 38 |             coalesce any ambiguous columns.
 39 | 
 40 |     Returns:
 41 |         pyspark.sql.DataFrame or None if provided dataframe list is empty.
 42 | 
 43 |     Example:
 44 |         Assume we have two DataFrames, the first is
 45 |         ``first = [{'id': 1, 'value': None}, {'id': 2, 'value': 2}]``
 46 |         and the second is
 47 |         ``second = [{'id': 1, 'value': 1}, {'id': 2, 'value': 22}]``
 48 | 
 49 |         Then collecting the ``DataFrame`` produced by
 50 | 
 51 |         ``multijoin([first, second], on='id', how='inner', coalesce=['value'])``
 52 | 
 53 |         yields ``[{'id': 1, 'value': 1}, {'id': 2, 'value': 2}]``.
 54 |     """
 55 |     if not dfs:
 56 |         return None
 57 | 
 58 |     # Go over the input dataframes and rename each to-be-resolved
 59 |     # column to ensure name uniqueness
 60 |     coalesce = set(coalesce or [])
 61 |     renamed_columns = defaultdict(list)
 62 |     for idx, df in enumerate(dfs):
 63 |         for col in df.columns:
 64 |             if col in coalesce:
 65 |                 disambiguation = '__{}_{}'.format(idx, col)
 66 |                 df = df.withColumnRenamed(col, disambiguation)
 67 |                 renamed_columns[col].append(disambiguation)
 68 |         dfs[idx] = df
 69 | 
 70 |     # Join the dataframes
 71 |     joined_df = reduce(lambda x, y: x.join(y, on=on, how=how), dfs)
 72 | 
 73 |     # And coalesce the would-have-been-ambiguities
 74 |     for col, disambiguations in renamed_columns.items():
 75 |         joined_df = joined_df.withColumn(col, F.coalesce(*disambiguations))
 76 |         for disambiguation in disambiguations:
 77 |             joined_df = joined_df.drop(disambiguation)
 78 | 
 79 |     return joined_df
 80 | 
 81 | 
 82 | def switch_case(switch, case=None, default=None, operand=operator.eq, **additional_cases):
 83 |     """Switch/case style column generation.
 84 | 
 85 |     Args:
 86 |         switch (str, pyspark.sql.Column): column to "switch" on;
 87 |             its values are going to be compared against defined cases.
 88 |         case (dict): case statements. When a key matches the value of
 89 |             the column in a specific row, the respective value will be
 90 |             assigned to the new column for that row. This is useful when
 91 |             your case condition constants are not strings.
 92 |         default: default value to be used when the value of the switch
 93 |             column doesn't match any keys.
 94 |         operand: function to compare the value of the switch column to the
 95 |             value of each case. Default is Column's eq. If user-provided,
 96 |             first argument will always be the switch Column; it's the
 97 |             user's responsibility to transform the case value to a column
 98 |             if they need to.
 99 |         additional_cases: additional "case" statements, kwargs style.
100 |             Same semantics with cases above. If both are provided,
101 |             cases takes precedence.
102 | 
103 |     Returns:
104 |         pyspark.sql.Column
105 | 
106 |     Example:
107 |         ``switch_case('state', CA='California', NY='New York', default='Other')``
108 | 
109 |         is equivalent to
110 | 
111 |         >>> F.when(
112 |         ... F.col('state') == 'CA', 'California'
113 |         ).when(
114 |         ... F.col('state') == 'NY', 'New York'
115 |         ).otherwise('Other')
116 | 
117 |         If you need to "bucketize" a value
118 | 
119 |         ``switch_case('age', {(13, 17): 1, (18, 24): 2, ...}, operand=lambda c, v: c.between(*v))``
120 | 
121 |         is equivalent to
122 | 
123 |         >>> F.when(
124 |         ... F.col('age').between(13, 17), F.lit(1)
125 |         ).when(
126 |         ... F.col('age').between(18, 24), F.lit(2)
127 |         )
128 |     """
129 |     if not isinstance(switch, Column):
130 |         switch = F.col(switch)
131 | 
132 |     def _column_or_lit(x):
133 |         return F.lit(x) if not isinstance(x, Column) else x
134 | 
135 |     def _execute_case(accumulator, case):
136 |         # transform the case to a pyspark.sql.functions.when statement,
137 |         # then chain it to existing when statements
138 |         condition_constant, assigned_value = case
139 |         when_args = (operand(switch, condition_constant), _column_or_lit(assigned_value))
140 |         return accumulator.when(*when_args)
141 | 
142 | 
143 |     cases = case or {}
144 |     for conflict in set(cases.keys()) & set(additional_cases.keys()):
145 |         del additional_cases[conflict]
146 |     cases = list(cases.items()) + list(additional_cases.items())
147 | 
148 |     default = _column_or_lit(default)
149 | 
150 |     if not cases:
151 |         return default
152 | 
153 |     result = reduce(_execute_case, cases, F).otherwise(default)
154 | 
155 |     return result
156 | 
157 | 
158 | def argmax(field, by, condition=None):
159 |     """Select a value from the row that maximizes other column(s)
160 | 
161 |     Args:
162 |         field (string, pyspark.sql.Column): the field to return that maximizes the "by" columns
163 |         by (*string, *pyspark.sql.Column): field or list of fields to maximize. In reality, this
164 |             will usually be only one field. But you may use multiple for tiebreakers
165 |         condition (optional): Only consider the entities that pass this condition
166 | 
167 |     Returns:
168 |         pyspark.sql.Column
169 | 
170 |     Example:
171 |         df = (
172 |             df
173 |             .groupBy('id')
174 |             .agg(argmax('field1', 'by_field'))
175 |         )
176 | 
177 |         argmax('field1', ['by_field1', 'by_field2'], condition=F.col('col') == 1)
178 |         argmax(F.col('field1'), [F.col('by_field1'), F.col('by_field2')], condition=F.lit(True))
179 |     """
180 |     if not isinstance(by, list):
181 |         by = [by]
182 | 
183 |     if isinstance(field, str):
184 |         field = F.col(field)
185 | 
186 |     by.append(field.alias('__tmp_argmax__'))
187 |     result = F.struct(*by)
188 |     if condition is not None:
189 |         result = F.when(condition, result)
190 |     result = F.max(result).getField('__tmp_argmax__')
191 | 
192 |     return result
193 | 


--------------------------------------------------------------------------------
/sparkly/instant_testing.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import json
 18 | import logging
 19 | import os
 20 | import signal
 21 | import tempfile
 22 | 
 23 | from py4j.java_gateway import java_import
 24 | from pyspark import SparkContext
 25 | from pyspark.java_gateway import launch_gateway
 26 | 
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class InstantTesting(object):
 32 |     """The set of tools to run tests using Spark Context running in the background.
 33 | 
 34 |     Implementation:
 35 |         We create a lock file that will contain Python gateway port (exposed by JVM).
 36 | 
 37 |         On the first run:
 38 |             - initialise Spark Context as usual;
 39 |             - write Python gateway port to the lock file;
 40 |             - fork current process.
 41 | 
 42 |         On the second run:
 43 |             - connect to the background JVM process using Python gateway port from the lock file;
 44 |             - recover Spark Context from JVM.
 45 |     """
 46 |     LOCK_FILE_PATH = os.path.join(tempfile.gettempdir(), 'sparkly_instant_testing_lock')
 47 | 
 48 |     @classmethod
 49 |     def activate(cls):
 50 |         """Activate instant testing mode."""
 51 |         if os.path.exists(cls.LOCK_FILE_PATH):
 52 |             logger.error('Instant testing mode is already activate, deactivate it first.')
 53 |         else:
 54 |             with open(cls.LOCK_FILE_PATH, 'w'):
 55 |                 logger.info('Instant testing mode has been activated.')
 56 | 
 57 |     @classmethod
 58 |     def deactivate(cls):
 59 |         """Deactivate instance testing mode."""
 60 |         if not os.path.exists(cls.LOCK_FILE_PATH):
 61 |             logger.error('Instant testing mode is not activated, activate it first.')
 62 |         else:
 63 |             try:
 64 |                 with open(cls.LOCK_FILE_PATH) as lock:
 65 |                     state = lock.read()
 66 |                     if state:
 67 |                         session_pid = json.loads(state)['session_pid']
 68 |                         try:
 69 |                             os.kill(session_pid, signal.SIGTERM)
 70 |                         except OSError:
 71 |                             logger.exception(
 72 |                                 'Can not kill background SparkContext (pid %d)', session_pid,
 73 |                             )
 74 |                         else:
 75 |                             logger.info(
 76 |                                 'Killed background SparkContext (pid %d)', session_pid,
 77 |                             )
 78 |             finally:
 79 |                 try:
 80 |                     os.remove(cls.LOCK_FILE_PATH)
 81 |                 except OSError:
 82 |                     logger.exception('Can not remove lock file: %s', cls.LOCK_FILE_PATH)
 83 | 
 84 |             logger.info('Instant testing mode has been deactivated.')
 85 | 
 86 |     @classmethod
 87 |     def is_activated(cls):
 88 |         """Check if instant testing has been activated before.
 89 | 
 90 |         Returns:
 91 |             bool
 92 |         """
 93 |         return os.path.exists(cls.LOCK_FILE_PATH)
 94 | 
 95 |     @classmethod
 96 |     def set_context(cls, spark_context):
 97 |         """Set the given spark context for instant testing.
 98 | 
 99 |         Args:
100 |             spark_context (pyspark.SparkContext)
101 |         """
102 |         assert cls.is_activated()
103 | 
104 |         gateway_port = spark_context._gateway.java_gateway_server.getListeningPort()
105 | 
106 |         # pid of the python process that holds JVM with running Spark Context.
107 |         session_pid = os.getpid()
108 | 
109 |         with open(cls.LOCK_FILE_PATH, 'w') as lock:
110 |             json.dump(
111 |                 {
112 |                     'gateway_port': gateway_port,
113 |                     'session_pid': session_pid,
114 |                     'gateway_secret': getattr(
115 |                         spark_context._gateway.gateway_parameters, 'auth_token', None,
116 |                     ),
117 |                 },
118 |                 lock,
119 |             )
120 |             logger.info(
121 |                 'Successfully set spark context for the instant testing [pid=%s, gateway=%s]',
122 |                 session_pid, gateway_port
123 |             )
124 | 
125 |     @classmethod
126 |     def get_context(cls):
127 |         """Get the current global spark context.
128 | 
129 |         Returns:
130 |             pyspark.SparkContext or None (if wasn't set before).
131 |         """
132 |         assert cls.is_activated()
133 | 
134 |         state = None
135 | 
136 |         with open(cls.LOCK_FILE_PATH) as lock:
137 |             serialised_state = lock.read()
138 |             if serialised_state:
139 |                 try:
140 |                     state = json.loads(serialised_state)
141 |                 except ValueError:
142 |                     logger.error(
143 |                         'Unable to deserialize lock file. Try to reactivate instant testing. '
144 |                         'The broken content is: %s',
145 |                         serialised_state,
146 |                     )
147 | 
148 |         if state:
149 |             logger.info(
150 |                 'Recovering context for the instant testing [pid=%s, gateway=%s]',
151 |                 state['session_pid'], state['gateway_port'],
152 |             )
153 | 
154 |             os.environ['PYSPARK_GATEWAY_PORT'] = str(state['gateway_port'])
155 |             os.environ['PYSPARK_GATEWAY_SECRET'] = str(state['gateway_secret'])
156 |             gateway = launch_gateway()
157 |             java_import(gateway.jvm, 'org.apache.spark.SparkContext')
158 |             jvm_spark_context = gateway.jvm.SparkContext.getOrCreate()
159 |             jvm_java_spark_context = gateway.jvm.JavaSparkContext(jvm_spark_context)
160 | 
161 |             SparkContext._gateway = gateway
162 |             SparkContext._jvm = gateway.jvm
163 | 
164 |             return SparkContext(
165 |                 appName=jvm_spark_context.appName(),
166 |                 master=jvm_spark_context.master(),
167 |                 gateway=gateway,
168 |                 jsc=jvm_java_spark_context,
169 |             )
170 | 


--------------------------------------------------------------------------------
/sparkly/session.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import atexit
 18 | from copy import deepcopy
 19 | import os
 20 | import signal
 21 | import sys
 22 | import time
 23 | import uuid
 24 | 
 25 | from pyspark import SparkContext
 26 | from pyspark.conf import SparkConf
 27 | from pyspark.sql import SparkSession
 28 | 
 29 | from sparkly.catalog import SparklyCatalog
 30 | from sparkly.instant_testing import InstantTesting
 31 | from sparkly.reader import SparklyReader
 32 | from sparkly.writer import attach_writer_to_dataframe
 33 | 
 34 | 
 35 | class SparklySession(SparkSession):
 36 |     """Wrapper around SparkSession to simplify definition of options, packages, JARs and UDFs.
 37 | 
 38 |     Example::
 39 | 
 40 |         from pyspark.sql.types import IntegerType
 41 |         import sparkly
 42 | 
 43 | 
 44 |         class MySession(sparkly.SparklySession):
 45 |             options = {'spark.sql.shuffle.partitions': '2000'}
 46 |             repositories = ['http://packages.confluent.io/maven/']
 47 |             packages = ['com.databricks:spark-csv_2.10:1.4.0']
 48 |             jars = ['../path/to/brickhouse-0.7.1.jar']
 49 |             udfs = {
 50 |                 'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',
 51 |                 'my_python_udf': (lambda x: len(x), IntegerType()),
 52 |             }
 53 | 
 54 | 
 55 |         spark = MySession()
 56 |         spark.read_ext.cassandra(...)
 57 | 
 58 |         # Alternatively
 59 |         spark = MySession.get_or_create()
 60 |         spark.read_ext.cassandra(...)
 61 | 
 62 |     Attributes:
 63 |         options (dict[str,str]): Configuration options that are passed to spark-submit.
 64 |             See `the list of possible options
 65 |             <https://spark.apache.org/docs/2.1.0/configuration.html#available-properties>`_.
 66 |             Note that any options set already through PYSPARK_SUBMIT_ARGS will override
 67 |             these.
 68 |         repositories (list[str]): List of additional maven repositories for package lookup.
 69 |         packages (list[str]): Spark packages that should be installed.
 70 |             See https://spark-packages.org/
 71 |         jars (list[str]): Full paths to jar files that we want to include to the session.
 72 |             E.g. a JDBC connector or a library with UDF functions.
 73 |         udfs (dict[str,str|typing.Callable]): Register UDF functions within the session.
 74 |             Key - a name of the function,
 75 |             Value - either a class name imported from a JAR file
 76 |                 or a tuple with python function and its return type.
 77 |         name (str): a name that is used in default app_id_template (see below)
 78 |         app_id_template (str|None): if set and nonempty, generate the `spark.app.id` with
 79 |             this template. Interpolation is available with some pre-defined variables:
 80 |                 * initial_time: the time that the first session started
 81 |                 * initial_uid: a unique id associated with the first session
 82 |                 * session_time: the time the session started
 83 |                 * session_uid: a unique id associated with the session
 84 |             A default value is provided using the name, initial-uid and session-time.
 85 |             This helps a specific use case when running in Kubernetes: when a session
 86 |             is restarted, the same app-id is used, breaking storage of spark-history data
 87 |             (only the first session will have its history stored, unless overwrite mode
 88 |             is used, in which case only the last session will have its history stored).
 89 |             By defaulting to using the initial-uid and session-time information, we get
 90 |             sane "grouping" of all sessions originating from the same initial session, but also
 91 |             achieve separate individual app ids so that history for each can be maintained.
 92 |             To disable this functionality entirely, simply set to None or emptystring.
 93 |             Finally, if a user manually specifies `spark.app.id`, then that value will
 94 |             always trump any template provided here.
 95 |     """
 96 |     name = 'sparkly'
 97 |     options = {}
 98 |     packages = []
 99 |     jars = []
100 |     udfs = {}
101 |     repositories = []
102 |     app_id_template = '{name}-{initial_uid}-{session_time}'
103 | 
104 |     _instantiated_session = None
105 |     _original_environment = None
106 | 
107 |     _initial_time = None
108 |     _initial_uid = None
109 | 
110 |     def __init__(self, additional_options=None):
111 |         SparklySession._original_environment = deepcopy(os.environ)
112 |         os.environ['PYSPARK_PYTHON'] = sys.executable
113 | 
114 |         self._initial_time = self._initial_time or int(time.time())
115 |         self._initial_uid = self._initial_uid or uuid.uuid4().hex
116 |         self._session_time = int(time.time())
117 |         self._session_uid = uuid.uuid4().hex
118 | 
119 |         options = {
120 |             'spark.sql.catalogImplementation': 'hive',
121 |         }
122 |         app_id_template = self.app_id_template
123 |         if app_id_template:
124 |             options.update({
125 |                 'spark.app.id': app_id_template.format(
126 |                     name=self.name,
127 |                     initial_time=self._initial_time,
128 |                     initial_uid=self._initial_uid,
129 |                     session_time=self._session_time,
130 |                     session_uid=self._session_uid,
131 |                 ),
132 |             })
133 |         options.update(self.options or {})
134 |         options.update(additional_options or {})
135 |         options = {str(key): str(value) for key, value in options.items()}
136 | 
137 |         submit_args = [
138 |             # options that were already defined through PYSPARK_SUBMIT_ARGS
139 |             # take precedence over SparklySession's
140 |             os.environ.get('PYSPARK_SUBMIT_ARGS', '').replace('pyspark-shell', ''),
141 |             self._setup_repositories(),
142 |             self._setup_packages(),
143 |             self._setup_jars(),
144 |             self._setup_options(options),
145 |             'pyspark-shell',
146 |         ]
147 |         os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(filter(None, submit_args))
148 | 
149 |         def get_context():
150 |             conf = SparkConf()
151 |             conf.setAll(options.items())
152 |             return SparkContext(conf=conf)
153 | 
154 |         # If we are in instant testing mode
155 |         if InstantTesting.is_activated():
156 |             context = InstantTesting.get_context()
157 | 
158 |             # It's the first run, so we have to create context and demonise the process.
159 |             if context is None:
160 |                 context = get_context()
161 |                 if os.fork() == 0:  # Detached process.
162 |                     signal.pause()
163 |                 else:
164 |                     InstantTesting.set_context(context)
165 |         else:
166 |             context = get_context()
167 | 
168 |         super(SparklySession, self).__init__(context)
169 | 
170 |         # similar to session builder:
171 |         for key, value in options.items():
172 |             self._jsparkSession.sessionState().conf().setConfString(key, value)
173 | 
174 |         self._setup_udfs()
175 | 
176 |         self.read_ext = SparklyReader(self)
177 |         self.catalog_ext = SparklyCatalog(self)
178 | 
179 |         attach_writer_to_dataframe()
180 |         SparklySession._instantiated_session = self
181 | 
182 |     @classmethod
183 |     def get_or_create(cls):
184 |         """Access instantiated sparkly session.
185 | 
186 |         If sparkly session has already been instantiated, return that
187 |         instance; if not, then instantiate one and return it. Useful
188 |         for lazy access to the session. Not thread-safe.
189 | 
190 |         Returns:
191 |             SparklySession (or subclass).
192 |         """
193 |         if SparklySession._instantiated_session is None:
194 |             cls()
195 |         return SparklySession._instantiated_session
196 | 
197 |     @classmethod
198 |     def stop(cls):
199 |         """Stop instantiated sparkly session."""
200 |         if SparklySession._instantiated_session is not None:
201 |             SparkSession.stop(SparklySession._instantiated_session)
202 |             SparklySession._instantiated_session = None
203 |             os.environ = SparklySession._original_environment
204 |             SparklySession._original_environment = None
205 | 
206 |     @property
207 |     def builder(self):
208 |         raise NotImplementedError(
209 |             'You do not need a builder for SparklySession. '
210 |             'Just use a regular python constructor. '
211 |             'Please, follow the documentation for more details.'
212 |         )
213 | 
214 |     def _setup_repositories(self):
215 |         if self.repositories:
216 |             return '--repositories {}'.format(','.join(self.repositories))
217 |         else:
218 |             return ''
219 | 
220 |     def _setup_packages(self):
221 |         if self.packages:
222 |             return '--packages {}'.format(','.join(self.packages))
223 |         else:
224 |             return ''
225 | 
226 |     def _setup_jars(self):
227 |         if self.jars:
228 |             return '--jars {}'.format(','.join(self.jars))
229 |         else:
230 |             return ''
231 | 
232 |     def _setup_options(self, options):
233 |         # Here we massage conf properties with the intent to pass them to
234 |         # spark-submit; this is convenient as it is unified with the approach
235 |         # we take for repos, packages and jars, and it also handles precedence
236 |         # of conf properties already defined by the user in a very
237 |         # straightforward way (since we always append to PYSPARK_SUBMIT_ARGS)
238 |         return ' '.join('--conf "{}={}"'.format(*o) for o in sorted(options.items()))
239 | 
240 |     def _setup_udfs(self):
241 |         for name, defn in self.udfs.items():
242 |             if isinstance(defn, str):
243 |                 self.sql('create temporary function {} as "{}"'.format(name, defn))
244 |             elif isinstance(defn, tuple):
245 |                 self.udf.register(name, *defn)
246 |             else:
247 |                 raise NotImplementedError('Incorrect UDF definition: {}: {}'.format(name, defn))
248 | 
249 | 
250 | # https://issues.apache.org/jira/browse/SPARK-27927
251 | # Spark on Kubernetes has an issue where the python process finishes,
252 | # but the controlling java process just hangs, so nothing terminates.
253 | # There is a simple workaround to stop the session prior to python termination.
254 | # We do that here with an atexit registration.
255 | atexit.register(SparklySession.stop)
256 | 


--------------------------------------------------------------------------------
/sparkly/utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import functools
 18 | import inspect
 19 | from itertools import islice
 20 | import os
 21 | import re
 22 | 
 23 | try:
 24 |     from kafka import KafkaAdminClient, KafkaConsumer, TopicPartition
 25 |     import kafka.admin
 26 | except ImportError:
 27 |     pass
 28 | 
 29 | import pylru
 30 | from pyspark import StorageLevel
 31 | from pyspark.sql import DataFrame
 32 | from pyspark.sql import types as T
 33 | 
 34 | from sparkly.exceptions import UnsupportedDataType
 35 | 
 36 | 
 37 | def absolute_path(file_path, *rel_path):
 38 |     """Return absolute path to file.
 39 | 
 40 |     Usage:
 41 |         >>> absolute_path('/my/current/dir/x.txt', '..', 'x.txt')
 42 |         '/my/current/x.txt'
 43 | 
 44 |         >>> absolute_path('/my/current/dir/x.txt', 'relative', 'path')
 45 |         '/my/current/dir/relative/path'
 46 | 
 47 |         >>> import os
 48 |         >>> absolute_path('x.txt', 'relative/path') == os.getcwd() + '/relative/path'
 49 |         True
 50 | 
 51 |     Args:
 52 |         file_path (str): file
 53 |         rel_path (list[str]): path parts
 54 | 
 55 |     Returns:
 56 |         str
 57 |     """
 58 |     return os.path.abspath(
 59 |         os.path.join(
 60 |             os.path.dirname(
 61 |                 os.path.realpath(file_path)
 62 |             ),
 63 |             *rel_path
 64 |         )
 65 |     )
 66 | 
 67 | 
 68 | def kafka_get_topics_offsets(host, topic, port=9092):
 69 |     """Return available partitions and their offsets for the given topic.
 70 | 
 71 |     Args:
 72 |         host (str): Kafka host.
 73 |         topic (str): Kafka topic.
 74 |         port (int): Kafka port.
 75 | 
 76 |     Returns:
 77 |         [(int, int, int)]: [(partition, start_offset, end_offset)].
 78 |     """
 79 |     brokers = ['{}:{}'.format(host, port)]
 80 |     consumer = KafkaConsumer(bootstrap_servers=brokers)
 81 |     partitions = consumer.partitions_for_topic(topic)
 82 |     offsets = []
 83 |     if partitions:
 84 |         topic_partitions = [TopicPartition(topic, p) for p in partitions]
 85 |         start_offsets_raw = consumer.beginning_offsets(topic_partitions)
 86 |         end_offsets_raw = consumer.end_offsets(topic_partitions)
 87 |         start_offsets = {tp.partition: offset for tp, offset in start_offsets_raw.items()}
 88 |         end_offsets = {tp.partition: offset for tp, offset in end_offsets_raw.items()}
 89 |         offsets = [
 90 |             (partition, start_offsets[partition], end_offsets[partition])
 91 |             for partition in start_offsets
 92 |         ]
 93 | 
 94 |     return offsets
 95 | 
 96 | def kafka_create_topic(host, topic, port=9092, num_partitions=2, replication_factor=1):
 97 |     """Creates Kafka topic.
 98 | 
 99 |     Args:
100 |         host (str): Kafka host.
101 |         topic (str): Kafka topic.
102 |         port (int): Kafka port.
103 |         num_partitions (int): Number of topic's partitions.
104 |         replication_factor (int): Number of partition's replicas.
105 |     """
106 |     kafka_admin = KafkaAdminClient(bootstrap_servers=f'{host}:{port}')
107 |     kafka_admin.create_topics([
108 |         kafka.admin.NewTopic(
109 |             name=topic,
110 |             num_partitions=num_partitions,
111 |             replication_factor=replication_factor,
112 |         ),
113 |     ])
114 | 
115 | 
116 | class lru_cache(object):
117 |     """LRU cache that supports DataFrames.
118 | 
119 |     Enables caching of both the dataframe object and the data that df
120 |     contains by persisting it according to user specs. It's the user's
121 |     responsibility to make sure that the dataframe contents are not
122 |     evicted from memory and/or disk should this feature get overused.
123 | 
124 |     Args:
125 |         maxsize (int|128): maximum number of items to cache.
126 |         storage_level (pyspark.StorageLevel|MEMORY_ONLY): how to cache
127 |             the contents of a dataframe (only used when the cached
128 |             function results in a dataframe).
129 |     """
130 |     def __init__(self, maxsize=128, storage_level=StorageLevel.MEMORY_ONLY):
131 |         self.maxsize = maxsize
132 |         self.storage_level = storage_level
133 | 
134 |     def __call__(self, func):
135 |         # Whenever an object is evicted from the cache we want to
136 |         # unpersist its contents too if it's a dataframe
137 |         def eviction_callback(key, value):
138 |             if isinstance(value, DataFrame):
139 |                 value.unpersist()
140 | 
141 |         lru_decorator = pylru.lrudecorator(self.maxsize)
142 |         lru_decorator.cache.callback = eviction_callback
143 | 
144 |         @lru_decorator
145 |         @functools.wraps(func)
146 |         def func_and_persist(*args, **kwargs):
147 |             result = func(*args, **kwargs)
148 |             if isinstance(result, DataFrame):
149 |                 result.persist(self.storage_level)
150 |             return result
151 | 
152 |         return func_and_persist
153 | 
154 | 
155 | def parse_schema(schema):
156 |     """Generate schema by its string definition.
157 | 
158 |     It's basically an opposite action to `DataType.simpleString` method.
159 |     Supports all atomic types (like string, int, float...) and complex types (array, map, struct)
160 |     except DecimalType.
161 | 
162 |     Usages:
163 |         >>> parse_schema('string')
164 |         StringType
165 |         >>> parse_schema('int')
166 |         IntegerType
167 |         >>> parse_schema('array<int>')
168 |         ArrayType(IntegerType,true)
169 |         >>> parse_schema('map<string,int>')
170 |         MapType(StringType,IntegerType,true)
171 |         >>> parse_schema('struct<a:int,b:string>')
172 |         StructType(List(StructField(a,IntegerType,true),StructField(b,StringType,true)))
173 |         >>> parse_schema('unsupported')
174 |         Traceback (most recent call last):
175 |         ...
176 |         sparkly.exceptions.UnsupportedDataType: Cannot parse type from string: "unsupported"
177 |     """
178 |     try:
179 |         return T._parse_datatype_string(schema)
180 |     except Exception as e:
181 |         raise UnsupportedDataType(f'Cannot parse schema: {schema}: {e}')
182 | 
183 | def schema_has(t, required_fields):
184 |     """Check whether a complex dataType has specific fields.
185 | 
186 |     Args:
187 |         t (pyspark.sql.types.ArrayType, MapType, StructType): type to
188 |             check.
189 |         required_fields (same with t or dict[str, pyspark.sql.DataType]):
190 |             fields that need to be present in t. For convenience, a user
191 |             can define a ``dict`` in place of a
192 |             ``pyspark.sql.types.StructType``, but other than that this
193 |             argument must have the same type as t.
194 | 
195 |     Raises:
196 |         AssertionError: if t and required_fields cannot be compared
197 |            because they aren't instances of the same complex dataType.
198 |         KeyError: if a required field is not found in the struct.
199 |         TypeError: if a required field exists but its actual type does
200 |             not match the required one.
201 |     """
202 |     if isinstance(required_fields, dict):
203 |         required_fields = T.StructType([
204 |             T.StructField(*field_def) for field_def in required_fields.items()
205 |         ])
206 | 
207 |     assert type(t) == type(required_fields), 'Cannot compare heterogeneous types'
208 | 
209 |     def _unpack(t):
210 |         if isinstance(t, T.ArrayType):
211 |             return {'element': t.elementType}
212 |         elif isinstance(t, T.MapType):
213 |             return {'key': t.keyType, 'value': t.valueType}
214 |         elif isinstance(t, T.StructType):
215 |             return {field.name: field.dataType for field in t.fields}
216 |         return {}
217 | 
218 |     def _is_complex(t):
219 |         return isinstance(t, (T.ArrayType, T.MapType, T.StructType))
220 | 
221 |     existing_fields = _unpack(t)
222 |     required_fields = _unpack(required_fields)
223 | 
224 |     for required_field, required_type in required_fields.items():
225 |         try:
226 |             current_type = existing_fields[required_field]
227 |         except KeyError:
228 |             raise KeyError(required_field)
229 | 
230 |         if _is_complex(current_type):
231 |             try:
232 |                 schema_has(current_type, required_type)
233 |             except (KeyError, TypeError) as e:
234 |                 raise type(e)('{}.{}'.format(required_field, e.args[0]))
235 |             except AssertionError:
236 |                 pass
237 |             else:
238 |                 continue
239 | 
240 |         if required_type != current_type:
241 |             raise TypeError(
242 |                 '{} is {}, expected {}'.format(required_field, current_type, required_type)
243 |             )
244 | 
245 |     return True
246 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/tests/integration/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import os
18 | 
19 | from pyspark.sql.types import StringType
20 | 
21 | import pyspark 
22 | from sparkly import SparklySession
23 | from sparkly.utils import absolute_path
24 | 
25 | 
26 | class SparklyTestSession(SparklySession):
27 |     packages = [
28 |         'com.datastax.spark:spark-cassandra-connector_2.12:3.2.0',
29 |         'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8',
30 |         'org.apache.spark:spark-sql-kafka-0-10_2.12:{}'.format(pyspark.__version__),
31 |         'mysql:mysql-connector-java:8.0.31',
32 |         'io.confluent:kafka-avro-serializer:3.0.1',
33 |     ]
34 | 
35 |     repositories = [
36 |         'http://packages.confluent.io/maven/',
37 |     ]
38 | 
39 |     jars = [
40 |         absolute_path(__file__, 'resources', 'brickhouse-0.7.1.jar'),
41 |     ]
42 | 
43 |     udfs = {
44 |         'collect': 'brickhouse.udf.collect.CollectUDAF',
45 |         'length_of_text': (lambda text: len(text), StringType())
46 |     }
47 | 
48 |     options = {
49 |         'my.custom.option.1': '117',
50 |         'my.custom.option.2': 223,
51 |         # will be overwritten by additional_options passed in setup_session
52 |         'my.custom.option.3': '319',
53 |     }
54 | 
55 | 
56 | class SparklyTestSessionWithOldCatalog(SparklyTestSession):
57 |     options = {
58 |         'spark.sql.legacy.keepCommandOutputSchema': 'true',
59 |     }
60 | 


--------------------------------------------------------------------------------
/tests/integration/fake_modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tubular/sparkly/393d1342ade404461a41d42e730e8571f92fcd28/tests/integration/fake_modules/__init__.py


--------------------------------------------------------------------------------
/tests/integration/fake_modules/testing.py:
--------------------------------------------------------------------------------
1 | def is_fake():
2 |     return True
3 | 


--------------------------------------------------------------------------------
/tests/integration/resources/brickhouse-0.7.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tubular/sparkly/393d1342ade404461a41d42e730e8571f92fcd28/tests/integration/resources/brickhouse-0.7.1.jar


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/cassandra_setup.cql:
--------------------------------------------------------------------------------
 1 | DROP KEYSPACE IF EXISTS sparkly_test;
 2 | 
 3 | CREATE KEYSPACE sparkly_test WITH replication =
 4 | {'class': 'SimpleStrategy', 'replication_factor': '1'};
 5 | 
 6 | CREATE TABLE sparkly_test.test (
 7 |     uid text,
 8 |     created text,
 9 |     countries map<text, bigint>,
10 |     PRIMARY KEY (uid, created)
11 | );
12 | 
13 | INSERT INTO sparkly_test.test (uid, created, countries) VALUES ('1', '1234567899', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206});
14 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/cassandra_teardown.cql:
--------------------------------------------------------------------------------
1 | DROP KEYSPACE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/data.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test_fixture", "_type" : "test", "_id": "1" } }
2 | { "name" : "John", "age": 56}
3 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/data_for_es7.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test_fixture", "_id": "1" } }
2 | { "name" : "John", "age": 56}
3 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/kafka.json:
--------------------------------------------------------------------------------
1 | {"key": {"name": "johny"}, "value": {"name": "johny", "surname": "cage"}}
2 | {"key": {"name": "johny"}, "value": {"name": "johny", "surname": "smith"}}
3 | {"key": {"name": "aron"}, "value": {"name": "aron", "surname": "ramsey"}}
4 | {"key": {"name": "killy"}, "value": {"name": "killy", "surname": "gonsales"}}
5 | {"key": {"name": "shefkey"}, "value": {"name": "shefkey", "surname": "kuki"}}
6 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "properties": {
 3 |     "name": {
 4 |       "type": "text"
 5 |     },
 6 |     "age": {
 7 |       "type": "integer"
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/mysql_setup.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE if not exists sparkly_test;
 2 | 
 3 | CREATE TABLE sparkly_test.test (
 4 |   id int,
 5 |   name varchar(30),
 6 |   surname varchar(40),
 7 |   age int,
 8 |   primary key (id)
 9 | );
10 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (1, 'john', 'sk', 111);
11 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_fixtures/mysql_teardown.sql:
--------------------------------------------------------------------------------
1 | DROP DATABASE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/cassandra_setup.cql:
--------------------------------------------------------------------------------
 1 | DROP KEYSPACE IF EXISTS sparkly_test;
 2 | 
 3 | CREATE KEYSPACE sparkly_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
 4 | 
 5 | CREATE TABLE sparkly_test.test (
 6 |     uid text,
 7 |     created text,
 8 |     countries map<text, bigint>,
 9 |     PRIMARY KEY (uid, created)
10 | );
11 | 
12 | INSERT INTO sparkly_test.test (uid, created, countries)
13 | VALUES ('1', '1234567894', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206});
14 | 
15 | INSERT INTO sparkly_test.test (uid, created, countries)
16 | VALUES ('2', '1234567893', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206});
17 | 
18 | INSERT INTO sparkly_test.test (uid, created, countries)
19 | VALUES ('3', '1234567891', {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206});
20 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/cassandra_teardown.cql:
--------------------------------------------------------------------------------
1 | DROP KEYSPACE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/elastic7_setup.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test", "_id": "1" } }
2 | { "name" : "John2", "topics": [1, 2, 3, 4, 5], "age": 56, "demo": { "age_30": 20, "age_10": 50 } }
3 | { "index" : { "_index" : "sparkly_test", "_id": "2" } }
4 | { "name" : "Smith3", "topics": [1, 4, 5], "age": 31, "demo": { "age_30": 110, "age_10": 50 } }
5 | { "index" : { "_index" : "sparkly_test", "_id": "3" } }
6 | { "name" : "Smith4", "topics": [4, 5], "age": 12, "demo": { "age_30": 20, "age_10": 1 } }
7 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/elastic_setup.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "1" } }
2 | { "name" : "John2", "topics": [1, 2, 3, 4, 5], "age": 56, "demo": { "age_30": 20, "age_10": 50 } }
3 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "2" } }
4 | { "name" : "Smith3", "topics": [1, 4, 5], "age": 31, "demo": { "age_30": 110, "age_10": 50 } }
5 | { "index" : { "_index" : "sparkly_test", "_type" : "test", "_id": "3" } }
6 | { "name" : "Smith4", "topics": [4, 5], "age": 12, "demo": { "age_30": 20, "age_10": 1 } }
7 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/kafka_setup.json:
--------------------------------------------------------------------------------
 1 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 1}}
 2 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 2}}
 3 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 3}}
 4 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 4}}
 5 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 5}}
 6 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 6}}
 7 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 7}}
 8 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 8}}
 9 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 9}}
10 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 10}}
11 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 11}}
12 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 12}}
13 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/mysql_setup.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE if not exists sparkly_test;
 2 | 
 3 | CREATE TABLE sparkly_test.test (
 4 |   id int,
 5 |   name varchar(30),
 6 |   surname varchar(40),
 7 |   age int,
 8 |   primary key (id)
 9 | );
10 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (1, 'john', 'sk', 111);
11 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (2, 'john', 'po', 222);
12 | INSERT INTO sparkly_test.test (id, name, surname, age) VALUES (3, 'john', 'ku', 333);
13 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_read/mysql_teardown.sql:
--------------------------------------------------------------------------------
1 | DROP DATABASE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/resources/test_testing/kafka_watcher_1.json:
--------------------------------------------------------------------------------
1 | {"key": {"user_id": 1}, "value": {"meal": "dinner", "food": ["spaghetti", "meatballs"]}}
2 | {"key": {"user_id": 2}, "value": {"meal": "lunch", "food": ["soylent"]}}
3 | {"key": {"user_id": 3}, "value": {"meal": "breakfast", "food": []}}
4 | {"key": {"user_id": 2}, "value": {"meal": "second dinner", "food": ["galbi", "ice cream"]}}
5 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_testing/kafka_watcher_2.json:
--------------------------------------------------------------------------------
1 | {"key": {"user_id": 1}, "value": {"meal": "lunch", "food": ["pizza", "stinky tofu"]}}
2 | {"key": {"user_id": 4}, "value": {"meal": "lunch", "food": ["cuban sandwich", "mashed potatoes"]}}
3 | {"key": {"user_id": 5}, "value": {"meal": "dessert", "food": ["pecan pie", "mango"]}}
4 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/cassandra_setup.cql:
--------------------------------------------------------------------------------
 1 | DROP KEYSPACE IF EXISTS sparkly_test;
 2 | 
 3 | CREATE KEYSPACE sparkly_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
 4 | 
 5 | CREATE TABLE sparkly_test.test_writer (
 6 |     uid text,
 7 |     title text,
 8 |     views bigint,
 9 |     PRIMARY KEY (uid)
10 | );
11 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/cassandra_teardown.cql:
--------------------------------------------------------------------------------
1 | DROP KEYSPACE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/elastic7_setup.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test", "_id": "1111" } }
2 | { "uid": "1111", "title": "xxxx", "views": 1111}
3 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/elastic_setup.json:
--------------------------------------------------------------------------------
1 | { "index" : { "_index" : "sparkly_test", "_type" : "test_writer", "_id": "1111" } }
2 | { "uid": "1111", "title": "xxxx", "views": 1111}
3 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/kafka_setup.json:
--------------------------------------------------------------------------------
 1 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 1}}
 2 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 2}}
 3 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 3}}
 4 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 4}}
 5 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 5}}
 6 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 6}}
 7 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 7}}
 8 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 8}}
 9 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 9}}
10 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "smith", "age": 10}}
11 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 11}}
12 | {"key": {"name": "john"}, "value": {"name": "john", "surname": "mnemonic", "age": 12}}
13 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/mysql_setup.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE if not exists sparkly_test;
2 | CREATE TABLE sparkly_test.test_writer (
3 |   uid varchar(30),
4 |   title varchar(40),
5 |   views int,
6 |   primary key (uid)
7 | );
8 | INSERT INTO sparkly_test.test_writer (uid, title, views) VALUES ('1111', '1111', 999);
9 | 


--------------------------------------------------------------------------------
/tests/integration/resources/test_write/mysql_teardown.sql:
--------------------------------------------------------------------------------
1 | DROP DATABASE sparkly_test;


--------------------------------------------------------------------------------
/tests/integration/test_catalog.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | from sparkly.testing import SparklyGlobalSessionTest
 18 | from tests.integration.base import SparklyTestSession, SparklyTestSessionWithOldCatalog
 19 | from sparkly.catalog import read_db_properties_format
 20 | 
 21 | 
 22 | class TestSparklyCatalog(SparklyGlobalSessionTest):
 23 |     session = SparklyTestSession
 24 | 
 25 |     def setUp(self):
 26 |         self.spark.catalog_ext.drop_table('test_table')
 27 | 
 28 |         if self.spark.catalog_ext.has_database('test_db'):
 29 |             for table in self.spark.catalog.listTables('test_db'):
 30 |                 self.spark.catalog_ext.drop_table('test_db.{}'.format(table.name))
 31 |             self.spark.sql('DROP DATABASE test_db')
 32 | 
 33 |         df = self.spark.createDataFrame([('row_1', 1), ('row_2', 2)], schema=('a', 'b'))
 34 |         df.write.saveAsTable('test_table', format='parquet', location='/tmp/test_table')
 35 | 
 36 |         self.spark.catalog_ext.set_table_property('test_table', 'property_a', 'str_value')
 37 |         self.spark.catalog_ext.set_table_property('test_table', 'property_b', 2)
 38 | 
 39 |         self.spark.sql('CREATE DATABASE test_db')
 40 |         df.write.saveAsTable('test_db.test_table', format='parquet', location='/tmp/test_table')
 41 |         self.spark.catalog_ext.set_table_property('test_db.test_table',
 42 |                                                   'property_a',
 43 |                                                   'str_value')
 44 |         self.spark.catalog_ext.set_table_property('test_db.test_table',
 45 |                                                   'property_b',
 46 |                                                   2)
 47 | 
 48 |     def test_has_database(self):
 49 |         self.assertTrue(self.spark.catalog_ext.has_database('test_db'))
 50 |         self.assertFalse(self.spark.catalog_ext.has_database('not_exists'))
 51 | 
 52 |     def test_create_table_when_exists(self):
 53 |         self.assertTrue(self.spark.catalog_ext.has_table('test_table'))
 54 | 
 55 |         new_df = self.spark.createDataFrame([('row_5', 'hi')], schema=('c', 'd'))
 56 |         new_df.write.save('/tmp/test_table_2', format='parquet', mode='overwrite')
 57 | 
 58 |         self.spark.catalog_ext.create_table(
 59 |             'test_table',
 60 |             path='/tmp/test_table_2',
 61 |             schema=new_df.schema,
 62 |             mode='overwrite',
 63 |         )
 64 | 
 65 |         self.assertTrue(self.spark.catalog_ext.has_table('test_table'))
 66 | 
 67 |         new_table = self.spark.table('test_table')
 68 |         self.assertEqual(
 69 |             [r.asDict() for r in new_table.collect()],
 70 |             [{'c': 'row_5', 'd': 'hi'}],
 71 |         )
 72 | 
 73 |     def test_drop_table(self):
 74 |         self.assertTrue(self.spark.catalog_ext.has_table('test_table'))
 75 | 
 76 |         self.spark.catalog_ext.drop_table('test_table')
 77 | 
 78 |         self.assertFalse(self.spark.catalog_ext.has_table('test_table'))
 79 | 
 80 |     def test_drop_table_non_default_db(self):
 81 |         self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table'))
 82 | 
 83 |         self.spark.catalog_ext.drop_table('test_db.test_table')
 84 | 
 85 |         self.assertFalse(self.spark.catalog_ext.has_table('test_db.test_table'))
 86 | 
 87 |     def test_has_table(self):
 88 |         self.assertFalse(self.spark.catalog_ext.has_table(None))
 89 |         self.assertFalse(self.spark.catalog_ext.has_table(''))
 90 |         self.assertTrue(self.spark.catalog_ext.has_table('test_table'))
 91 |         self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table'))
 92 |         self.assertFalse(self.spark.catalog_ext.has_table('test_unknown_table'))
 93 |         self.assertFalse(self.spark.catalog_ext.has_table('non_exists.test_unknown_table'))
 94 | 
 95 |     def test_rename_table(self):
 96 |         self.spark.catalog_ext.drop_table('new_test_table')
 97 |         self.assertTrue(self.spark.catalog_ext.has_table('test_table'))
 98 |         self.assertFalse(self.spark.catalog_ext.has_table('new_test_table'))
 99 | 
100 |         self.spark.catalog_ext.rename_table('test_table', 'new_test_table')
101 | 
102 |         self.assertFalse(self.spark.catalog_ext.has_table('test_table'))
103 |         self.assertTrue(self.spark.catalog_ext.has_table('new_test_table'))
104 |         self.assertEqual(self.spark.table('new_test_table').count(), 2)
105 | 
106 |     def test_rename_table_non_default_db(self):
107 |         self.spark.catalog_ext.drop_table('test_db.new_test_table')
108 |         self.assertTrue(self.spark.catalog_ext.has_table('test_db.test_table'))
109 |         self.assertFalse(self.spark.catalog_ext.has_table('test_db.new_test_table'))
110 | 
111 |         self.spark.catalog_ext.rename_table('test_db.test_table', 'test_db.new_test_table')
112 | 
113 |         self.assertFalse(self.spark.catalog_ext.has_table('test_db.test_table'))
114 |         self.assertTrue(self.spark.catalog_ext.has_table('test_db.new_test_table'))
115 |         self.assertEqual(self.spark.table('test_db.new_test_table').count(), 2)
116 | 
117 |     def test_get_table_properties(self):
118 |         properties = self.spark.catalog_ext.get_table_properties('test_table')
119 | 
120 |         self.assertEqual(properties.get('property_a'), 'str_value')
121 |         self.assertEqual(properties.get('property_b'), '2')
122 | 
123 |     def test_get_table_property(self):
124 |         self.assertEqual(
125 |             self.spark.catalog_ext.get_table_property('test_table', 'property_a'),
126 |             'str_value',
127 |         )
128 |         self.assertEqual(
129 |             self.spark.catalog_ext.get_table_property('test_db.test_table', 'property_a'),
130 |             'str_value',
131 |         )
132 | 
133 |     def test_get_table_property_to_type(self):
134 |         self.assertEqual(
135 |             self.spark.catalog_ext.get_table_property('test_table', 'property_b', to_type=int),
136 |             2,
137 |         )
138 |         self.assertEqual(
139 |             self.spark.catalog_ext.get_table_property('test_db.test_table',
140 |                                                       'property_b',
141 |                                                       to_type=int),
142 |             2,
143 |         )
144 | 
145 |     def test_get_table_property_unknown(self):
146 |         self.assertIsNone(self.spark.catalog_ext.get_table_property('test_table', 'unknown'))
147 |         self.assertIsNone(
148 |             self.spark.catalog_ext.get_table_property('test_db.test_table', 'unknown')
149 |         )
150 | 
151 |     def test_set_database_property_with_prohibited_symbols(self):
152 |         with self.assertRaises(ValueError):
153 |             self.spark.catalog_ext.set_database_property('test_db', 'broken,key', 'normal_value')
154 | 
155 |         with self.assertRaises(ValueError):
156 |             self.spark.catalog_ext.set_database_property('test_db', 'normal_key', 'broken(value)')
157 | 
158 |     def test_get_database_property(self):
159 |         self.spark.catalog_ext.set_database_property('test_db', 'property_a', 'just,a,string')
160 |         self.spark.catalog_ext.set_database_property('test_db', 'property_b', '123')
161 | 
162 |         self.assertEqual(
163 |             self.spark.catalog_ext.get_database_property('test_db', 'property_a'),
164 |             'just,a,string',
165 |         )
166 |         self.assertEqual(
167 |             self.spark.catalog_ext.get_database_property('test_db', 'property_b', to_type=int),
168 |             123,
169 |         )
170 |         self.assertIsNone(
171 |             self.spark.catalog_ext.get_database_property('test_db', 'unknown_prop', to_type=int),
172 |         )
173 | 
174 |     def test_get_database_properties(self):
175 |         self.spark.catalog_ext.set_database_property('test_db', 'property_a', 'just,a,string')
176 |         self.spark.catalog_ext.set_database_property('test_db', 'property_b', '123')
177 | 
178 |         self.assertEqual(self.spark.catalog_ext.get_database_properties('test_db'), {
179 |             'property_a': 'just,a,string',
180 |             'property_b': '123',
181 |         })
182 | 
183 |     def test_read_db_properties_format_for_typical_input(self):
184 |         self.assertEqual(read_db_properties_format('((a,b), (c,d))'), [['a', 'b'], ['c', 'd']])
185 |         self.assertEqual(read_db_properties_format('((a,b))'), [['a', 'b']])
186 |         self.assertEqual(read_db_properties_format('()'), [])
187 | 
188 |     def test_read_db_properties_format_for_broken_input(self):
189 |         with self.assertRaises(ValueError):
190 |             read_db_properties_format('((a, b), (c, d)')
191 | 
192 |         with self.assertRaises(ValueError):
193 |             read_db_properties_format(')(a, b), (c, d)(')
194 | 
195 |         with self.assertRaises(ValueError):
196 |             read_db_properties_format(')(')
197 | 
198 |         with self.assertRaises(ValueError):
199 |             read_db_properties_format(')')
200 | 
201 | 
202 | class TestSparklyWithOldCatalog(TestSparklyCatalog):
203 |     session = SparklyTestSessionWithOldCatalog
204 | 


--------------------------------------------------------------------------------
/tests/integration/test_instant_testing.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import json
18 | import os
19 | 
20 | try:
21 |     from unittest import mock
22 | except ImportError:
23 |     import mock
24 | 
25 | from pyspark import SparkContext
26 | 
27 | from sparkly.instant_testing import InstantTesting
28 | from sparkly.testing import SparklyGlobalSessionTest
29 | from tests.integration.base import SparklyTestSession
30 | 
31 | 
32 | _MOCK_LOCK_FILE_PATH = InstantTesting.LOCK_FILE_PATH + '__test'
33 | 
34 | 
35 | @mock.patch.object(InstantTesting, 'LOCK_FILE_PATH', _MOCK_LOCK_FILE_PATH)
36 | class TestInstantTesting(SparklyGlobalSessionTest):
37 |     session = SparklyTestSession
38 | 
39 |     def setUp(self):
40 |         try:
41 |             os.remove(_MOCK_LOCK_FILE_PATH)
42 |         except:
43 |             pass
44 | 
45 |     def test_set_context(self):
46 |         InstantTesting.activate()
47 |         InstantTesting.set_context(self.spark.sparkContext)
48 | 
49 |         with open(_MOCK_LOCK_FILE_PATH) as f:
50 |             state = json.load(f)
51 |             self.assertEqual(state, {
52 |                 'gateway_port':
53 |                     self.spark.sparkContext._gateway.java_gateway_server.getListeningPort(),
54 |                 'session_pid': os.getpid(),
55 |                 'gateway_secret': getattr(
56 |                     self.spark.sparkContext._gateway.gateway_parameters, 'auth_token', None,
57 |                 ),
58 |             })
59 | 
60 |     def test_get_context(self):
61 |         initial_context = self.spark.sparkContext
62 | 
63 |         InstantTesting.activate()
64 |         InstantTesting.set_context(initial_context)
65 | 
66 |         with mock.patch.object(SparkContext, '_active_spark_context', None):
67 |             recovered_context = InstantTesting.get_context()
68 | 
69 |             self.assertIsInstance(recovered_context, SparkContext)
70 |             self.assertEqual(initial_context.appName, recovered_context.appName)
71 |             self.assertEqual(initial_context.master, recovered_context.master)
72 | 


--------------------------------------------------------------------------------
/tests/integration/test_reader.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import json
 17 | import uuid
 18 | 
 19 | from sparkly.exceptions import InvalidArgumentError
 20 | from sparkly.testing import (
 21 |     SparklyGlobalSessionTest,
 22 |     SparklyTest,
 23 |     CassandraFixture,
 24 |     MysqlFixture,
 25 |     ElasticFixture,
 26 |     KafkaFixture,
 27 | )
 28 | from sparkly.utils import absolute_path, kafka_get_topics_offsets
 29 | from tests.integration.base import (
 30 |     SparklyTestSession,
 31 | )
 32 | 
 33 | 
 34 | class SparklyReaderCassandraTest(SparklyGlobalSessionTest):
 35 |     session = SparklyTestSession
 36 | 
 37 |     fixtures = [
 38 |         CassandraFixture(
 39 |             'cassandra.docker',
 40 |             absolute_path(__file__, 'resources', 'test_read', 'cassandra_setup.cql'),
 41 |             absolute_path(__file__, 'resources', 'test_read', 'cassandra_teardown.cql'),
 42 |         )
 43 |     ]
 44 | 
 45 |     def test_read(self):
 46 |         df = self.spark.read_ext.cassandra(
 47 |             host='cassandra.docker',
 48 |             port=9042,
 49 |             keyspace='sparkly_test',
 50 |             table='test',
 51 |             consistency='ONE',
 52 |         )
 53 | 
 54 |         self.assertRowsEqual(df.collect(), [
 55 |             {
 56 |                 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3},
 57 |                 'uid': '1',
 58 |                 'created': '1234567894',
 59 |             },
 60 |             {
 61 |                 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3},
 62 |                 'uid': '2',
 63 |                 'created': '1234567893',
 64 |             },
 65 |             {
 66 |                 'countries': {'DZ': 1, 'EG': 206, 'BE': 1, 'CA': 1, 'AE': 13, 'BH': 3},
 67 |                 'uid': '3',
 68 |                 'created': '1234567891',
 69 |             }
 70 |         ])
 71 | 
 72 | 
 73 | ELASTIC_TEST_DATA = [
 74 |     {
 75 |         'name': 'Smith3',
 76 |         'topics': [1, 4, 5],
 77 |         'age': 31,
 78 |         'demo': {
 79 |             'age_30': 110,
 80 |             'age_10': 50,
 81 |         }
 82 |     },
 83 |     {
 84 |         'name': 'Smith4',
 85 |         'topics': [4, 5],
 86 |         'age': 12,
 87 |         'demo': {
 88 |             'age_30': 20,
 89 |             'age_10': 1,
 90 |         }
 91 |     }
 92 | ]
 93 | 
 94 | 
 95 | class SparklyReaderElasticTest(SparklyGlobalSessionTest):
 96 |     session = SparklyTestSession
 97 | 
 98 |     fixtures = [
 99 |         ElasticFixture(
100 |             'elastic.docker',
101 |             'sparkly_test',
102 |             None,
103 |             None,
104 |             absolute_path(__file__, 'resources', 'test_read', 'elastic7_setup.json'),
105 |         )
106 |     ]
107 | 
108 |     def test_elastic(self):
109 |         df = self.spark.read_ext.elastic(
110 |             host='elastic.docker',
111 |             port=9200,
112 |             es_index='sparkly_test',
113 |             es_type=None,
114 |             query='?q=name:*Smith*',
115 |             options={
116 |                 'es.read.field.as.array.include': 'topics',
117 |                 'es.read.metadata': 'false',
118 |             },
119 |         )
120 | 
121 |         self.assertRowsEqual(df.collect(), ELASTIC_TEST_DATA)
122 | 
123 | 
124 | class SparklyReaderMySQLTest(SparklyGlobalSessionTest):
125 |     session = SparklyTestSession
126 | 
127 |     fixtures = [
128 |         MysqlFixture(
129 |             'mysql.docker',
130 |             'root',
131 |             None,
132 |             absolute_path(__file__, 'resources', 'test_read', 'mysql_setup.sql'),
133 |             absolute_path(__file__, 'resources', 'test_read', 'mysql_teardown.sql'),
134 |         )
135 |     ]
136 | 
137 |     def test_read_mysql(self):
138 |         df = self.spark.read_ext.mysql(
139 |             host='mysql.docker',
140 |             database='sparkly_test',
141 |             table='test',
142 |             options={
143 |                 'user': 'root',
144 |                 'password': '',
145 |             }
146 |         )
147 | 
148 |         self.assertRowsEqual(df.collect(), [
149 |             {'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111},
150 |             {'id': 2, 'name': 'john', 'surname': 'po', 'age': 222},
151 |             {'id': 3, 'name': 'john', 'surname': 'ku', 'age': 333},
152 |         ])
153 | 
154 | 
155 | class TestReaderKafka(SparklyGlobalSessionTest):
156 |     session = SparklyTestSession
157 | 
158 |     def setUp(self):
159 |         self.json_decoder = lambda item: json.loads(item.decode('utf-8'))
160 |         self.json_encoder = lambda item: json.dumps(item).encode('utf-8')
161 |         self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10])
162 |         self.fixture_path = absolute_path(__file__, 'resources', 'test_read', 'kafka_setup.json')
163 |         self.fixture = KafkaFixture(
164 |             'kafka.docker',
165 |             topic=self.topic,
166 |             key_serializer=self.json_encoder,
167 |             value_serializer=self.json_encoder,
168 |             data=self.fixture_path,
169 |         )
170 |         self.fixture.setup_data()
171 |         self.expected_data_df = self.spark.read.json(self.fixture_path)
172 |         self.expected_data = [item.asDict(recursive=True)
173 |                               for item in self.expected_data_df.collect()]
174 | 
175 |     def test_read_by_topic(self):
176 |         df = self.spark.read_ext.kafka(
177 |             'kafka.docker',
178 |             topic=self.topic,
179 |             key_deserializer=self.json_decoder,
180 |             value_deserializer=self.json_decoder,
181 |             schema=self.expected_data_df.schema,
182 |         )
183 |         self.assertRowsEqual(
184 |             df.collect(),
185 |             self.expected_data,
186 |         )
187 | 
188 |     def test_read_by_offsets(self):
189 |         offsets = kafka_get_topics_offsets('kafka.docker', self.topic)
190 |         df = self.spark.read_ext.kafka(
191 |             'kafka.docker',
192 |             topic=self.topic,
193 |             offset_ranges=offsets,
194 |             key_deserializer=self.json_decoder,
195 |             value_deserializer=self.json_decoder,
196 |             schema=self.expected_data_df.schema,
197 |         )
198 | 
199 |         self.assertRowsEqual(df.collect(), self.expected_data)
200 | 
201 |         self.fixture.setup_data()
202 | 
203 |         offsets = kafka_get_topics_offsets('kafka.docker', self.topic)
204 |         df = self.spark.read_ext.kafka(
205 |             'kafka.docker',
206 |             topic=self.topic,
207 |             offset_ranges=offsets,
208 |             key_deserializer=self.json_decoder,
209 |             value_deserializer=self.json_decoder,
210 |             schema=self.expected_data_df.schema,
211 |         )
212 | 
213 |         self.assertRowsEqual(df.collect(), self.expected_data * 2)
214 | 
215 |         df = self.spark.read_ext.kafka(
216 |             'kafka.docker',
217 |             topic=self.topic,
218 |             offset_ranges=offsets,
219 |             key_deserializer=self.json_decoder,
220 |             value_deserializer=self.json_decoder,
221 |             schema=self.expected_data_df.schema,
222 |             include_meta_cols=True,
223 |         )
224 |         expected = [
225 |             # normal fields:
226 |             'key',
227 |             'value',
228 |             # meta fields:
229 |             'topic',
230 |             'partition',
231 |             'offset',
232 |             'timestamp',
233 |             'timestampType',
234 |         ]
235 |         self.assertListEqual(sorted(expected), sorted(df.schema.fieldNames()))
236 | 
237 |     def test_argument_errors(self):
238 |         with self.assertRaises(InvalidArgumentError):
239 |             self.spark.read_ext.kafka(
240 |                 'kafka.docker',
241 |                 topic=self.topic,
242 |                 key_deserializer=self.json_decoder,
243 |                 value_deserializer=self.json_decoder,
244 |                 # no schema!
245 |             )
246 |             self.spark.read_ext.kafka(
247 |                 'kafka.docker',
248 |                 topic=self.topic,
249 |                 key_deserializer=self.json_decoder,
250 |                 # no schema!
251 |             )
252 |             self.spark.read_ext.kafka(
253 |                 'kafka.docker',
254 |                 topic=self.topic,
255 |                 value_deserializer=self.json_decoder,
256 |                 # no schema!
257 |             )
258 | 


--------------------------------------------------------------------------------
/tests/integration/test_session.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from sparkly.testing import SparklyGlobalSessionTest
18 | from tests.integration.base import SparklyTestSession
19 | 
20 | 
21 | class TestSparklySession(SparklyGlobalSessionTest):
22 |     session = SparklyTestSession
23 | 
24 |     def test_options(self):
25 |         self.assertEqual('hive', self.spark.conf.get('spark.sql.catalogImplementation'))
26 |         self.assertEqual('117', self.spark.conf.get('my.custom.option.1'))
27 |         self.assertEqual('223', self.spark.conf.get('my.custom.option.2'))
28 |         self.assertEqual('333', self.spark.conf.get('my.custom.option.3'))
29 | 
30 |     def test_python_udf(self):
31 |         rows = self.spark.sql('select length_of_text("hello world")')
32 |         self.assertEqual(rows.collect()[0][0], '11')
33 | 
34 |     def test_jar_udf(self):
35 |         self.spark.createDataFrame(
36 |             [
37 |                 {'key_field': 'A', 'value_field': 1},
38 |                 {'key_field': 'B', 'value_field': 2},
39 |                 {'key_field': 'C', 'value_field': 3},
40 |                 {'key_field': 'D', 'value_field': 4},
41 |             ],
42 |         ).createOrReplaceTempView('test_jar_udf')
43 | 
44 |         rows = self.spark.sql('select collect(key_field, value_field) from test_jar_udf')
45 |         self.assertEqual(rows.collect()[0][0], {'A': 1, 'B': 2, 'C': 3, 'D': 4})
46 | 
47 |     def test_builder(self):
48 |         with self.assertRaises(NotImplementedError):
49 |             assert self.spark.builder
50 | 


--------------------------------------------------------------------------------
/tests/integration/test_testing.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | import json
 17 | import uuid
 18 | import pickle
 19 | import time
 20 | import unittest
 21 | 
 22 | from sparkly.session import SparklySession
 23 | from sparkly.testing import (
 24 |     CassandraFixture,
 25 |     ElasticFixture,
 26 |     MysqlFixture,
 27 |     SparklyGlobalSessionTest,
 28 |     SparklyTest,
 29 |     KafkaFixture,
 30 |     KafkaWatcher,
 31 | )
 32 | from sparkly.utils import absolute_path
 33 | from tests.integration.base import (
 34 |     SparklyTestSession,
 35 | )
 36 | 
 37 | 
 38 | try:
 39 |     from kafka import KafkaConsumer, KafkaProducer
 40 | except ImportError:
 41 |     pass
 42 | 
 43 | 
 44 | class TestAssertions(SparklyGlobalSessionTest):
 45 |     session = SparklyTestSession
 46 | 
 47 |     def test_assert_dataframe_equal(self):
 48 |         df = self.spark.createDataFrame([('Alice', 1),
 49 |                                          ('Kelly', 1),
 50 |                                          ('BigBoss', 999)],
 51 |                                         ['name', 'age'])
 52 |         self.assertDataFrameEqual(
 53 |             df,
 54 |             [{'name': 'Alice', 'age': 1},
 55 |              {'name': 'BigBoss', 'age': 999},
 56 |              {'name': 'Kelly', 'age': 1},
 57 |              ],
 58 |             ordered=False,
 59 |         )
 60 | 
 61 |         with self.assertRaises(AssertionError):
 62 |             self.assertDataFrameEqual(
 63 |                 df,
 64 |                 [{'name': 'Alice', 'age': 1},
 65 |                  {'name': 'BigBoss', 'age': 999},
 66 |                  {'name': 'Kelly', 'age': 1},
 67 |                  ],
 68 |                 ordered=True,
 69 |             )
 70 | 
 71 | 
 72 | class TestSparklyGlobalSessionTest(unittest.TestCase):
 73 | 
 74 |     def test_imports_test_target(self):
 75 | 
 76 |         class MyGlobalTest(SparklyGlobalSessionTest):
 77 |             session = SparklyTestSession
 78 |             test_target = 'tests.integration.fake_modules.testing.is_fake'
 79 | 
 80 | 
 81 |         MyGlobalTest.setUpClass()
 82 | 
 83 |         self.assertTrue(is_fake)
 84 | 
 85 | 
 86 | class TestCassandraFixtures(SparklyGlobalSessionTest):
 87 |     session = SparklyTestSession
 88 | 
 89 |     def test_cassandra_fixture(self):
 90 |         data_in_cassandra = CassandraFixture(
 91 |             'cassandra.docker',
 92 |             absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_setup.cql'),
 93 |             absolute_path(__file__, 'resources', 'test_fixtures', 'cassandra_teardown.cql'),
 94 |         )
 95 | 
 96 |         with data_in_cassandra:
 97 |             time.sleep(2)  # wait till keyspace is up
 98 |             df = self.spark.read_ext.by_url('cassandra://cassandra.docker/sparkly_test/test')
 99 |             self.assertRowsEqual(df.select('uid', 'countries').collect(), [
100 |                 {
101 |                     'uid': '1',
102 |                     'countries': {'AE': 13, 'BE': 1, 'BH': 3, 'CA': 1, 'DZ': 1, 'EG': 206},
103 |                 },
104 |             ])
105 | 
106 | 
107 | class TestMysqlFixtures(SparklyGlobalSessionTest):
108 | 
109 |     session = SparklyTestSession
110 | 
111 |     fixtures = [
112 |         MysqlFixture(
113 |             'mysql.docker',
114 |             'root',
115 |             None,
116 |             absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_setup.sql'),
117 |             absolute_path(__file__, 'resources', 'test_fixtures', 'mysql_teardown.sql'),
118 |         )
119 |     ]
120 | 
121 |     def test_mysql_fixture(self):
122 |         df = self.spark.read_ext.by_url('mysql://mysql.docker/sparkly_test/test?user=root&password=')
123 |         self.assertRowsEqual(df.collect(), [
124 |             {'id': 1, 'name': 'john', 'surname': 'sk', 'age': 111},
125 |         ])
126 | 
127 | 
128 | class TestElasticFixture(SparklyGlobalSessionTest):
129 | 
130 |     session = SparklyTestSession
131 | 
132 |     class_fixtures = [
133 |         ElasticFixture(
134 |             'elastic.docker',
135 |             'sparkly_test_fixture',
136 |             None,
137 |             absolute_path(__file__, 'resources', 'test_fixtures', 'mapping.json'),
138 |             absolute_path(__file__, 'resources', 'test_fixtures', 'data_for_es7.json'),
139 |         )
140 |     ]
141 | 
142 |     def test_elastic_fixture(self):
143 |         df = self.spark.read_ext.by_url(
144 |             'elastic://elastic.docker/sparkly_test_fixture?es.read.metadata=false'
145 |         )
146 |         self.assertRowsEqual(df.collect(), [{'name': 'John', 'age': 56}])
147 | 
148 | 
149 | class TestKafkaFixture(SparklyGlobalSessionTest):
150 | 
151 |     session = SparklyTestSession
152 | 
153 |     topic = 'sparkly.test.fixture.{}'.format(uuid.uuid4().hex[:10])
154 |     fixtures = [
155 |         KafkaFixture(
156 |             'kafka.docker',
157 |             topic=topic,
158 |             key_serializer=lambda item: json.dumps(item).encode('utf-8'),
159 |             value_serializer=lambda item: json.dumps(item).encode('utf-8'),
160 |             data=absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json'),
161 |         )
162 |     ]
163 | 
164 |     def test_kafka_fixture(self):
165 |         consumer = KafkaConsumer(
166 |             self.topic,
167 |             bootstrap_servers='kafka.docker:9092',
168 |             key_deserializer=lambda item: json.loads(item.decode('utf-8')),
169 |             value_deserializer=lambda item: json.loads(item.decode('utf-8')),
170 |             auto_offset_reset='earliest',
171 |         )
172 | 
173 |         actual_data = []
174 |         for i in range(5):
175 |             message = next(consumer)
176 |             data = {'key': message.key, 'value': message.value}
177 |             actual_data.append(data)
178 | 
179 |         expected_data = self.spark.read.json(
180 |             absolute_path(__file__, 'resources', 'test_fixtures', 'kafka.json')
181 |         )
182 |         self.assertRowsEqual(expected_data.collect(), actual_data)
183 | 
184 | 
185 | class TestKafkaWatcher(SparklyGlobalSessionTest):
186 |     session = SparklyTestSession
187 | 
188 |     def test_write_kafka_dataframe(self):
189 |         host = 'kafka.docker'
190 |         topic = 'test.topic.kafkawatcher.{}'.format(uuid.uuid4().hex[:10])
191 |         port = 9092
192 |         input_df, expected_data = self.get_test_data('kafka_watcher_1.json')
193 | 
194 |         kafka_watcher = KafkaWatcher(
195 |             self.spark,
196 |             input_df.schema,
197 |             pickle.loads,
198 |             pickle.loads,
199 |             host,
200 |             topic,
201 |             port,
202 |         )
203 |         with kafka_watcher:
204 |             expected_count = self.write_data(input_df, host, topic, port)
205 |         self.assertEqual(kafka_watcher.count, expected_count)
206 |         self.assertRowsEqual(kafka_watcher.df.collect(), expected_data)
207 | 
208 |         with kafka_watcher:
209 |             pass
210 |         self.assertEqual(kafka_watcher.count, 0)
211 |         self.assertIsNone(kafka_watcher.df, None)
212 | 
213 |         input_df, expected_data = self.get_test_data('kafka_watcher_2.json')
214 |         with kafka_watcher:
215 |             expected_count = self.write_data(input_df, host, topic, port)
216 |         self.assertEqual(kafka_watcher.count, expected_count)
217 |         self.assertRowsEqual(kafka_watcher.df.collect(), expected_data)
218 | 
219 |     def get_test_data(self, filename):
220 |         file_path = absolute_path(__file__, 'resources', 'test_testing', filename)
221 |         df = self.spark.read.json(file_path)
222 |         data = [item.asDict(recursive=True) for item in df.collect()]
223 |         return df, data
224 | 
225 |     def write_data(self, df, host, topic, port):
226 |         producer = KafkaProducer(
227 |             bootstrap_servers=['{}:{}'.format(host, port)],
228 |             key_serializer=pickle.dumps,
229 |             value_serializer=pickle.dumps,
230 |         )
231 |         rows = df.collect()
232 |         for row in rows:
233 |             producer.send(topic, key=row.key, value=row.value)
234 |         producer.flush()
235 |         return len(rows)
236 | 
237 | 
238 | class TestSwitchingBetweenTestSessions(unittest.TestCase):
239 |     # Test whether a user can switch between different sessions
240 |     # during tests
241 | 
242 |     def test_switch_session_between_sparkly_tests(self):
243 |         # Define a test session with ES 7.14
244 |         class SessionA(SparklySession):
245 |             packages = [
246 |                 'org.elasticsearch:elasticsearch-spark-30_2.12:7.14.0',
247 |             ]
248 | 
249 |             repositories = [
250 |                 'http://packages.confluent.io/maven/',
251 |             ]
252 | 
253 |         class TestSessionA(SparklyTest):
254 |             session = SessionA
255 | 
256 |         # Define a test session with ES 7.17
257 |         class SessionB(SparklySession):
258 |             packages = [
259 |                 'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8',
260 |             ]
261 | 
262 |             repositories = [
263 |                 'http://packages.confluent.io/maven/',
264 |             ]
265 | 
266 |         class TestSessionB(SparklyTest):
267 |             session = SessionB
268 | 
269 |         # Make sure that when the ES6 session is set up, the underlying
270 |         # spark session contains the appropriate jars
271 |         TestSessionA.setUpClass()
272 |         es_7_14_jar = (
273 |             'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.14.0.jar'
274 |         )
275 |         installed_jars = list(TestSessionA.spark._jsc.jars())
276 |         self.assertIn(es_7_14_jar, installed_jars)
277 |         TestSessionA.tearDownClass()
278 | 
279 |         # And now make sure that when the ES7 session is set up, the underlying
280 |         # spark session contains the appropriate jars as well
281 |         TestSessionB.setUpClass()
282 |         es_7_17_jar = (
283 |             'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.17.8.jar'
284 |         )
285 |         installed_jars = list(TestSessionB.spark._jsc.jars())
286 |         self.assertIn(es_7_17_jar, installed_jars)
287 |         self.assertNotIn(es_7_14_jar, installed_jars)
288 | 
289 |         TestSessionB.tearDownClass()
290 | 
291 |     def test_switch_global_session_between_sparkly_tests(self):
292 |         # Define a test session with ES 7.14
293 |         class SessionA(SparklySession):
294 |             packages = [
295 |                 'org.elasticsearch:elasticsearch-spark-30_2.12:7.14.0',
296 |             ]
297 | 
298 |             repositories = [
299 |                 'http://packages.confluent.io/maven/',
300 |             ]
301 | 
302 |         class TestSessionA(SparklyGlobalSessionTest):
303 |             session = SessionA
304 | 
305 |         # Define a test session with ES 7.17
306 |         class SessionB(SparklySession):
307 |             packages = [
308 |                 'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8',
309 |             ]
310 | 
311 |             repositories = [
312 |                 'http://packages.confluent.io/maven/',
313 |             ]
314 | 
315 |         class TestSessionB(SparklyGlobalSessionTest):
316 |             session = SessionB
317 | 
318 |         # Make sure that when the ES6 session is set up, the underlying
319 |         # spark session contains the appropriate jars
320 |         TestSessionA.setUpClass()
321 |         es_7_14_jar = (
322 |             'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.14.0.jar'
323 |         )
324 |         installed_jars = list(TestSessionA.spark._jsc.jars())
325 |         self.assertIn(es_7_14_jar, installed_jars)
326 |         TestSessionA.tearDownClass()
327 | 
328 |         # And now make sure that when the ES7 session is set up, the underlying
329 |         # spark session contains the appropriate jars as well
330 |         TestSessionB.setUpClass()
331 |         es_7_17_jar = (
332 |             'file:///root/.ivy2/jars/org.elasticsearch_elasticsearch-spark-30_2.12-7.17.8.jar'
333 |         )
334 |         installed_jars = list(TestSessionB.spark._jsc.jars())
335 |         self.assertIn(es_7_17_jar, installed_jars)
336 |         self.assertNotIn(es_7_14_jar, installed_jars)
337 |         TestSessionB.tearDownClass()
338 | 


--------------------------------------------------------------------------------
/tests/no_extras/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/tests/no_extras/test_testing.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import json
18 | 
19 | from sparkly.testing import (
20 |     CassandraFixture,
21 |     MysqlFixture,
22 |     SparklyGlobalSessionTest,
23 |     KafkaFixture)
24 | from tests.integration.base import SparklyTestSession
25 | 
26 | 
27 | class TestCassandraFixtures(SparklyGlobalSessionTest):
28 |     session = SparklyTestSession
29 | 
30 |     def test_cassandra_fixture(self):
31 |         with self.assertRaises(NotImplementedError):
32 |             CassandraFixture(
33 |                 'cassandra.docker',
34 |                 'test',
35 |                 'test',
36 |             )
37 | 
38 | 
39 | class TestMysqlFixtures(SparklyGlobalSessionTest):
40 | 
41 |     session = SparklyTestSession
42 | 
43 |     def test_mysql_fixture(self):
44 |         with self.assertRaises(NotImplementedError):
45 |             MysqlFixture(
46 |                 'mysql.docker',
47 |                 'root',
48 |                 None,
49 |                 'test',
50 |                 'test',
51 |             )
52 | 
53 | 
54 | class TestKafkaFixture(SparklyGlobalSessionTest):
55 | 
56 |     session = SparklyTestSession
57 | 
58 |     def test_kafka_fixture(self):
59 |         with self.assertRaises(NotImplementedError):
60 |             KafkaFixture(
61 |                 'kafka.docker',
62 |                 topic='test',
63 |                 key_serializer=lambda item: json.dumps(item).encode('utf-8'),
64 |                 value_serializer=lambda item: json.dumps(item).encode('utf-8'),
65 |                 data='test.json',
66 |             )
67 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/tests/unit/test_instant_testing.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import os
18 | from unittest import TestCase
19 | 
20 | try:
21 |     from unittest import mock
22 | except ImportError:
23 |     import mock
24 | 
25 | from sparkly.instant_testing import InstantTesting
26 | 
27 | 
28 | _MOCK_LOCK_FILE_PATH = InstantTesting.LOCK_FILE_PATH + '__test'
29 | 
30 | 
31 | @mock.patch.object(InstantTesting, 'LOCK_FILE_PATH', _MOCK_LOCK_FILE_PATH)
32 | class TestInstantTesting(TestCase):
33 |     def setUp(self):
34 |         try:
35 |             os.remove(_MOCK_LOCK_FILE_PATH)
36 |         except:
37 |             pass
38 | 
39 |     def test_activate(self):
40 |         self.assertFalse(InstantTesting.is_activated())
41 |         InstantTesting.activate()
42 |         self.assertTrue(InstantTesting.is_activated())
43 | 
44 |     def test_deactivate(self):
45 |         InstantTesting.activate()
46 |         self.assertTrue(InstantTesting.is_activated())
47 |         InstantTesting.deactivate()
48 |         self.assertFalse(InstantTesting.is_activated())
49 | 
50 |     def test_double_activation(self):
51 |         InstantTesting.activate()
52 |         InstantTesting.activate()
53 | 
54 |     def test_double_deactivation(self):
55 |         InstantTesting.deactivate()
56 |         InstantTesting.deactivate()
57 | 


--------------------------------------------------------------------------------
/tests/unit/test_reader.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import unittest
 18 | try:
 19 |     from unittest import mock
 20 | except ImportError:
 21 |     import mock
 22 | 
 23 | import pyspark.sql
 24 | 
 25 | import sparkly
 26 | from sparkly.reader import SparklyReader
 27 | from sparkly.utils import parse_schema
 28 | 
 29 | 
 30 | class TestSparklyReaderByUrl(unittest.TestCase):
 31 |     def setUp(self):
 32 |         self.spark = mock.Mock(spec=sparkly.SparklySession)
 33 |         self.read_ext = SparklyReader(self.spark)
 34 |         self.fake_df = mock.Mock(spec=pyspark.sql.DataFrame)
 35 | 
 36 |     def test_table(self):
 37 |         self.spark.table.return_value = self.fake_df
 38 | 
 39 |         df = self.read_ext.by_url('table://some_hive_table')
 40 | 
 41 |         self.assertEqual(df, self.fake_df)
 42 |         self.spark.table.assert_called_with('some_hive_table')
 43 | 
 44 |     def test_parquet(self):
 45 |         self.spark.read.load.return_value = self.fake_df
 46 | 
 47 |         df = self.read_ext.by_url('parquet:s3://my-bucket/path/to/parquet')
 48 | 
 49 |         self.assertEqual(df, self.fake_df)
 50 |         self.spark.read.load.assert_called_with(
 51 |             path='s3://my-bucket/path/to/parquet',
 52 |             format='parquet',
 53 |         )
 54 | 
 55 |     def test_csv(self):
 56 |         self.spark.read.csv.return_value = self.fake_df
 57 | 
 58 |         df = self.read_ext.by_url('csv:s3://my-bucket/path/to/csv?header=true')
 59 | 
 60 |         self.assertEqual(df, self.fake_df)
 61 |         self.spark.read.csv.assert_called_with(
 62 |             path='s3://my-bucket/path/to/csv',
 63 |             header='true',
 64 |         )
 65 | 
 66 |     def test_csv_on_local_file_system(self):
 67 |         self.spark.read.csv.return_value = self.fake_df
 68 | 
 69 |         schema = 'struct<a:string,b:bigint,c:array<int>>'
 70 |         df = self.read_ext.by_url('csv:///path/on/file/system?header=false&schema={}'
 71 |                                   .format(schema))
 72 | 
 73 |         self.assertEqual(df, self.fake_df)
 74 |         self.spark.read.csv.assert_called_with(
 75 |             path='/path/on/file/system',
 76 |             schema=parse_schema(schema),
 77 |             header='false',
 78 |         )
 79 | 
 80 |     def test_elastic_on_or_before_6(self):
 81 |         self.read_ext.elastic = mock.Mock(return_value=self.fake_df)
 82 | 
 83 |         df = self.read_ext.by_url('elastic://es_host/test_index/test_type?'
 84 |                                   'q=name:*Johnny*&fields=name,surname&'
 85 |                                   'es.input.json=true&parallelism=4')
 86 | 
 87 |         self.assertEqual(df, self.fake_df)
 88 |         self.read_ext.elastic.assert_called_with(
 89 |             host='es_host',
 90 |             es_index='test_index',
 91 |             es_type='test_type',
 92 |             query='?q=name:*Johnny*',
 93 |             fields=['name', 'surname'],
 94 |             port=None,
 95 |             parallelism=4,
 96 |             options={'es.input.json': 'true'},
 97 |         )
 98 | 
 99 |     def test_elastic_on_and_after_7(self):
100 |         self.read_ext.elastic = mock.Mock(return_value=self.fake_df)
101 | 
102 |         df = self.read_ext.by_url('elastic://es_host/test_index?'
103 |                                   'q=name:*Johnny*&fields=name,surname&'
104 |                                   'es.input.json=true&parallelism=4')
105 | 
106 |         self.assertEqual(df, self.fake_df)
107 |         self.read_ext.elastic.assert_called_with(
108 |             host='es_host',
109 |             es_index='test_index',
110 |             es_type=None,
111 |             query='?q=name:*Johnny*',
112 |             fields=['name', 'surname'],
113 |             port=None,
114 |             parallelism=4,
115 |             options={'es.input.json': 'true'},
116 |         )
117 | 
118 |     def test_cassandra(self):
119 |         self.read_ext.cassandra = mock.Mock(return_value=self.fake_df)
120 | 
121 |         df = self.read_ext.by_url('cassandra://localhost/test_cf/test_table?'
122 |                                   'consistency=ONE&parallelism=8&query.retry.count=2')
123 | 
124 |         self.assertEqual(df, self.fake_df)
125 |         self.read_ext.cassandra.assert_called_with(
126 |             host='localhost',
127 |             port=None,
128 |             keyspace='test_cf',
129 |             table='test_table',
130 |             consistency='ONE',
131 |             parallelism=8,
132 |             options={'query.retry.count': '2'},
133 |         )
134 | 
135 |     def test_cassandra_custom_port(self):
136 |         self.read_ext.cassandra = mock.Mock(return_value=self.fake_df)
137 | 
138 |         df = self.read_ext.by_url('cassandra://localhost:19042/test_cf/test_table?'
139 |                                   'consistency=ONE&parallelism=8&query.retry.count=2')
140 | 
141 |         self.assertEqual(df, self.fake_df)
142 |         self.read_ext.cassandra.assert_called_with(
143 |             host='localhost',
144 |             port=19042,
145 |             keyspace='test_cf',
146 |             table='test_table',
147 |             consistency='ONE',
148 |             parallelism=8,
149 |             options={'query.retry.count': '2'},
150 |         )
151 | 
152 |     def test_mysql(self):
153 |         self.read_ext.mysql = mock.Mock(return_value=self.fake_df)
154 | 
155 |         df = self.read_ext.by_url('mysql://localhost/test_database/test_table?'
156 |                                   'user=root&password=pass')
157 | 
158 |         self.assertEqual(df, self.fake_df)
159 |         self.read_ext.mysql.assert_called_with(
160 |             host='localhost',
161 |             database='test_database',
162 |             table='test_table',
163 |             port=None,
164 |             parallelism=None,
165 |             options={'user': 'root', 'password': 'pass'},
166 |         )
167 | 
168 |     def test_mysql_custom_port(self):
169 |         self.read_ext.mysql = mock.Mock(return_value=self.fake_df)
170 | 
171 |         df = self.read_ext.by_url('mysql://localhost:33306/test_database/test_table?'
172 |                                   'user=root&password=pass')
173 | 
174 |         self.assertEqual(df, self.fake_df)
175 |         self.read_ext.mysql.assert_called_with(
176 |             host='localhost',
177 |             database='test_database',
178 |             table='test_table',
179 |             port=33306,
180 |             parallelism=None,
181 |             options={'user': 'root', 'password': 'pass'},
182 |         )
183 | 
184 |     def test_unknown_format(self):
185 |         self.assertRaises(NotImplementedError, self.read_ext.by_url, 'fake://host')
186 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | from unittest import TestCase
 18 | try:
 19 |     from unittest import mock
 20 | except ImportError:
 21 |     import mock
 22 | 
 23 | from pyspark import StorageLevel
 24 | from pyspark.sql import DataFrame
 25 | from pyspark.sql import types as T
 26 | 
 27 | from sparkly.utils import lru_cache, parse_schema, schema_has
 28 | 
 29 | 
 30 | class TestParseSchema(TestCase):
 31 |     def test_atomic(self):
 32 |         self.assert_parsed_properly('date')
 33 |         self.assert_parsed_properly('float')
 34 |         self.assert_parsed_properly('string')
 35 |         self.assert_parsed_properly('timestamp')
 36 |         self.assert_parsed_properly('int')
 37 | 
 38 |     def test_array(self):
 39 |         self.assert_parsed_properly('array<int>')
 40 | 
 41 |     def test_map(self):
 42 |         self.assert_parsed_properly('map<string,float>')
 43 | 
 44 |     def test_struct(self):
 45 |         self.assert_parsed_properly('struct<a:string,b:float>')
 46 | 
 47 |     def test_parse_complex_types(self):
 48 |         self.assert_parsed_properly('array<map<string,float>>')
 49 |         self.assert_parsed_properly('map<string,struct<a:map<bigint,string>>>')
 50 |         self.assert_parsed_properly('struct<a:struct<a:string>>')
 51 |         self.assert_parsed_properly('struct<a:map<bigint,map<string,int>>,c:map<int,string>>')
 52 | 
 53 |     def assert_parsed_properly(self, schema):
 54 |         self.assertEqual(parse_schema(schema).simpleString(), schema)
 55 | 
 56 | 
 57 | class TestLruCache(TestCase):
 58 |     def test_caching(self):
 59 |         df = mock.MagicMock(spec=DataFrame)
 60 | 
 61 |         called = [0]
 62 | 
 63 |         @lru_cache(storage_level=StorageLevel.DISK_ONLY)
 64 |         def func(*args, **kwargs):
 65 |             called[0] += 1
 66 |             return df
 67 | 
 68 |         func()
 69 |         df.persist.assert_called_once_with(StorageLevel.DISK_ONLY)
 70 |         self.assertEqual(df.unpersist.mock_calls, [])
 71 |         self.assertEqual(called[0], 1)
 72 | 
 73 |         cached_df = func()
 74 |         self.assertEqual(cached_df, df)
 75 |         self.assertEqual(called[0], 1)
 76 | 
 77 |     def test_eviction(self):
 78 |         first_df = mock.MagicMock(spec=DataFrame)
 79 |         second_df = mock.MagicMock(spec=DataFrame)
 80 | 
 81 |         @lru_cache(maxsize=1, storage_level=StorageLevel.DISK_ONLY)
 82 |         def func(uid):
 83 |             if uid == 'first':
 84 |                 return first_df
 85 |             else:
 86 |                 return second_df
 87 | 
 88 |         func('first')
 89 |         first_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY)
 90 |         self.assertEqual(first_df.unpersist.mock_calls, [])
 91 | 
 92 |         func('second')
 93 |         first_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY)
 94 |         first_df.unpersist.assert_called_once_with()
 95 |         second_df.persist.assert_called_once_with(StorageLevel.DISK_ONLY)
 96 |         self.assertEqual(second_df.unpersist.mock_calls, [])
 97 | 
 98 | 
 99 | class TestSchemaHas(TestCase):
100 |     def test_structs_equal(self):
101 |         schema_has(
102 |             T.StructType([
103 |                 T.StructField('f1', T.IntegerType()),
104 |                 T.StructField('f2', T.FloatType()),
105 |                 T.StructField('f3', T.StringType()),
106 |             ]),
107 |             T.StructType([
108 |                 T.StructField('f3', T.StringType()),
109 |                 T.StructField('f2', T.FloatType()),
110 |                 T.StructField('f1', T.IntegerType()),
111 |             ]),
112 |         )
113 | 
114 |     def test_structs_equal_with_dict(self):
115 |         schema_has(
116 |             T.StructType([
117 |                 T.StructField('f1', T.IntegerType()),
118 |                 T.StructField('f2', T.FloatType()),
119 |                 T.StructField('f3', T.StringType()),
120 |             ]),
121 |             {
122 |                 'f1': T.IntegerType(),
123 |                 'f2': T.FloatType(),
124 |                 'f3': T.StringType(),
125 |             },
126 |         )
127 | 
128 |     def test_structs_subset(self):
129 |         schema_has(
130 |             T.StructType([
131 |                 T.StructField('f1', T.IntegerType()),
132 |                 T.StructField('f2', T.FloatType()),
133 |                 T.StructField('f3', T.StringType()),
134 |             ]),
135 |             T.StructType([
136 |                 T.StructField('f2', T.FloatType()),
137 |             ]),
138 |         )
139 | 
140 |     def test_structs_nested_subset(self):
141 |         schema_has(
142 |             T.StructType([
143 |                 T.StructField(
144 |                     'f1',
145 |                     T.ArrayType(T.StructType([
146 |                         T.StructField('f11', T.IntegerType()),
147 |                         T.StructField('f12', T.StringType()),
148 |                     ])),
149 |                 ),
150 |             ]),
151 |             T.StructType([
152 |                 T.StructField(
153 |                     'f1',
154 |                     T.ArrayType(T.StructType([T.StructField('f11', T.IntegerType())])),
155 |                 ),
156 |             ]),
157 |         )
158 | 
159 |     def test_arrays_equal(self):
160 |         schema_has(
161 |             T.ArrayType(T.ArrayType(T.ArrayType(T.LongType()))),
162 |             T.ArrayType(T.ArrayType(T.ArrayType(T.LongType()))),
163 |         )
164 | 
165 |     def test_arrays_nested_subset(self):
166 |         schema_has(
167 |             T.ArrayType(T.ArrayType(T.StructType([
168 |                 T.StructField('f1', T.ArrayType(T.LongType())),
169 |                 T.StructField('f2', T.ArrayType(T.StringType())),
170 |             ]))),
171 |             T.ArrayType(T.ArrayType(T.StructType([
172 |                 T.StructField('f1', T.ArrayType(T.LongType()))
173 |             ]))),
174 |         )
175 | 
176 |     def test_maps_equal(self):
177 |         schema_has(
178 |             T.MapType(T.StringType(), T.MapType(T.StringType(), T.LongType())),
179 |             T.MapType(T.StringType(), T.MapType(T.StringType(), T.LongType())),
180 |         )
181 | 
182 |     def test_maps_nested_subset(self):
183 |         schema_has(
184 |             T.MapType(
185 |                 T.StringType(),
186 |                 T.MapType(
187 |                     T.StringType(),
188 |                     T.StructType([
189 |                         T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
190 |                         T.StructField('f2', T.MapType(T.StringType(), T.IntegerType())),
191 |                     ]),
192 |                 ),
193 |             ),
194 |             T.MapType(
195 |                 T.StringType(),
196 |                 T.MapType(
197 |                     T.StringType(),
198 |                     T.StructType([
199 |                         T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
200 |                     ]),
201 |                 ),
202 |             ),
203 |         )
204 | 
205 |     def test_type_mismatch(self):
206 |         with self.assertRaisesRegex(AssertionError, 'Cannot compare heterogeneous types'):
207 |             schema_has(
208 |                 T.StructType([T.StructField('f1', T.IntegerType())]),
209 |                 T.ArrayType(T.IntegerType()),
210 |             )
211 | 
212 |         with self.assertRaisesRegex(AssertionError, 'Cannot compare heterogeneous types'):
213 |             schema_has(
214 |                 T.ArrayType(T.IntegerType()),
215 |                 {'f1': T.IntegerType()},
216 |             )
217 | 
218 |         with self.assertRaisesRegex(TypeError, r'f1 is IntegerType\(?\)?, expected LongType\(?\)?'):
219 |             schema_has(
220 |                 T.StructType([T.StructField('f1', T.IntegerType())]),
221 |                 T.StructType([T.StructField('f1', T.LongType())]),
222 |             )
223 | 
224 |         with self.assertRaisesRegex(
225 |             TypeError,
226 |             r'f1\.element\.s1 is IntegerType\(?\)?, expected LongType\(?\)?',
227 |         ):
228 |             schema_has(
229 |                 T.StructType([
230 |                     T.StructField(
231 |                         'f1',
232 |                         T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
233 |                     ),
234 |                 ]),
235 |                 T.StructType([
236 |                     T.StructField(
237 |                         'f1',
238 |                         T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])),
239 |                     ),
240 |                 ]),
241 |             )
242 | 
243 |         with self.assertRaisesRegex(
244 |             TypeError,
245 |             r'element is IntegerType\(?\)?, expected LongType\(?\)?',
246 |         ):
247 |             schema_has(
248 |                 T.ArrayType(T.IntegerType()),
249 |                 T.ArrayType(T.LongType()),
250 |             )
251 | 
252 |         with self.assertRaisesRegex(
253 |             TypeError,
254 |             r'key is StringType\(?\)?, expected LongType\(?\)?',
255 |         ):
256 |             schema_has(
257 |                 T.MapType(T.StringType(), T.IntegerType()),
258 |                 T.MapType(T.LongType(), T.IntegerType()),
259 |             )
260 | 
261 |         with self.assertRaisesRegex(
262 |             TypeError,
263 |             r'value is IntegerType\(?\)?, expected LongType\(?\)?'
264 |         ):
265 |             schema_has(
266 |                 T.MapType(T.StringType(), T.IntegerType()),
267 |                 T.MapType(T.StringType(), T.LongType()),
268 |             )
269 | 
270 |     def test_undefined_field(self):
271 |         with self.assertRaisesRegex(KeyError, 'f2'):
272 |             schema_has(
273 |                 T.StructType([T.StructField('f1', T.IntegerType())]),
274 |                 T.StructType([T.StructField('f2', T.LongType())]),
275 |             )
276 | 
277 |         with self.assertRaisesRegex(KeyError, r'f1\.element\.s2'):
278 |             schema_has(
279 |                 T.StructType([
280 |                     T.StructField(
281 |                         'f1',
282 |                         T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
283 |                     ),
284 |                 ]),
285 |                 T.StructType([
286 |                     T.StructField(
287 |                         'f1',
288 |                         T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])),
289 |                     ),
290 |                 ]),
291 |             )
292 | 
293 |         with self.assertRaisesRegex(
294 |             TypeError,
295 |             r'element is IntegerType\(?\)?, expected LongType\(?\)?',
296 |         ):
297 |             schema_has(
298 |                 T.ArrayType(T.IntegerType()),
299 |                 T.ArrayType(T.LongType()),
300 |             )
301 | 


--------------------------------------------------------------------------------
/tests/unit/test_writer.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2017 Tubular Labs, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import unittest
 18 | try:
 19 |     from unittest import mock
 20 | except ImportError:
 21 |     import mock
 22 | 
 23 | from pyspark.sql import DataFrame, SQLContext
 24 | 
 25 | from sparkly import SparklySession
 26 | from sparkly.writer import SparklyWriter
 27 | 
 28 | 
 29 | class TestWriteByUrl(unittest.TestCase):
 30 |     def setUp(self):
 31 |         self.df = mock.Mock(spec=DataFrame)
 32 |         self.df.sql_ctx = mock.Mock(spec=SQLContext)
 33 |         self.df.sql_ctx.sparkSession = mock.Mock(spec=SparklySession)
 34 |         self.write_ext = SparklyWriter(self.df)
 35 | 
 36 |     def test_parquet_s3(self):
 37 |         self.write_ext.by_url(
 38 |             'parquet:s3://my-bucket/path/to/parquet?partitionBy=x,y,z&mode=append&'
 39 |             'additional=1&parallelism=20',
 40 |         )
 41 | 
 42 |         self.df.coalesce.assert_called_once_with(20)
 43 |         self.df.coalesce.return_value.write.save.assert_called_once_with(
 44 |             path='s3://my-bucket/path/to/parquet',
 45 |             format='parquet',
 46 |             mode='append',
 47 |             partitionBy=['x', 'y', 'z'],
 48 |             additional='1',
 49 |         )
 50 | 
 51 |     def test_csv_local(self):
 52 |         self.df.write.csv = mock.Mock()
 53 | 
 54 |         self.write_ext.by_url('csv:///my-bucket/path/to/csv?parallelism=10')
 55 | 
 56 |         self.df.coalesce.assert_called_once_with(10)
 57 |         self.df.coalesce.return_value.write.save.assert_called_once_with(
 58 |             path='/my-bucket/path/to/csv',
 59 |             format='csv',
 60 |         )
 61 | 
 62 |     def test_cassandra(self):
 63 |         self.write_ext.cassandra = mock.Mock()
 64 | 
 65 |         self.write_ext.by_url(
 66 |             'cassandra://host/ks/cf?consistency=ONE&mode=overwrite&parallelism=10',
 67 |         )
 68 | 
 69 |         self.write_ext.cassandra.assert_called_once_with(
 70 |             host='host',
 71 |             keyspace='ks',
 72 |             table='cf',
 73 |             port=None,
 74 |             mode='overwrite',
 75 |             consistency='ONE',
 76 |             parallelism=10,
 77 |             options={},
 78 |         )
 79 | 
 80 |     def test_cassandra_custom_port(self):
 81 |         self.write_ext.cassandra = mock.Mock()
 82 | 
 83 |         self.write_ext.by_url(
 84 |             'cassandra://host:19042/ks/cf?consistency=ONE&mode=overwrite&parallelism=10',
 85 |         )
 86 | 
 87 |         self.write_ext.cassandra.assert_called_once_with(
 88 |             host='host',
 89 |             keyspace='ks',
 90 |             table='cf',
 91 |             port=19042,
 92 |             mode='overwrite',
 93 |             consistency='ONE',
 94 |             parallelism=10,
 95 |             options={},
 96 |         )
 97 | 
 98 |     def test_elastic_on_or_before_6(self):
 99 |         self.write_ext.elastic = mock.Mock()
100 | 
101 |         self.write_ext.by_url('elastic://host/index/type?parallelism=15')
102 | 
103 |         self.write_ext.elastic.assert_called_once_with(
104 |             host='host',
105 |             es_index='index',
106 |             es_type='type',
107 |             port=None,
108 |             mode=None,
109 |             parallelism=15,
110 |             options={},
111 |         )
112 |     
113 |     def test_elastic_on_and_after_7(self):
114 |         self.write_ext.elastic = mock.Mock()
115 | 
116 |         self.write_ext.by_url('elastic://host/index?parallelism=15')
117 | 
118 |         self.write_ext.elastic.assert_called_once_with(
119 |             host='host',
120 |             es_index='index',
121 |             es_type=None,
122 |             port=None,
123 |             mode=None,
124 |             parallelism=15,
125 |             options={},
126 |         )
127 |     
128 |     def test_mysql(self):
129 |         self.write_ext.mysql = mock.Mock()
130 | 
131 |         self.write_ext.by_url('mysql://host/db/table?parallelism=20')
132 | 
133 |         self.write_ext.mysql.assert_called_with(
134 |             host='host',
135 |             database='db',
136 |             table='table',
137 |             port=None,
138 |             mode=None,
139 |             parallelism=20,
140 |             options={},
141 |         )
142 | 
143 |     def test_mysql_custom_port(self):
144 |         self.write_ext.mysql = mock.Mock()
145 | 
146 |         self.write_ext.by_url('mysql://host:33306/db/table?parallelism=20')
147 | 
148 |         self.write_ext.mysql.assert_called_with(
149 |             host='host',
150 |             database='db',
151 |             table='table',
152 |             port=33306,
153 |             mode=None,
154 |             parallelism=20,
155 |             options={},
156 |         )
157 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2017 Tubular Labs, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | [tox]
18 | envlist = spark32,spark33,spark34,no_extras,docs
19 | 
20 | [testenv:spark32]
21 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit
22 | deps =
23 |     -rrequirements.txt
24 |     -rrequirements_dev.txt
25 |     -rrequirements_extras.txt
26 |     pyspark==3.2.3
27 | 
28 | [testenv:spark33]
29 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit
30 | deps =
31 |     -rrequirements.txt
32 |     -rrequirements_dev.txt
33 |     -rrequirements_extras.txt
34 |     pyspark==3.3.1
35 | 
36 | [testenv:spark34]
37 | commands = py.test --cov=sparkly --cov-report term-missing tests/integration tests/unit
38 | deps =
39 |     -rrequirements.txt
40 |     -rrequirements_dev.txt
41 |     -rrequirements_extras.txt
42 |     pyspark==3.4.0
43 | 
44 | [testenv:no_extras]
45 | commands = py.test tests/no_extras
46 | deps =
47 |     -rrequirements.txt
48 |     -rrequirements_dev.txt
49 |     pyspark==3.3.1
50 | 
51 | [testenv:docs]
52 | commands = sphinx-build -b html docs/source docs/build
53 | deps =
54 |     -rrequirements_dev.txt
55 |     -rrequirements_extras.txt
56 |     -rrequirements.txt
57 |     pyspark==3.3.1
58 | 


--------------------------------------------------------------------------------