├── .gitattributes ├── .github ├── stale.yml └── workflows │ ├── deploy.yml │ └── tests.yml ├── .gitignore ├── .pylintrc ├── .readthedocs.yml ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── demo.ipynb ├── iris.ipynb ├── readthedocs.png ├── requirements.txt └── sphinx │ ├── Makefile │ ├── api.rst │ ├── api_context.rst │ ├── api_fileio.rst │ ├── api_rdd.rst │ ├── api_streaming.rst │ ├── conf.py │ ├── dev.rst │ ├── images │ ├── favicon.ico │ └── logo-w600.png │ ├── index.rst │ ├── parallel.rst │ ├── read_write.rst │ └── version_index │ ├── .nojekyll │ ├── CNAME │ ├── circle.yml │ ├── favicon.ico │ ├── index.html │ └── logo.svg ├── logo ├── banner-w1500.png ├── banner-w500.png ├── banner.svg ├── create.py ├── favicon-w128.png ├── favicon-w16.png ├── favicon-w256.png ├── favicon-w32.png ├── favicon-w48.png ├── favicon.ico ├── favicon.svg ├── logo-w100.png ├── logo-w600.png └── logo.svg ├── pysparkling ├── __init__.py ├── __version__.py ├── _version.py ├── accumulators.py ├── broadcast.py ├── cache_manager.py ├── context.py ├── exceptions.py ├── fileio │ ├── __init__.py │ ├── codec │ │ ├── __init__.py │ │ ├── bz2.py │ │ ├── codec.py │ │ ├── gz.py │ │ ├── lzma.py │ │ ├── sevenz.py │ │ ├── tar.py │ │ └── zip.py │ ├── file.py │ ├── fs │ │ ├── __init__.py │ │ ├── file_system.py │ │ ├── gs.py │ │ ├── hdfs.py │ │ ├── http.py │ │ ├── local.py │ │ └── s3.py │ └── textfile.py ├── partition.py ├── rdd.py ├── samplers.py ├── sql │ ├── __init__.py │ ├── casts.py │ ├── column.py │ ├── conf.py │ ├── context.py │ ├── dataframe.py │ ├── expressions │ │ ├── __init__.py │ │ ├── aggregate │ │ │ ├── __init__.py │ │ │ ├── aggregations.py │ │ │ ├── collectors.py │ │ │ ├── covariance_aggregations.py │ │ │ └── stat_aggregations.py │ │ ├── arrays.py │ │ ├── csvs.py │ │ ├── dates.py │ │ ├── explodes.py │ │ ├── expressions.py │ │ ├── fields.py │ │ ├── jsons.py │ │ ├── literals.py │ │ ├── mappers.py │ │ ├── operators.py │ │ ├── orders.py │ │ ├── strings.py │ │ └── userdefined.py │ ├── functions.py │ ├── group.py │ ├── internal_utils │ │ ├── __init__.py │ │ ├── column.py │ │ ├── joins.py │ │ ├── options.py │ │ ├── readers │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── csvreader.py │ │ │ ├── jsonreader.py │ │ │ ├── textreader.py │ │ │ └── utils.py │ │ ├── readwrite.py │ │ └── writers.py │ ├── internals.py │ ├── readwriter.py │ ├── schema_utils.py │ ├── session.py │ ├── tests │ │ ├── __init__.py │ │ ├── data │ │ │ └── fundings │ │ │ │ └── part-0.csv │ │ ├── expressions │ │ │ └── test_mappers.py │ │ ├── test_casts.py │ │ ├── test_read.py │ │ ├── test_session.py │ │ └── test_write.py │ ├── types.py │ └── utils.py ├── stat_counter.py ├── storagelevel.py ├── streaming │ ├── __init__.py │ ├── context.py │ ├── dstream.py │ ├── filestream.py │ ├── queuestream.py │ └── tcpstream.py ├── task_context.py ├── tests │ ├── __init__.py │ ├── data.7z │ ├── data.tar.gz │ ├── pyspark │ │ ├── key_value.txt.bz2 │ │ │ ├── _SUCCESS │ │ │ └── part-00000.bz2 │ │ ├── key_value.txt.gz │ │ │ ├── _SUCCESS │ │ │ └── part-00000.gz │ │ └── key_value.txt │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ ├── test_broadcast.py │ ├── test_cache.py │ ├── test_context.py │ ├── test_multiprocessing.py │ ├── test_rdd.py │ ├── test_resolve_filenames.py │ ├── test_sample.py │ ├── test_stat_counter.py │ ├── test_streaming_files.py │ ├── test_streaming_queue.py │ ├── test_streaming_tcp.py │ └── test_textFile.py └── utils.py ├── scripts ├── benchmark_csv.py ├── benchmark_generators.py ├── ipcluster_simple.py ├── log_streaming.py ├── multiprocessing_performance_plot.pdf ├── multiprocessing_performance_plot.png ├── multiprocessing_performance_plot.py ├── profile_textfile.py ├── pyspark_comparisons.py ├── pyspark_streaming.py ├── readme_example.py ├── readme_example_common_crawl.py ├── readme_example_human_microbiome.py ├── readme_example_word_count.py ├── starcluster_simple.py ├── tcpperf_client.py ├── tcpperf_connections.csv ├── tcpperf_connections.csv.pdf ├── tcpperf_connections.csv.png ├── tcpperf_messages.csv ├── tcpperf_messages.csv.pdf ├── tcpperf_messages.csv.png ├── tcpperf_plot.py └── tcpperf_server.py ├── setup.cfg ├── setup.py └── versioneer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | pysparkling/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload 2 | 3 | # Build on every branch push, tag push, and pull request change: 4 | # on: [push, pull_request] 5 | # Alternatively, to publish when a (published) GitHub Release is created, use the following: 6 | on: 7 | push: 8 | branches: 9 | - master 10 | pull_request: 11 | branches: 12 | - master 13 | release: 14 | types: 15 | - published 16 | 17 | jobs: 18 | build_sdist: 19 | name: Build Python source distribution 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | with: 24 | fetch-depth: 0 25 | 26 | - uses: actions/setup-python@v4 27 | name: Install Python 28 | with: 29 | python-version: '3.7' 30 | 31 | - name: Build sdist 32 | run: python setup.py sdist 33 | 34 | - uses: actions/upload-artifact@v3 35 | with: 36 | path: dist/*.tar.gz 37 | 38 | upload_pypi: 39 | needs: [build_sdist] 40 | runs-on: ubuntu-latest 41 | # upload to PyPI on every tag starting with 'v' 42 | # if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') 43 | # alternatively, to publish when a GitHub Release is created, use the following rule: 44 | if: github.event_name == 'release' && github.event.action == 'published' 45 | steps: 46 | - uses: actions/download-artifact@v3 47 | with: 48 | name: artifact 49 | path: dist 50 | 51 | - uses: pypa/gh-action-pypi-publish@master 52 | with: 53 | user: __token__ 54 | password: ${{ secrets.pypi_token }} 55 | # To test: repository_url: https://test.pypi.org/legacy/ 56 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ ubuntu-latest, macos-latest, windows-latest ] 12 | python: [ 3.7, 3.8, 3.9, "3.10", "3.11" ] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | with: 17 | fetch-depth: 0 18 | 19 | - name: Set up Python ${{ matrix.python }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python }} 23 | 24 | - name: Install 25 | run: | 26 | python -m pip install --upgrade pip setuptools 27 | python -m pip install -e ".[tests,scripts]" 28 | 29 | - name: Print environment 30 | run: | 31 | python -m pip freeze 32 | python --version 33 | python -c "import pysparkling; print(pysparkling.__version__)" 34 | 35 | - name: Check if import order is fine 36 | run: | 37 | isort . --check --diff 38 | 39 | - name: Test pysparkling/rdd.py 40 | run: python -m pytest pysparkling/rdd.py -vv 41 | 42 | - name: Test pysparkling/tests 43 | if: matrix.os == 'ubuntu-latest' # because of timing sensitivity in stream tests 44 | run: python -m pytest pysparkling/tests -vv 45 | 46 | - name: Install SQL Dependencies 47 | run: | 48 | python -m pip install -e ".[sql]" 49 | 50 | - name: Lint 51 | if: matrix.python != '3.9' 52 | run: pylint pysparkling scripts --disable=fixme 53 | 54 | - name: pycodestyle 55 | run: python -m pycodestyle pysparkling scripts 56 | 57 | - name: Test All 58 | if: matrix.os == 'ubuntu-latest' # because of timing sensitivity in stream tests 59 | run: python -m pytest -vv 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | test.* 3 | profile.out 4 | .vscode 5 | scripts/textout 6 | tests/textout 7 | checkpoints/ 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | .pytest_cache/ 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Vim 18 | *.sw[po] 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | .env/ 24 | venv*/ 25 | pypy/ 26 | pypy3/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *,cover 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | 69 | # Sphinx documentation 70 | docs/sphinx/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Spark data files 76 | *.crc 77 | 78 | # IPython 79 | *.ipynb.syncdoc 80 | .ipynb_checkpoints 81 | .ipython-daemon.json 82 | 83 | /.idea/ 84 | /reports/ 85 | /pysparkling/tests/20news-19997.tar.gz 86 | 87 | /scripts_private/ 88 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [BASIC] 2 | 3 | variable-rgx=[a-z0-9_]{1,30}$ 4 | good-names=log 5 | 6 | disable=invalid-name,unused-argument,too-few-public-methods,missing-docstring,logging-format-interpolation,too-many-instance-attributes,duplicate-code,too-many-public-methods,too-many-arguments,protected-access,too-many-lines,missing-timeout,unnecessary-lambda-assignment 7 | 8 | [FORMAT] 9 | max-line-length=119 10 | 11 | [SIMILARITIES] 12 | 13 | ignore-imports=yes 14 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/sphinx/conf.py 11 | 12 | # Optionally set the version of Python and requirements required to build your docs 13 | python: 14 | version: 3.7 15 | install: 16 | - requirements: docs/requirements.txt 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2020 pysparkling contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | 24 | ----------------------------------------------------------------------------- 25 | 26 | 27 | Parts of the files pysparkling/accumulators.py, pysparkling/broadcast.py, 28 | pysparkling/rdd.py, pysparkling/storagelevel.py and pysparkling/sql were 29 | extracted from their PySpark counterparts under the following license: 30 | 31 | Licensed to the Apache Software Foundation (ASF) under one or more 32 | contributor license agreements. See the NOTICE file distributed with 33 | this work for additional information regarding copyright ownership. 34 | The ASF licenses this file to You under the Apache License, Version 2.0 35 | (the "License"); you may not use this file except in compliance with 36 | the License. You may obtain a copy of the License at 37 | 38 | http://www.apache.org/licenses/LICENSE-2.0 39 | 40 | Unless required by applicable law or agreed to in writing, software 41 | distributed under the License is distributed on an "AS IS" BASIS, 42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 43 | See the License for the specific language governing permissions and 44 | limitations under the License. 45 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include pysparkling/_version.py 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/logo/logo-w100.png 2 | :target: https://github.com/svenkreiss/pysparkling 3 | 4 | pysparkling 5 | =========== 6 | 7 | **Pysparkling** provides a faster, more responsive way to develop programs 8 | for PySpark. It enables code intended for Spark applications to execute 9 | entirely in Python, without incurring the overhead of initializing and 10 | passing data through the JVM and Hadoop. The focus is on having a lightweight 11 | and fast implementation for small datasets at the expense of some data 12 | resilience features and some parallel processing features. 13 | 14 | **How does it work?** To switch execution of a script from PySpark to pysparkling, 15 | have the code initialize a pysparkling Context instead of a SparkContext, and 16 | use the pysparkling Context to set up your RDDs. The beauty is you don't have 17 | to change a single line of code after the Context initialization, because 18 | pysparkling's API is (almost) exactly the same as PySpark's. Since it's so easy 19 | to switch between PySpark and pysparkling, you can choose the right tool for your 20 | use case. 21 | 22 | **When would I use it?** Say you are writing a Spark application because you 23 | need robust computation on huge datasets, but you also want the same application 24 | to provide fast answers on a small dataset. You're finding Spark is not responsive 25 | enough for your needs, but you don't want to rewrite an entire separate application 26 | for the *small-answers-fast* problem. You'd rather reuse your Spark code but somehow 27 | get it to run fast. Pysparkling bypasses the stuff that causes Spark's long startup 28 | times and less responsive feel. 29 | 30 | Here are a few areas where pysparkling excels: 31 | 32 | * Small to medium-scale exploratory data analysis 33 | * Application prototyping 34 | * Low-latency web deployments 35 | * Unit tests 36 | 37 | 38 | Install 39 | ======= 40 | 41 | .. code-block:: bash 42 | 43 | python3 -m pip install "pysparkling[s3,hdfs,http,streaming]" 44 | 45 | 46 | `Documentation `_: 47 | 48 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/docs/readthedocs.png 49 | :target: https://pysparkling.trivial.io 50 | 51 | 52 | Other links: 53 | `Github `_, 54 | |pypi-badge|, |test-badge|, |docs-badge| 55 | 56 | .. |pypi-badge| image:: https://badge.fury.io/py/pysparkling.svg 57 | :target: https://pypi.python.org/pypi/pysparkling/ 58 | .. |test-badge| image:: https://github.com/svenkreiss/pysparkling/workflows/Tests/badge.svg 59 | :target: https://github.com/svenkreiss/pysparkling/actions?query=workflow%3ATests 60 | .. |docs-badge| image:: https://readthedocs.org/projects/pysparkling/badge/?version=latest 61 | :target: https://pysparkling.readthedocs.io/en/latest/?badge=latest 62 | :alt: Documentation Status 63 | 64 | 65 | Features 66 | ======== 67 | 68 | * Supports URI schemes ``s3://``, ``hdfs://``, ``gs://``, ``http://`` and ``file://`` 69 | for Amazon S3, HDFS, Google Storage, web and local file access. 70 | Specify multiple files separated by comma. 71 | Resolves ``*`` and ``?`` wildcards. 72 | * Handles ``.gz``, ``.zip``, ``.lzma``, ``.xz``, ``.bz2``, ``.tar``, 73 | ``.tar.gz`` and ``.tar.bz2`` compressed files. 74 | Supports reading of ``.7z`` files. 75 | * Parallelization via ``multiprocessing.Pool``, 76 | ``concurrent.futures.ThreadPoolExecutor`` or any other Pool-like 77 | objects that have a ``map(func, iterable)`` method. 78 | * Plain pysparkling does not have any dependencies (use ``pip install pysparkling``). 79 | Some file access methods have optional dependencies: 80 | ``boto`` for AWS S3, ``requests`` for http, ``hdfs`` for hdfs 81 | 82 | 83 | Examples 84 | ======== 85 | 86 | Some demos are in the notebooks 87 | `docs/demo.ipynb `_ 88 | and 89 | `docs/iris.ipynb `_ 90 | . 91 | 92 | **Word Count** 93 | 94 | .. code-block:: python 95 | 96 | from pysparkling import Context 97 | 98 | counts = ( 99 | Context() 100 | .textFile('README.rst') 101 | .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line)) 102 | .flatMap(lambda line: line.split(' ')) 103 | .map(lambda word: (word, 1)) 104 | .reduceByKey(lambda a, b: a + b) 105 | ) 106 | print(counts.collect()) 107 | 108 | which prints a long list of pairs of words and their counts. 109 | -------------------------------------------------------------------------------- /docs/readthedocs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/readthedocs.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dateutil 2 | Sphinx 3 | sphinx_rtd_theme 4 | -------------------------------------------------------------------------------- /docs/sphinx/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API 4 | === 5 | 6 | .. currentmodule:: pysparkling 7 | 8 | A usual ``pysparkling`` session starts with either parallelizing a `list` 9 | with :func:`Context.parallelize` or by reading data from a file using 10 | :func:`Context.textFile`. These two methods return :class:`RDD` instances that 11 | can then be processed. 12 | 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | api_rdd 18 | api_context 19 | api_streaming 20 | api_fileio 21 | -------------------------------------------------------------------------------- /docs/sphinx/api_context.rst: -------------------------------------------------------------------------------- 1 | .. _api_context: 2 | 3 | .. currentmodule:: pysparkling 4 | 5 | Context 6 | ------- 7 | 8 | A :class:`~pysparkling.Context` describes the setup. Instantiating a Context with the default 9 | arguments using ``Context()`` is the most lightweight setup. All data is just 10 | in the local thread and is never serialized or deserialized. 11 | 12 | If you want to process the data in parallel, you can use the `multiprocessing` 13 | module. Given the limitations of the default `pickle` serializer, you can 14 | specify to serialize all methods with `cloudpickle` instead. For example, 15 | a common instantiation with `multiprocessing` looks like this: 16 | 17 | .. code-block:: python 18 | 19 | sc = pysparkling.Context( 20 | multiprocessing.Pool(4), 21 | serializer=cloudpickle.dumps, 22 | deserializer=pickle.loads, 23 | ) 24 | 25 | This assumes that your data is serializable with `pickle` which is generally 26 | faster. You can also specify a custom serializer/deserializer for data. 27 | 28 | .. autoclass:: pysparkling.Context 29 | :members: 30 | -------------------------------------------------------------------------------- /docs/sphinx/api_fileio.rst: -------------------------------------------------------------------------------- 1 | .. _api_fileio: 2 | 3 | 4 | fileio 5 | ------ 6 | 7 | .. currentmodule:: pysparkling 8 | 9 | The functionality provided by this module is used in :func:`Context.textFile` 10 | for reading and in :func:`RDD.saveAsTextFile` for writing. 11 | 12 | .. currentmodule:: pysparkling.fileio 13 | 14 | You can use this submodule with :func:`File.dump`, :func:`File.load` and 15 | :func:`File.exists` to read, write and check for existance of a file. 16 | All methods transparently handle various schemas (for example ``http://``, 17 | ``s3://`` and ``file://``) and compression/decompression of ``.gz`` and 18 | ``.bz2`` files (among others). 19 | 20 | 21 | .. autoclass:: pysparkling.fileio.File 22 | :members: 23 | 24 | .. autoclass:: pysparkling.fileio.TextFile 25 | :members: 26 | 27 | 28 | File System 29 | ^^^^^^^^^^^ 30 | 31 | .. autoclass:: pysparkling.fileio.fs.FileSystem 32 | :members: 33 | 34 | .. autoclass:: pysparkling.fileio.fs.Local 35 | :members: 36 | 37 | .. autoclass:: pysparkling.fileio.fs.GS 38 | :members: 39 | 40 | .. autoclass:: pysparkling.fileio.fs.Hdfs 41 | :members: 42 | 43 | .. autoclass:: pysparkling.fileio.fs.Http 44 | :members: 45 | 46 | .. autoclass:: pysparkling.fileio.fs.S3 47 | :members: 48 | 49 | 50 | Codec 51 | ^^^^^ 52 | 53 | .. autoclass:: pysparkling.fileio.codec.Codec 54 | :members: 55 | 56 | .. autoclass:: pysparkling.fileio.codec.Bz2 57 | :members: 58 | 59 | .. autoclass:: pysparkling.fileio.codec.Gz 60 | :members: 61 | 62 | .. autoclass:: pysparkling.fileio.codec.Lzma 63 | :members: 64 | 65 | .. autoclass:: pysparkling.fileio.codec.SevenZ 66 | :members: 67 | 68 | .. autoclass:: pysparkling.fileio.codec.Tar 69 | :members: 70 | 71 | .. autoclass:: pysparkling.fileio.codec.TarGz 72 | :members: 73 | 74 | .. autoclass:: pysparkling.fileio.codec.TarBz2 75 | :members: 76 | 77 | .. autoclass:: pysparkling.fileio.codec.Zip 78 | :members: 79 | -------------------------------------------------------------------------------- /docs/sphinx/api_rdd.rst: -------------------------------------------------------------------------------- 1 | .. _api_rdd: 2 | 3 | RDD 4 | --- 5 | 6 | .. autoclass:: pysparkling.RDD 7 | :members: 8 | 9 | .. autoclass:: pysparkling.StatCounter 10 | :members: 11 | -------------------------------------------------------------------------------- /docs/sphinx/api_streaming.rst: -------------------------------------------------------------------------------- 1 | .. _api_streaming: 2 | 3 | Streaming 4 | --------- 5 | 6 | .. warning:: 7 | This is a new addition to the API (March 2017) that should only be used 8 | with care. 9 | 10 | 11 | StreamingContext 12 | ^^^^^^^^^^^^^^^^ 13 | 14 | .. autoclass:: pysparkling.streaming.StreamingContext 15 | :members: 16 | 17 | 18 | DStream 19 | ^^^^^^^ 20 | 21 | .. autoclass:: pysparkling.streaming.DStream 22 | :members: 23 | -------------------------------------------------------------------------------- /docs/sphinx/dev.rst: -------------------------------------------------------------------------------- 1 | .. _dev: 2 | 3 | Development 4 | =========== 5 | 6 | Fork the Github repository and apply your changes in a feature branch. 7 | To run pysparkling's unit tests: 8 | 9 | .. code-block:: sh 10 | 11 | # install 12 | pip install -e .[hdfs,performance,streaming,test] 13 | flake8 --install-hook 14 | 15 | # run linting and test 16 | flake8 17 | pytest -vv 18 | 19 | Don't run ``python setup.py test`` as this will 20 | not execute the doctests. When all tests pass, create a Pull Request on GitHub. 21 | Please also update ``HISTORY.rst`` with short description of your change. 22 | 23 | To preview the docs locally, install the extra dependencies with 24 | ``pip install -r docs/requirements.txt``, and then cd into ``docs/sphinx``, 25 | run ``make html`` and open ``_build/html/index.html``. 26 | 27 | Please also try not to add derivative work from other projects. If you do, 28 | incorporate proper handling of external licenses in your Pull Request. 29 | -------------------------------------------------------------------------------- /docs/sphinx/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/images/favicon.ico -------------------------------------------------------------------------------- /docs/sphinx/images/logo-w600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/images/logo-w600.png -------------------------------------------------------------------------------- /docs/sphinx/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. include:: ../../README.rst 3 | 4 | 5 | Contents 6 | ======== 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | 11 | self 12 | read_write 13 | api 14 | dev 15 | 16 | 17 | 18 | .. Indices and tables 19 | .. ================== 20 | 21 | .. * :ref:`genindex` 22 | .. * :ref:`modindex` 23 | .. * :ref:`search` 24 | -------------------------------------------------------------------------------- /docs/sphinx/parallel.rst: -------------------------------------------------------------------------------- 1 | .. _parallel: 2 | 3 | 4 | Parallelization 5 | =============== 6 | 7 | Pysparkling supports parallelizations on the local machine and across clusters 8 | of computers. 9 | 10 | 11 | Processes and Threads 12 | --------------------- 13 | 14 | Single machine parallelization with 15 | ``concurrent.futures.ThreadPoolExecutor``, 16 | ``concurrent.futures.ProcessPoolExecutor`` or 17 | ``multiprocessing.Pool`` is supported. Use ``cloudpickle`` instead of ``pickle`` for 18 | serialization to support lambda functions (and more) for data transformations. 19 | 20 | 21 | .. code-block:: python 22 | 23 | import cloudpickle 24 | import concurrent 25 | import pysparkling 26 | 27 | sc = pysparkling.Context( 28 | pool=concurrent.futures.ProcessPoolExecutor(4), 29 | serializer=cloudpickle.dumps, 30 | deserializer=pickle.loads, 31 | ) 32 | 33 | 34 | 35 | Experimental 36 | ------------ 37 | 38 | The following are experimental notes. Most of them don't even contain examples how to make 39 | use of these techniques with pysparkling. 40 | 41 | ipcluster and IPython.parallel 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 43 | 44 | Local test setup: 45 | 46 | .. code-block:: bash 47 | 48 | ipcluster start --n=2 49 | 50 | .. code-block:: python 51 | 52 | from IPython.parallel import Client 53 | 54 | c = Client() 55 | print(c[:].map(lambda _: 'hello world', range(2)).get()) 56 | 57 | which should print ``['hello world', 'hello world']``. 58 | 59 | To run on a cluster, create a profile: 60 | 61 | .. code-block:: bash 62 | 63 | ipython profile create --parallel --profile=smallcluster 64 | 65 | # start controller: 66 | # Creates ~/.ipython/profile_smallcluster/security/ipcontroller-engine.json 67 | # which is used by the engines to identify the location of this controller. 68 | # This is the local-only IP address. Substitute with the machines IP 69 | # address so that the engines can find it. 70 | ipcontroller --ip=127.0.0.1 --port=7123 --profile=smallcluster 71 | 72 | # start engines (assuming they have access to the 73 | # ipcontroller-engine.json file) 74 | ipengine --profile=smallcluster 75 | 76 | Test it in Python: 77 | 78 | .. code-block:: python 79 | 80 | from IPython.parallel import Client 81 | 82 | c = Client(profile='smallcluster') 83 | print(c[:].map(lambda _: 'hello world', range(2)).get()) 84 | 85 | If you don't want to start the engines manually, ``ipcluster`` comes with 86 | "Launchers" that can start them for you: 87 | https://ipython.org/ipython-doc/dev/parallel/parallel_process.html#using-ipcluster-in-ssh-mode 88 | 89 | 90 | StarCluster 91 | ~~~~~~~~~~~ 92 | 93 | Setting up StarCluster was an experiment. However it does not integrate well 94 | with the rest of our EC2 infrastructure, so we switched to a Chef based setup 95 | where we use ``ipcluster`` directly. A blocker was that the number of engines 96 | per node is not configurable and we have many map jobs that wait on external 97 | responses. 98 | 99 | Setup 100 | 101 | .. code-block:: bash 102 | 103 | # install 104 | pip install starcluster 105 | 106 | # create configuration 107 | starcluster help # choose the option to create a sample config file 108 | 109 | # add your user id, aws_access_key_id and aws_secret_access_key to config 110 | 111 | # create an ssh key (this creates a new key just for starcluster) 112 | # and registers it with AWS 113 | starcluster createkey starclusterkey -o ~/.ssh/starclusterkey.rsa 114 | 115 | # add this key to config: 116 | [key starclusterkey] 117 | KEY_LOCATION=~/.ssh/starclusterkey.rsa 118 | # and use this key in the cluster setup: 119 | KEYNAME = starclusterkey 120 | 121 | # disable the queue, Sun Grid Engine 122 | # (unnecessary for pysparkling and takes time during setup) 123 | DISABLE_QUEUE=True 124 | 125 | # to enable IPython parallel support, uncomment these lines in config: 126 | [plugin ipcluster] 127 | SETUP_CLASS = starcluster.plugins.ipcluster.IPCluster 128 | 129 | # and make sure you have this line inside the cluster section 130 | [cluster smallcluster] 131 | PLUGINS = ipcluster 132 | 133 | # start the cluster 134 | starcluster start smallcluster 135 | 136 | # check it has started 137 | starcluster listclusters 138 | 139 | Currently use: ``ami-da180db2`` (Ubuntu 14.04 with 100GB EBS) on 140 | ``m3.medium`` instances. 141 | 142 | Workarounds: 143 | 144 | .. code-block:: bash 145 | 146 | # this seems to be a dependency that does not get installed 147 | pip install pexpect 148 | 149 | # to validate the ssh host, you need to log in once manually, to add it 150 | # to the list of known hosts 151 | starcluster sshmaster smallcluster 152 | 153 | In Python, you should now be able to run 154 | 155 | .. code-block:: python 156 | 157 | from IPython.parallel import Client 158 | 159 | # the exact command is printed after the cluster started 160 | rc = Client('/Users/sven/.starcluster/ipcluster/SecurityGroup:@sc-smallcluster-us-east-1.json', 161 | sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle') 162 | 163 | view = rc[:] 164 | results = view.map(lambda x: x**30, range(8)) 165 | print results.get() 166 | 167 | which is also in ``tests/starcluster_simple.py``. 168 | 169 | 170 | Install your own software that is not on pypi: 171 | 172 | .. code-block:: python 173 | 174 | pip install wheel 175 | python setup.py bdist_wheel # add --universal for Python2 and 3 packages 176 | starcluster put smallcluster dist/your_package_name.whl /home/sgeadmin/your_package_name.whl 177 | 178 | # ssh into remote machine 179 | starcluster sshmaster smallcluster 180 | > pip install --upgrade pip 181 | > pip install wheel 182 | > pip2.7 install /home/sgeadmin/your_package_name.whl 183 | 184 | 185 | -------------------------------------------------------------------------------- /docs/sphinx/read_write.rst: -------------------------------------------------------------------------------- 1 | .. _read_write: 2 | 3 | .. currentmodule:: pysparkling 4 | 5 | 6 | Reading and Writing 7 | =================== 8 | 9 | This is a collection of best practices or templates for reading and writing 10 | various input and output formats. 11 | 12 | 13 | Batch 14 | ----- 15 | 16 | Python List 17 | ~~~~~~~~~~~ 18 | 19 | The most direct input and output is from and to a Python list. 20 | 21 | .. code-block:: python 22 | 23 | import pysparkling 24 | 25 | sc = pysparkling.Context() 26 | 27 | # reading 28 | rdd = sc.parallelize(['hello', 'world']) 29 | 30 | # back to Python list 31 | print(rdd.collect()) 32 | 33 | # back to an iterator 34 | rdd.toLocalIterator() 35 | 36 | 37 | ND-JSON 38 | ~~~~~~~ 39 | 40 | Newline delimited JSON is a text file where every line is its own JSON string. 41 | 42 | 43 | .. code-block:: python 44 | 45 | import json 46 | import pysparkling 47 | 48 | sc = pysparkling.Context() 49 | 50 | # reading 51 | rdd = ( 52 | sc 53 | .textFile('input.json') 54 | .map(json.loads) 55 | ) 56 | 57 | # writing 58 | ( 59 | rdd 60 | .map(json.dumps) 61 | .saveAsTextFile('output.json') 62 | ) 63 | 64 | 65 | CSV 66 | ~~~ 67 | 68 | .. code-block:: python 69 | 70 | import csv 71 | import io 72 | import pysparkling 73 | 74 | sc = pysparkling.Context() 75 | 76 | # reading 77 | rdd = ( 78 | sc 79 | .textFile('input.csv') 80 | .mapPartitions(csv.reader) 81 | ) 82 | 83 | # writing 84 | def csv_row(data): 85 | s = io.StringIO() 86 | csv.writer(s).writerow(data) 87 | return s.getvalue()[:-1] 88 | 89 | ( 90 | rdd 91 | .map(csv_row) 92 | .saveAsTextFile('output.csv') 93 | ) 94 | 95 | 96 | TensorFlow Records 97 | ~~~~~~~~~~~~~~~~~~ 98 | 99 | This example preprocesses example data into a TensorFlow Records file. The 100 | second part is a cross check and prints the contents of the `tfrecords` file. 101 | 102 | .. code-block:: python 103 | 104 | import pysparkling 105 | import tensorflow as tf 106 | 107 | def to_tfrecord(self, xy): 108 | X, y = xy 109 | example = tf.train.Example(features=tf.train.Features(feature={ 110 | 'X': tf.train.Feature(float_list=tf.train.FloatList(value=X)), 111 | 'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)), 112 | })) 113 | return example.SerializeToString() 114 | 115 | # example 116 | X = [1.2, 3.1, 8.7] 117 | y = [2, 5] 118 | 119 | # writing 120 | sc = pysparkling.Context() 121 | rdd = ( 122 | sc 123 | .parallelize([(X, y)]) 124 | .map(to_tfrecord) 125 | ) 126 | with tf.python_io.TFRecordWriter('out.tfrecords') as writer: 127 | for example in rdd.toLocalIterator(): 128 | writer.write(example) 129 | 130 | # debugging a tf records file 131 | for serialized_example in tf.python_io.tf_record_iterator('out.tfrecords'): 132 | example = tf.train.Example() 133 | example.ParseFromString(serialized_example) 134 | X = example.features.feature['X'].float_list.value 135 | y = example.features.feature['y'].int64_list.value 136 | print(X, y) 137 | 138 | 139 | Streaming 140 | --------- 141 | 142 | Python List 143 | ~~~~~~~~~~~ 144 | 145 | .. code-block:: python 146 | 147 | import pysparkling 148 | 149 | sc = pysparkling.Context() 150 | ssc = pysparkling.streaming.StreamingContext(sc, 1.0) 151 | 152 | ( 153 | ssc 154 | .queueStream([[4], [2], [7]]) 155 | .foreachRDD(lambda rdd: print(rdd.collect())) 156 | ) 157 | 158 | ssc.start() 159 | ssc.awaitTermination(3.5) 160 | 161 | # output: 162 | # [4] 163 | # [2] 164 | # [7] 165 | -------------------------------------------------------------------------------- /docs/sphinx/version_index/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/version_index/.nojekyll -------------------------------------------------------------------------------- /docs/sphinx/version_index/CNAME: -------------------------------------------------------------------------------- 1 | pysparkling.trivial.io -------------------------------------------------------------------------------- /docs/sphinx/version_index/circle.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | pre: 3 | - sudo pip install html5validator 4 | test: 5 | override: 6 | - html5validator 7 | -------------------------------------------------------------------------------- /docs/sphinx/version_index/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/version_index/favicon.ico -------------------------------------------------------------------------------- /docs/sphinx/version_index/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Databench Docs 10 | 11 | 12 | 13 | 14 | 15 | 30 | 31 | 32 | 35 | 36 | pysparkling logo 37 | 38 |

pysparkling Docs

39 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /logo/banner-w1500.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/banner-w1500.png -------------------------------------------------------------------------------- /logo/banner-w500.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/banner-w500.png -------------------------------------------------------------------------------- /logo/create.py: -------------------------------------------------------------------------------- 1 | """Creates an SVG of the Databench logo. Optionally also a png.""" 2 | 3 | import os 4 | import random 5 | 6 | import svgwrite 7 | 8 | DATA = [ 9 | [0, 1, 1, 1, 1, 1, 1, 1], 10 | [0, 1, 1, 1, 1, 1, 1, 1], 11 | [0, 0, 0, 0, 1, 1, 1, 1], 12 | [0, 0, 0, 1, 1, 1, 1, 1], 13 | [0, 0, 1, 1, 1, 0, 1, 1], 14 | [0, 1, 1, 1, 0, 0, 1, 1], 15 | [1, 1, 1, 0, 0, 0, 1, 1], 16 | [1, 1, 0, 0, 0, 0, 0, 0], 17 | ] 18 | 19 | 20 | def color(x, y): 21 | """triangles. 22 | 23 | Colors: 24 | - http://paletton.com/#uid=70l150klllletuehUpNoMgTsdcs shade 2 25 | """ 26 | 27 | return '#42359C' # "#CDB95B" 28 | 29 | if (x - 4) > (y - 4) and -(y - 4) <= (x - 4): 30 | # right 31 | return '#42359C' # "#CDB95B" 32 | elif (x - 4) > (y - 4) and -(y - 4) > (x - 4): 33 | # top 34 | return "#CD845B" 35 | elif (x - 4) <= (y - 4) and -(y - 4) <= (x - 4): 36 | # bottom 37 | return "#57488E" 38 | elif (x - 4) <= (y - 4) and -(y - 4) > (x - 4): 39 | # left 40 | return "#3B8772" 41 | 42 | # should not happen 43 | return "black" 44 | 45 | 46 | def simple(svg_document, x, y, v): 47 | if v == 1: 48 | svg_document.add(svg_document.rect(insert=(x * 16, y * 16), 49 | size=("16px", "16px"), 50 | # rx="2px", 51 | # stroke_width="1", 52 | # stroke=color(x, y), 53 | fill=color(x, y))) 54 | 55 | 56 | def smaller(svg_document, x, y, v, x_offset=0, y_offset=0): 57 | # from center 58 | distance2 = (x - 3.5) ** 2 + (y - 3.5) ** 2 59 | max_distance2 = 2 * 4 ** 2 60 | 61 | if v == 1: 62 | size = 16.0 * (1.0 - distance2 / max_distance2) 63 | number_of_cubes = int(16 ** 2 / (size ** 2)) 64 | for i in range(number_of_cubes): 65 | xi = x * 16 + 1 + random.random() * (14.0 - size) + x_offset 66 | yi = y * 16 + 1 + random.random() * (14.0 - size) + y_offset 67 | sizepx = str(size) + "px" 68 | svg_document.add(svg_document.rect(insert=(xi, yi), 69 | size=(sizepx, sizepx), 70 | rx="2px", 71 | stroke_width="1", 72 | # stroke='#4E9954', 73 | stroke='#FAE5A5', 74 | # stroke=color(x, y), 75 | fill=color(x, y))) 76 | 77 | 78 | def main(): 79 | svg_favicon = svgwrite.Drawing(filename="favicon.svg", 80 | size=("128px", "128px")) 81 | svg_document = svgwrite.Drawing(filename="logo.svg", 82 | size=("128px", "128px")) 83 | svg_banner = svgwrite.Drawing(filename="banner.svg", 84 | size=("600px", "200px")) 85 | for y, r in enumerate(DATA): 86 | for x, v in enumerate(r): 87 | simple(svg_favicon, x, y, v) 88 | smaller(svg_document, x, y, v) 89 | smaller(svg_banner, x, y, v, x_offset=20, y_offset=40) 90 | # add banner text 91 | g = svg_banner.g(style='font-size:40px; font-family:Arial; font-weight: bold; font-style: italic;') 92 | g.add(svg_banner.text( 93 | 'pysparkling', 94 | insert=(180, 120), fill='#000000'), 95 | ) 96 | svg_banner.add(g) 97 | # print(svg_document.tostring()) 98 | svg_favicon.save() 99 | svg_document.save() 100 | svg_banner.save() 101 | 102 | # create pngs 103 | os.system('svg2png --width=100 --height=100 logo.svg logo-w100.png') 104 | os.system('svg2png --width=600 --height=600 logo.svg logo-w600.png') 105 | os.system('svg2png --width=500 --height=100 banner.svg banner-w500.png') 106 | os.system('svg2png --width=1500 --height=400 banner.svg banner-w1500.png') 107 | favicon_sizes = [16, 32, 48, 128, 256] 108 | for s in favicon_sizes: 109 | os.system(f'svg2png --width={s} --height={s} favicon.svg favicon-w{s}.png') 110 | png_favicon_names = [f'favicon-w{s}.png' for s in favicon_sizes] 111 | os.system('convert ' + (' '.join(png_favicon_names)) + 112 | ' -colors 256 favicon.ico') 113 | 114 | 115 | if __name__ == "__main__": 116 | random.seed(42) 117 | main() 118 | -------------------------------------------------------------------------------- /logo/favicon-w128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w128.png -------------------------------------------------------------------------------- /logo/favicon-w16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w16.png -------------------------------------------------------------------------------- /logo/favicon-w256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w256.png -------------------------------------------------------------------------------- /logo/favicon-w32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w32.png -------------------------------------------------------------------------------- /logo/favicon-w48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w48.png -------------------------------------------------------------------------------- /logo/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon.ico -------------------------------------------------------------------------------- /logo/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /logo/logo-w100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/logo-w100.png -------------------------------------------------------------------------------- /logo/logo-w600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/logo-w600.png -------------------------------------------------------------------------------- /pysparkling/__init__.py: -------------------------------------------------------------------------------- 1 | """pysparkling module""" 2 | # flake8: noqa 3 | 4 | from . import exceptions, fileio, streaming 5 | from .__version__ import __version__ 6 | from .accumulators import Accumulator, AccumulatorParam 7 | from .broadcast import Broadcast 8 | from .cache_manager import CacheManager, TimedCacheManager 9 | from .context import Context 10 | from .rdd import RDD 11 | from .sql.types import Row 12 | from .stat_counter import StatCounter 13 | from .storagelevel import StorageLevel 14 | 15 | __all__ = ['RDD', 'Context', 'Broadcast', 'StatCounter', 'CacheManager', 'Row', 16 | 'TimedCacheManager', 'StorageLevel', 17 | 'exceptions', 'fileio', 'streaming'] 18 | -------------------------------------------------------------------------------- /pysparkling/__version__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_versions 2 | 3 | __version__ = get_versions()['version'] 4 | -------------------------------------------------------------------------------- /pysparkling/accumulators.py: -------------------------------------------------------------------------------- 1 | # A large part of this module is extracted from its PySpark counterpart at 2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/accumulators.html 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | """ 21 | >>> from pysparkling import Context 22 | >>> sc = Context() 23 | >>> a = sc.accumulator(1) 24 | >>> a.value 25 | 1 26 | >>> a.value = 2 27 | >>> a.value 28 | 2 29 | >>> a += 5 30 | >>> a.value 31 | 7 32 | 33 | >>> sc.accumulator(1.0).value 34 | 1.0 35 | 36 | >>> sc.accumulator(1j).value 37 | 1j 38 | 39 | >>> rdd = sc.parallelize([1,2,3]) 40 | >>> def f(x): 41 | ... global a 42 | ... a += x 43 | >>> rdd.foreach(f) 44 | >>> a.value 45 | 13 46 | 47 | >>> b = sc.accumulator(0) 48 | >>> def g(x): 49 | ... b.add(x) 50 | >>> rdd.foreach(g) 51 | >>> b.value 52 | 6 53 | 54 | >>> from pysparkling import AccumulatorParam 55 | >>> class VectorAccumulatorParam(AccumulatorParam): 56 | ... def zero(self, value): 57 | ... return [0.0] * len(value) 58 | ... def addInPlace(self, val1, val2): 59 | ... for i in range(len(val1)): 60 | ... val1[i] += val2[i] 61 | ... return val1 62 | >>> va = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam()) 63 | >>> va.value 64 | [1.0, 2.0, 3.0] 65 | >>> def g(x): 66 | ... global va 67 | ... va += [x] * 3 68 | >>> rdd.foreach(g) 69 | >>> va.value 70 | [7.0, 8.0, 9.0] 71 | 72 | >>> sc.accumulator([1.0, 2.0, 3.0]) # doctest: +IGNORE_EXCEPTION_DETAIL 73 | Traceback (most recent call last): 74 | ... 75 | TypeError: No default accumulator param for type 76 | """ 77 | 78 | 79 | __all__ = ['Accumulator', 'AccumulatorParam'] 80 | 81 | 82 | class Accumulator: 83 | """ 84 | A shared variable that can be accumulated, i.e., has a commutative and associative "add" 85 | operation. Tasks can add values to an Accumulator with the ``+=`` operator 86 | 87 | The API supports accumulators for primitive data types like ``int`` and 88 | ``float``, users can also define accumulators for custom types by providing a custom 89 | ``AccumulatorParam`` object. Refer to the doctest of this module for an example. 90 | """ 91 | 92 | def __init__(self, value, accum_param): 93 | """Create a new Accumulator with a given initial value and AccumulatorParam object""" 94 | self.accum_param = accum_param 95 | self._value = value 96 | 97 | @property 98 | def value(self): 99 | return self._value 100 | 101 | @value.setter 102 | def value(self, value): 103 | self._value = value 104 | 105 | def add(self, term): 106 | """Adds a term to this accumulator's value""" 107 | self._value = self.accum_param.addInPlace(self._value, term) 108 | 109 | def __iadd__(self, term): 110 | """The += operator; adds a term to this accumulator's value""" 111 | self.add(term) 112 | return self 113 | 114 | def __str__(self): 115 | return str(self._value) 116 | 117 | def __repr__(self): 118 | return f"Accumulator" 119 | 120 | 121 | class AccumulatorParam: 122 | """ 123 | Helper object that defines how to accumulate values of a given type. 124 | """ 125 | def zero(self, value): 126 | """ 127 | Provide a "zero value" for the type, compatible in dimensions with the 128 | provided ``value`` (e.g., a zero vector) 129 | """ 130 | raise NotImplementedError 131 | 132 | def addInPlace(self, value1, value2): 133 | """ 134 | Add two values of the accumulator's data type, returning a new value; 135 | for efficiency, can also update ``value1`` in place and return it. 136 | """ 137 | raise NotImplementedError 138 | 139 | 140 | class AddingAccumulatorParam(AccumulatorParam): 141 | """ 142 | An AccumulatorParam that uses the + operators to add values. Designed for simple types 143 | such as integers, floats, and lists. Requires the zero value for the underlying type 144 | as a parameter. 145 | """ 146 | def __init__(self, zero_value): 147 | self.zero_value = zero_value 148 | 149 | def zero(self, value): 150 | return self.zero_value 151 | 152 | def addInPlace(self, value1, value2): 153 | value1 += value2 154 | return value1 155 | 156 | 157 | # Singleton accumulator params for some standard types 158 | INT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0) 159 | FLOAT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0) 160 | COMPLEX_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0j) 161 | 162 | 163 | if __name__ == "__main__": 164 | # 165 | # Execute doctests with 166 | # 167 | # $ python -m pysparkling.accumulators -v 168 | # 169 | import doctest 170 | import sys 171 | 172 | failure_count, _ = doctest.testmod() 173 | if failure_count: 174 | sys.exit(-1) 175 | -------------------------------------------------------------------------------- /pysparkling/broadcast.py: -------------------------------------------------------------------------------- 1 | # A large part of this module is extracted from its PySpark counterpart at 2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/broadcast.html 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | __all__ = ['Broadcast'] 21 | 22 | 23 | class Broadcast: 24 | """ 25 | A broadcast variable created with ``b = sc.broadcast(0)``. 26 | Access its value through ``b.value``. 27 | 28 | Examples: 29 | 30 | >>> from pysparkling import Context 31 | >>> sc = Context() 32 | >>> b = sc.broadcast([1, 2, 3, 4, 5]) 33 | >>> b.value 34 | [1, 2, 3, 4, 5] 35 | >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() 36 | [1, 2, 3, 4, 5, 1, 2, 3, 4, 5] 37 | """ 38 | def __init__(self, sc=None, value=None): 39 | self._value = value 40 | 41 | @property 42 | def value(self): 43 | """Returs the broadcasted value.""" 44 | return self._value 45 | 46 | 47 | if __name__ == "__main__": 48 | # 49 | # Execute doctests with 50 | # 51 | # $ python -m pysparkling.accumulators -v 52 | # 53 | import doctest 54 | import sys 55 | 56 | failure_count, _ = doctest.testmod() 57 | if failure_count: 58 | sys.exit(-1) 59 | -------------------------------------------------------------------------------- /pysparkling/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | class ConnectionException(Exception): 3 | pass 4 | 5 | 6 | class ContextIsLockedException(Exception): 7 | pass 8 | 9 | 10 | class FileAlreadyExistsException(Exception): 11 | pass 12 | 13 | 14 | class FileSystemNotSupported(Exception): 15 | pass 16 | -------------------------------------------------------------------------------- /pysparkling/fileio/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .file import File 4 | from .textfile import TextFile 5 | 6 | # flake8: noqa 7 | 8 | __all__ = ['File', 'TextFile'] 9 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .bz2 import Bz2 4 | from .codec import Codec 5 | from .gz import Gz 6 | from .lzma import Lzma 7 | from .sevenz import SevenZ 8 | from .tar import Tar, TarBz2, TarGz 9 | from .zip import Zip 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | FILE_ENDINGS = [ 14 | (('.tar',), Tar), 15 | (('.tar.gz',), TarGz), 16 | (('.tar.bz2',), TarBz2), 17 | (('.gz',), Gz), 18 | (('.zip',), Zip), 19 | (('.bz2',), Bz2), 20 | (('.lzma', '.xz'), Lzma), 21 | (('.7z',), SevenZ), 22 | ] 23 | 24 | 25 | class NoCodec(Codec): 26 | pass 27 | 28 | 29 | def get_codec(path): 30 | """Find the codec implementation for this path.""" 31 | if '.' not in path or path.rfind('/') > path.rfind('.'): 32 | return Codec 33 | 34 | for endings, codec_class in FILE_ENDINGS: 35 | if any(path.endswith(e) for e in endings): 36 | log.debug('Using %s codec: %s', endings, path) 37 | return codec_class 38 | 39 | return NoCodec 40 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/bz2.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import io 3 | import logging 4 | 5 | from .codec import Codec 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class Bz2(Codec): 11 | """Implementation of :class:`.Codec` for bz2 compression.""" 12 | 13 | def compress(self, stream): 14 | return io.BytesIO(bz2.compress(b''.join(stream))) 15 | 16 | def decompress(self, stream): 17 | return io.BytesIO(bz2.decompress(stream.read())) 18 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/codec.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class Codec: 7 | """Codec.""" 8 | def __init__(self): 9 | pass 10 | 11 | def compress(self, stream): 12 | """Compress. 13 | 14 | :param io.BytesIO stream: Uncompressed input stream. 15 | :rtype: io.BytesIO 16 | """ 17 | return stream 18 | 19 | def decompress(self, stream): 20 | """Decompress. 21 | 22 | :param io.BytesIO stream: Compressed input stream. 23 | :rtype: io.BytesIO 24 | """ 25 | return stream 26 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/gz.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | from io import BytesIO 3 | import logging 4 | 5 | from .codec import Codec 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class Gz(Codec): 11 | """Implementation of :class:`.Codec` for gz compression.""" 12 | 13 | def compress(self, stream): 14 | compressed = BytesIO() 15 | 16 | with gzip.GzipFile(fileobj=compressed, mode='wb') as f: 17 | f.write(stream.read()) 18 | 19 | compressed.seek(0) 20 | return compressed 21 | 22 | def decompress(self, stream): 23 | uncompressed = BytesIO() 24 | 25 | with gzip.GzipFile(fileobj=stream, mode='rb') as f: 26 | uncompressed.write(f.read()) 27 | 28 | uncompressed.seek(0) 29 | return uncompressed 30 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/lzma.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import logging 3 | import lzma 4 | 5 | from .codec import Codec 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class Lzma(Codec): 11 | """Implementation of :class:`.Codec` for lzma compression. 12 | 13 | Needs Python >= 3.3. 14 | """ 15 | 16 | def __init__(self): 17 | if lzma is None: 18 | log.warning('LZMA codec not supported. It is only supported ' 19 | 'in Python>=3.3. Not compressing streams.') 20 | super().__init__() 21 | 22 | def compress(self, stream): 23 | if lzma is None: 24 | return Codec.compress(self, stream) 25 | 26 | return BytesIO(lzma.compress(stream.read())) 27 | 28 | def decompress(self, stream): 29 | if lzma is None: 30 | return Codec.decompress(self, stream) 31 | 32 | return BytesIO(lzma.decompress(stream.read())) 33 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/sevenz.py: -------------------------------------------------------------------------------- 1 | try: 2 | import py7zlib 3 | except ImportError: 4 | py7zlib = None 5 | 6 | from io import BytesIO 7 | import logging 8 | 9 | from .codec import Codec 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class SevenZ(Codec): 15 | """Implementation of :class:`.Codec` for 7z compression. 16 | 17 | Needs the `pylzma` module. 18 | """ 19 | 20 | def __init__(self): 21 | if py7zlib is None: 22 | log.warning('py7zlib could not be imported. To read 7z files, ' 23 | 'install the library with "pip install pylzma".') 24 | super().__init__() 25 | 26 | def compress(self, stream): 27 | log.warning('Writing of 7z compressed archives is not supported.') 28 | return stream 29 | 30 | def decompress(self, stream): 31 | if py7zlib is None: 32 | return Codec.decompress(self, stream) 33 | 34 | uncompressed = BytesIO() 35 | 36 | f = py7zlib.Archive7z(file=stream) 37 | for f_name in f.getnames(): 38 | uncompressed.write(f.getmember(f_name).read()) 39 | 40 | uncompressed.seek(0) 41 | return uncompressed 42 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/tar.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import logging 3 | import tarfile 4 | 5 | from .codec import Codec 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class Tar(Codec): 11 | """Implementation of :class:`.Codec` for tar compression.""" 12 | 13 | def compress(self, stream): 14 | compressed = BytesIO() 15 | 16 | with tarfile.open(fileobj=compressed, mode='w') as f: 17 | s = stream.read() 18 | 19 | t = tarfile.TarInfo('data') 20 | t.size = len(s) 21 | 22 | f.addfile(t, BytesIO(s)) 23 | 24 | compressed.seek(0) 25 | return compressed 26 | 27 | def decompress(self, stream): 28 | uncompressed = BytesIO() 29 | 30 | with tarfile.open(fileobj=stream, mode='r') as f: 31 | for tar_info in f.getmembers(): 32 | if not tar_info.isfile(): 33 | continue 34 | uncompressed.write(f.extractfile(tar_info).read()) 35 | 36 | uncompressed.seek(0) 37 | return uncompressed 38 | 39 | 40 | class TarGz(Codec): 41 | """Implementation of :class:`.Codec` for .tar.gz compression.""" 42 | 43 | def compress(self, stream): 44 | compressed = BytesIO() 45 | 46 | with tarfile.open(fileobj=compressed, mode='w:gz') as f: 47 | s = stream.read() 48 | 49 | t = tarfile.TarInfo('data') 50 | t.size = len(s) 51 | 52 | f.addfile(t, BytesIO(s)) 53 | 54 | compressed.seek(0) 55 | return compressed 56 | 57 | def decompress(self, stream): 58 | uncompressed = BytesIO() 59 | 60 | with tarfile.open(fileobj=stream, mode='r:gz') as f: 61 | for tar_info in f.getmembers(): 62 | if not tar_info.isfile(): 63 | continue 64 | uncompressed.write(f.extractfile(tar_info).read()) 65 | 66 | uncompressed.seek(0) 67 | return uncompressed 68 | 69 | 70 | class TarBz2(Codec): 71 | """Implementation of :class:`.Codec` for .tar.bz2 compression.""" 72 | 73 | def compress(self, stream): 74 | compressed = BytesIO() 75 | 76 | with tarfile.open(fileobj=compressed, mode='w:bz2') as f: 77 | s = stream.read() 78 | 79 | t = tarfile.TarInfo('data') 80 | t.size = len(s) 81 | 82 | f.addfile(t, BytesIO(s)) 83 | 84 | compressed.seek(0) 85 | return compressed 86 | 87 | def decompress(self, stream): 88 | uncompressed = BytesIO() 89 | 90 | with tarfile.open(fileobj=stream, mode='r:bz2') as f: 91 | for tar_info in f.getmembers(): 92 | if not tar_info.isfile(): 93 | continue 94 | uncompressed.write(f.extractfile(tar_info).read()) 95 | 96 | uncompressed.seek(0) 97 | return uncompressed 98 | -------------------------------------------------------------------------------- /pysparkling/fileio/codec/zip.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import logging 3 | import zipfile 4 | 5 | from .codec import Codec 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | class Zip(Codec): 11 | """Implementation of :class:`.Codec` for zip compression.""" 12 | 13 | def compress(self, stream): 14 | compressed = BytesIO() 15 | 16 | with zipfile.ZipFile(file=compressed, mode='w', allowZip64=True) as f: 17 | f.writestr('data', stream.read()) 18 | 19 | compressed.seek(0) 20 | return compressed 21 | 22 | def decompress(self, stream): 23 | uncompressed = BytesIO() 24 | 25 | with zipfile.ZipFile(file=stream, mode='r', allowZip64=True) as f: 26 | for f_name in f.namelist(): 27 | uncompressed.write(f.read(f_name)) 28 | 29 | uncompressed.seek(0) 30 | return uncompressed 31 | -------------------------------------------------------------------------------- /pysparkling/fileio/file.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import logging 3 | 4 | from . import codec, fs 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | class File: 10 | """File object. 11 | 12 | :param file_name: Any file name. 13 | """ 14 | 15 | def __init__(self, file_name): 16 | self.file_name = file_name 17 | self.fs = fs.get_fs(file_name)(file_name) 18 | self.codec = codec.get_codec(file_name)() 19 | 20 | @staticmethod 21 | def resolve_filenames(all_expr): 22 | """resolve expression for a filename 23 | 24 | :param all_expr: 25 | A comma separated list of expressions. The expressions can contain 26 | the wildcard characters ``*`` and ``?``. It also resolves Spark 27 | datasets to the paths of the individual partitions 28 | (i.e. ``my_data`` gets resolved to 29 | ``[my_data/part-00000, my_data/part-00001]``). 30 | 31 | :returns: A list of file names. 32 | :rtype: list 33 | """ 34 | files = [] 35 | for expr in all_expr.split(','): 36 | expr = expr.strip() 37 | files += fs.get_fs(expr).resolve_filenames(expr) 38 | log.debug('Filenames: %s', files) 39 | return files 40 | 41 | @classmethod 42 | def get_content(cls, all_expr): 43 | """Return all files matching or in folder matching one of the given expression 44 | 45 | :param all_expr: 46 | A list of expressions. 47 | The expressions can contain the wildcard characters ``*`` and ``?``. 48 | 49 | :returns: A list of file names. 50 | :rtype: list 51 | """ 52 | files = [] 53 | for expr in all_expr: 54 | expr = expr.strip() 55 | files += fs.get_fs(expr).resolve_content(expr) 56 | log.debug('Filenames: %s', files) 57 | return files 58 | 59 | def exists(self): 60 | """Checks both for a file or directory at this location. 61 | 62 | :returns: True or false. 63 | """ 64 | return self.fs.exists() 65 | 66 | def load(self): 67 | """Load the data from a file. 68 | 69 | :rtype: io.BytesIO 70 | """ 71 | stream = self.fs.load() 72 | stream = self.codec.decompress(stream) 73 | return stream 74 | 75 | def dump(self, stream=None): 76 | """Writes a stream to a file. 77 | 78 | :param stream: 79 | A BytesIO instance. ``bytes`` are also possible and are converted 80 | to BytesIO. 81 | 82 | :rtype: File 83 | """ 84 | if stream is None: 85 | stream = BytesIO() 86 | 87 | if isinstance(stream, bytes): 88 | stream = BytesIO(stream) 89 | 90 | stream = self.codec.compress(stream) 91 | self.fs.dump(stream) 92 | 93 | return self 94 | 95 | def make_public(self, recursive=False): 96 | """Makes the file public. Currently only supported on S3. 97 | 98 | :param recursive: Whether to apply this recursively. 99 | :rtype: File 100 | """ 101 | self.fs.make_public(recursive) 102 | return self 103 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/__init__.py: -------------------------------------------------------------------------------- 1 | from .file_system import FileSystem 2 | from .gs import GS 3 | from .hdfs import Hdfs 4 | from .http import Http 5 | from .local import Local 6 | from .s3 import S3 7 | 8 | __all__ = ['FileSystem', 'GS', 'Hdfs', 'Http', 'Local', 'S3'] 9 | 10 | 11 | FILE_EXTENSIONS = [ 12 | (('file', ''), Local), 13 | (('s3', 's3n'), S3), 14 | (('gs', 'gcs'), GS), 15 | (('http', 'https'), Http), 16 | (('hdfs'), Hdfs), 17 | ] 18 | 19 | 20 | def get_fs(path): 21 | """Find the file system implementation for this path.""" 22 | scheme = '' 23 | if '://' in path: 24 | scheme = path.partition('://')[0] 25 | 26 | for schemes, fs_class in FILE_EXTENSIONS: 27 | if scheme in schemes: 28 | return fs_class 29 | 30 | return FileSystem 31 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/file_system.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import typing as t 4 | 5 | log = logging.getLogger(__name__) 6 | 7 | 8 | class FileSystem: 9 | """Interface class for the file system. 10 | 11 | :param str file_name: File name. 12 | """ 13 | def __init__(self, file_name: str): 14 | self.file_name: str = file_name 15 | 16 | @staticmethod 17 | def resolve_filenames(expr: str) -> t.List[str]: 18 | """Resolve the given glob-like expression to filenames. 19 | 20 | :rtype: list 21 | """ 22 | log.error('Cannot resolve: %s', expr) 23 | raise NotImplementedError 24 | 25 | @staticmethod 26 | def resolve_content(expr: str) -> t.List[str]: 27 | """Return all the files matching expr or in a folder matching expr 28 | 29 | :rtype: list 30 | """ 31 | log.error('Cannot resolve: %s', expr) 32 | raise NotImplementedError 33 | 34 | def exists(self) -> bool: 35 | """Check whether the given file_name exists. 36 | 37 | :rtype: bool 38 | """ 39 | log.warning('Could not determine whether %s exists due to unhandled scheme.', self.file_name) 40 | raise NotImplementedError 41 | 42 | def load(self) -> io.BytesIO: 43 | """Load a file to a stream.""" 44 | log.error('Cannot load: %s', self.file_name) 45 | raise NotImplementedError 46 | 47 | def load_text(self, encoding: str = 'utf8', encoding_errors: str = 'ignore') -> io.StringIO: 48 | """Load a file to a stream. 49 | 50 | :param str encoding: Text encoding. 51 | :param str encoding_errors: How to handle encoding errors. 52 | """ 53 | log.error('Cannot load: %s', self.file_name) 54 | raise NotImplementedError 55 | 56 | def dump(self, stream: io.BytesIO): 57 | """Dump a stream to a file. 58 | 59 | :param io.BytesIO stream: Input tream. 60 | """ 61 | log.error('Cannot dump: %s', self.file_name) 62 | raise NotImplementedError 63 | 64 | def make_public(self, recursive=False): 65 | """Make the file public (only on some file systems). 66 | 67 | :param bool recursive: Recurse. 68 | :rtype: FileSystem 69 | """ 70 | log.warning('Cannot make %s public.', self.file_name) 71 | raise NotImplementedError 72 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/gs.py: -------------------------------------------------------------------------------- 1 | from fnmatch import fnmatch 2 | from io import BytesIO, StringIO 3 | import logging 4 | 5 | from ...exceptions import FileSystemNotSupported 6 | from ...utils import parse_file_uri, Tokenizer 7 | from .file_system import FileSystem 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | try: 12 | from gcloud import storage 13 | except ImportError: 14 | storage = None 15 | 16 | 17 | class GS(FileSystem): 18 | """:class:`.FileSystem` implementation for Google Storage. 19 | 20 | Paths are of the form `gs://bucket_name/file_path` or 21 | `gs://project_name:bucket_name/file_path`. 22 | """ 23 | 24 | #: Set a default project name. 25 | project_name = None 26 | 27 | #: Default mime type. 28 | mime_type = 'text/plain' 29 | 30 | _clients = {} 31 | 32 | def __init__(self, file_name): 33 | if storage is None: 34 | raise FileSystemNotSupported( 35 | 'Google Storage is not supported. Install "gcloud".' 36 | ) 37 | 38 | super().__init__(file_name) 39 | 40 | # obtain key 41 | t = Tokenizer(self.file_name) 42 | t.get_next('://') # skip scheme 43 | bucket_name = t.get_next('/') 44 | if ':' in bucket_name: 45 | project_name, _, bucket_name = bucket_name.partition(':') 46 | else: 47 | project_name = GS.project_name 48 | blob_name = t.get_next() 49 | 50 | client = GS._get_client(project_name) 51 | bucket = client.get_bucket(bucket_name) 52 | self.blob = bucket.get_blob(blob_name) 53 | if not self.blob: 54 | self.blob = bucket.blob(blob_name) 55 | 56 | @staticmethod 57 | def _get_client(project_name): 58 | if project_name not in GS._clients: 59 | if storage is None: 60 | raise FileSystemNotSupported( 61 | 'Google Storage is not supported. Install "gcloud".' 62 | ) 63 | GS._clients[project_name] = storage.Client(project_name) 64 | return GS._clients[project_name] 65 | 66 | @staticmethod 67 | def resolve_filenames(expr): 68 | files = [] 69 | 70 | t = Tokenizer(expr) 71 | scheme = t.get_next('://') 72 | bucket_name = t.get_next('/') 73 | if ':' in bucket_name: 74 | project_name, _, bucket_name = bucket_name.partition(':') 75 | else: 76 | project_name = GS.project_name 77 | prefix = t.get_next(['*', '?']) 78 | 79 | bucket = GS._get_client(project_name).get_bucket(bucket_name) 80 | expr_s = len(scheme) + 3 + len(project_name) + 1 + len(bucket_name) + 1 81 | expr = expr[expr_s:] 82 | for k in bucket.list_blobs(prefix=prefix): 83 | if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'): 84 | files.append(f'{scheme}://{project_name}:{bucket_name}/{k.name}') 85 | return files 86 | 87 | @staticmethod 88 | def resolve_content(expr): 89 | scheme, raw_bucket_name, folder_path, pattern = parse_file_uri(expr) 90 | 91 | if ':' in raw_bucket_name: 92 | project_name, _, bucket_name = raw_bucket_name.partition(':') 93 | else: 94 | project_name = GS.project_name 95 | bucket_name = raw_bucket_name 96 | 97 | folder_path = folder_path[1:] # Remove leading slash 98 | 99 | expr = f"{folder_path}{pattern}" 100 | # Match all files inside folders that match expr 101 | pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*" 102 | 103 | bucket = GS._get_client(project_name).get_bucket(bucket_name) 104 | 105 | files = [] 106 | for k in bucket.list_blobs(prefix=folder_path): 107 | if not k.name.endswith("/") and ( 108 | fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr) 109 | ): 110 | files.append( 111 | f'{scheme}://{raw_bucket_name}/{k.name}' 112 | ) 113 | return files 114 | 115 | def exists(self): 116 | t = Tokenizer(self.file_name) 117 | t.get_next('//') # skip scheme 118 | bucket_name = t.get_next('/') 119 | if ':' in bucket_name: 120 | project_name, _, bucket_name = bucket_name.partition(':') 121 | else: 122 | project_name = GS.project_name 123 | blob_name = t.get_next() 124 | bucket = GS._get_client(project_name).get_bucket(bucket_name) 125 | return (bucket.get_blob(blob_name) 126 | or list(bucket.list_blobs(prefix=f'{blob_name}/'))) 127 | 128 | def load(self): 129 | log.debug('Loading %s with size %s.', self.blob.name, self.blob.size) 130 | return BytesIO(self.blob.download_as_string()) 131 | 132 | def load_text(self, encoding='utf8', encoding_errors='ignore'): 133 | log.debug('Loading %s with size %s.', self.blob.name, self.blob.size) 134 | return StringIO( 135 | self.blob.download_as_string().decode( 136 | encoding, encoding_errors 137 | ) 138 | ) 139 | 140 | def dump(self, stream): 141 | log.debug('Dumping to %s.', self.blob.name) 142 | self.blob.upload_from_string(stream.read(), 143 | content_type=self.mime_type) 144 | return self 145 | 146 | def make_public(self, recursive=False): 147 | self.blob.make_public(recursive) 148 | return self 149 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/hdfs.py: -------------------------------------------------------------------------------- 1 | from fnmatch import fnmatch 2 | from io import BytesIO, StringIO 3 | import logging 4 | 5 | from ...exceptions import FileSystemNotSupported 6 | from ...utils import format_file_uri, parse_file_uri 7 | from .file_system import FileSystem 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | try: 12 | import hdfs 13 | except ImportError: 14 | hdfs = None 15 | 16 | 17 | class Hdfs(FileSystem): 18 | """:class:`.FileSystem` implementation for HDFS.""" 19 | 20 | _conn = {} 21 | 22 | def __init__(self, file_name): 23 | if hdfs is None: 24 | raise FileSystemNotSupported( 25 | 'hdfs not supported. Install the python package "hdfs".' 26 | ) 27 | 28 | super().__init__(file_name) 29 | 30 | @staticmethod 31 | def client_and_path(path): 32 | _, domain, folder_path, file_pattern = parse_file_uri(path) 33 | 34 | if ':' not in domain: 35 | port = 50070 36 | else: 37 | domain, port = domain.split(':') 38 | port = int(port) 39 | cache_id = domain + '__' + str(port) 40 | 41 | if cache_id not in Hdfs._conn: 42 | if hdfs is None: 43 | raise FileSystemNotSupported( 44 | 'hdfs not supported. Install the python package "hdfs".' 45 | ) 46 | Hdfs._conn[cache_id] = hdfs.InsecureClient( # pylint: disable=no-member 47 | f'http://{domain}:{port}' 48 | ) 49 | return Hdfs._conn[cache_id], folder_path + file_pattern 50 | 51 | def exists(self): 52 | c, p = Hdfs.client_and_path(self.file_name) 53 | try: 54 | c.status(p) 55 | except hdfs.util.HdfsError: # pylint: disable=no-member 56 | return False 57 | return True 58 | 59 | @staticmethod 60 | def resolve_filenames(expr): 61 | c, _ = Hdfs.client_and_path(expr) 62 | 63 | scheme, domain, folder_path, _ = parse_file_uri(expr) 64 | 65 | files = [] 66 | for fn, file_status in c.list(folder_path, status=True): 67 | file_local_path = f'{folder_path}{fn}' 68 | file_path = format_file_uri(scheme, domain, file_local_path) 69 | part_file_expr = expr + ("" if expr.endswith("/") else "/") + 'part*' 70 | 71 | if fnmatch(file_path, expr): 72 | if file_status["type"] != "DIRECTORY": 73 | files.append(file_path) 74 | else: 75 | files += Hdfs._get_folder_part_files( 76 | c, 77 | scheme, 78 | domain, 79 | file_local_path, 80 | part_file_expr 81 | ) 82 | elif fnmatch(file_path, part_file_expr): 83 | files.append(file_path) 84 | return files 85 | 86 | @staticmethod 87 | def _get_folder_part_files(c, scheme, domain, folder_local_path, expr_with_part): 88 | files = [] 89 | for fn, file_status in c.list(folder_local_path, status=True): 90 | sub_file_path = format_file_uri(scheme, domain, folder_local_path, fn) 91 | if fnmatch(sub_file_path, expr_with_part) and file_status["type"] != "DIRECTORY": 92 | files.append(sub_file_path) 93 | return files 94 | 95 | @classmethod 96 | def _get_folder_files_by_expr(cls, c, scheme, domain, folder_path, expr=None): 97 | """ 98 | Using client c, retrieves all files located in the folder `folder_path` that matches `expr` 99 | 100 | :param c: An HDFS client 101 | :param scheme: a scheme such as hdfs 102 | :param domain: a DFS web server 103 | :param folder_path: a folder path without patterns 104 | :param expr: a pattern 105 | 106 | :return: list of matching files absolute paths prefixed with the scheme and domain 107 | """ 108 | file_paths = [] 109 | for fn, file_status in c.list(folder_path, status=True): 110 | file_local_path = f'{folder_path}{fn}' 111 | if expr is None or fnmatch(file_local_path, expr): 112 | if file_status["type"] == "DIRECTORY": 113 | file_paths += cls._get_folder_files_by_expr( 114 | c, 115 | scheme, 116 | domain, 117 | file_local_path + "/", 118 | expr=None 119 | ) 120 | else: 121 | file_path = format_file_uri(scheme, domain, file_local_path) 122 | file_paths.append(file_path) 123 | elif file_status["type"] == "DIRECTORY": 124 | file_paths += cls._get_folder_files_by_expr( 125 | c, scheme, domain, file_local_path + "/", expr 126 | ) 127 | return file_paths 128 | 129 | @classmethod 130 | def resolve_content(cls, expr): 131 | c, _ = cls.client_and_path(expr) 132 | 133 | scheme, domain, folder_path, pattern = parse_file_uri(expr) 134 | 135 | expr = folder_path + pattern 136 | 137 | return cls._get_folder_files_by_expr(c, scheme, domain, folder_path, expr) 138 | 139 | def load(self): 140 | log.debug('Hdfs read for %s.', self.file_name) 141 | c, path = Hdfs.client_and_path(self.file_name) 142 | 143 | with c.read(path) as reader: 144 | r = BytesIO(reader.read()) 145 | 146 | return r 147 | 148 | def load_text(self, encoding='utf8', encoding_errors='ignore'): 149 | log.debug('Hdfs text read for %s.', self.file_name) 150 | c, path = Hdfs.client_and_path(self.file_name) 151 | 152 | with c.read(path) as reader: 153 | r = StringIO(reader.read().decode(encoding, encoding_errors)) 154 | 155 | return r 156 | 157 | def dump(self, stream): 158 | log.debug('Dump to %s with hdfs write.', self.file_name) 159 | c, path = Hdfs.client_and_path(self.file_name) 160 | c.write(path, stream) 161 | return self 162 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/http.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO, StringIO 2 | import logging 3 | 4 | from ...exceptions import ConnectionException, FileSystemNotSupported 5 | from .file_system import FileSystem 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | try: 10 | import requests 11 | except ImportError: 12 | requests = None 13 | 14 | 15 | class Http(FileSystem): 16 | """:class:`.FileSystem` implementation for HTTP.""" 17 | 18 | def __init__(self, file_name): 19 | if requests is None: 20 | raise FileSystemNotSupported( 21 | 'http not supported. Install "requests".' 22 | ) 23 | 24 | super().__init__(file_name) 25 | self.headers = None 26 | 27 | @staticmethod 28 | def resolve_filenames(expr): 29 | if Http(expr).exists(): 30 | return [expr] 31 | return [] 32 | 33 | def exists(self): 34 | r = requests.head(self.file_name, allow_redirects=True) 35 | return r.status_code == 200 36 | 37 | def load(self): 38 | log.debug('Http GET request for %s.', self.file_name) 39 | r = requests.get(self.file_name, headers=self.headers) 40 | if r.status_code != 200: 41 | raise ConnectionException() 42 | return BytesIO(r.content) 43 | 44 | def load_text(self, encoding='utf8', encoding_errors='ignore'): 45 | # warning: encoding and encoding_errors are ignored 46 | log.debug('Http GET request for %s.', self.file_name) 47 | r = requests.get(self.file_name, headers=self.headers) 48 | if r.status_code != 200: 49 | raise ConnectionException() 50 | return StringIO(r.text) 51 | 52 | def dump(self, stream): 53 | log.debug('Dump to %s with http PUT.', self.file_name) 54 | requests.put(self.file_name, data=b''.join(stream)) 55 | return self 56 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/local.py: -------------------------------------------------------------------------------- 1 | from fnmatch import fnmatch 2 | import glob 3 | import io 4 | import logging 5 | import os 6 | 7 | from ...utils import Tokenizer 8 | from .file_system import FileSystem 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class Local(FileSystem): 14 | """:class:`.FileSystem` implementation for the local file system.""" 15 | 16 | @staticmethod 17 | def resolve_filenames(expr: str): 18 | if expr.startswith('file://'): 19 | expr = expr[7:] 20 | 21 | if os.path.isfile(expr): 22 | return [expr] 23 | 24 | os_sep = [os.path.sep] 25 | if os.path.altsep: 26 | os_sep.append(os.path.altsep) 27 | 28 | if not any(sep in expr for sep in os_sep): 29 | expr = '.' + os.path.sep + expr 30 | 31 | t = Tokenizer(expr) 32 | prefix = t.get_next(['*', '?']) 33 | 34 | if not any(prefix.endswith(sep) for sep in os_sep) and any(sep in prefix for sep in os_sep): 35 | prefix = os.path.dirname(prefix) 36 | 37 | files = [] 38 | for root, _, filenames in os.walk(prefix): 39 | for filename in filenames: 40 | path = os.path.join(root, filename) 41 | if fnmatch(path, expr) or fnmatch(path, expr + '/part*'): 42 | files.append(path) 43 | return files 44 | 45 | @staticmethod 46 | def resolve_content(expr): 47 | if expr.startswith('file://'): 48 | expr = expr[7:] 49 | matches = glob.glob(expr) 50 | file_paths = [] 51 | for match in matches: 52 | if os.path.isfile(match): 53 | file_paths.append(match) 54 | else: 55 | file_paths += [ 56 | os.path.join(root, f) 57 | for root, _, files in os.walk(match) 58 | for f in files 59 | if not f.startswith(("_", ".")) 60 | ] 61 | return file_paths 62 | 63 | @property 64 | def file_path(self): 65 | if self.file_name.startswith('file://'): 66 | return self.file_name[7:] 67 | return self.file_name 68 | 69 | def exists(self): 70 | return os.path.exists(self.file_path) 71 | 72 | def load(self): 73 | with io.open(self.file_path, 'rb') as f: 74 | return io.BytesIO(f.read()) 75 | 76 | def load_text(self, encoding='utf8', encoding_errors='ignore'): 77 | with io.open(self.file_path, 'r', 78 | encoding=encoding, errors=encoding_errors) as f: 79 | return io.StringIO(f.read()) 80 | 81 | def dump(self, stream): 82 | file_path = self.file_path # caching 83 | 84 | # making sure directory exists 85 | dirname = os.path.dirname(file_path) 86 | if dirname and not os.path.exists(dirname): 87 | log.debug('creating local directory %s', dirname) 88 | os.makedirs(dirname) 89 | 90 | log.debug('writing file %s', file_path) 91 | with io.open(file_path, 'wb') as f: 92 | for c in stream: 93 | f.write(c) 94 | return self 95 | -------------------------------------------------------------------------------- /pysparkling/fileio/fs/s3.py: -------------------------------------------------------------------------------- 1 | from fnmatch import fnmatch 2 | from io import BytesIO, StringIO 3 | import logging 4 | 5 | from ...exceptions import FileSystemNotSupported 6 | from ...utils import parse_file_uri, Tokenizer 7 | from .file_system import FileSystem 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | try: 12 | import boto 13 | except ImportError: 14 | boto = None 15 | 16 | 17 | class S3(FileSystem): 18 | """:class:`.FileSystem` implementation for S3. 19 | 20 | Use environment variables ``AWS_SECRET_ACCESS_KEY`` and 21 | ``AWS_ACCESS_KEY_ID`` for auth and use file paths of the form 22 | ``s3://bucket_name/filename.txt``. 23 | """ 24 | 25 | #: Keyword arguments for new connections. 26 | #: Example: set to `{'anon': True}` for anonymous connections. 27 | connection_kwargs = {} 28 | 29 | _conn = None 30 | 31 | def __init__(self, file_name): 32 | if boto is None: 33 | raise FileSystemNotSupported('S3 not supported. Install "boto".') 34 | 35 | super().__init__(file_name) 36 | 37 | # obtain key 38 | t = Tokenizer(self.file_name) 39 | t.get_next('://') # skip scheme 40 | bucket_name = t.get_next('/') 41 | key_name = t.get_next() 42 | conn = self._get_conn() 43 | bucket = conn.get_bucket(bucket_name, validate=False) 44 | self.key = bucket.get_key(key_name) 45 | if not self.key: 46 | self.key = bucket.new_key(key_name) 47 | 48 | @classmethod 49 | def _get_conn(cls): 50 | if not cls._conn: 51 | if boto is None: 52 | raise FileSystemNotSupported('S3 not supported. Install "boto".') 53 | cls._conn = boto.connect_s3(**cls.connection_kwargs) 54 | return cls._conn 55 | 56 | @classmethod 57 | def resolve_filenames(cls, expr): 58 | files = [] 59 | 60 | t = Tokenizer(expr) 61 | scheme = t.get_next('://') 62 | bucket_name = t.get_next('/') 63 | prefix = t.get_next(['*', '?']) 64 | 65 | bucket = cls._get_conn().get_bucket( 66 | bucket_name, 67 | validate=False 68 | ) 69 | expr = expr[len(scheme) + 3 + len(bucket_name) + 1:] 70 | for k in bucket.list(prefix=prefix): 71 | if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'): 72 | files.append(f'{scheme}://{bucket_name}/{k.name}') 73 | return files 74 | 75 | @classmethod 76 | def resolve_content(cls, expr): 77 | scheme, bucket_name, folder_path, pattern = parse_file_uri(expr) 78 | 79 | folder_path = folder_path[1:] # Remove leading slash 80 | 81 | expr = f"{folder_path}{pattern}" 82 | # Match all files inside folders that match expr 83 | pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*" 84 | 85 | bucket = cls._get_conn().get_bucket( 86 | bucket_name, 87 | validate=False 88 | ) 89 | files = [] 90 | for k in bucket.list(prefix=folder_path): 91 | if fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr): 92 | files.append(f'{scheme}://{bucket_name}/{k.name}') 93 | return files 94 | 95 | def exists(self): 96 | t = Tokenizer(self.file_name) 97 | t.get_next('//') # skip scheme 98 | bucket_name = t.get_next('/') 99 | key_name = t.get_next() 100 | conn = self._get_conn() 101 | bucket = conn.get_bucket(bucket_name, validate=False) 102 | return (bucket.get_key(key_name) 103 | or bucket.list(prefix=f'{key_name}/')) 104 | 105 | def load(self): 106 | log.debug('Loading %s with size %s.', self.key.name, self.key.size) 107 | return BytesIO(self.key.get_contents_as_string()) 108 | 109 | def load_text(self, encoding='utf8', encoding_errors='ignore'): 110 | log.debug('Loading %s with size %s.', self.key.name, self.key.size) 111 | return StringIO( 112 | self.key.get_contents_as_string().decode(encoding, encoding_errors) 113 | ) 114 | 115 | def dump(self, stream): 116 | log.debug('Dumping to %s.', self.key.name) 117 | self.key.set_contents_from_file(stream) 118 | return self 119 | 120 | def make_public(self, recursive=False): 121 | self.key.make_public(recursive) 122 | return self 123 | -------------------------------------------------------------------------------- /pysparkling/fileio/textfile.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO, StringIO, TextIOWrapper 2 | import logging 3 | 4 | from . import codec 5 | from .file import File 6 | from .fs.file_system import FileSystem 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class TextFile(File): 12 | """Derived from :class:`File`. 13 | 14 | :param file_name: Any text file name. 15 | """ 16 | 17 | def load(self, encoding='utf8', encoding_errors='ignore'): # pylint: disable=arguments-differ 18 | """Load the data from a file. 19 | 20 | :param str encoding: The character encoding of the file. 21 | :param str encoding_errors: How to handle encoding errors. 22 | :rtype: io.StringIO 23 | """ 24 | # pylint: disable=comparison-with-callable 25 | if isinstance(self.codec, codec.NoCodec) and \ 26 | self.fs.load_text != FileSystem.load_text: 27 | stream = self.fs.load_text(encoding, encoding_errors) 28 | else: 29 | stream = self.fs.load() 30 | stream = self.codec.decompress(stream) 31 | stream = TextIOWrapper(stream, encoding, encoding_errors) 32 | return stream 33 | 34 | def dump(self, stream=None, encoding='utf8', encoding_errors='ignore'): # pylint: disable=arguments-differ 35 | """Writes a stream to a file. 36 | 37 | :param stream: 38 | An ``io.StringIO`` instance. A ``str`` is also possible and 39 | get converted to ``io.StringIO``. 40 | 41 | :param encoding: (optional) 42 | The character encoding of the file. 43 | 44 | :rtype: TextFile 45 | """ 46 | if stream is None: 47 | stream = StringIO() 48 | 49 | if isinstance(stream, str): 50 | stream = StringIO(stream) 51 | 52 | stream = self.codec.compress( 53 | BytesIO(stream.read().encode(encoding, encoding_errors)) 54 | ) 55 | self.fs.dump(stream) 56 | 57 | return self 58 | -------------------------------------------------------------------------------- /pysparkling/partition.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class Partition: 7 | def __init__(self, x, idx=None): 8 | self.index = idx 9 | self._x = list(x) 10 | 11 | def x(self): 12 | return self._x 13 | 14 | def hashCode(self): 15 | return self.index 16 | 17 | def __getstate__(self): 18 | return { 19 | 'index': self.index, 20 | '_x': self.x(), 21 | } 22 | -------------------------------------------------------------------------------- /pysparkling/samplers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | try: 5 | import numpy 6 | except ImportError: 7 | numpy = None 8 | 9 | 10 | def pysparkling_poisson(lambda_): 11 | if lambda_ == 0.0: 12 | return 0 13 | 14 | n = 0 15 | exp_neg_lambda = math.exp(-lambda_) 16 | prod = 1.0 17 | while True: 18 | prod *= random.random() 19 | if prod > exp_neg_lambda: 20 | n += 1 21 | else: 22 | return n 23 | 24 | 25 | def poisson(lambda_): 26 | if numpy is not None: 27 | return numpy.random.poisson(lambda_) 28 | return pysparkling_poisson(lambda_) 29 | 30 | 31 | class BernoulliSampler: 32 | def __init__(self, expectation): 33 | self.expectation = expectation 34 | 35 | def __call__(self, sample): 36 | return 1 if random.random() < self.expectation else 0 37 | 38 | 39 | class PoissonSampler: 40 | def __init__(self, expectation): 41 | self.expectation = expectation 42 | 43 | def __call__(self, sample): 44 | return poisson(self.expectation) 45 | 46 | 47 | class BernoulliSamplerPerKey: 48 | def __init__(self, expectations): 49 | self.expectations = expectations 50 | 51 | def __call__(self, sample): 52 | key = sample[0] 53 | return 1 if random.random() < self.expectations.get(key, 0.0) else 0 54 | 55 | 56 | class PoissonSamplerPerKey: 57 | def __init__(self, expectations): 58 | self.expectations = expectations 59 | 60 | def __call__(self, sample): 61 | key = sample[0] 62 | return poisson(self.expectations.get(key, 0.0)) 63 | -------------------------------------------------------------------------------- /pysparkling/sql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/__init__.py -------------------------------------------------------------------------------- /pysparkling/sql/conf.py: -------------------------------------------------------------------------------- 1 | _sentinel = object() 2 | 3 | 4 | class RuntimeConfig: 5 | def __init__(self, jconf=None): 6 | self._conf = {} 7 | 8 | def set(self, key, value): 9 | self._conf[key] = value 10 | 11 | def get(self, key, default=_sentinel): 12 | self._checkType(key, "key") 13 | if default is _sentinel: 14 | return self._conf.get(key) 15 | if default is not None: 16 | self._checkType(default, "default") 17 | return self._conf.get(key, default) 18 | 19 | def unset(self, key): 20 | del self._conf[key] 21 | 22 | def _checkType(self, obj, identifier): 23 | if not isinstance(obj, str): 24 | raise TypeError(f"expected {identifier} '{obj}' to be a string (was '{type(obj).__name__}')") 25 | 26 | def isModifiable(self, key): 27 | raise NotImplementedError("pysparkling does not support yet this feature") 28 | -------------------------------------------------------------------------------- /pysparkling/sql/context.py: -------------------------------------------------------------------------------- 1 | from .session import SparkSession 2 | 3 | 4 | class SQLContext: 5 | _instantiatedContext = None 6 | 7 | def __init__(self, sparkContext, sparkSession=None, jsqlContext=None): 8 | self._sc = sparkContext 9 | if sparkSession is None: 10 | sparkSession = SparkSession.builder.getOrCreate() 11 | self.sparkSession = sparkSession 12 | if SQLContext._instantiatedContext is None: 13 | SQLContext._instantiatedContext = self 14 | 15 | @classmethod 16 | def getOrCreate(cls, sc): 17 | """ 18 | Get the existing SQLContext or create a new one with given SparkContext. 19 | 20 | :param sc: SparkContext 21 | """ 22 | if cls._instantiatedContext is None: 23 | cls(sc, SparkSession(sc), None) 24 | return cls._instantiatedContext 25 | 26 | def newSession(self): 27 | """ 28 | Returns a new SQLContext as new session, that has separate SQLConf, 29 | registered temporary views and UDFs, but shared SparkContext and 30 | table cache. 31 | """ 32 | return self.__class__(self._sc, self.sparkSession.newSession()) 33 | 34 | def setConf(self, key, value): 35 | """Sets the given Spark SQL configuration property. 36 | """ 37 | self.sparkSession.conf.set(key, value) 38 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/expressions/__init__.py -------------------------------------------------------------------------------- /pysparkling/sql/expressions/aggregate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/expressions/aggregate/__init__.py -------------------------------------------------------------------------------- /pysparkling/sql/expressions/aggregate/aggregations.py: -------------------------------------------------------------------------------- 1 | from ..expressions import Expression 2 | 3 | 4 | class Aggregation(Expression): 5 | @property 6 | def is_an_aggregation(self): 7 | return True 8 | 9 | def merge(self, row, schema): 10 | raise NotImplementedError 11 | 12 | def mergeStats(self, other, schema): 13 | raise NotImplementedError 14 | 15 | def eval(self, row, schema): 16 | raise NotImplementedError 17 | 18 | def args(self): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/aggregate/collectors.py: -------------------------------------------------------------------------------- 1 | from .aggregations import Aggregation 2 | 3 | 4 | class CollectList(Aggregation): 5 | pretty_name = "collect_list" 6 | 7 | def __init__(self, column): 8 | super().__init__(column) 9 | self.column = column 10 | self.items = [] 11 | 12 | def merge(self, row, schema): 13 | self.items.append(self.column.eval(row, schema)) 14 | 15 | def mergeStats(self, other, schema): 16 | self.items += other.items 17 | 18 | def eval(self, row, schema): 19 | return self.items 20 | 21 | def args(self): 22 | return (self.column,) 23 | 24 | 25 | class CollectSet(Aggregation): 26 | pretty_name = "collect_set" 27 | 28 | def __init__(self, column): 29 | super().__init__(column) 30 | self.column = column 31 | self.items = set() 32 | 33 | def merge(self, row, schema): 34 | self.items.add(self.column.eval(row, schema)) 35 | 36 | def mergeStats(self, other, schema): 37 | self.items |= other.items 38 | 39 | def eval(self, row, schema): 40 | return list(self.items) 41 | 42 | def args(self): 43 | return (self.column,) 44 | 45 | 46 | class SumDistinct(Aggregation): 47 | pretty_name = "sum_distinct" 48 | 49 | def __init__(self, column): 50 | super().__init__(column) 51 | self.column = column 52 | self.items = set() 53 | 54 | def merge(self, row, schema): 55 | self.items.add(self.column.eval(row, schema)) 56 | 57 | def mergeStats(self, other, schema): 58 | self.items |= other.items 59 | 60 | def eval(self, row, schema): 61 | return sum(self.items) 62 | 63 | def args(self): 64 | return (self.column,) 65 | 66 | 67 | class First(Aggregation): 68 | pretty_name = "first" 69 | _sentinel = object() 70 | 71 | def __init__(self, column, ignore_nulls): 72 | super().__init__(column) 73 | self.column = column 74 | self.value = self._sentinel 75 | self.ignore_nulls = ignore_nulls.get_literal_value() 76 | 77 | def merge(self, row, schema): 78 | if self.value is First._sentinel or (self.ignore_nulls and self.value is None): 79 | self.value = self.column.eval(row, schema) 80 | 81 | def mergeStats(self, other, schema): 82 | if self.value is First._sentinel or (self.ignore_nulls and self.value is None): 83 | self.value = other.value 84 | 85 | def eval(self, row, schema): 86 | return self.value if self.value is not First._sentinel else None 87 | 88 | def args(self): 89 | return ( 90 | self.column, 91 | str(self.ignore_nulls).lower() 92 | ) 93 | 94 | 95 | class Last(Aggregation): 96 | pretty_name = "last" 97 | _sentinel = object() 98 | 99 | def __init__(self, column, ignore_nulls): 100 | super().__init__(column) 101 | self.column = column 102 | self.value = None 103 | self.ignore_nulls = ignore_nulls.get_literal_value() 104 | 105 | def merge(self, row, schema): 106 | new_value = self.column.eval(row, schema) 107 | if not (self.ignore_nulls and new_value is None): 108 | self.value = new_value 109 | 110 | def mergeStats(self, other, schema): 111 | if not (self.ignore_nulls and other.value is None): 112 | self.value = other.value 113 | 114 | def eval(self, row, schema): 115 | return self.value 116 | 117 | def args(self): 118 | return ( 119 | self.column, 120 | str(self.ignore_nulls).lower() 121 | ) 122 | 123 | 124 | class CountDistinct(Aggregation): 125 | pretty_name = "count" 126 | 127 | def __init__(self, columns): 128 | super().__init__(columns) 129 | self.columns = columns 130 | self.items = set() 131 | 132 | def merge(self, row, schema): 133 | self.items.add(tuple( 134 | col.eval(row, schema) for col in self.columns 135 | )) 136 | 137 | def mergeStats(self, other, schema): 138 | self.items += other.items 139 | 140 | def eval(self, row, schema): 141 | return len(self.items) 142 | 143 | def args(self): 144 | return f"DISTINCT {','.join(self.columns)}" 145 | 146 | 147 | class ApproxCountDistinct(Aggregation): 148 | pretty_name = "approx_count_distinct" 149 | 150 | def __init__(self, column): 151 | super().__init__(column) 152 | self.column = column 153 | self.items = set() 154 | 155 | def merge(self, row, schema): 156 | self.items.add(self.column.eval(row, schema)) 157 | 158 | def mergeStats(self, other, schema): 159 | self.items += other.items 160 | 161 | def eval(self, row, schema): 162 | return len(self.items) 163 | 164 | def args(self): 165 | return (self.column,) 166 | 167 | 168 | __all__ = [ 169 | "SumDistinct", "ApproxCountDistinct", "CollectList", "CollectSet", 170 | "First", "CountDistinct", "Last" 171 | ] 172 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/aggregate/covariance_aggregations.py: -------------------------------------------------------------------------------- 1 | from ....stat_counter import CovarianceCounter 2 | from .aggregations import Aggregation 3 | 4 | 5 | class CovarianceStatAggregation(Aggregation): 6 | def __init__(self, column1, column2): 7 | super().__init__(column1, column2) 8 | self.column1 = column1 9 | self.column2 = column2 10 | self.stat_helper = CovarianceCounter(method="pearson") 11 | 12 | def merge(self, row, schema): 13 | self.stat_helper.add(row.eval(self.column1, schema), row.eval(self.column2, schema)) 14 | 15 | def mergeStats(self, other, schema): 16 | self.stat_helper.merge(other) 17 | 18 | def eval(self, row, schema): 19 | raise NotImplementedError 20 | 21 | def args(self): 22 | return ( 23 | self.column1, 24 | self.column2 25 | ) 26 | 27 | 28 | class Corr(CovarianceStatAggregation): 29 | pretty_name = "corr" 30 | 31 | def eval(self, row, schema): 32 | return self.stat_helper.pearson_correlation 33 | 34 | 35 | class CovarSamp(CovarianceStatAggregation): 36 | pretty_name = "covar_samp" 37 | 38 | def eval(self, row, schema): 39 | return self.stat_helper.covar_samp 40 | 41 | 42 | class CovarPop(CovarianceStatAggregation): 43 | pretty_name = "covar_pop" 44 | 45 | def eval(self, row, schema): 46 | return self.stat_helper.covar_pop 47 | 48 | 49 | __all__ = ["Corr", "CovarSamp", "CovarPop"] 50 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/aggregate/stat_aggregations.py: -------------------------------------------------------------------------------- 1 | from ....stat_counter import ColumnStatHelper 2 | from ...column import Column 3 | from ..literals import Literal 4 | from ..mappers import StarOperator 5 | from .aggregations import Aggregation 6 | 7 | 8 | class SimpleStatAggregation(Aggregation): 9 | def __init__(self, column): 10 | super().__init__(column) 11 | self.column = column 12 | self.stat_helper = ColumnStatHelper(column) 13 | 14 | def merge(self, row, schema): 15 | self.stat_helper.merge(row, schema) 16 | 17 | def mergeStats(self, other, schema): 18 | self.stat_helper.mergeStats(other.stat_helper) 19 | 20 | def eval(self, row, schema): 21 | raise NotImplementedError 22 | 23 | def args(self): 24 | return (self.column,) 25 | 26 | 27 | class Count(SimpleStatAggregation): 28 | pretty_name = "count" 29 | 30 | def __init__(self, column): 31 | if isinstance(column.expr, StarOperator): 32 | column = Column(Literal(1)) 33 | super().__init__(column) 34 | self.column = column 35 | self.stat_helper = ColumnStatHelper(column) 36 | 37 | def eval(self, row, schema): 38 | return self.stat_helper.count 39 | 40 | 41 | class Max(SimpleStatAggregation): 42 | pretty_name = "max" 43 | 44 | def eval(self, row, schema): 45 | return self.stat_helper.max 46 | 47 | 48 | class Min(SimpleStatAggregation): 49 | pretty_name = "min" 50 | 51 | def eval(self, row, schema): 52 | return self.stat_helper.min 53 | 54 | 55 | class Sum(SimpleStatAggregation): 56 | pretty_name = "sum" 57 | 58 | def eval(self, row, schema): 59 | return self.stat_helper.sum 60 | 61 | 62 | class Avg(SimpleStatAggregation): 63 | pretty_name = "avg" 64 | 65 | def eval(self, row, schema): 66 | return self.stat_helper.mean 67 | 68 | 69 | class VarSamp(SimpleStatAggregation): 70 | pretty_name = "var_samp" 71 | 72 | def eval(self, row, schema): 73 | return self.stat_helper.variance_samp 74 | 75 | 76 | class VarPop(SimpleStatAggregation): 77 | pretty_name = "var_pop" 78 | 79 | def eval(self, row, schema): 80 | return self.stat_helper.variance_pop 81 | 82 | 83 | class StddevSamp(SimpleStatAggregation): 84 | pretty_name = "stddev_samp" 85 | 86 | def eval(self, row, schema): 87 | return self.stat_helper.stddev_samp 88 | 89 | 90 | class StddevPop(SimpleStatAggregation): 91 | pretty_name = "stddev_pop" 92 | 93 | def eval(self, row, schema): 94 | return self.stat_helper.stddev_pop 95 | 96 | 97 | class Skewness(SimpleStatAggregation): 98 | pretty_name = "skewness" 99 | 100 | def eval(self, row, schema): 101 | return self.stat_helper.skewness 102 | 103 | 104 | class Kurtosis(SimpleStatAggregation): 105 | pretty_name = "kurtosis" 106 | 107 | def eval(self, row, schema): 108 | return self.stat_helper.kurtosis 109 | 110 | 111 | __all__ = [ 112 | "Avg", "VarPop", "VarSamp", "Sum", "StddevPop", "StddevSamp", 113 | "Skewness", "Min", "Max", "Kurtosis", "Count" 114 | ] 115 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/csvs.py: -------------------------------------------------------------------------------- 1 | from ..casts import NO_TIMESTAMP_CONVERSION 2 | from ..internal_utils.options import Options 3 | from ..internal_utils.readers.csvreader import csv_record_to_row, CSVReader 4 | from ..internal_utils.readers.utils import guess_schema_from_strings 5 | from ..utils import AnalysisException 6 | from .expressions import Expression 7 | 8 | sql_csv_function_options = dict( 9 | dateFormat=NO_TIMESTAMP_CONVERSION, 10 | timestampFormat=NO_TIMESTAMP_CONVERSION, 11 | ) 12 | 13 | 14 | class SchemaOfCsv(Expression): 15 | pretty_name = "schema_of_csv" 16 | 17 | def __init__(self, column, options): 18 | super().__init__(column) 19 | self.column = column 20 | self.input_options = options 21 | self.options = Options(CSVReader.default_options, sql_csv_function_options, options) 22 | 23 | def eval(self, row, schema): 24 | value = self.column.eval(row, schema) 25 | if not isinstance(value, str) or value == "": 26 | raise AnalysisException( 27 | "type mismatch: The input csv should be a string literal and not null; " 28 | f"however, got {value}." 29 | ) 30 | record_as_row = csv_record_to_row(value, self.options) 31 | schema = guess_schema_from_strings(record_as_row.__fields__, [record_as_row], self.options) 32 | return schema.simpleString() 33 | 34 | def args(self): 35 | return (self.column,) 36 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/explodes.py: -------------------------------------------------------------------------------- 1 | from ..types import DataType, IntegerType, StructField 2 | from .expressions import UnaryExpression 3 | 4 | 5 | class Explode(UnaryExpression): 6 | def __init__(self, column): 7 | super().__init__(column) 8 | self.column = column 9 | 10 | @property 11 | def may_output_multiple_rows(self): 12 | return True 13 | 14 | def eval(self, row, schema): 15 | values = self.column.eval(row, schema) 16 | if not values: 17 | return [] 18 | return [[value] for value in values] 19 | 20 | def __str__(self): 21 | return "col" 22 | 23 | 24 | class ExplodeOuter(Explode): 25 | def eval(self, row, schema): 26 | values = self.column.eval(row, schema) 27 | if not values: 28 | return [[None]] 29 | return [[value] for value in values] 30 | 31 | def __str__(self): 32 | return "col" 33 | 34 | 35 | class PosExplode(UnaryExpression): 36 | def eval(self, row, schema): 37 | values = self.column.eval(row, schema) 38 | if not values: 39 | return [] 40 | return list(enumerate(values)) 41 | 42 | def __str__(self): 43 | return "posexplode" 44 | 45 | @property 46 | def may_output_multiple_rows(self): 47 | return True 48 | 49 | @property 50 | def may_output_multiple_cols(self): 51 | return True 52 | 53 | def output_fields(self, schema): 54 | return [ 55 | StructField("pos", IntegerType(), False), 56 | StructField("col", DataType(), False) 57 | ] 58 | 59 | 60 | class PosExplodeOuter(PosExplode): 61 | def eval(self, row, schema): 62 | values = self.column.eval(row, schema) 63 | if not values: 64 | return [[None, None]] 65 | return list(enumerate(values)) 66 | 67 | def __str__(self): 68 | return "posexplode_outer" 69 | 70 | 71 | __all__ = ["PosExplodeOuter", "PosExplode", "ExplodeOuter", "Explode"] 72 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/fields.py: -------------------------------------------------------------------------------- 1 | from ..types import StructField 2 | from ..utils import AnalysisException 3 | from .expressions import Expression 4 | 5 | 6 | class FieldAsExpression(Expression): 7 | def __init__(self, field): 8 | super().__init__() 9 | self.field = field 10 | 11 | def eval(self, row, schema): 12 | return row[find_position_in_schema(schema, self.field)] 13 | 14 | def __str__(self): 15 | return self.field.name 16 | 17 | def output_fields(self, schema): 18 | return [self.field] 19 | 20 | def args(self): 21 | return (self.field,) 22 | 23 | 24 | def find_position_in_schema(schema, expr): 25 | if isinstance(expr, str): 26 | show_id = False 27 | field_name = expr 28 | matches = set(i for i, field in enumerate(schema.fields) if field_name == field.name) 29 | elif isinstance(expr, FieldAsExpression): 30 | return find_position_in_schema(schema, expr.field) 31 | elif isinstance(expr, StructField) and hasattr(expr, "id"): 32 | show_id = True 33 | field_name = format_field(expr, show_id=show_id) 34 | matches = set(i for i, field in enumerate(schema.fields) if expr.id == field.id) 35 | else: 36 | if isinstance(expr, StructField): 37 | expression = f"Unbound field {expr.name}" 38 | else: 39 | expression = f"Expression type '{type(expr)}'" 40 | 41 | raise NotImplementedError( 42 | f"{expression} is not supported. " 43 | "As a user you should not see this error, feel free to report a bug at " 44 | "https://github.com/svenkreiss/pysparkling/issues" 45 | ) 46 | 47 | return get_checked_matches(matches, field_name, schema, show_id) 48 | 49 | 50 | def get_checked_matches(matches, field_name, schema, show_id): 51 | if not matches: 52 | raise AnalysisException(f"Unable to find the column '{field_name}'" 53 | f" among {format_schema(schema, show_id)}") 54 | 55 | if len(matches) > 1: 56 | raise AnalysisException( 57 | f"Reference '{field_name}' is ambiguous, found {len(matches)} columns matching it." 58 | ) 59 | 60 | return matches.pop() 61 | 62 | 63 | def format_schema(schema, show_id): 64 | return [format_field(field, show_id=show_id) for field in schema.fields] 65 | 66 | 67 | def format_field(field, show_id): 68 | if show_id: 69 | return f"{field.name}#{field.id}" 70 | return field.name 71 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/jsons.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from ...utils import get_json_encoder 4 | from ..internal_utils.options import Options 5 | from ..internal_utils.readers.jsonreader import JSONReader 6 | from .expressions import Expression 7 | 8 | 9 | class StructsToJson(Expression): 10 | pretty_name = "structstojson" 11 | 12 | default_options = dict( 13 | dateFormat="yyyy-MM-dd", 14 | timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX", 15 | ) 16 | 17 | def __init__(self, column, options): 18 | super().__init__(column) 19 | self.column = column 20 | self.input_options = options 21 | self.options = Options(JSONReader.default_options, options) 22 | self.encoder = get_json_encoder(self.options) 23 | 24 | def eval(self, row, schema): 25 | value = self.column.eval(row, schema) 26 | return json.dumps( 27 | value, 28 | cls=self.encoder, 29 | separators=(',', ':') 30 | ) 31 | 32 | def args(self): 33 | if self.input_options is None: 34 | return (self.column, ) 35 | return ( 36 | self.column, 37 | self.input_options 38 | ) 39 | 40 | 41 | __all__ = ["StructsToJson"] 42 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/literals.py: -------------------------------------------------------------------------------- 1 | from ..utils import AnalysisException 2 | from .expressions import Expression 3 | 4 | 5 | class Literal(Expression): 6 | def __init__(self, value): 7 | super().__init__() 8 | self.value = value 9 | 10 | def eval(self, row, schema): 11 | return self.value 12 | 13 | def __str__(self): 14 | if self.value is True: 15 | return "true" 16 | if self.value is False: 17 | return "false" 18 | if self.value is None: 19 | return "NULL" 20 | return str(self.value) 21 | 22 | def get_literal_value(self): 23 | if hasattr(self.value, "expr") or isinstance(self.value, Expression): 24 | raise AnalysisException("Value should not be a Column or an Expression," 25 | f" but got {type(self)}: {self}") 26 | return self.value 27 | 28 | def args(self): 29 | return (self.value, ) 30 | 31 | 32 | __all__ = ["Literal"] 33 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/orders.py: -------------------------------------------------------------------------------- 1 | from .expressions import Expression 2 | 3 | 4 | class SortOrder(Expression): 5 | sort_order = None 6 | 7 | def __init__(self, column): 8 | super().__init__(column) 9 | self.column = column 10 | 11 | def eval(self, row, schema): 12 | return self.column.eval(row, schema) 13 | 14 | def __str__(self): 15 | return f"{self.column} {self.sort_order}" 16 | 17 | def args(self): 18 | return (self.column,) 19 | 20 | 21 | class AscNullsFirst(SortOrder): 22 | sort_order = "ASC NULLS FIRST" 23 | 24 | 25 | class AscNullsLast(SortOrder): 26 | sort_order = "ASC NULLS LAST" 27 | 28 | 29 | class DescNullsFirst(SortOrder): 30 | sort_order = "DESCNULLS FIRST" 31 | 32 | 33 | class DescNullsLast(SortOrder): 34 | sort_order = "DESC NULLS LAST" 35 | 36 | 37 | Asc = AscNullsFirst 38 | Desc = DescNullsLast 39 | -------------------------------------------------------------------------------- /pysparkling/sql/expressions/userdefined.py: -------------------------------------------------------------------------------- 1 | from .expressions import Expression 2 | 3 | 4 | class UserDefinedFunction(Expression): 5 | def __init__(self, f, return_type, *exprs): 6 | super().__init__() 7 | self.f = f 8 | self.return_type = return_type 9 | self.exprs = exprs 10 | 11 | def eval(self, row, schema): 12 | return self.f(*(expr.eval(row, schema) for expr in self.exprs)) 13 | 14 | def __str__(self): 15 | arguments = ', '.join(str(arg) for arg in self.args()) 16 | return f"{self.f.__name__}({arguments})" 17 | 18 | def args(self): 19 | return self.exprs 20 | 21 | 22 | __all__ = ["UserDefinedFunction"] 23 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/internal_utils/__init__.py -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/column.py: -------------------------------------------------------------------------------- 1 | def resolve_column(col, row, schema, allow_generator=True): 2 | """ 3 | Return the list of column names corresponding to a column value and a schema and: 4 | If allow generator is False, a list of values corresponding to a row 5 | If allow generator is True, a list of list of values, each list correspond to a row 6 | """ 7 | output_cols = [field.name for field in col.output_fields(schema)] 8 | 9 | output_values = col.eval(row, schema) 10 | 11 | if not allow_generator and col.may_output_multiple_rows: 12 | raise Exception("Generators are not supported when it's nested in expressions," 13 | f" but got: {col}") 14 | 15 | if not col.may_output_multiple_rows: 16 | output_values = [output_values] 17 | if not col.may_output_multiple_cols: 18 | output_values = [output_values] 19 | 20 | return output_cols, output_values 21 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/joins.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following constants are used to identify join types 3 | """ 4 | INNER_JOIN = "inner" 5 | CROSS_JOIN = "cross" 6 | FULL_JOIN = "full" 7 | LEFT_JOIN = "left" 8 | RIGHT_JOIN = "right" 9 | LEFT_SEMI_JOIN = "leftsemi" 10 | LEFT_ANTI_JOIN = "leftanti" 11 | 12 | JOIN_TYPES = dict( 13 | inner=INNER_JOIN, 14 | cross=CROSS_JOIN, 15 | outer=FULL_JOIN, 16 | full=FULL_JOIN, 17 | fullouter=FULL_JOIN, 18 | left=LEFT_JOIN, 19 | leftouter=LEFT_JOIN, 20 | right=RIGHT_JOIN, 21 | rightouter=RIGHT_JOIN, 22 | leftsemi=LEFT_SEMI_JOIN, 23 | leftanti=LEFT_ANTI_JOIN, 24 | ) 25 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/options.py: -------------------------------------------------------------------------------- 1 | class Options(dict): 2 | """ 3 | A case insensitive dict, which can be initialized from multiple dicts 4 | and whose values can be access through attr syntax 5 | 6 | It also stores "false" and "true" strings as Boolean 7 | 8 | e.g.: 9 | 10 | >>> default_options = dict(sep=",", samplingRatio=None) 11 | >>> requested_options = dict(Sep="|") 12 | >>> o=Options({"format": "json", "lineSep": ","}, Format="csv") 13 | >>> o.format, o.linesep 14 | ('csv', ',') 15 | >>> o.UndefinedSetting 16 | Traceback (most recent call last): 17 | ... 18 | KeyError: 'undefinedsetting' 19 | """ 20 | 21 | def __init__(self, *args, **kwargs): 22 | d = { 23 | key.lower(): value 24 | for arg in args 25 | if arg is not None 26 | for key, value in arg.items() 27 | } 28 | d.update({ 29 | key.lower(): value 30 | for key, value in kwargs.items() 31 | }) 32 | super().__init__(d) 33 | 34 | def setdefault(self, k, default=None): 35 | return super().setdefault(k.lower(), default) 36 | 37 | @staticmethod 38 | def fromkeys(seq, value=None): 39 | return Options({k.lower(): value for k in seq}) 40 | 41 | def __getitem__(self, k): 42 | return super().__getitem__(k.lower()) 43 | 44 | def __setitem__(self, k, v): 45 | if isinstance(v, str) and v.lower() in ("true", "false"): 46 | v = (v.lower() == "true") 47 | super().__setitem__(k.lower(), v) 48 | 49 | def __delitem__(self, k): 50 | super().__delitem__(k.lower()) 51 | 52 | def get(self, k, *args, **kwargs): 53 | return super().get(k.lower(), *args, **kwargs) 54 | 55 | def __contains__(self, o): 56 | if not isinstance(o, str): 57 | return False 58 | return super().__contains__(o.lower()) 59 | 60 | def __getattr__(self, item): 61 | if not item.startswith("_"): 62 | return self[item.lower()] 63 | return getattr(super(), item) 64 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import InternalReader 2 | 3 | __all__ = [ 4 | 'InternalReader' 5 | ] 6 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/common.py: -------------------------------------------------------------------------------- 1 | from ...internal_utils.readers import csvreader, jsonreader, textreader 2 | from ...internal_utils.readwrite import OptionUtils, to_option_stored_value 3 | from ...types import StructType 4 | 5 | 6 | class InternalReader(OptionUtils): 7 | def schema(self, schema): 8 | if not isinstance(schema, StructType): 9 | raise NotImplementedError("Pysparkling currently only supports StructType for schemas") 10 | self._schema = schema 11 | 12 | def option(self, key, value): 13 | self._options[key.lower()] = to_option_stored_value(value) 14 | 15 | def __init__(self, spark): 16 | """ 17 | 18 | :type spark: pysparkling.sql.session.SparkSession 19 | """ 20 | self._spark = spark 21 | self._options = {} 22 | self._schema = None 23 | 24 | def csv(self, paths): 25 | return csvreader.CSVReader(self._spark, paths, self._schema, self._options).read() 26 | 27 | def json(self, paths): 28 | return jsonreader.JSONReader(self._spark, paths, self._schema, self._options).read() 29 | 30 | def text(self, paths): 31 | return textreader.TextReader(self._spark, paths, self._schema, self._options).read() 32 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/csvreader.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import itertools 3 | 4 | from ....fileio import TextFile 5 | from ...casts import get_caster 6 | from ...internal_utils.options import Options 7 | from ...internal_utils.readers.utils import guess_schema_from_strings, resolve_partitions 8 | from ...schema_utils import infer_schema_from_rdd 9 | from ...types import create_row, StringType, StructField, StructType 10 | 11 | 12 | class CSVReader: 13 | default_options = dict( 14 | lineSep=None, 15 | encoding="utf-8", 16 | sep=",", 17 | inferSchema=False, 18 | header=False 19 | ) 20 | 21 | def __init__(self, spark, paths, schema, options): 22 | self.spark = spark 23 | self.paths = paths 24 | self.schema = schema 25 | self.options = Options(self.default_options, options) 26 | 27 | def read(self): 28 | sc = self.spark._sc 29 | paths = self.paths 30 | 31 | partitions, partition_schema = resolve_partitions(paths) 32 | 33 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions)) 34 | rdd = rdd_filenames.flatMap(partial( 35 | parse_csv_file, 36 | partitions, 37 | partition_schema, 38 | self.schema, 39 | self.options 40 | )) 41 | 42 | if self.schema is not None: 43 | schema = self.schema 44 | elif self.options.inferSchema: 45 | fields = rdd.take(1)[0].__fields__ 46 | schema = guess_schema_from_strings(fields, rdd.collect(), options=self.options) 47 | else: 48 | schema = infer_schema_from_rdd(rdd) 49 | 50 | schema_with_string = StructType(fields=[ 51 | StructField(field.name, StringType()) for field in schema.fields 52 | ]) 53 | 54 | if partition_schema: 55 | partitions_fields = partition_schema.fields 56 | full_schema = StructType(schema.fields[:-len(partitions_fields)] + partitions_fields) 57 | else: 58 | full_schema = schema 59 | 60 | cast_row = get_caster( 61 | from_type=schema_with_string, to_type=full_schema, options=self.options 62 | ) 63 | casted_rdd = rdd.map(cast_row) 64 | casted_rdd._name = paths 65 | 66 | # pylint: disable=import-outside-toplevel, cyclic-import 67 | from ...internals import DataFrameInternal 68 | 69 | return DataFrameInternal( 70 | sc, 71 | casted_rdd, 72 | schema=full_schema 73 | ) 74 | 75 | 76 | def parse_csv_file(partitions, partition_schema, schema, options, file_name): 77 | f_content = TextFile(file_name).load(encoding=options.encoding).read() 78 | records = (f_content.split(options.lineSep) 79 | if options.lineSep is not None 80 | else f_content.splitlines()) 81 | if options.header == "true": 82 | header = records[0].split(options.sep) 83 | records = records[1:] 84 | else: 85 | header = None 86 | 87 | null_value = "" 88 | rows = [] 89 | for record in records: 90 | row = csv_record_to_row( 91 | record, options, schema, header, null_value, partition_schema, partitions[file_name] 92 | ) 93 | row.set_input_file_name(file_name) 94 | rows.append(row) 95 | 96 | return rows 97 | 98 | 99 | def csv_record_to_row(record, options, schema=None, header=None, 100 | null_value=None, partition_schema=None, partition=None): 101 | record_values = [val if val != null_value else None for val in record.split(options.sep)] 102 | if schema is not None: 103 | field_names = [f.name for f in schema.fields] 104 | elif header is not None: 105 | field_names = header 106 | else: 107 | field_names = [f"_c{i}" for i, field in enumerate(record_values)] 108 | partition_field_names = [ 109 | f.name for f in partition_schema.fields 110 | ] if partition_schema else [] 111 | row = create_row( 112 | itertools.chain(field_names, partition_field_names), 113 | itertools.chain(record_values, partition or []) 114 | ) 115 | return row 116 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/jsonreader.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import itertools 3 | import json 4 | 5 | from ...casts import get_struct_caster 6 | from ...internal_utils.options import Options 7 | from ...internal_utils.readers.utils import get_records, resolve_partitions 8 | from ...schema_utils import infer_schema_from_rdd 9 | from ...types import create_row, row_from_keyed_values, StructType 10 | 11 | 12 | class JSONReader: 13 | default_options = dict( 14 | primitivesAsString=False, 15 | prefersDecimal=False, 16 | allowComments=False, 17 | allowUnquotedFieldNames=False, 18 | allowSingleQuotes=True, 19 | allowNumericLeadingZero=False, 20 | allowBackslashEscapingAnyCharacter=False, 21 | mode="PERMISSIVE", 22 | columnNameOfCorruptRecord="", 23 | dateFormat="yyyy-MM-dd", 24 | timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX", 25 | multiLine=False, 26 | allowUnquotedControlChars=False, 27 | encoding=None, 28 | lineSep=None, 29 | samplingRatio=1.0, 30 | dropFieldIfAllNull=False, 31 | locale="en-US", 32 | ) 33 | 34 | def __init__(self, spark, paths, schema, options): 35 | self.spark = spark 36 | self.paths = paths 37 | self.schema = schema 38 | self.options = Options(self.default_options, options) 39 | 40 | def read(self): 41 | sc = self.spark._sc 42 | paths = self.paths 43 | 44 | partitions, partition_schema = resolve_partitions(paths) 45 | 46 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions)) 47 | rdd = rdd_filenames.flatMap(partial( 48 | parse_json_file, 49 | partitions, 50 | partition_schema, 51 | self.schema, 52 | self.options 53 | )) 54 | 55 | inferred_schema = infer_schema_from_rdd(rdd) 56 | 57 | schema = self.schema if self.schema is not None else inferred_schema 58 | schema_fields = { 59 | field.name: field 60 | for field in schema.fields 61 | } 62 | 63 | # Field order is defined by fields in the record, not by the given schema 64 | # Field type is defined by the given schema or inferred 65 | full_schema = StructType( 66 | fields=[ 67 | schema_fields.get(field.name, field) 68 | for field in inferred_schema.fields 69 | ] 70 | ) 71 | 72 | cast_row = get_struct_caster(inferred_schema, full_schema, options=self.options) 73 | casted_rdd = rdd.map(cast_row) 74 | casted_rdd._name = paths 75 | 76 | # pylint: disable=import-outside-toplevel, cyclic-import 77 | from ...internals import DataFrameInternal 78 | 79 | return DataFrameInternal( 80 | sc, 81 | casted_rdd, 82 | schema=full_schema 83 | ) 84 | 85 | 86 | def parse_json_file(partitions, partition_schema, schema, options, file_name): 87 | records = get_records(file_name, options.linesep, options.encoding) 88 | rows = [] 89 | for record in records: 90 | partition = partitions[file_name] 91 | row = parse_record(record, schema, partition, partition_schema, options) 92 | row.set_input_file_name(file_name) 93 | rows.append(row) 94 | return rows 95 | 96 | 97 | def parse_record(record, schema, partition, partition_schema, options): 98 | raw_record_value = json.loads(record, encoding=options.encoding) 99 | if not isinstance(raw_record_value, dict): 100 | raise NotImplementedError( 101 | "Top level items should be JSON objects (dicts)," 102 | f" got {type(raw_record_value)} with {raw_record_value}" 103 | ) 104 | record_value = decode_record(raw_record_value) 105 | if schema is not None: 106 | record_fields = record_value.__fields__ 107 | available_names = tuple(partition_schema.names) + record_fields 108 | field_names = [name for name in record_fields if name in schema.names] + [ 109 | f.name for f in schema.fields if f.name not in available_names 110 | ] 111 | else: 112 | field_names = list(record_value.__fields__) 113 | record_values = [ 114 | record_value[field_name] if field_name in record_value.__fields__ else None 115 | for field_name in field_names 116 | ] 117 | partition_field_names = [f.name for f in partition_schema.fields] if partition_schema else [] 118 | # pylint: disable=W0511 119 | # todo: handle nested rows 120 | row = create_row( 121 | itertools.chain(field_names, partition_field_names), 122 | itertools.chain(record_values, partition) 123 | ) 124 | return row 125 | 126 | 127 | def decode_record(item): 128 | if isinstance(item, list): 129 | return [decode_record(e) for e in item] 130 | if isinstance(item, dict): 131 | return row_from_keyed_values( 132 | (key, decode_record(value)) 133 | for key, value in item.items() 134 | ) 135 | return item 136 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/textreader.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import itertools 3 | 4 | from ....fileio import TextFile 5 | from ...internal_utils.options import Options 6 | from ...internal_utils.readers.utils import resolve_partitions 7 | from ...types import create_row, StringType, StructField, StructType 8 | 9 | 10 | class TextReader: 11 | default_options = dict( 12 | lineSep=None, 13 | encoding="utf-8", 14 | sep=",", 15 | inferSchema=False, 16 | header=False 17 | ) 18 | 19 | def __init__(self, spark, paths, schema, options): 20 | self.spark = spark 21 | self.paths = paths 22 | self.schema = schema or StructType([StructField("value", StringType())]) 23 | self.options = Options(self.default_options, options) 24 | 25 | def read(self): 26 | sc = self.spark._sc 27 | paths = self.paths 28 | 29 | partitions, partition_schema = resolve_partitions(paths) 30 | 31 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions)) 32 | rdd = rdd_filenames.flatMap(partial( 33 | parse_text_file, 34 | partitions, 35 | partition_schema, 36 | self.schema, 37 | self.options 38 | )) 39 | 40 | if partition_schema: 41 | partitions_fields = partition_schema.fields 42 | full_schema = StructType(self.schema.fields + partitions_fields) 43 | else: 44 | full_schema = self.schema 45 | 46 | rdd._name = paths 47 | 48 | # pylint: disable=import-outside-toplevel, cyclic-import 49 | from ...internals import DataFrameInternal 50 | 51 | return DataFrameInternal( 52 | sc, 53 | rdd, 54 | schema=full_schema 55 | ) 56 | 57 | 58 | def parse_text_file(partitions, partition_schema, schema, options, file_name): 59 | f_content = TextFile(file_name).load(encoding=options.encoding).read() 60 | records = (f_content.split(options.lineSep) 61 | if options.lineSep is not None 62 | else f_content.splitlines()) 63 | 64 | rows = [] 65 | for record in records: 66 | row = text_record_to_row(record, options, schema, partition_schema, partitions[file_name]) 67 | row.set_input_file_name(file_name) 68 | rows.append(row) 69 | 70 | return rows 71 | 72 | 73 | def text_record_to_row(record, options, schema, partition_schema, partition): 74 | partition_field_names = [ 75 | f.name for f in partition_schema.fields 76 | ] if partition_schema else [] 77 | row = create_row( 78 | itertools.chain([schema.fields[0].name], partition_field_names), 79 | itertools.chain([record], partition or []) 80 | ) 81 | return row 82 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readers/utils.py: -------------------------------------------------------------------------------- 1 | from ....fileio import File, TextFile 2 | from ...casts import get_caster 3 | from ...types import ( 4 | DecimalType, DoubleType, IntegerType, LongType, row_from_keyed_values, StringType, StructField, StructType, 5 | TimestampType 6 | ) 7 | from ...utils import AnalysisException 8 | 9 | 10 | def resolve_partitions(patterns): 11 | """ 12 | Given a list of patterns, returns all the files matching or in folders matching 13 | one of them. 14 | 15 | The file are returned in a list of tuple of 2 elements: 16 | - The first tuple is the file path 17 | - The second being the partition keys and values if any were encountered else None 18 | 19 | In addition to this list, return, if the data was partitioned, a schema for the 20 | partition keys, else None 21 | 22 | :type patterns: list of str 23 | :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]] 24 | """ 25 | file_paths = File.get_content(patterns) 26 | if not file_paths: 27 | raise AnalysisException(f'Path does not exist: {patterns}') 28 | partitions = {} 29 | for file_path in file_paths: 30 | if "=" in file_path: 31 | row = row_from_keyed_values( 32 | folder.split("=") 33 | for folder in file_path.split("/")[:-1] 34 | if folder.count("=") == 1 35 | ) 36 | partitions[file_path] = row 37 | else: 38 | partitions[file_path] = None 39 | 40 | partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None) 41 | if len(partitioning_field_sets) > 1: 42 | raise Exception( 43 | f"Conflicting directory structures detected while reading {','.join(patterns)}. " 44 | f"All partitions must have the same partitioning fields," 45 | f" found fields {' and also '.join(str(fields) for fields in partitioning_field_sets)}" 46 | ) 47 | 48 | if partitioning_field_sets: 49 | if any(value is None for value in partitions.values()): 50 | paths = [path for path, value in partitions.items() if value is None] 51 | raise AnalysisException( 52 | f"Unable to parse those malformed folders: {paths} of {file_paths}" 53 | ) 54 | partitioning_fields = partitioning_field_sets.pop() 55 | partition_schema = guess_schema_from_strings( 56 | partitioning_fields, partitions.values(), options={} 57 | ) 58 | else: 59 | partition_schema = None 60 | 61 | return partitions, partition_schema 62 | 63 | 64 | def guess_schema_from_strings(schema_fields, data, options): 65 | field_values = [ 66 | (field, [row[field] for row in data]) 67 | for field in schema_fields 68 | ] 69 | 70 | field_types_and_values = [ 71 | (field, guess_type_from_values_as_string(values, options)) 72 | for field, values in field_values 73 | ] 74 | 75 | schema = StructType(fields=[ 76 | StructField(field, field_type) 77 | for field, field_type in field_types_and_values 78 | ]) 79 | 80 | return schema 81 | 82 | 83 | def guess_type_from_values_as_string(values, options): 84 | # Reproduces inferences available in Spark 85 | # PartitioningUtils.inferPartitionColumnValue() 86 | # located in org.apache.spark.sql.execution.datasources 87 | tested_types = ( 88 | IntegerType(), 89 | LongType(), 90 | DecimalType(), 91 | DoubleType(), 92 | TimestampType(), 93 | StringType() 94 | ) 95 | string_type = StringType() 96 | for tested_type in tested_types: 97 | type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) 98 | try: 99 | for value in values: 100 | casted_value = type_caster(value) 101 | if casted_value is None and value not in ("null", None): 102 | raise ValueError 103 | return tested_type 104 | except ValueError: 105 | pass 106 | # Should never happen 107 | raise AnalysisException( 108 | "Unable to find a matching type for some fields, even StringType did not work" 109 | ) 110 | 111 | 112 | def get_records(f_name, linesep, encoding): 113 | f_content = TextFile(f_name).load(encoding=encoding).read() 114 | records = f_content.split(linesep) if linesep is not None else f_content.splitlines() 115 | return records 116 | -------------------------------------------------------------------------------- /pysparkling/sql/internal_utils/readwrite.py: -------------------------------------------------------------------------------- 1 | from ..utils import IllegalArgumentException 2 | 3 | 4 | def to_option_stored_value(value): 5 | if value is None: 6 | return None 7 | if isinstance(value, bool): 8 | return str(value).lower() 9 | return str(value) 10 | 11 | 12 | class OptionUtils: 13 | def _set_opts(self, schema=None, **options): 14 | """ 15 | Set named options (filter out those the value is None) 16 | """ 17 | if schema is not None: 18 | self.schema(schema) 19 | for k, v in options.items(): 20 | if v is not None: 21 | self.option(k, v) 22 | 23 | def option(self, key, value): 24 | raise NotImplementedError 25 | 26 | def schema(self, schema): 27 | # By default OptionUtils subclass do not support schema 28 | raise IllegalArgumentException( 29 | f"schema is not a valid argument for {self.__class__}" 30 | ) 31 | -------------------------------------------------------------------------------- /pysparkling/sql/schema_utils.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | from .internal_utils.joins import ( 4 | CROSS_JOIN, FULL_JOIN, INNER_JOIN, LEFT_ANTI_JOIN, LEFT_JOIN, LEFT_SEMI_JOIN, RIGHT_JOIN 5 | ) 6 | from .types import _get_null_fields, _has_nulltype, _infer_schema, _merge_type, StructField, StructType 7 | from .utils import IllegalArgumentException 8 | 9 | 10 | def infer_schema_from_rdd(rdd): 11 | return infer_schema_from_list(rdd.takeSample(withReplacement=False, num=200)) 12 | 13 | 14 | def infer_schema_from_list(data, names=None): 15 | """ 16 | Infer schema from list of Row or tuple. 17 | 18 | :param data: list of Row or tuple 19 | :param names: list of column names 20 | :return: :class:`pysparkling.sql.types.StructType` 21 | """ 22 | if not data: 23 | raise ValueError("can not infer schema from empty dataset") 24 | first = data[0] 25 | if isinstance(first, dict): 26 | raise NotImplementedError( 27 | "Inferring schema from dict is deprecated in Spark " 28 | "and not implemented in pysparkling. " 29 | "Please use .sql.Row instead" 30 | ) 31 | schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) 32 | if _has_nulltype(schema): 33 | null_fields = "', '".join(_get_null_fields(schema)) 34 | raise ValueError( 35 | "Type(s) of the following field(s) cannot be determined after inferring:" 36 | f" '{null_fields}'" 37 | ) 38 | return schema 39 | 40 | 41 | def merge_schemas(left_schema, right_schema, how, on=None): 42 | if on is None: 43 | on = [] 44 | 45 | left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema, on) 46 | other_left_fields = [field for field in left_schema.fields if field not in left_on_fields] 47 | other_right_fields = [field for field in right_schema.fields if field not in right_on_fields] 48 | 49 | if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, LEFT_ANTI_JOIN, LEFT_SEMI_JOIN): 50 | on_fields = left_on_fields 51 | elif how == RIGHT_JOIN: 52 | on_fields = right_on_fields 53 | elif how == FULL_JOIN: 54 | on_fields = [StructField(field.name, field.dataType, nullable=True) 55 | for field in left_on_fields] 56 | else: 57 | raise IllegalArgumentException(f"Invalid how argument in join: {how}") 58 | 59 | return StructType(fields=on_fields + other_left_fields + other_right_fields) 60 | 61 | 62 | def get_on_fields(left_schema, right_schema, on): 63 | left_on_fields = [next(field for field in left_schema if field.name == c) for c in on] 64 | right_on_fields = [next(field for field in right_schema if field.name == c) for c in on] 65 | return left_on_fields, right_on_fields 66 | 67 | 68 | def get_schema_from_cols(cols, current_schema): 69 | new_schema = StructType(fields=[ 70 | field for col in cols for field in col.find_fields_in_schema(current_schema) 71 | ]) 72 | return new_schema 73 | -------------------------------------------------------------------------------- /pysparkling/sql/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/tests/__init__.py -------------------------------------------------------------------------------- /pysparkling/sql/tests/data/fundings/part-0.csv: -------------------------------------------------------------------------------- 1 | permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round 2 | mycityfaces,MyCityFaces,7,web,Scottsdale,AZ,2008-01-01,50000,USD,seed 3 | flypaper,Flypaper,,web,Phoenix,AZ,2008-02-01,3000000,USD,a 4 | chosenlist-com,ChosenList.com,5,web,Scottsdale,AZ,2008-01-25,233750,USD,angel 5 | digg,Digg,60,web,San Francisco,CA,2006-12-01,8500000,USD,b 6 | -------------------------------------------------------------------------------- /pysparkling/sql/tests/expressions/test_mappers.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from pysparkling.utils import MonotonicallyIncreasingIDGenerator 4 | 5 | 6 | class MonotonicallyIncreasingIDGeneratorTests(TestCase): 7 | def test_init_ok(self): 8 | sut = MonotonicallyIncreasingIDGenerator(0) 9 | self.assertEqual(sut.value, -1) # Shouldn't we throw an error here? 10 | 11 | sut = MonotonicallyIncreasingIDGenerator(1) 12 | self.assertEqual(sut.value, 8589934592 - 1) # I do it this way so I can easily find/replace the value 13 | 14 | sut = MonotonicallyIncreasingIDGenerator(2) 15 | self.assertEqual(sut.value, 2 * 8589934592 - 1) 16 | 17 | def test_next_value_ok(self): 18 | sut = MonotonicallyIncreasingIDGenerator(1) 19 | self.assertEqual(next(sut), 8589934592) 20 | self.assertEqual(next(sut), 8589934593) 21 | self.assertEqual(next(sut), 8589934594) 22 | -------------------------------------------------------------------------------- /pysparkling/sql/tests/test_session.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import pytest 4 | 5 | from pysparkling import Context, StorageLevel 6 | from pysparkling.sql.session import SparkSession 7 | from pysparkling.sql.types import ( 8 | ArrayType, DoubleType, IntegerType, LongType, MapType, Row, row_from_keyed_values, StringType, StructField, 9 | StructType 10 | ) 11 | from pysparkling.sql.utils import require_minimum_pandas_version 12 | 13 | try: 14 | require_minimum_pandas_version() 15 | has_pandas = True 16 | except ImportError: 17 | has_pandas = False 18 | 19 | 20 | class SessionTests(TestCase): 21 | spark = SparkSession(sparkContext=Context()) 22 | 23 | def test_session_range(self): 24 | df = self.spark.range(3) 25 | self.assertEqual(df.count(), 3) 26 | self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)]) 27 | self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)]) 28 | 29 | def test_session_create_data_frame_from_rdd(self): 30 | df = self.spark.createDataFrame(self.spark.sparkContext.parallelize([ 31 | (1, "one"), 32 | (2, "two"), 33 | (3, "three"), 34 | ])) 35 | self.assertEqual(df.count(), 3) 36 | self.assertListEqual( 37 | df.collect(), 38 | [Row(_1=1, _2='one'), 39 | Row(_1=2, _2='two'), 40 | Row(_1=3, _2='three')]) 41 | self.assertEqual( 42 | df.schema, 43 | StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)]) 44 | ) 45 | 46 | def test_session_create_data_frame_from_list(self): 47 | df = self.spark.createDataFrame([ 48 | (1, "one"), 49 | (2, "two"), 50 | (3, "three"), 51 | ]) 52 | self.assertEqual(df.count(), 3) 53 | self.assertListEqual( 54 | df.collect(), 55 | [Row(_1=1, _2='one'), 56 | Row(_1=2, _2='two'), 57 | Row(_1=3, _2='three')]) 58 | self.assertEqual( 59 | df.schema, 60 | StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)]) 61 | ) 62 | 63 | @pytest.mark.skipif(not has_pandas, reason='pandas is not installed') 64 | def test_session_create_data_frame_from_pandas_data_frame(self): 65 | try: 66 | # Pandas is an optional dependency 67 | # pylint: disable=import-outside-toplevel 68 | import pandas as pd 69 | except ImportError as e: 70 | raise ImportError("pandas is not importable") from e 71 | 72 | pdf = pd.DataFrame([ 73 | (1, "one"), 74 | (2, "two"), 75 | (3, "three") 76 | ]) 77 | 78 | df = self.spark.createDataFrame(pdf) 79 | 80 | self.assertEqual(df.count(), 3) 81 | self.assertListEqual( 82 | df.collect(), 83 | [Row(**{"0": 1, "1": 'one'}), 84 | Row(**{"0": 2, "1": 'two'}), 85 | Row(**{"0": 3, "2": 'three'})]) 86 | self.assertEqual( 87 | df.schema, 88 | StructType([StructField("0", LongType(), True), StructField("1", StringType(), True)]) 89 | ) 90 | 91 | def test_session_create_data_frame_from_list_with_col_names(self): 92 | df = self.spark.createDataFrame([(0.0, [1.0, 0.8]), 93 | (1.0, [0.0, 0.0]), 94 | (2.0, [0.5, 0.5])], 95 | ["label", "features"]) 96 | self.assertEqual(df.count(), 3) 97 | self.assertListEqual( 98 | df.collect(), 99 | [ 100 | row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]), 101 | row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]), 102 | row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]), 103 | ] 104 | ) 105 | 106 | self.assertEqual( 107 | df.schema, 108 | StructType([ 109 | StructField("label", DoubleType(), True), 110 | StructField("features", ArrayType(DoubleType(), True), True) 111 | ]) 112 | ) 113 | 114 | def test_session_create_data_frame_from_list_with_schema(self): 115 | schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) 116 | df = self.spark.createDataFrame([({'a': 1},)], schema=schema) 117 | self.assertEqual(df.count(), 1) 118 | self.assertListEqual( 119 | df.collect(), 120 | [Row(map={'a': 1})] 121 | ) 122 | self.assertEqual(df.schema, schema) 123 | 124 | def test_session_storage_level(self): 125 | spark = SparkSession(Context()) 126 | df = spark.range(4, numPartitions=2) 127 | self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1))) 128 | persisted_df = df.persist() 129 | self.assertEqual(persisted_df.is_cached, True) 130 | self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY)) 131 | -------------------------------------------------------------------------------- /pysparkling/sql/tests/test_write.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import shutil 4 | from unittest import TestCase 5 | 6 | from dateutil.tz import tzlocal 7 | 8 | from pysparkling import Context, Row 9 | from pysparkling.sql.session import SparkSession 10 | from pysparkling.sql.utils import AnalysisException 11 | 12 | spark = SparkSession(Context()) 13 | 14 | 15 | def get_folder_content(folder_path): 16 | folder_content = {} 17 | for root, _, files in os.walk(folder_path): 18 | relative_path = root[len(folder_path):] 19 | for file in files: 20 | file_path = os.path.join(root, file) 21 | with open(file_path, 'r', encoding='utf8') as file_content: 22 | folder_content[os.path.join(relative_path, file)] = file_content.readlines() 23 | return folder_content 24 | 25 | 26 | class DataFrameWriterTests(TestCase): 27 | maxDiff = None 28 | 29 | @staticmethod 30 | def clean(): 31 | if os.path.exists(".tmp"): 32 | shutil.rmtree(".tmp") 33 | 34 | def setUp(self): 35 | self.clean() 36 | 37 | tz = datetime.datetime.now().astimezone().strftime('%z') # +0100 38 | self.tz = f'{tz[:3]}:{tz[3:]}' # --> +01:00 39 | 40 | def tearDown(self): 41 | self.clean() 42 | 43 | def test_write_to_csv(self): 44 | df = spark.createDataFrame( 45 | [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ), 46 | Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))] 47 | ) 48 | df.write.csv(".tmp/wonderland/") 49 | self.assertDictEqual( 50 | get_folder_content(".tmp/wonderland"), 51 | { 52 | '_SUCCESS': [], 53 | 'part-00000-8447389540241120843.csv': [ 54 | f'2,Alice,2017-01-01T00:00:00.000{self.tz}\n', 55 | f'5,Bob,2014-03-02T00:00:00.000{self.tz}\n' 56 | ] 57 | } 58 | ) 59 | 60 | def test_write_to_csv_with_custom_options(self): 61 | df = spark.createDataFrame( 62 | [ 63 | Row(age=2, name='Alice', occupation=None), 64 | Row(age=5, name='Bob', occupation=""), 65 | ] 66 | ) 67 | df.write.csv(".tmp/wonderland/", sep="^", emptyValue="", nullValue="null", header=True) 68 | self.assertDictEqual( 69 | get_folder_content(".tmp/wonderland"), 70 | { 71 | '_SUCCESS': [], 72 | 'part-00000-4061950540148431296.csv': [ 73 | 'age^name^occupation\n', 74 | '2^Alice^null\n', 75 | '5^Bob^\n', 76 | ], 77 | } 78 | ) 79 | 80 | def test_write_to_csv_fail_when_overwrite(self): 81 | df = spark.createDataFrame( 82 | [Row(age=2, name='Alice'), 83 | Row(age=5, name='Bob')] 84 | ) 85 | df.write.csv(".tmp/wonderland/") 86 | with self.assertRaises(AnalysisException) as ctx: 87 | df.write.csv(".tmp/wonderland/") 88 | self.assertEqual(ctx.exception.args[0], 'path .tmp/wonderland already exists.;') 89 | self.assertDictEqual( 90 | get_folder_content(".tmp/wonderland"), 91 | { 92 | '_SUCCESS': [], 93 | 'part-00000-3434325560268771971.csv': [ 94 | '2,Alice\n', 95 | '5,Bob\n', 96 | ], 97 | } 98 | ) 99 | 100 | def test_write_to_json(self): 101 | df = spark.createDataFrame( 102 | [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ), 103 | Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))] 104 | ) 105 | df.write.json(".tmp/wonderland/") 106 | self.assertDictEqual( 107 | get_folder_content(".tmp/wonderland"), 108 | { 109 | '_SUCCESS': [], 110 | 'part-00000-8447389540241120843.json': [ 111 | f'{{"age":2,"name":"Alice","time":"2017-01-01T00:00:00.000{self.tz}"}}\n', 112 | f'{{"age":5,"name":"Bob","time":"2014-03-02T00:00:00.000{self.tz}"}}\n', 113 | ], 114 | } 115 | ) 116 | 117 | def test_write_nested_rows_to_json(self): 118 | df = spark.createDataFrame([ 119 | Row(age=2, name='Alice', animals=[ 120 | Row(name="Chessur", type="cat"), 121 | Row(name="The White Rabbit", type="Rabbit")]), 122 | Row(age=5, name='Bob', animals=[]) 123 | ]) 124 | df.write.json(".tmp/wonderland/") 125 | self.assertDictEqual( 126 | get_folder_content(".tmp/wonderland"), 127 | { 128 | '_SUCCESS': [], 129 | 'part-00000-2819354714706678872.json': [ 130 | '{"age":2,"animals":[' 131 | '{"name":"Chessur","type":"cat"},' 132 | '{"name":"The White Rabbit","type":"Rabbit"}' 133 | '],"name":"Alice"}\n', 134 | '{"age":5,"animals":[],"name":"Bob"}\n', 135 | ], 136 | } 137 | ) 138 | -------------------------------------------------------------------------------- /pysparkling/sql/utils.py: -------------------------------------------------------------------------------- 1 | class CapturedException(Exception): 2 | pass 3 | 4 | 5 | class AnalysisException(CapturedException): 6 | pass 7 | 8 | 9 | class ParseException(CapturedException): 10 | pass 11 | 12 | 13 | class IllegalArgumentException(CapturedException): 14 | pass 15 | 16 | 17 | def require_minimum_pandas_version(): 18 | """ Raise an ImportError if Pandas version is < 0.23.2 19 | """ 20 | minimum_pandas_version = (0, 23, 2) 21 | 22 | # pandas is an optional dependency 23 | # pylint: disable=import-outside-toplevel 24 | try: 25 | import pandas 26 | have_pandas = True 27 | except ImportError: 28 | have_pandas = False 29 | 30 | if not have_pandas: 31 | raise ImportError( 32 | f"Pandas >= {minimum_pandas_version} must be installed; however none were found." 33 | ) 34 | if parse_pandas_version(pandas.__version__) < minimum_pandas_version: 35 | raise ImportError( 36 | f"Pandas >= {minimum_pandas_version} must be installed;" 37 | f" however, your version was {pandas.__version__}." 38 | ) 39 | 40 | 41 | def parse_pandas_version(version): 42 | return tuple(int(part) for part in version.split(".")) 43 | -------------------------------------------------------------------------------- /pysparkling/storagelevel.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | __all__ = ["StorageLevel"] 19 | 20 | 21 | class StorageLevel: 22 | 23 | """ 24 | Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory, 25 | whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory 26 | in a JAVA-specific serialized format, and whether to replicate the RDD partitions on multiple 27 | nodes. Also contains static constants for some commonly used storage levels, MEMORY_ONLY. 28 | Since the data is always serialized on the Python side, all the constants use the serialized 29 | formats. 30 | """ 31 | 32 | def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1): 33 | self.useDisk = useDisk 34 | self.useMemory = useMemory 35 | self.useOffHeap = useOffHeap 36 | self.deserialized = deserialized 37 | self.replication = replication 38 | 39 | def __repr__(self): 40 | return ( 41 | f"StorageLevel({self.useDisk}, {self.useMemory}, {self.useOffHeap}, {self.deserialized}, " 42 | f"{self.replication})" 43 | ) 44 | 45 | def __str__(self): 46 | result = "" 47 | result += "Disk " if self.useDisk else "" 48 | result += "Memory " if self.useMemory else "" 49 | result += "OffHeap " if self.useOffHeap else "" 50 | result += "Deserialized " if self.deserialized else "Serialized " 51 | result += f"{self.replication}x Replicated" 52 | return result 53 | 54 | 55 | StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False) 56 | StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2) 57 | StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False) 58 | StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2) 59 | StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False) 60 | StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2) 61 | StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) 62 | -------------------------------------------------------------------------------- /pysparkling/streaming/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .context import StreamingContext 3 | from .dstream import DStream 4 | 5 | __all__ = ['StreamingContext', 'DStream'] 6 | -------------------------------------------------------------------------------- /pysparkling/streaming/filestream.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ..fileio import File 4 | from ..rdd import EmptyRDD 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | class FileTextStreamDeserializer: 10 | def __init__(self, context): 11 | self.context = context 12 | 13 | def __call__(self, path): 14 | if path is None: 15 | return EmptyRDD(self.context) 16 | 17 | return self.context.textFile(path) 18 | 19 | 20 | class FileBinaryStreamDeserializer: 21 | def __init__(self, context, recordLength=None): 22 | self.context = context 23 | self.record_length = recordLength 24 | 25 | def __call__(self, path): 26 | if path is None: 27 | return EmptyRDD(self.context) 28 | 29 | return self.context.binaryRecords( 30 | path, recordLength=self.record_length) 31 | 32 | 33 | class FileStream: 34 | def __init__(self, path, process_all=False): 35 | self.path = path 36 | self.files_done = set() 37 | if not process_all: 38 | self.files_done = set(File.resolve_filenames(self.path)) 39 | 40 | def get(self): 41 | files = [fn for fn in File.resolve_filenames(self.path) 42 | if fn not in self.files_done] 43 | if not files: 44 | return None 45 | 46 | self.files_done |= set(files) 47 | return ','.join(files) 48 | 49 | def stop(self): 50 | pass 51 | -------------------------------------------------------------------------------- /pysparkling/streaming/queuestream.py: -------------------------------------------------------------------------------- 1 | from ..rdd import EmptyRDD, RDD 2 | 3 | 4 | class QueueStreamDeserializer: 5 | def __init__(self, context): 6 | self.context = context 7 | 8 | def ensure_rdd(self, data): 9 | if data is None: 10 | return EmptyRDD(self.context) 11 | if isinstance(data, RDD): 12 | return data 13 | return self.context.parallelize(data) 14 | 15 | def __call__(self, data): 16 | return self.ensure_rdd(data) 17 | 18 | 19 | class QueueStream: 20 | def __init__(self, queue, oneAtATime=True, default=None): 21 | self.queue = queue 22 | self.oneAtATime = oneAtATime 23 | self.default = default 24 | 25 | def get(self): 26 | q_size = self.queue.qsize() 27 | 28 | if q_size == 0: 29 | return self.default 30 | 31 | if self.oneAtATime: 32 | return self.queue.get_nowait() 33 | 34 | return [e for _ in range(q_size) for e in self.queue.get_nowait()] 35 | -------------------------------------------------------------------------------- /pysparkling/streaming/tcpstream.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import struct 3 | 4 | from tornado.gen import coroutine, moment 5 | from tornado.iostream import StreamClosedError 6 | from tornado.tcpserver import TCPServer 7 | 8 | from ..rdd import EmptyRDD 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class TCPDeserializer: 14 | def __init__(self, context): 15 | self.context = context 16 | 17 | def __call__(self, data): 18 | if data is None: 19 | return EmptyRDD(self.context) 20 | 21 | return self.context.parallelize(data) 22 | 23 | 24 | class TCPTextStream(TCPServer): 25 | def __init__(self, delimiter=b'\n'): 26 | super().__init__() 27 | self.delimiter = delimiter 28 | self.buffer = [] 29 | 30 | def get(self): 31 | if not self.buffer: 32 | return [] 33 | 34 | buffer_ = self.buffer 35 | self.buffer = [] 36 | return buffer_ 37 | 38 | @coroutine 39 | def handle_stream(self, stream, address): 40 | try: 41 | while True: 42 | for _ in range(100): 43 | data = yield stream.read_until(self.delimiter) 44 | self.buffer.append(data[:-1].decode('utf8')) 45 | yield moment 46 | except StreamClosedError: 47 | pass 48 | 49 | 50 | class TCPBinaryStream(TCPServer): 51 | """Consumes binary messages from a TCP socket. 52 | 53 | :param length: An int or string. 54 | """ 55 | 56 | def __init__(self, length=None): 57 | super().__init__() 58 | self.length = length 59 | self.buffer = [] 60 | 61 | self.prefix_length = None 62 | if not isinstance(self.length, int): 63 | self.prefix_length = struct.calcsize(self.length) 64 | 65 | def get(self): 66 | if not self.buffer: 67 | return [] 68 | 69 | buffer_ = self.buffer 70 | self.buffer = [] 71 | return buffer_ 72 | 73 | @coroutine 74 | def handle_stream(self, stream, address): 75 | try: 76 | while True: 77 | for _ in range(100): 78 | if self.prefix_length: 79 | prefix = yield stream.read_bytes(self.prefix_length) 80 | message_length = struct.unpack(self.length, prefix)[0] 81 | else: 82 | message_length = self.length 83 | data = yield stream.read_bytes(message_length) 84 | self.buffer.append(data) 85 | yield moment 86 | except StreamClosedError: 87 | return 88 | -------------------------------------------------------------------------------- /pysparkling/task_context.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class TaskContext: 7 | def __init__(self, cache_manager, catch_exceptions, 8 | stage_id=0, partition_id=0, max_retries=3, retry_wait=0): 9 | self.cache_manager = cache_manager 10 | self.catch_exceptions = catch_exceptions 11 | self.stage_id = stage_id 12 | self.partition_id = partition_id 13 | self.max_retries = max_retries 14 | self.retry_wait = retry_wait 15 | 16 | self.attempt_number = 0 17 | self.is_completed = False 18 | self.is_running_locally = True 19 | self.task_completion_listeners = [] 20 | 21 | def _create_child(self): 22 | return TaskContext(self.cache_manager, self.catch_exceptions, 23 | stage_id=self.stage_id + 1, 24 | partition_id=self.partition_id) 25 | 26 | def attemptNumber(self): 27 | return self.attempt_number 28 | 29 | def partitionId(self): 30 | return self.partition_id 31 | 32 | def stageId(self): 33 | return self.stage_id 34 | -------------------------------------------------------------------------------- /pysparkling/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/__init__.py -------------------------------------------------------------------------------- /pysparkling/tests/data.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/data.7z -------------------------------------------------------------------------------- /pysparkling/tests/data.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/data.tar.gz -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt.bz2/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.bz2/_SUCCESS -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2 -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt.gz/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.gz/_SUCCESS -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt/_SUCCESS -------------------------------------------------------------------------------- /pysparkling/tests/pyspark/key_value.txt/part-00000: -------------------------------------------------------------------------------- 1 | ('a', 1) 2 | ('b', 2) 3 | -------------------------------------------------------------------------------- /pysparkling/tests/test_broadcast.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pysparkling 4 | 5 | 6 | class BroadcastTest(unittest.TestCase): 7 | def setUp(self) -> None: 8 | self.context = pysparkling.Context() 9 | 10 | def testSimple(self): 11 | b = self.context.broadcast([1, 2, 3, 4, 5]) 12 | self.assertEqual(b.value, [1, 2, 3, 4, 5]) 13 | 14 | def testAppendFails(self): 15 | b = self.context.broadcast([1, 2, 3, 4, 5]) 16 | with self.assertRaises(AttributeError): 17 | b.value += [1] # type: ignore 18 | -------------------------------------------------------------------------------- /pysparkling/tests/test_cache.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import pysparkling 5 | 6 | 7 | class Manip: 8 | def __init__(self): 9 | self.count = 0 10 | 11 | def trivial_manip_with_debug(self, e): 12 | self.count += 1 13 | print(f'manipulating {e}') 14 | return e 15 | 16 | 17 | def test_cache_empty_partition(): 18 | m = Manip() 19 | 20 | c = pysparkling.Context() 21 | rdd = c.parallelize(range(10), 2) 22 | rdd = rdd.map(m.trivial_manip_with_debug) 23 | rdd = rdd.filter(lambda e: e > 6).cache() 24 | print(rdd.collect()) 25 | print(rdd.collect()) 26 | 27 | print(f'count of map executions: {m.count}') 28 | assert m.count == 10 29 | 30 | 31 | def test_timed_cache(): 32 | m = Manip() 33 | 34 | # create a timed cache manager 35 | cm = pysparkling.TimedCacheManager(timeout=1.0) 36 | 37 | # create a cache entry 38 | c = pysparkling.Context(cache_manager=cm) 39 | rdd = c.parallelize(range(10), 2) 40 | rdd = rdd.map(m.trivial_manip_with_debug).cache() 41 | print(rdd.collect()) 42 | # make sure the cache is working 43 | count_before = m.count 44 | print(rdd.collect()) 45 | count_after = m.count 46 | assert count_before == count_after 47 | 48 | # wait to have the cache expire 49 | time.sleep(1.5) 50 | cm.gc() 51 | print(rdd.collect()) 52 | assert m.count > count_after 53 | 54 | 55 | if __name__ == '__main__': 56 | logging.basicConfig(level=logging.DEBUG) 57 | # test_cache_empty_partition() 58 | test_timed_cache() 59 | -------------------------------------------------------------------------------- /pysparkling/tests/test_context.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | 4 | import pysparkling 5 | 6 | 7 | class Context(unittest.TestCase): 8 | def test_broadcast(self): 9 | b = pysparkling.Context().broadcast([1, 2, 3]) 10 | self.assertEqual(b.value[0], 1) 11 | 12 | def test_lock1(self): 13 | """Should not be able to create a new RDD inside a map operation.""" 14 | sc = pysparkling.Context() 15 | self.assertRaises( 16 | pysparkling.exceptions.ContextIsLockedException, 17 | lambda: (sc 18 | .parallelize(range(5)) 19 | .map(lambda _: sc.parallelize([1])) 20 | .collect()) 21 | ) 22 | 23 | def test_lock2(self): 24 | """Should not be able to create RDDs containing RDDs.""" 25 | sc = pysparkling.Context() 26 | 27 | def parallelize_in_parallelize(): 28 | o = sc.parallelize(sc.parallelize(range(x)) for x in range(5)) 29 | print(o.map(lambda x: x.collect()).collect()) 30 | 31 | self.assertRaises( 32 | pysparkling.exceptions.ContextIsLockedException, 33 | parallelize_in_parallelize 34 | ) 35 | 36 | def test_parallelize_single_element(self): 37 | my_rdd = pysparkling.Context().parallelize([7], 100) 38 | self.assertEqual(my_rdd.collect(), [7]) 39 | 40 | def test_parallelize_matched_elements(self): 41 | my_rdd = pysparkling.Context().parallelize([1, 2, 3, 4, 5], 5) 42 | self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5]) 43 | 44 | def test_parallelize_empty_partitions_at_end(self): 45 | my_rdd = pysparkling.Context().parallelize(range(3529), 500) 46 | print(my_rdd.getNumPartitions()) 47 | my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p))) 48 | self.assertEqual(my_rdd.getNumPartitions(), 500) 49 | self.assertEqual(my_rdd.count(), 3529) 50 | 51 | def test_retry(self): 52 | 53 | class EverySecondCallFails: 54 | def __init__(self): 55 | self.attempt = 0 56 | 57 | def __call__(self, value): 58 | self.attempt += 1 59 | if self.attempt % 2 == 1: 60 | raise Exception 61 | return value 62 | 63 | data = list(range(6)) 64 | rdd = pysparkling.Context().parallelize(data, 3) 65 | result = rdd.mapPartitions(EverySecondCallFails()).collect() 66 | self.assertEqual(result, data) 67 | 68 | def test_union(self): 69 | sc = pysparkling.Context() 70 | rdd1 = sc.parallelize(['Hello']) 71 | rdd2 = sc.parallelize(['World']) 72 | union = sc.union([rdd1, rdd2]).collect() 73 | print(union) 74 | self.assertEqual(union, ['Hello', 'World']) 75 | 76 | def test_version(self): 77 | self.assertIsInstance(pysparkling.Context().version, str) 78 | 79 | 80 | if __name__ == '__main__': 81 | logging.basicConfig(level=logging.DEBUG) 82 | Context().test_retry() 83 | -------------------------------------------------------------------------------- /pysparkling/tests/test_resolve_filenames.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from pysparkling.fileio import File 6 | 7 | CURRENT_FILE_LOCATION = __file__ 8 | 9 | 10 | class MockedHdfsClient: 11 | def list(self, path, status): 12 | if path == "/user/username/": 13 | return [ 14 | ("input", {"type": "DIRECTORY"}), 15 | ("output", {"type": "DIRECTORY"}) 16 | ] 17 | if path in ('/user/username/input', '/user/username/input/'): 18 | return [ 19 | ("part-00001.gz", {"type": "FILE"}), 20 | ("part-00002.gz", {"type": "FILE"}), 21 | ("_SUCCESS", {"type": "FILE"}) 22 | ] 23 | raise NotImplementedError(f"Return value not mocked for '{path}'") 24 | 25 | 26 | class MockedS3Bucket: 27 | def list(self, *args, **kwargs): 28 | return [ 29 | MockedS3Key("user/username/input/part-00001.gz"), 30 | MockedS3Key("user/username/input/part-00002.gz"), 31 | MockedS3Key("user/username/input/_SUCCESS"), 32 | ] 33 | 34 | 35 | class MockedS3Connection: 36 | def get_bucket(self, *args, **kwargs): 37 | return MockedS3Bucket() 38 | 39 | 40 | class MockedS3Key: 41 | def __init__(self, name): 42 | self.name = name 43 | 44 | 45 | def test_local_1(): 46 | filenames = File.resolve_filenames( 47 | f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*' 48 | ) 49 | assert CURRENT_FILE_LOCATION in filenames 50 | 51 | 52 | def test_local_2(): 53 | filenames = File.resolve_filenames(CURRENT_FILE_LOCATION) 54 | assert filenames == [CURRENT_FILE_LOCATION] 55 | 56 | 57 | @pytest.mark.skipif(not os.getenv('AWS_ACCESS_KEY_ID'), reason='no AWS env') 58 | def test_s3_1(): 59 | filenames = File.resolve_filenames( 60 | 's3n://aws-publicdatasets/common-crawl/' 61 | 'crawl-data/CC-MAIN-2015-11/warc.paths.*' 62 | ) 63 | print(filenames) 64 | assert ('s3n://aws-publicdatasets/common-crawl/' 65 | 'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames) 66 | 67 | 68 | def test_hdfs_resolve_filenames_with_wildcard(): 69 | # hdfs is an optional dependency 70 | # pylint: disable=import-outside-toplevel 71 | from pysparkling.fileio.fs import Hdfs 72 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path")) 73 | 74 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-*.gz") 75 | print(filenames) 76 | assert filenames == [ 77 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz', 78 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz' 79 | ] 80 | 81 | 82 | def test_hdfs_resolve_filenames_with_folder_path(): 83 | # hdfs is an optional dependency 84 | # pylint: disable=import-outside-toplevel 85 | from pysparkling.fileio.fs import Hdfs 86 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path")) 87 | 88 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input") 89 | print(filenames) 90 | assert filenames == [ 91 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz', 92 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz' 93 | ] 94 | 95 | 96 | def test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash(): 97 | # hdfs is an optional dependency 98 | # pylint: disable=import-outside-toplevel 99 | from pysparkling.fileio.fs import Hdfs 100 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path")) 101 | 102 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/") 103 | print(filenames) 104 | assert filenames == [ 105 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz', 106 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz' 107 | ] 108 | 109 | 110 | def test_hdfs_resolve_filenames_with_file_path(): 111 | # hdfs is an optional dependency 112 | # pylint: disable=import-outside-toplevel 113 | from pysparkling.fileio.fs import Hdfs 114 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path")) 115 | 116 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-00001.gz") 117 | print(filenames) 118 | assert filenames == [ 119 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz' 120 | ] 121 | 122 | 123 | def test_s3_resolve_filenames(): 124 | # boto is an optional dependency 125 | # pylint: disable=import-outside-toplevel 126 | from pysparkling.fileio.fs import S3 127 | S3._get_conn = classmethod(lambda *args, **kwargs: MockedS3Connection()) 128 | 129 | filenames = S3.resolve_filenames("s3://bucket-name/user/username/input/part-*.gz") 130 | print(filenames) 131 | assert filenames == [ 132 | 's3://bucket-name/user/username/input/part-00001.gz', 133 | 's3://bucket-name/user/username/input/part-00002.gz' 134 | ] 135 | 136 | 137 | if __name__ == '__main__': 138 | test_local_1() 139 | test_local_2() 140 | test_s3_1() 141 | test_hdfs_resolve_filenames_with_folder_path() 142 | test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash() 143 | test_hdfs_resolve_filenames_with_file_path() 144 | test_hdfs_resolve_filenames_with_wildcard() 145 | test_s3_resolve_filenames() 146 | -------------------------------------------------------------------------------- /pysparkling/tests/test_sample.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pysparkling 4 | 5 | 6 | def test_trivial_sample(): 7 | rdd = pysparkling.Context().parallelize(range(1000), 1000) 8 | sampled = rdd.sample(False, 0.01, 42).collect() 9 | print(sampled) 10 | assert sampled == [97, 164, 294, 695, 807, 864, 911] 11 | 12 | 13 | if __name__ == '__main__': 14 | logging.basicConfig(level=logging.DEBUG) 15 | test_trivial_sample() 16 | -------------------------------------------------------------------------------- /pysparkling/tests/test_stat_counter.py: -------------------------------------------------------------------------------- 1 | import pysparkling 2 | from pysparkling.sql.functions import col 3 | from pysparkling.sql.types import IntegerType, Row, StructField, StructType 4 | from pysparkling.stat_counter import ColumnStatHelper 5 | 6 | 7 | def test_mean(): 8 | d = [1, 4, 9, 160] 9 | s = pysparkling.StatCounter(d) 10 | assert sum(d) / len(d) == s.mean() 11 | 12 | 13 | def test_column_stat_helper(): 14 | """ 15 | Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries 16 | """ 17 | schema = StructType([StructField("value", IntegerType())]) 18 | helper = ColumnStatHelper(col("value")) 19 | for i in range(1, 100001): 20 | helper.merge(Row(value=i), schema) 21 | helper.finalize() 22 | assert helper.count == 100000 23 | assert helper.min == 1 24 | assert helper.max == 100000 25 | assert helper.mean == 50000.5 26 | assert helper.stddev == 28867.65779668774 # sample standard deviation 27 | assert helper.get_quantile(0) == 1 28 | assert helper.get_quantile(0.25) == 24998 29 | assert helper.get_quantile(0.5) == 50000 30 | assert helper.get_quantile(0.75) == 74993 31 | assert helper.get_quantile(1) == 100000 32 | 33 | 34 | if __name__ == '__main__': 35 | test_mean() 36 | test_column_stat_helper() 37 | -------------------------------------------------------------------------------- /pysparkling/tests/test_streaming_files.py: -------------------------------------------------------------------------------- 1 | import tornado.testing 2 | 3 | import pysparkling 4 | 5 | 6 | class TextFile(tornado.testing.AsyncTestCase): 7 | 8 | def test_connect(self): 9 | sc = pysparkling.Context() 10 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 11 | 12 | result = [] 13 | ( 14 | ssc.textFileStream('LICENS*', process_all=True) 15 | .count() 16 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0])) 17 | ) 18 | 19 | ssc.start() 20 | ssc.awaitTermination(timeout=0.3) 21 | self.assertEqual(sum(result), 44) 22 | 23 | def test_save(self): 24 | sc = pysparkling.Context() 25 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 26 | 27 | ( 28 | ssc.textFileStream('LICENS*') 29 | .count() 30 | .saveAsTextFiles('tests/textout/') 31 | ) 32 | 33 | def test_save_gz(self): 34 | sc = pysparkling.Context() 35 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 36 | 37 | ( 38 | ssc.textFileStream('LICENS*') 39 | .count() 40 | .saveAsTextFiles('tests/textout/', suffix='.gz') 41 | ) 42 | 43 | 44 | class BinaryFile(tornado.testing.AsyncTestCase): 45 | 46 | def test_read_file(self): 47 | sc = pysparkling.Context() 48 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 49 | 50 | result = [] 51 | ( 52 | ssc.fileBinaryStream('LICENS*', process_all=True) 53 | .count() 54 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0])) 55 | ) 56 | 57 | ssc.start() 58 | ssc.awaitTermination(timeout=0.3) 59 | self.assertEqual(sum(result), 1) 60 | 61 | def test_read_chunks(self): 62 | sc = pysparkling.Context() 63 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 64 | 65 | result = [] 66 | ( 67 | ssc.fileBinaryStream('LICENS*', recordLength=40, process_all=True) 68 | .count() 69 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0])) 70 | ) 71 | 72 | ssc.start() 73 | ssc.awaitTermination(timeout=0.3) 74 | self.assertEqual(sum(result), 54) 75 | -------------------------------------------------------------------------------- /pysparkling/tests/test_streaming_queue.py: -------------------------------------------------------------------------------- 1 | import tornado.testing 2 | 3 | import pysparkling 4 | 5 | 6 | class TestCount(tornado.testing.AsyncTestCase): 7 | 8 | def test_count(self): 9 | sc = pysparkling.Context() 10 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 11 | 12 | result = [] 13 | ( 14 | ssc.queueStream([range(20), ['a', 'b'], ['c']]) 15 | .count() 16 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0])) 17 | ) 18 | 19 | ssc.start() 20 | ssc.awaitTermination(timeout=0.35) 21 | self.assertEqual(sum(result), 23) 22 | 23 | def test_groupByKey(self): 24 | sc = pysparkling.Context() 25 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 26 | 27 | result = [] 28 | ( 29 | ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], 30 | [('a', 2), ('b', 3)]]) 31 | .groupByKey().mapPartitions(sorted).mapValues(sorted) 32 | .foreachRDD(lambda rdd: result.append(rdd.collect())) 33 | ) 34 | 35 | ssc.start() 36 | ssc.awaitTermination(timeout=0.25) 37 | self.assertEqual( 38 | result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]]) 39 | 40 | def test_mapValues(self): 41 | sc = pysparkling.Context() 42 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 43 | 44 | result = [] 45 | ( 46 | ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]]) 47 | .mapValues(sorted) 48 | .foreachRDD(lambda rdd: result.append(rdd.collect())) 49 | ) 50 | 51 | ssc.start() 52 | ssc.awaitTermination(timeout=0.15) 53 | self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]]) 54 | -------------------------------------------------------------------------------- /pysparkling/tests/test_streaming_tcp.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from contextlib import closing 3 | import struct 4 | 5 | import tornado.gen 6 | import tornado.tcpclient 7 | import tornado.testing 8 | 9 | import pysparkling 10 | 11 | 12 | class TCPTextTest(tornado.testing.AsyncTestCase): 13 | @tornado.gen.coroutine 14 | def client(self): 15 | client = tornado.tcpclient.TCPClient() 16 | for v in range(20): 17 | stream = yield client.connect('127.0.0.1', 8123) 18 | with closing(stream): 19 | stream.write(f'a = {v}\n'.encode('utf8')) 20 | client.close() 21 | 22 | def test_connect(self): 23 | sc = pysparkling.Context() 24 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 25 | 26 | counter = Counter() 27 | ( 28 | ssc.socketTextStream('127.0.0.1', 8123) 29 | .foreachRDD(lambda rdd: 30 | counter.update(''.join(rdd.collect())) 31 | if rdd.collect() else None) 32 | ) 33 | self.client() 34 | 35 | ssc.start() 36 | ssc.awaitTermination(timeout=0.3) 37 | self.assertEqual(counter['a'], 20) 38 | 39 | 40 | class TCPBinaryFixedLengthTest(tornado.testing.AsyncTestCase): 41 | @tornado.gen.coroutine 42 | def client(self): 43 | client = tornado.tcpclient.TCPClient() 44 | stream = yield client.connect('127.0.0.1', 8124) 45 | with closing(stream): 46 | stream.write(b'hello') 47 | client.close() 48 | 49 | def test_main(self): 50 | sc = pysparkling.Context() 51 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1) 52 | 53 | counter = Counter() 54 | ( 55 | ssc.socketBinaryStream('127.0.0.1', 8124, length=5) 56 | .foreachRDD(lambda rdd: counter.update(rdd.collect())) 57 | ) 58 | self.client() 59 | 60 | ssc.start() 61 | ssc.awaitTermination(timeout=0.3) 62 | self.assertEqual(counter[b'hello'], 1) 63 | 64 | 65 | class TCPBinaryUIntLengthTest(tornado.testing.AsyncTestCase): 66 | @tornado.gen.coroutine 67 | def client(self): 68 | client = tornado.tcpclient.TCPClient() 69 | stream = yield client.connect('127.0.0.1', 8125) 70 | with closing(stream): 71 | stream.write(struct.pack('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') 30 | print(r) 31 | print(SC.textFile('tests/*.py').count()) 32 | 33 | 34 | def create_key_value_txt(): 35 | r = SC.parallelize([('a', 1), ('b', 2)], 1) 36 | r.saveAsTextFile('tests/pyspark/key_value.txt') 37 | r.saveAsHadoopFile( 38 | "tests/pyspark/key_value.txt.bz2", 39 | "org.apache.hadoop.mapred.TextOutputFormat", 40 | compressionCodecClass="org.apache.hadoop.io.compress.BZip2Codec", 41 | ) 42 | r.saveAsHadoopFile( 43 | "tests/pyspark/key_value.txt.gz", 44 | "org.apache.hadoop.mapred.TextOutputFormat", 45 | compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec", 46 | ) 47 | # r.saveAsHadoopFile( 48 | # "tests/pyspark/key_value.txt.lzo", 49 | # "org.apache.hadoop.mapred.TextOutputFormat", 50 | # compressionCodecClass="com.hadoop.compression.lzo.LzopCodec", 51 | # ) 52 | 53 | r_txt = SC.textFile('tests/pyspark/key_value.txt') 54 | print(r_txt.collect()) 55 | r_gz = SC.textFile('tests/pyspark/key_value.txt.gz') 56 | print(r_gz.collect()) 57 | r_bz2 = SC.textFile('tests/pyspark/key_value.txt.bz2') 58 | print(r_bz2.collect()) 59 | 60 | 61 | def create_pickled_files(): 62 | rdd = SC.parallelize(['hello', 'world', 1, 2], 2) 63 | rdd.saveAsPickleFile('tests/pyspark/mixed.pickle') 64 | rdd.saveAsPickleFile('tests/pyspark/mixed_batched.pickle', 1) 65 | 66 | 67 | def stat(): 68 | d = [1, 4, 9, 16, 25, 36] 69 | s1 = SC.parallelize(d).stats() 70 | s2 = SC.parallelize(d, 3).stats() 71 | print(str(s1)) 72 | print(str(s2)) 73 | 74 | 75 | def partition_by(): 76 | rdd = SC.parallelize(range(20), 2).map(lambda x: (x, x)) 77 | r = rdd.partitionBy(2).collect() 78 | print('>>>>>>', r) 79 | 80 | 81 | if __name__ == '__main__': 82 | # simple_textFile() 83 | # lazy_execution() 84 | # count_lines() 85 | # create_key_value_txt() 86 | # create_pickled_files() 87 | # stat() 88 | partition_by() 89 | -------------------------------------------------------------------------------- /scripts/pyspark_streaming.py: -------------------------------------------------------------------------------- 1 | """Explore PySpark API. 2 | 3 | Run with `spark-submit scripts/pyspark_streaming.py`. 4 | """ 5 | import time 6 | 7 | import pyspark.streaming 8 | 9 | 10 | def simple_queue(ssc): 11 | ssc.queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=False).pprint() 12 | 13 | 14 | def simple_queue_count(ssc): 15 | (ssc 16 | .queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=False) 17 | .count() 18 | .foreachRDD(lambda t, r: print('>>>>>>>>>>>>>>', t, r.collect()))) 19 | 20 | 21 | def simple_queue_one_at_a_time(ssc): 22 | ssc.queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True).pprint() 23 | 24 | 25 | def save_text(ssc): 26 | (ssc 27 | .queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True) 28 | .saveAsTextFiles('scripts/textout/')) 29 | 30 | 31 | def window(ssc): 32 | (ssc 33 | .queueStream([[1], [2], [3], [4], [5], [6]]) 34 | .window(3) 35 | .foreachRDD(lambda rdd: print('>>>>>>>>>', rdd.collect()))) 36 | 37 | 38 | def updateStateByKey(ssc): 39 | def processStateUpdateByKey(input_stream, state): 40 | print('i', input_stream) 41 | print('s', state) 42 | return state if not input_stream else input_stream[-1] 43 | 44 | ssc.checkpoint('checkpoints/') 45 | (ssc 46 | .queueStream([[('a', 1), ('b', 3)], [('a', 2), ('a', 5), ('c', 4)]]) 47 | .updateStateByKey(processStateUpdateByKey) 48 | .pprint() 49 | ) 50 | 51 | 52 | def stream_log(ssc): 53 | ssc.textFileStream('/var/log/system.log*').pprint() 54 | 55 | 56 | def stream_queue_default(ssc): 57 | (ssc 58 | .queueStream([[4], [2]], default=['placeholder']) 59 | .foreachRDD(lambda rdd: print(rdd.collect()))) 60 | 61 | 62 | def join_with_repeated_keys(ssc): 63 | s1 = ssc.queueStream([[('a', 4), ('a', 2)], [('c', 7)]]) 64 | s2 = ssc.queueStream([[('b', 1), ('b', 3)], [('c', 8)]]) 65 | ( 66 | s1.fullOuterJoin(s2) 67 | .foreachRDD(lambda rdd: print(sorted(rdd.collect()))) 68 | ) 69 | 70 | 71 | def union(ssc): 72 | odd = ssc.queueStream([[1], [3], [5]]) 73 | even = ssc.queueStream([[2], [4], [6]]) 74 | ( 75 | odd.union(even) 76 | .foreachRDD(lambda rdd: print(rdd.collect())) 77 | ) 78 | 79 | 80 | def quiet_logs(sc): 81 | logger = sc._jvm.org.apache.log4j 82 | logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) 83 | logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) 84 | 85 | 86 | if __name__ == '__main__': 87 | spark_context = pyspark.SparkContext() 88 | quiet_logs(spark_context) 89 | streaming_context = pyspark.streaming.StreamingContext(spark_context, 1) 90 | 91 | # simple_queue(ssc) 92 | # simple_queue_count(ssc) 93 | # simple_queue_one_at_a_time(ssc) 94 | # save_text(ssc) 95 | # window(ssc) 96 | # updateStateByKey(ssc) 97 | # stream_log(ssc) 98 | # stream_queue_default(ssc) 99 | # join_with_repeated_keys(ssc) 100 | union(streaming_context) 101 | 102 | streaming_context.start() 103 | time.sleep(3.0) 104 | streaming_context.stop(stopGraceFully=True) 105 | -------------------------------------------------------------------------------- /scripts/readme_example.py: -------------------------------------------------------------------------------- 1 | from pysparkling import Context 2 | 3 | my_rdd = Context().textFile('tests/*.py') 4 | 5 | unfiltered_count = my_rdd.count() 6 | filtered_count = my_rdd.filter(lambda l: l.startswith("import ")).count() 7 | print(f'In tests/*.py: all lines={unfiltered_count}, with import={filtered_count}') 8 | -------------------------------------------------------------------------------- /scripts/readme_example_common_crawl.py: -------------------------------------------------------------------------------- 1 | from pysparkling import Context 2 | 3 | # read all the paths of warc and wat files of the latest Common Crawl 4 | paths_rdd = Context().textFile( 5 | 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/' 6 | 'warc.paths.*,' 7 | 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/' 8 | 'wat.paths.gz', 9 | ) 10 | 11 | print(paths_rdd.collect()) 12 | -------------------------------------------------------------------------------- /scripts/readme_example_human_microbiome.py: -------------------------------------------------------------------------------- 1 | from pysparkling import Context 2 | 3 | by_subject_rdd = Context().textFile( 4 | 's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*' 5 | ) 6 | print(by_subject_rdd.takeSample(True, 1)) 7 | -------------------------------------------------------------------------------- /scripts/readme_example_word_count.py: -------------------------------------------------------------------------------- 1 | from pysparkling import Context 2 | 3 | counts = ( 4 | Context() 5 | .textFile('README.rst') 6 | .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line)) 7 | .flatMap(lambda line: line.split(' ')) 8 | .map(lambda word: (word, 1)) 9 | .reduceByKey(lambda a, b: a + b) 10 | ) 11 | print(counts.collect()) 12 | -------------------------------------------------------------------------------- /scripts/starcluster_simple.py: -------------------------------------------------------------------------------- 1 | from ipyparallel import Client 2 | 3 | rc = Client('/Users/sven/.starcluster/ipcluster/' 4 | 'SecurityGroup:@sc-smallcluster-us-east-1.json', 5 | sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle') 6 | 7 | view = rc[:] 8 | results = view.map(lambda x: x ** 30, range(8)) 9 | print(results.get()) 10 | -------------------------------------------------------------------------------- /scripts/tcpperf_client.py: -------------------------------------------------------------------------------- 1 | """Sends tcp messages.""" 2 | import argparse 3 | from contextlib import closing 4 | import json 5 | import random 6 | import struct 7 | import sys 8 | import time 9 | 10 | from tornado import gen 11 | from tornado.ioloop import IOLoop, PeriodicCallback 12 | from tornado.iostream import StreamClosedError 13 | from tornado.tcpclient import TCPClient 14 | 15 | 16 | class Emitter: 17 | def __init__(self, port, n=1000, values=1, duration=3.0): 18 | self.port = port 19 | self.n = n 20 | self.values = values 21 | self.duration = duration 22 | self.message = self.hello 23 | self.i = 0 24 | 25 | self.pcb = None 26 | self.client = None 27 | 28 | def start(self): 29 | self.client = TCPClient() 30 | 31 | self.pcb = PeriodicCallback(self.send, 1000.0 / self.n) 32 | self.pcb.start() 33 | 34 | IOLoop.current().call_later(self.duration + 0.5, self.stop) 35 | IOLoop.current().start() 36 | IOLoop.clear_current() 37 | 38 | def stop(self): 39 | if self.pcb is not None: 40 | self.pcb.stop() 41 | if self.client is not None: 42 | self.client.close() 43 | IOLoop.current().stop() 44 | 45 | @gen.coroutine 46 | def send(self): 47 | if self.i >= self.duration * self.n * self.values: 48 | self.pcb.stop() 49 | return 50 | 51 | try: 52 | stream = yield self.client.connect('127.0.0.1', self.port) 53 | with closing(stream): 54 | messages = b''.join(self.message() for _ in range(self.values)) 55 | stream.write(messages) 56 | self.i += self.values 57 | except StreamClosedError: 58 | return 59 | 60 | def hello(self): 61 | return b'hello\n' 62 | 63 | def r(self): 64 | s = random.randint(1, 10) 65 | v = s / 10.0 + (1.5 - s / 10.0) * random.random() 66 | return (s, v) 67 | 68 | def text(self): 69 | s, v = self.r() 70 | return f'sensor{s}|{v}\n'.encode('utf8') 71 | 72 | def json(self): 73 | s, v = self.r() 74 | return (json.dumps({f'sensor{s}': v}) + '\n').encode('utf8') 75 | 76 | def bello(self): 77 | # 5 bytes 78 | return b'bello' 79 | 80 | def struct(self): 81 | # 8 bytes 82 | return struct.pack('If', *self.r()) 83 | 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser(description=__doc__) 87 | parser.add_argument('-n', type=int, default=1000, 88 | help='number of connections') 89 | parser.add_argument('--values', type=int, default=1, 90 | help='number of values per connection') 91 | parser.add_argument('--port', type=int, default=8123, 92 | help='target port number') 93 | parser.add_argument('--format', default='hello', 94 | help='format of the messages: hello (default), ' 95 | 'text, json, bello (binary hello), ' 96 | 'struct (binary)') 97 | parser.add_argument('--delay', type=float, default=0.5, 98 | help='wait before start sending messages') 99 | args = parser.parse_args() 100 | 101 | time.sleep(args.delay) 102 | e = Emitter(args.port, args.n, args.values) 103 | e.message = getattr(e, args.format) 104 | e.start() 105 | print(f'{sys.argv[0]} sent {e.i} messages') 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /scripts/tcpperf_connections.csv: -------------------------------------------------------------------------------- 1 | # messages, hello, text, json, bello, struct 2 | 8000, 5505, 5077, 5315, 5128, 5309 3 | 7000, 4641, 4369, 4395, 4846, 4670 4 | 6000, 5238, 4854, 4825, 4639, 5184 5 | 5000, 4329, 4626, 4314, 4270, 4246 6 | 4500, 4064, 4406, 3900, 3980, 4278 7 | 4000, 3681, 3584, 3680, 3710, 3709 8 | 3500, 3378, 3307, 3299, 3404, 3220 9 | 3000, 2888, 2892, 2961, 2890, 2871 10 | 2000, 1978, 1970, 1989, 1972, 1970 11 | 1000, 998, 998, 996, 1001, 998 12 | 100, 100, 100, 100, 101, 100 13 | -------------------------------------------------------------------------------- /scripts/tcpperf_connections.csv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_connections.csv.pdf -------------------------------------------------------------------------------- /scripts/tcpperf_connections.csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_connections.csv.png -------------------------------------------------------------------------------- /scripts/tcpperf_messages.csv: -------------------------------------------------------------------------------- 1 | # messages, hello, text, json, bello, struct 2 | 100000, 72700, 77500, 77800, 69500, 60000 3 | 90000, 82000, 58600, 58500, 60400, 59000 4 | 80000, 65400, 65900, 56800, 57600, 58300 5 | 70000, 59300, 59900, 56800, 50500, 56500 6 | 60000, 56800, 55100, 55600, 52300, 55400 7 | 50000, 50100, 50300, 50000, 48900, 50000 8 | 45000, 45000, 45300, 45000, 45000, 45100 9 | 40000, 40000, 40100, 40300, 39800, 40000 10 | 30000, 30000, 30000, 30000, 30000, 30000 11 | 20000, 20500, 20000, 20500, 20100, 20300 12 | 10000, 10000, 10000, 10000, 10000, 10000 13 | -------------------------------------------------------------------------------- /scripts/tcpperf_messages.csv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_messages.csv.pdf -------------------------------------------------------------------------------- /scripts/tcpperf_messages.csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_messages.csv.png -------------------------------------------------------------------------------- /scripts/tcpperf_plot.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import csv 3 | 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | matplotlib.use('Agg') 8 | 9 | 10 | class Plot: 11 | def __init__(self, filename, x_label=None, y_label=None): 12 | self.filename = filename 13 | self.x_label = x_label or 'connections per second' 14 | self.y_label = y_label or 'processed messages per second' 15 | self.record = None 16 | self.data = list(self.read()) 17 | self.frame() 18 | 19 | def read(self): 20 | with open(self.filename, 'r', encoding='utf8') as f: 21 | reader = csv.reader(f) 22 | 23 | try: 24 | first_line = next(reader) 25 | except StopIteration: 26 | return 27 | 28 | self.record = namedtuple('record', [k.strip().replace('# ', '') 29 | for k in first_line]) 30 | for row_raw in reader: 31 | row = self.record._make([int(v) for v in row_raw]) 32 | yield row 33 | 34 | def frame(self): 35 | fig, ax = plt.subplots() 36 | 37 | x = [row.messages for row in self.data] 38 | y = [row.hello for row in self.data] 39 | 40 | # add some text for labels, title and axes ticks 41 | ax.set_xlabel(self.x_label) 42 | ax.set_ylabel(self.y_label) 43 | # ax.set_xticks(x) 44 | ax.set_xlim(-300, max(x) + 300) 45 | ax.set_ylim(-300, max(y) + 2000) 46 | 47 | fig.tight_layout() 48 | 49 | self.fig, self.ax = fig, ax 50 | return self 51 | 52 | def plot(self): 53 | x = [row.messages for row in self.data] 54 | 55 | ideal, = self.ax.plot([0.0, max(x)], [0.0, max(x)], label='ideal', 56 | color='black', linestyle='--', linewidth=1) 57 | graphs = [ 58 | self.ax.plot(x, [getattr(row, k) for row in self.data], label=k) 59 | for k in self.record._fields if k != 'messages' 60 | ] 61 | 62 | self.ax.legend( 63 | handles=[ideal] + [g for g, in graphs], 64 | loc='upper left', 65 | ) 66 | 67 | return self 68 | 69 | def show(self): 70 | plt.show() 71 | return self 72 | 73 | def save(self): 74 | self.fig.savefig(self.filename + '.pdf') 75 | self.fig.savefig(self.filename + '.png', dpi=300) 76 | return self 77 | 78 | 79 | if __name__ == '__main__': 80 | Plot('tests/tcpperf_connections.csv').plot().save() 81 | (Plot('tests/tcpperf_messages.csv', 82 | x_label='inbound messages per second') 83 | .plot() 84 | .save()) 85 | -------------------------------------------------------------------------------- /scripts/tcpperf_server.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import struct 7 | import time 8 | 9 | import pysparkling 10 | 11 | N_CONNECTIONS = (100, 1000, 2000, 3000, 3500, 4000, 4500, 5000, 12 | 6000, 7000, 8000) 13 | N_CONNECTIONS_1K = (10, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100) 14 | 15 | 16 | class Server: 17 | def __init__(self, pause=60, values=1, start_port=8123, processes=2): 18 | self.pause = pause 19 | self.values = values 20 | self.port = start_port 21 | self.processes = processes 22 | 23 | def client(self, n=2000, format_='hello'): 24 | for _ in range(self.processes): 25 | os.system( 26 | f'python tests/tcpperf_client.py -n {int(n / self.processes)}' 27 | f' --port {self.port} --format {format_} --values {self.values}' 28 | f' &' 29 | ) 30 | 31 | def _run_process(self, n, to_kv, format_): 32 | c = pysparkling.Context() 33 | stream_c = pysparkling.streaming.StreamingContext(c, 1.0) 34 | 35 | counts = [] 36 | sensor_sums = defaultdict(float) 37 | sensor_squares = defaultdict(float) 38 | sensor_counts = defaultdict(int) 39 | if format_ not in ('bello', 'struct'): 40 | t = stream_c.socketTextStream('localhost', self.port) 41 | else: 42 | length = {'bello': 5, 'struct': 8}[format_] 43 | t = stream_c.socketBinaryStream('localhost', self.port, length) 44 | t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0])) 45 | if to_kv is not None: 46 | def update(rdd): 47 | for k, v in rdd.collect(): 48 | sensor_sums[k] += sum(v) 49 | sensor_squares[k] += sum(vv ** 2 for vv in v) 50 | sensor_counts[k] += len(v) 51 | 52 | t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd)) 53 | 54 | self.client(n, format_=format_) 55 | 56 | stream_c.start() 57 | stream_c.awaitTermination(timeout=5.0) 58 | 59 | return ( 60 | counts, 61 | sensor_sums, 62 | sensor_squares, 63 | sensor_counts 64 | ) 65 | 66 | def run(self, n=2000, to_kv=None, format_='hello'): 67 | counts, sensor_sums, sensor_squares, sensor_counts = self._run_process(n, to_kv, format_) 68 | 69 | result = max(counts) if counts else 0 70 | sensor_expections = { 71 | # expectation of X and X^2 72 | k: (sensor_sums[k] / v, sensor_squares[k] / v) 73 | for k, v in sensor_counts.items() 74 | } 75 | sensors = { 76 | k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0] ** 2)) 77 | for k, ex_ex2 in sensor_expections.items() 78 | } 79 | print(f'run: n = {n}, counts = {counts}, result = {result}') 80 | print(f'sensors = {sensors}') 81 | time.sleep(self.pause) 82 | self.port += 1 83 | return result 84 | 85 | 86 | def main(): 87 | logging.basicConfig(level=logging.WARNING) 88 | 89 | def kv_from_text(text): 90 | k, _, v = text.partition('|') 91 | return k, float(v) 92 | 93 | def kv_from_json(text): 94 | j = json.loads(text) 95 | return list(j.items())[0] 96 | 97 | def kv_from_struct(b): 98 | s, v = struct.unpack('If', b) 99 | return f'sensor{s}', v 100 | 101 | with open('tests/tcpperf_messages.csv', 'w', encoding='utf8') as f: 102 | f.write('# messages, hello, text, json, bello, struct\n') 103 | server_1k = Server(pause=2, values=1000, processes=5) 104 | for n in reversed(N_CONNECTIONS_1K): 105 | data = ( 106 | n * 1000, 107 | server_1k.run(n), 108 | server_1k.run(n, None, 'bello'), 109 | server_1k.run(n, kv_from_text, 'text'), 110 | server_1k.run(n, kv_from_json, 'json'), 111 | server_1k.run(n, kv_from_struct, 'struct'), 112 | ) 113 | f.write(', '.join(f'{d}' for d in data) + '\n') 114 | 115 | with open('tests/tcpperf_connections.csv', 'w', encoding='utf8') as f: 116 | f.write('# messages, hello, text, json, bello, struct\n') 117 | server = Server() 118 | for n in reversed(N_CONNECTIONS): 119 | data = ( 120 | n, 121 | server.run(n), 122 | server.run(n, None, 'bello'), 123 | server.run(n, kv_from_text, 'text'), 124 | server.run(n, kv_from_json, 'json'), 125 | server.run(n, kv_from_struct, 'struct'), 126 | ) 127 | f.write(', '.join(f'{d}' for d in data) + '\n') 128 | 129 | 130 | if __name__ == '__main__': 131 | main() 132 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503, E731 3 | exclude = venv*,logo,docs,build 4 | max-line-length = 119 5 | 6 | [tool:pytest] 7 | addopts = --doctest-modules --cov=pysparkling --cov-report=html --cov-branch 8 | testpaths = pysparkling 9 | doctest_optionflags = ALLOW_UNICODE NORMALIZE_WHITESPACE 10 | 11 | [pycodestyle] 12 | max-line-length=119 13 | ignore=E731,E741,W503 14 | exclude=pysparkling/__init__.py 15 | 16 | # See the docstring in versioneer.py for instructions. Note that you must 17 | # re-run 'versioneer.py setup' after changing this section, and commit the 18 | # resulting files. 19 | 20 | [versioneer] 21 | VCS = git 22 | style = pep440 23 | versionfile_source = pysparkling/_version.py 24 | versionfile_build = pysparkling/_version.py 25 | tag_prefix = v 26 | # parentdir_prefix = 27 | 28 | [coverage:run] 29 | branch = True 30 | cover_pylib = False 31 | data_file = reports/.coverage 32 | source = pysparkling 33 | omit = pysparkling/_version.py 34 | 35 | [coverage:report] 36 | show_missing = True 37 | skip_covered = False 38 | 39 | [coverage:html] 40 | directory = reports/coverage 41 | 42 | [isort] 43 | src_paths = pysparkling,scripts 44 | skip_gitignore = True 45 | line_length = 119 46 | order_by_type = False 47 | case_sensitive = False 48 | multi_line_output = 5 49 | force_sort_within_sections = True 50 | skip = versioneer.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | import versioneer 3 | 4 | setup( 5 | name='pysparkling', 6 | version=versioneer.get_version(), 7 | cmdclass=versioneer.get_cmdclass(), 8 | packages=find_packages(), 9 | license='MIT', 10 | description='Pure Python implementation of the Spark RDD interface.', 11 | long_description=open('README.rst', 'r', encoding='utf8').read(), 12 | author='pysparkling contributors', 13 | url='https://github.com/svenkreiss/pysparkling', 14 | 15 | install_requires=[ 16 | 'pytz>=2019.3', 17 | 'python-dateutil>=2.8.0' 18 | ], 19 | extras_require={ 20 | 'hdfs': ['hdfs>=2.0.0'], 21 | 'http': ['requests>=2.6.0'], 22 | 'performance': ['matplotlib>=1.5.3'], 23 | 's3': ['boto>=2.36.0'], 24 | 'streaming': ['tornado>=4.3'], 25 | 'sql': [ 26 | 'numpy', 27 | 'pandas>=0.23.2', 28 | ], 29 | 'tests': [ 30 | 'backports.tempfile==1.0rc1', 31 | 'cloudpickle>=0.1.0', 32 | 'isort', 33 | 'pylint', 34 | 'pylzma', 35 | 'memory-profiler>=0.47', 36 | 'pycodestyle', 37 | 'pytest', 38 | 'pytest-cov', 39 | 'requests>=2.6.0', 40 | 'tornado>=4.3', 41 | ], 42 | 'scripts': [ 43 | 'ipyparallel', 44 | 'pyspark', 45 | 'matplotlib', 46 | ] 47 | }, 48 | 49 | classifiers=[ 50 | 'Development Status :: 4 - Beta', 51 | 'Intended Audience :: Developers', 52 | 'Natural Language :: English', 53 | 'License :: OSI Approved :: MIT License', 54 | 'Operating System :: OS Independent', 55 | 'Programming Language :: Python', 56 | 'Programming Language :: Python :: 3.7', 57 | 'Programming Language :: Python :: 3.8', 58 | 'Programming Language :: Python :: 3.9', 59 | 'Programming Language :: Python :: 3.10', 60 | 'Programming Language :: Python :: 3.11', 61 | 'Programming Language :: Python :: Implementation :: PyPy', 62 | ] 63 | ) 64 | --------------------------------------------------------------------------------