├── .gitattributes
├── .github
    ├── stale.yml
    └── workflows
    │   ├── deploy.yml
    │   └── tests.yml
├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── demo.ipynb
    ├── iris.ipynb
    ├── readthedocs.png
    ├── requirements.txt
    └── sphinx
    │   ├── Makefile
    │   ├── api.rst
    │   ├── api_context.rst
    │   ├── api_fileio.rst
    │   ├── api_rdd.rst
    │   ├── api_streaming.rst
    │   ├── conf.py
    │   ├── dev.rst
    │   ├── images
    │       ├── favicon.ico
    │       └── logo-w600.png
    │   ├── index.rst
    │   ├── parallel.rst
    │   ├── read_write.rst
    │   └── version_index
    │       ├── .nojekyll
    │       ├── CNAME
    │       ├── circle.yml
    │       ├── favicon.ico
    │       ├── index.html
    │       └── logo.svg
├── logo
    ├── banner-w1500.png
    ├── banner-w500.png
    ├── banner.svg
    ├── create.py
    ├── favicon-w128.png
    ├── favicon-w16.png
    ├── favicon-w256.png
    ├── favicon-w32.png
    ├── favicon-w48.png
    ├── favicon.ico
    ├── favicon.svg
    ├── logo-w100.png
    ├── logo-w600.png
    └── logo.svg
├── pysparkling
    ├── __init__.py
    ├── __version__.py
    ├── _version.py
    ├── accumulators.py
    ├── broadcast.py
    ├── cache_manager.py
    ├── context.py
    ├── exceptions.py
    ├── fileio
    │   ├── __init__.py
    │   ├── codec
    │   │   ├── __init__.py
    │   │   ├── bz2.py
    │   │   ├── codec.py
    │   │   ├── gz.py
    │   │   ├── lzma.py
    │   │   ├── sevenz.py
    │   │   ├── tar.py
    │   │   └── zip.py
    │   ├── file.py
    │   ├── fs
    │   │   ├── __init__.py
    │   │   ├── file_system.py
    │   │   ├── gs.py
    │   │   ├── hdfs.py
    │   │   ├── http.py
    │   │   ├── local.py
    │   │   └── s3.py
    │   └── textfile.py
    ├── partition.py
    ├── rdd.py
    ├── samplers.py
    ├── sql
    │   ├── __init__.py
    │   ├── casts.py
    │   ├── column.py
    │   ├── conf.py
    │   ├── context.py
    │   ├── dataframe.py
    │   ├── expressions
    │   │   ├── __init__.py
    │   │   ├── aggregate
    │   │   │   ├── __init__.py
    │   │   │   ├── aggregations.py
    │   │   │   ├── collectors.py
    │   │   │   ├── covariance_aggregations.py
    │   │   │   └── stat_aggregations.py
    │   │   ├── arrays.py
    │   │   ├── csvs.py
    │   │   ├── dates.py
    │   │   ├── explodes.py
    │   │   ├── expressions.py
    │   │   ├── fields.py
    │   │   ├── jsons.py
    │   │   ├── literals.py
    │   │   ├── mappers.py
    │   │   ├── operators.py
    │   │   ├── orders.py
    │   │   ├── strings.py
    │   │   └── userdefined.py
    │   ├── functions.py
    │   ├── group.py
    │   ├── internal_utils
    │   │   ├── __init__.py
    │   │   ├── column.py
    │   │   ├── joins.py
    │   │   ├── options.py
    │   │   ├── readers
    │   │   │   ├── __init__.py
    │   │   │   ├── common.py
    │   │   │   ├── csvreader.py
    │   │   │   ├── jsonreader.py
    │   │   │   ├── textreader.py
    │   │   │   └── utils.py
    │   │   ├── readwrite.py
    │   │   └── writers.py
    │   ├── internals.py
    │   ├── readwriter.py
    │   ├── schema_utils.py
    │   ├── session.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── data
    │   │   │   └── fundings
    │   │   │   │   └── part-0.csv
    │   │   ├── expressions
    │   │   │   └── test_mappers.py
    │   │   ├── test_casts.py
    │   │   ├── test_read.py
    │   │   ├── test_session.py
    │   │   └── test_write.py
    │   ├── types.py
    │   └── utils.py
    ├── stat_counter.py
    ├── storagelevel.py
    ├── streaming
    │   ├── __init__.py
    │   ├── context.py
    │   ├── dstream.py
    │   ├── filestream.py
    │   ├── queuestream.py
    │   └── tcpstream.py
    ├── task_context.py
    ├── tests
    │   ├── __init__.py
    │   ├── data.7z
    │   ├── data.tar.gz
    │   ├── pyspark
    │   │   ├── key_value.txt.bz2
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-00000.bz2
    │   │   ├── key_value.txt.gz
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-00000.gz
    │   │   └── key_value.txt
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-00000
    │   ├── test_broadcast.py
    │   ├── test_cache.py
    │   ├── test_context.py
    │   ├── test_multiprocessing.py
    │   ├── test_rdd.py
    │   ├── test_resolve_filenames.py
    │   ├── test_sample.py
    │   ├── test_stat_counter.py
    │   ├── test_streaming_files.py
    │   ├── test_streaming_queue.py
    │   ├── test_streaming_tcp.py
    │   └── test_textFile.py
    └── utils.py
├── scripts
    ├── benchmark_csv.py
    ├── benchmark_generators.py
    ├── ipcluster_simple.py
    ├── log_streaming.py
    ├── multiprocessing_performance_plot.pdf
    ├── multiprocessing_performance_plot.png
    ├── multiprocessing_performance_plot.py
    ├── profile_textfile.py
    ├── pyspark_comparisons.py
    ├── pyspark_streaming.py
    ├── readme_example.py
    ├── readme_example_common_crawl.py
    ├── readme_example_human_microbiome.py
    ├── readme_example_word_count.py
    ├── starcluster_simple.py
    ├── tcpperf_client.py
    ├── tcpperf_connections.csv
    ├── tcpperf_connections.csv.pdf
    ├── tcpperf_connections.csv.png
    ├── tcpperf_messages.csv
    ├── tcpperf_messages.csv.pdf
    ├── tcpperf_messages.csv.png
    ├── tcpperf_plot.py
    └── tcpperf_server.py
├── setup.cfg
├── setup.py
└── versioneer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | pysparkling/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Build and upload
 2 | 
 3 | # Build on every branch push, tag push, and pull request change:
 4 | # on: [push, pull_request]
 5 | # Alternatively, to publish when a (published) GitHub Release is created, use the following:
 6 | on:
 7 |   push:
 8 |     branches:
 9 |       - master
10 |   pull_request:
11 |     branches:
12 |       - master
13 |   release:
14 |     types:
15 |       - published
16 | 
17 | jobs:
18 |   build_sdist:
19 |     name: Build Python source distribution
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |         with:
24 |           fetch-depth: 0
25 | 
26 |       - uses: actions/setup-python@v4
27 |         name: Install Python
28 |         with:
29 |           python-version: '3.7'
30 | 
31 |       - name: Build sdist
32 |         run: python setup.py sdist
33 | 
34 |       - uses: actions/upload-artifact@v3
35 |         with:
36 |           path: dist/*.tar.gz
37 | 
38 |   upload_pypi:
39 |     needs: [build_sdist]
40 |     runs-on: ubuntu-latest
41 |     # upload to PyPI on every tag starting with 'v'
42 |     # if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
43 |     # alternatively, to publish when a GitHub Release is created, use the following rule:
44 |     if: github.event_name == 'release' && github.event.action == 'published'
45 |     steps:
46 |       - uses: actions/download-artifact@v3
47 |         with:
48 |           name: artifact
49 |           path: dist
50 | 
51 |       - uses: pypa/gh-action-pypi-publish@master
52 |         with:
53 |           user: __token__
54 |           password: ${{ secrets.pypi_token }}
55 |           # To test: repository_url: https://test.pypi.org/legacy/
56 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ ubuntu-latest, macos-latest, windows-latest ]
12 |         python: [ 3.7, 3.8, 3.9, "3.10", "3.11" ]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |       with:
17 |         fetch-depth: 0
18 | 
19 |     - name: Set up Python ${{ matrix.python }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python }}
23 | 
24 |     - name: Install
25 |       run: |
26 |         python -m pip install --upgrade pip setuptools
27 |         python -m pip install -e ".[tests,scripts]"
28 | 
29 |     - name: Print environment
30 |       run: |
31 |         python -m pip freeze
32 |         python --version
33 |         python -c "import pysparkling; print(pysparkling.__version__)"
34 | 
35 |     - name: Check if import order is fine
36 |       run: |
37 |         isort . --check --diff
38 | 
39 |     - name: Test pysparkling/rdd.py
40 |       run: python -m pytest pysparkling/rdd.py -vv
41 | 
42 |     - name: Test pysparkling/tests
43 |       if: matrix.os == 'ubuntu-latest'  # because of timing sensitivity in stream tests
44 |       run: python -m pytest pysparkling/tests -vv
45 | 
46 |     - name: Install SQL Dependencies
47 |       run: |
48 |         python -m pip install -e ".[sql]"
49 | 
50 |     - name: Lint
51 |       if: matrix.python != '3.9'
52 |       run: pylint pysparkling scripts --disable=fixme
53 | 
54 |     - name: pycodestyle
55 |       run: python -m pycodestyle pysparkling scripts
56 | 
57 |     - name: Test All
58 |       if: matrix.os == 'ubuntu-latest'  # because of timing sensitivity in stream tests
59 |       run: python -m pytest -vv
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | test.*
 3 | profile.out
 4 | .vscode
 5 | scripts/textout
 6 | tests/textout
 7 | checkpoints/
 8 | 
 9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | .pytest_cache/
13 | 
14 | # C extensions
15 | *.so
16 | 
17 | # Vim
18 | *.sw[po]
19 | 
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | .env/
24 | venv*/
25 | pypy/
26 | pypy3/
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | 
42 | # PyInstaller
43 | #  Usually these files are written by a python script from a template
44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
45 | *.manifest
46 | *.spec
47 | 
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 | 
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *,cover
61 | 
62 | # Translations
63 | *.mo
64 | *.pot
65 | 
66 | # Django stuff:
67 | *.log
68 | 
69 | # Sphinx documentation
70 | docs/sphinx/_build/
71 | 
72 | # PyBuilder
73 | target/
74 | 
75 | # Spark data files
76 | *.crc
77 | 
78 | # IPython
79 | *.ipynb.syncdoc
80 | .ipynb_checkpoints
81 | .ipython-daemon.json
82 | 
83 | /.idea/
84 | /reports/
85 | /pysparkling/tests/20news-19997.tar.gz
86 | 
87 | /scripts_private/
88 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [BASIC]
 2 | 
 3 | variable-rgx=[a-z0-9_]{1,30}$
 4 | good-names=log
 5 | 
 6 | disable=invalid-name,unused-argument,too-few-public-methods,missing-docstring,logging-format-interpolation,too-many-instance-attributes,duplicate-code,too-many-public-methods,too-many-arguments,protected-access,too-many-lines,missing-timeout,unnecessary-lambda-assignment
 7 | 
 8 | [FORMAT]
 9 | max-line-length=119
10 | 
11 | [SIMILARITIES]
12 | 
13 | ignore-imports=yes
14 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/sphinx/conf.py
11 | 
12 | # Optionally set the version of Python and requirements required to build your docs
13 | python:
14 |   version: 3.7
15 |   install:
16 |     - requirements: docs/requirements.txt
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2020 pysparkling contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 
24 | -----------------------------------------------------------------------------
25 | 
26 | 
27 | Parts of the files pysparkling/accumulators.py, pysparkling/broadcast.py,
28 | pysparkling/rdd.py, pysparkling/storagelevel.py and pysparkling/sql were
29 | extracted from their PySpark counterparts under the following license:
30 | 
31 | Licensed to the Apache Software Foundation (ASF) under one or more
32 | contributor license agreements.  See the NOTICE file distributed with
33 | this work for additional information regarding copyright ownership.
34 | The ASF licenses this file to You under the Apache License, Version 2.0
35 | (the "License"); you may not use this file except in compliance with
36 | the License.  You may obtain a copy of the License at
37 | 
38 |    http://www.apache.org/licenses/LICENSE-2.0
39 | 
40 | Unless required by applicable law or agreed to in writing, software
41 | distributed under the License is distributed on an "AS IS" BASIS,
42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 | See the License for the specific language governing permissions and
44 | limitations under the License.
45 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include pysparkling/_version.py
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/logo/logo-w100.png
  2 |     :target: https://github.com/svenkreiss/pysparkling
  3 | 
  4 | pysparkling
  5 | ===========
  6 | 
  7 | **Pysparkling** provides a faster, more responsive way to develop programs
  8 | for PySpark. It enables code intended for Spark applications to execute
  9 | entirely in Python, without incurring the overhead of initializing and
 10 | passing data through the JVM and Hadoop. The focus is on having a lightweight
 11 | and fast implementation for small datasets at the expense of some data
 12 | resilience features and some parallel processing features.
 13 | 
 14 | **How does it work?** To switch execution of a script from PySpark to pysparkling,
 15 | have the code initialize a pysparkling Context instead of a SparkContext, and
 16 | use the pysparkling Context to set up your RDDs. The beauty is you don't have
 17 | to change a single line of code after the Context initialization, because
 18 | pysparkling's API is (almost) exactly the same as PySpark's. Since it's so easy
 19 | to switch between PySpark and pysparkling, you can choose the right tool for your
 20 | use case.
 21 | 
 22 | **When would I use it?** Say you are writing a Spark application because you
 23 | need robust computation on huge datasets, but you also want the same application
 24 | to provide fast answers on a small dataset. You're finding Spark is not responsive
 25 | enough for your needs, but you don't want to rewrite an entire separate application
 26 | for the *small-answers-fast* problem. You'd rather reuse your Spark code but somehow
 27 | get it to run fast. Pysparkling bypasses the stuff that causes Spark's long startup
 28 | times and less responsive feel.
 29 | 
 30 | Here are a few areas where pysparkling excels:
 31 | 
 32 | * Small to medium-scale exploratory data analysis
 33 | * Application prototyping
 34 | * Low-latency web deployments
 35 | * Unit tests
 36 | 
 37 | 
 38 | Install
 39 | =======
 40 | 
 41 | .. code-block:: bash
 42 | 
 43 |     python3 -m pip install "pysparkling[s3,hdfs,http,streaming]"
 44 | 
 45 | 
 46 | `Documentation <https://pysparkling.trivial.io>`_:
 47 | 
 48 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/docs/readthedocs.png
 49 |    :target: https://pysparkling.trivial.io
 50 | 
 51 | 
 52 | Other links:
 53 | `Github <https://github.com/svenkreiss/pysparkling>`_,
 54 | |pypi-badge|, |test-badge|, |docs-badge|
 55 | 
 56 | .. |pypi-badge| image:: https://badge.fury.io/py/pysparkling.svg
 57 |    :target: https://pypi.python.org/pypi/pysparkling/
 58 | .. |test-badge| image:: https://github.com/svenkreiss/pysparkling/workflows/Tests/badge.svg
 59 |    :target: https://github.com/svenkreiss/pysparkling/actions?query=workflow%3ATests
 60 | .. |docs-badge| image:: https://readthedocs.org/projects/pysparkling/badge/?version=latest
 61 |    :target: https://pysparkling.readthedocs.io/en/latest/?badge=latest
 62 |    :alt: Documentation Status
 63 | 
 64 | 
 65 | Features
 66 | ========
 67 | 
 68 | * Supports URI schemes ``s3://``, ``hdfs://``, ``gs://``, ``http://`` and ``file://``
 69 |   for Amazon S3, HDFS, Google Storage, web and local file access.
 70 |   Specify multiple files separated by comma.
 71 |   Resolves ``*`` and ``?`` wildcards.
 72 | * Handles ``.gz``, ``.zip``, ``.lzma``, ``.xz``, ``.bz2``, ``.tar``,
 73 |   ``.tar.gz`` and ``.tar.bz2`` compressed files.
 74 |   Supports reading of ``.7z`` files.
 75 | * Parallelization via ``multiprocessing.Pool``,
 76 |   ``concurrent.futures.ThreadPoolExecutor`` or any other Pool-like
 77 |   objects that have a ``map(func, iterable)`` method.
 78 | * Plain pysparkling does not have any dependencies (use ``pip install pysparkling``).
 79 |   Some file access methods have optional dependencies:
 80 |   ``boto`` for AWS S3, ``requests`` for http, ``hdfs`` for hdfs
 81 | 
 82 | 
 83 | Examples
 84 | ========
 85 | 
 86 | Some demos are in the notebooks
 87 | `docs/demo.ipynb <https://github.com/svenkreiss/pysparkling/blob/master/docs/demo.ipynb>`_
 88 | and
 89 | `docs/iris.ipynb <https://github.com/svenkreiss/pysparkling/blob/master/docs/iris.ipynb>`_
 90 | .
 91 | 
 92 | **Word Count**
 93 | 
 94 | .. code-block:: python
 95 | 
 96 |     from pysparkling import Context
 97 | 
 98 |     counts = (
 99 |         Context()
100 |         .textFile('README.rst')
101 |         .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line))
102 |         .flatMap(lambda line: line.split(' '))
103 |         .map(lambda word: (word, 1))
104 |         .reduceByKey(lambda a, b: a + b)
105 |     )
106 |     print(counts.collect())
107 | 
108 | which prints a long list of pairs of words and their counts.
109 | 


--------------------------------------------------------------------------------
/docs/readthedocs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/readthedocs.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dateutil
2 | Sphinx
3 | sphinx_rtd_theme
4 | 


--------------------------------------------------------------------------------
/docs/sphinx/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api:
 2 | 
 3 | API
 4 | ===
 5 | 
 6 | .. currentmodule:: pysparkling
 7 | 
 8 | A usual ``pysparkling`` session starts with either parallelizing a `list`
 9 | with :func:`Context.parallelize` or by reading data from a file using
10 | :func:`Context.textFile`. These two methods return :class:`RDD` instances that
11 | can then be processed.
12 | 
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 | 
17 |    api_rdd
18 |    api_context
19 |    api_streaming
20 |    api_fileio
21 | 


--------------------------------------------------------------------------------
/docs/sphinx/api_context.rst:
--------------------------------------------------------------------------------
 1 | .. _api_context:
 2 | 
 3 | .. currentmodule:: pysparkling
 4 | 
 5 | Context
 6 | -------
 7 | 
 8 | A :class:`~pysparkling.Context` describes the setup. Instantiating a Context with the default
 9 | arguments using ``Context()`` is the most lightweight setup. All data is just
10 | in the local thread and is never serialized or deserialized.
11 | 
12 | If you want to process the data in parallel, you can use the `multiprocessing`
13 | module. Given the limitations of the default `pickle` serializer, you can
14 | specify to serialize all methods with `cloudpickle` instead. For example,
15 | a common instantiation with `multiprocessing` looks like this:
16 | 
17 | .. code-block:: python
18 | 
19 |   sc = pysparkling.Context(
20 |       multiprocessing.Pool(4),
21 |       serializer=cloudpickle.dumps,
22 |       deserializer=pickle.loads,
23 |   )
24 | 
25 | This assumes that your data is serializable with `pickle` which is generally
26 | faster. You can also specify a custom serializer/deserializer for data.
27 | 
28 | .. autoclass:: pysparkling.Context
29 |    :members:
30 | 


--------------------------------------------------------------------------------
/docs/sphinx/api_fileio.rst:
--------------------------------------------------------------------------------
 1 | .. _api_fileio:
 2 | 
 3 | 
 4 | fileio
 5 | ------
 6 | 
 7 | .. currentmodule:: pysparkling
 8 | 
 9 | The functionality provided by this module is used in :func:`Context.textFile`
10 | for reading and in :func:`RDD.saveAsTextFile` for writing.
11 | 
12 | .. currentmodule:: pysparkling.fileio
13 | 
14 | You can use this submodule with :func:`File.dump`, :func:`File.load` and
15 | :func:`File.exists` to read, write and check for existance of a file.
16 | All methods transparently handle various schemas (for example ``http://``,
17 | ``s3://`` and ``file://``) and compression/decompression of ``.gz`` and
18 | ``.bz2`` files (among others).
19 | 
20 | 
21 | .. autoclass:: pysparkling.fileio.File
22 |     :members:
23 | 
24 | .. autoclass:: pysparkling.fileio.TextFile
25 |     :members:
26 | 
27 | 
28 | File System
29 | ^^^^^^^^^^^
30 | 
31 | .. autoclass:: pysparkling.fileio.fs.FileSystem
32 |     :members:
33 | 
34 | .. autoclass:: pysparkling.fileio.fs.Local
35 |     :members:
36 | 
37 | .. autoclass:: pysparkling.fileio.fs.GS
38 |     :members:
39 | 
40 | .. autoclass:: pysparkling.fileio.fs.Hdfs
41 |     :members:
42 | 
43 | .. autoclass:: pysparkling.fileio.fs.Http
44 |     :members:
45 | 
46 | .. autoclass:: pysparkling.fileio.fs.S3
47 |     :members:
48 | 
49 | 
50 | Codec
51 | ^^^^^
52 | 
53 | .. autoclass:: pysparkling.fileio.codec.Codec
54 |     :members:
55 | 
56 | .. autoclass:: pysparkling.fileio.codec.Bz2
57 |     :members:
58 | 
59 | .. autoclass:: pysparkling.fileio.codec.Gz
60 |     :members:
61 | 
62 | .. autoclass:: pysparkling.fileio.codec.Lzma
63 |     :members:
64 | 
65 | .. autoclass:: pysparkling.fileio.codec.SevenZ
66 |     :members:
67 | 
68 | .. autoclass:: pysparkling.fileio.codec.Tar
69 |     :members:
70 | 
71 | .. autoclass:: pysparkling.fileio.codec.TarGz
72 |     :members:
73 | 
74 | .. autoclass:: pysparkling.fileio.codec.TarBz2
75 |     :members:
76 | 
77 | .. autoclass:: pysparkling.fileio.codec.Zip
78 |     :members:
79 | 


--------------------------------------------------------------------------------
/docs/sphinx/api_rdd.rst:
--------------------------------------------------------------------------------
 1 | .. _api_rdd:
 2 | 
 3 | RDD
 4 | ---
 5 | 
 6 | .. autoclass:: pysparkling.RDD
 7 |    :members:
 8 | 
 9 | .. autoclass:: pysparkling.StatCounter
10 |    :members:
11 | 


--------------------------------------------------------------------------------
/docs/sphinx/api_streaming.rst:
--------------------------------------------------------------------------------
 1 | .. _api_streaming:
 2 | 
 3 | Streaming
 4 | ---------
 5 | 
 6 | .. warning::
 7 |     This is a new addition to the API (March 2017) that should only be used
 8 |     with care.
 9 | 
10 | 
11 | StreamingContext
12 | ^^^^^^^^^^^^^^^^
13 | 
14 | .. autoclass:: pysparkling.streaming.StreamingContext
15 |    :members:
16 | 
17 | 
18 | DStream
19 | ^^^^^^^
20 | 
21 | .. autoclass:: pysparkling.streaming.DStream
22 |    :members:
23 | 


--------------------------------------------------------------------------------
/docs/sphinx/dev.rst:
--------------------------------------------------------------------------------
 1 | .. _dev:
 2 | 
 3 | Development
 4 | ===========
 5 | 
 6 | Fork the Github repository and apply your changes in a feature branch.
 7 | To run pysparkling's unit tests:
 8 | 
 9 | .. code-block:: sh
10 | 
11 |     # install
12 |     pip install -e .[hdfs,performance,streaming,test]
13 |     flake8 --install-hook
14 | 
15 |     # run linting and test
16 |     flake8
17 |     pytest -vv
18 | 
19 | Don't run ``python setup.py test`` as this will
20 | not execute the doctests. When all tests pass, create a Pull Request on GitHub.
21 | Please also update ``HISTORY.rst`` with short description of your change.
22 | 
23 | To preview the docs locally, install the extra dependencies with
24 | ``pip install -r docs/requirements.txt``, and then cd into ``docs/sphinx``,
25 | run ``make html`` and open ``_build/html/index.html``.
26 | 
27 | Please also try not to add derivative work from other projects. If you do,
28 | incorporate proper handling of external licenses in your Pull Request.
29 | 


--------------------------------------------------------------------------------
/docs/sphinx/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/images/favicon.ico


--------------------------------------------------------------------------------
/docs/sphinx/images/logo-w600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/images/logo-w600.png


--------------------------------------------------------------------------------
/docs/sphinx/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. include:: ../../README.rst
 3 | 
 4 | 
 5 | Contents
 6 | ========
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 | 
11 |    self
12 |    read_write
13 |    api
14 |    dev
15 | 
16 | 
17 | 
18 | .. Indices and tables
19 | .. ==================
20 | 
21 | .. * :ref:`genindex`
22 | .. * :ref:`modindex`
23 | .. * :ref:`search`
24 | 


--------------------------------------------------------------------------------
/docs/sphinx/parallel.rst:
--------------------------------------------------------------------------------
  1 | .. _parallel:
  2 | 
  3 | 
  4 | Parallelization
  5 | ===============
  6 | 
  7 | Pysparkling supports parallelizations on the local machine and across clusters
  8 | of computers.
  9 | 
 10 | 
 11 | Processes and Threads
 12 | ---------------------
 13 | 
 14 | Single machine parallelization with
 15 | ``concurrent.futures.ThreadPoolExecutor``,
 16 | ``concurrent.futures.ProcessPoolExecutor`` or
 17 | ``multiprocessing.Pool`` is supported. Use ``cloudpickle`` instead of ``pickle`` for
 18 | serialization to support lambda functions (and more) for data transformations.
 19 | 
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     import cloudpickle
 24 |     import concurrent
 25 |     import pysparkling
 26 | 
 27 |     sc = pysparkling.Context(
 28 |         pool=concurrent.futures.ProcessPoolExecutor(4),
 29 |         serializer=cloudpickle.dumps,
 30 |         deserializer=pickle.loads,
 31 |     )
 32 | 
 33 | 
 34 | 
 35 | Experimental
 36 | ------------
 37 | 
 38 | The following are experimental notes. Most of them don't even contain examples how to make
 39 | use of these techniques with pysparkling.
 40 | 
 41 | ipcluster and IPython.parallel
 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 43 | 
 44 | Local test setup:
 45 | 
 46 | .. code-block:: bash
 47 | 
 48 |     ipcluster start --n=2
 49 | 
 50 | .. code-block:: python
 51 | 
 52 |     from IPython.parallel import Client
 53 | 
 54 |     c = Client()
 55 |     print(c[:].map(lambda _: 'hello world', range(2)).get())
 56 | 
 57 | which should print ``['hello world', 'hello world']``.
 58 | 
 59 | To run on a cluster, create a profile:
 60 | 
 61 | .. code-block:: bash
 62 | 
 63 |     ipython profile create --parallel --profile=smallcluster
 64 | 
 65 |     # start controller:
 66 |     # Creates ~/.ipython/profile_smallcluster/security/ipcontroller-engine.json
 67 |     # which is used by the engines to identify the location of this controller.
 68 |     # This is the local-only IP address. Substitute with the machines IP
 69 |     # address so that the engines can find it.
 70 |     ipcontroller --ip=127.0.0.1 --port=7123 --profile=smallcluster
 71 | 
 72 |     # start engines (assuming they have access to the
 73 |     # ipcontroller-engine.json file)
 74 |     ipengine --profile=smallcluster
 75 | 
 76 | Test it in Python:
 77 | 
 78 | .. code-block:: python
 79 | 
 80 |     from IPython.parallel import Client
 81 | 
 82 |     c = Client(profile='smallcluster')
 83 |     print(c[:].map(lambda _: 'hello world', range(2)).get())
 84 | 
 85 | If you don't want to start the engines manually, ``ipcluster`` comes with
 86 | "Launchers" that can start them for you:
 87 | https://ipython.org/ipython-doc/dev/parallel/parallel_process.html#using-ipcluster-in-ssh-mode
 88 | 
 89 | 
 90 | StarCluster
 91 | ~~~~~~~~~~~
 92 | 
 93 | Setting up StarCluster was an experiment. However it does not integrate well
 94 | with the rest of our EC2 infrastructure, so we switched to a Chef based setup
 95 | where we use ``ipcluster`` directly. A blocker was that the number of engines
 96 | per node is not configurable and we have many map jobs that wait on external
 97 | responses.
 98 | 
 99 | Setup
100 | 
101 | .. code-block:: bash
102 | 
103 |     # install
104 |     pip install starcluster
105 | 
106 |     # create configuration
107 |     starcluster help  # choose the option to create a sample config file
108 | 
109 |     # add your user id, aws_access_key_id and aws_secret_access_key to config
110 | 
111 |     # create an ssh key (this creates a new key just for starcluster)
112 |     # and registers it with AWS
113 |     starcluster createkey starclusterkey -o ~/.ssh/starclusterkey.rsa
114 | 
115 |     # add this key to config:
116 |     [key starclusterkey]
117 |     KEY_LOCATION=~/.ssh/starclusterkey.rsa
118 |     # and use this key in the cluster setup:
119 |     KEYNAME = starclusterkey
120 | 
121 |     # disable the queue, Sun Grid Engine
122 |     # (unnecessary for pysparkling and takes time during setup)
123 |     DISABLE_QUEUE=True
124 | 
125 |     # to enable IPython parallel support, uncomment these lines in config:
126 |     [plugin ipcluster]
127 |     SETUP_CLASS = starcluster.plugins.ipcluster.IPCluster
128 | 
129 |     # and make sure you have this line inside the cluster section
130 |     [cluster smallcluster]
131 |     PLUGINS = ipcluster
132 | 
133 |     # start the cluster
134 |     starcluster start smallcluster
135 | 
136 |     # check it has started
137 |     starcluster listclusters
138 | 
139 | Currently use: ``ami-da180db2`` (Ubuntu 14.04 with 100GB EBS) on
140 | ``m3.medium`` instances.
141 | 
142 | Workarounds:
143 | 
144 | .. code-block:: bash
145 | 
146 |     # this seems to be a dependency that does not get installed
147 |     pip install pexpect
148 | 
149 |     # to validate the ssh host, you need to log in once manually, to add it
150 |     # to the list of known hosts
151 |     starcluster sshmaster smallcluster
152 | 
153 | In Python, you should now be able to run
154 | 
155 | .. code-block:: python
156 | 
157 |     from IPython.parallel import Client
158 | 
159 |     # the exact command is printed after the cluster started
160 |     rc = Client('/Users/sven/.starcluster/ipcluster/SecurityGroup:@sc-smallcluster-us-east-1.json',
161 |                 sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle')
162 | 
163 |     view = rc[:]
164 |     results = view.map(lambda x: x**30, range(8))
165 |     print results.get()
166 | 
167 | which is also in ``tests/starcluster_simple.py``.
168 | 
169 | 
170 | Install your own software that is not on pypi:
171 | 
172 | .. code-block:: python
173 | 
174 |     pip install wheel
175 |     python setup.py bdist_wheel  # add --universal for Python2 and 3 packages
176 |     starcluster put smallcluster dist/your_package_name.whl /home/sgeadmin/your_package_name.whl
177 | 
178 |     # ssh into remote machine
179 |     starcluster sshmaster smallcluster
180 |     > pip install --upgrade pip
181 |     > pip install wheel
182 |     > pip2.7 install /home/sgeadmin/your_package_name.whl
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/docs/sphinx/read_write.rst:
--------------------------------------------------------------------------------
  1 | .. _read_write:
  2 | 
  3 | .. currentmodule:: pysparkling
  4 | 
  5 | 
  6 | Reading and Writing
  7 | ===================
  8 | 
  9 | This is a collection of best practices or templates for reading and writing
 10 | various input and output formats.
 11 | 
 12 | 
 13 | Batch
 14 | -----
 15 | 
 16 | Python List
 17 | ~~~~~~~~~~~
 18 | 
 19 | The most direct input and output is from and to a Python list.
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     import pysparkling
 24 | 
 25 |     sc = pysparkling.Context()
 26 | 
 27 |     # reading
 28 |     rdd = sc.parallelize(['hello', 'world'])
 29 | 
 30 |     # back to Python list
 31 |     print(rdd.collect())
 32 | 
 33 |     # back to an iterator
 34 |     rdd.toLocalIterator()
 35 | 
 36 | 
 37 | ND-JSON
 38 | ~~~~~~~
 39 | 
 40 | Newline delimited JSON is a text file where every line is its own JSON string.
 41 | 
 42 | 
 43 | .. code-block:: python
 44 | 
 45 |     import json
 46 |     import pysparkling
 47 | 
 48 |     sc = pysparkling.Context()
 49 | 
 50 |     # reading
 51 |     rdd = (
 52 |         sc
 53 |         .textFile('input.json')
 54 |         .map(json.loads)
 55 |     )
 56 | 
 57 |     # writing
 58 |     (
 59 |         rdd
 60 |         .map(json.dumps)
 61 |         .saveAsTextFile('output.json')
 62 |     )
 63 | 
 64 | 
 65 | CSV
 66 | ~~~
 67 | 
 68 | .. code-block:: python
 69 | 
 70 |     import csv
 71 |     import io
 72 |     import pysparkling
 73 | 
 74 |     sc = pysparkling.Context()
 75 | 
 76 |     # reading
 77 |     rdd = (
 78 |         sc
 79 |         .textFile('input.csv')
 80 |         .mapPartitions(csv.reader)
 81 |     )
 82 | 
 83 |     # writing
 84 |     def csv_row(data):
 85 |         s = io.StringIO()
 86 |         csv.writer(s).writerow(data)
 87 |         return s.getvalue()[:-1]
 88 | 
 89 |     (
 90 |         rdd
 91 |         .map(csv_row)
 92 |         .saveAsTextFile('output.csv')
 93 |     )
 94 | 
 95 | 
 96 | TensorFlow Records
 97 | ~~~~~~~~~~~~~~~~~~
 98 | 
 99 | This example preprocesses example data into a TensorFlow Records file. The
100 | second part is a cross check and prints the contents of the `tfrecords` file.
101 | 
102 | .. code-block:: python
103 | 
104 |     import pysparkling
105 |     import tensorflow as tf
106 | 
107 |     def to_tfrecord(self, xy):
108 |         X, y = xy
109 |         example = tf.train.Example(features=tf.train.Features(feature={
110 |             'X': tf.train.Feature(float_list=tf.train.FloatList(value=X)),
111 |             'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
112 |         }))
113 |         return example.SerializeToString()
114 | 
115 |     # example
116 |     X = [1.2, 3.1, 8.7]
117 |     y = [2, 5]
118 | 
119 |     # writing
120 |     sc = pysparkling.Context()
121 |     rdd = (
122 |         sc
123 |         .parallelize([(X, y)])
124 |         .map(to_tfrecord)
125 |     )
126 |     with tf.python_io.TFRecordWriter('out.tfrecords') as writer:
127 |         for example in rdd.toLocalIterator():
128 |             writer.write(example)
129 | 
130 |     # debugging a tf records file
131 |     for serialized_example in tf.python_io.tf_record_iterator('out.tfrecords'):
132 |         example = tf.train.Example()
133 |         example.ParseFromString(serialized_example)
134 |         X = example.features.feature['X'].float_list.value
135 |         y = example.features.feature['y'].int64_list.value
136 |         print(X, y)
137 | 
138 | 
139 | Streaming
140 | ---------
141 | 
142 | Python List
143 | ~~~~~~~~~~~
144 | 
145 | .. code-block:: python
146 | 
147 |     import pysparkling
148 | 
149 |     sc = pysparkling.Context()
150 |     ssc = pysparkling.streaming.StreamingContext(sc, 1.0)
151 | 
152 |     (
153 |         ssc
154 |         .queueStream([[4], [2], [7]])
155 |         .foreachRDD(lambda rdd: print(rdd.collect()))
156 |     )
157 | 
158 |     ssc.start()
159 |     ssc.awaitTermination(3.5)
160 | 
161 |     # output:
162 |     # [4]
163 |     # [2]
164 |     # [7]
165 | 


--------------------------------------------------------------------------------
/docs/sphinx/version_index/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/version_index/.nojekyll


--------------------------------------------------------------------------------
/docs/sphinx/version_index/CNAME:
--------------------------------------------------------------------------------
1 | pysparkling.trivial.io


--------------------------------------------------------------------------------
/docs/sphinx/version_index/circle.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 |   pre:
3 |     - sudo pip install html5validator
4 | test:
5 |   override:
6 |     - html5validator
7 | 


--------------------------------------------------------------------------------
/docs/sphinx/version_index/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/docs/sphinx/version_index/favicon.ico


--------------------------------------------------------------------------------
/docs/sphinx/version_index/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
 3 | <!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
 4 | <!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
 5 | <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
 6 |     <head>
 7 |         <meta charset="utf-8">
 8 |         <meta http-equiv="X-UA-Compatible" content="IE=edge">
 9 |         <title>Databench Docs</title>
10 |         <meta name="description" content="">
11 |         <meta name="viewport" content="width=device-width, initial-scale=1">
12 |         <meta http-equiv="refresh" content="0;url=http://pysparkling.trivial.io/v0.3/">
13 |         <link rel="shortcut icon" href="favicon.ico">
14 | 
15 |         <style>
16 |             body {
17 |                 max-width: 400px;
18 |                 margin: 40px auto;
19 |                 font-family: sans-serif;
20 |             }
21 | 
22 |             ul {
23 |                 padding: 0;
24 |                 list-style-type: none;
25 |             }
26 |             li {
27 |                 margin: 10px 0;
28 |             }
29 |         </style>
30 |     </head>
31 |     <body>
32 |         <!--[if lt IE 7]>
33 |             <p class="browsehappy">You are using an <strong>outdated</strong> browser. Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</p>
34 |         <![endif]-->
35 | 
36 |         <img src="logo.svg" alt="pysparkling logo" />
37 | 
38 |         <h1>pysparkling Docs</h1>
39 |         <ul>
40 |             <li class="newest"><a href="v0.3/">v0.3</a></li>
41 |             <li><a href="v0.2/">v0.2</a></li>
42 |         </ul>
43 |     </body>
44 | </html>
45 | 


--------------------------------------------------------------------------------
/logo/banner-w1500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/banner-w1500.png


--------------------------------------------------------------------------------
/logo/banner-w500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/banner-w500.png


--------------------------------------------------------------------------------
/logo/create.py:
--------------------------------------------------------------------------------
  1 | """Creates an SVG of the Databench logo. Optionally also a png."""
  2 | 
  3 | import os
  4 | import random
  5 | 
  6 | import svgwrite
  7 | 
  8 | DATA = [
  9 |     [0, 1, 1, 1, 1, 1, 1, 1],
 10 |     [0, 1, 1, 1, 1, 1, 1, 1],
 11 |     [0, 0, 0, 0, 1, 1, 1, 1],
 12 |     [0, 0, 0, 1, 1, 1, 1, 1],
 13 |     [0, 0, 1, 1, 1, 0, 1, 1],
 14 |     [0, 1, 1, 1, 0, 0, 1, 1],
 15 |     [1, 1, 1, 0, 0, 0, 1, 1],
 16 |     [1, 1, 0, 0, 0, 0, 0, 0],
 17 | ]
 18 | 
 19 | 
 20 | def color(x, y):
 21 |     """triangles.
 22 | 
 23 |     Colors:
 24 |     - http://paletton.com/#uid=70l150klllletuehUpNoMgTsdcs shade 2
 25 |     """
 26 | 
 27 |     return '#42359C'  # "#CDB95B"
 28 | 
 29 |     if (x - 4) > (y - 4) and -(y - 4) <= (x - 4):
 30 |         # right
 31 |         return '#42359C'  # "#CDB95B"
 32 |     elif (x - 4) > (y - 4) and -(y - 4) > (x - 4):
 33 |         # top
 34 |         return "#CD845B"
 35 |     elif (x - 4) <= (y - 4) and -(y - 4) <= (x - 4):
 36 |         # bottom
 37 |         return "#57488E"
 38 |     elif (x - 4) <= (y - 4) and -(y - 4) > (x - 4):
 39 |         # left
 40 |         return "#3B8772"
 41 | 
 42 |     # should not happen
 43 |     return "black"
 44 | 
 45 | 
 46 | def simple(svg_document, x, y, v):
 47 |     if v == 1:
 48 |         svg_document.add(svg_document.rect(insert=(x * 16, y * 16),
 49 |                                            size=("16px", "16px"),
 50 |                                            # rx="2px",
 51 |                                            # stroke_width="1",
 52 |                                            # stroke=color(x, y),
 53 |                                            fill=color(x, y)))
 54 | 
 55 | 
 56 | def smaller(svg_document, x, y, v, x_offset=0, y_offset=0):
 57 |     # from center
 58 |     distance2 = (x - 3.5) ** 2 + (y - 3.5) ** 2
 59 |     max_distance2 = 2 * 4 ** 2
 60 | 
 61 |     if v == 1:
 62 |         size = 16.0 * (1.0 - distance2 / max_distance2)
 63 |         number_of_cubes = int(16 ** 2 / (size ** 2))
 64 |         for i in range(number_of_cubes):
 65 |             xi = x * 16 + 1 + random.random() * (14.0 - size) + x_offset
 66 |             yi = y * 16 + 1 + random.random() * (14.0 - size) + y_offset
 67 |             sizepx = str(size) + "px"
 68 |             svg_document.add(svg_document.rect(insert=(xi, yi),
 69 |                                                size=(sizepx, sizepx),
 70 |                                                rx="2px",
 71 |                                                stroke_width="1",
 72 |                                                # stroke='#4E9954',
 73 |                                                stroke='#FAE5A5',
 74 |                                                # stroke=color(x, y),
 75 |                                                fill=color(x, y)))
 76 | 
 77 | 
 78 | def main():
 79 |     svg_favicon = svgwrite.Drawing(filename="favicon.svg",
 80 |                                    size=("128px", "128px"))
 81 |     svg_document = svgwrite.Drawing(filename="logo.svg",
 82 |                                     size=("128px", "128px"))
 83 |     svg_banner = svgwrite.Drawing(filename="banner.svg",
 84 |                                   size=("600px", "200px"))
 85 |     for y, r in enumerate(DATA):
 86 |         for x, v in enumerate(r):
 87 |             simple(svg_favicon, x, y, v)
 88 |             smaller(svg_document, x, y, v)
 89 |             smaller(svg_banner, x, y, v, x_offset=20, y_offset=40)
 90 |     # add banner text
 91 |     g = svg_banner.g(style='font-size:40px; font-family:Arial; font-weight: bold; font-style: italic;')
 92 |     g.add(svg_banner.text(
 93 |         'pysparkling',
 94 |         insert=(180, 120), fill='#000000'),
 95 |     )
 96 |     svg_banner.add(g)
 97 |     # print(svg_document.tostring())
 98 |     svg_favicon.save()
 99 |     svg_document.save()
100 |     svg_banner.save()
101 | 
102 |     # create pngs
103 |     os.system('svg2png --width=100 --height=100 logo.svg logo-w100.png')
104 |     os.system('svg2png --width=600 --height=600 logo.svg logo-w600.png')
105 |     os.system('svg2png --width=500 --height=100 banner.svg banner-w500.png')
106 |     os.system('svg2png --width=1500 --height=400 banner.svg banner-w1500.png')
107 |     favicon_sizes = [16, 32, 48, 128, 256]
108 |     for s in favicon_sizes:
109 |         os.system(f'svg2png --width={s} --height={s} favicon.svg favicon-w{s}.png')
110 |     png_favicon_names = [f'favicon-w{s}.png' for s in favicon_sizes]
111 |     os.system('convert ' + (' '.join(png_favicon_names)) +
112 |               ' -colors 256 favicon.ico')
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     random.seed(42)
117 |     main()
118 | 


--------------------------------------------------------------------------------
/logo/favicon-w128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w128.png


--------------------------------------------------------------------------------
/logo/favicon-w16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w16.png


--------------------------------------------------------------------------------
/logo/favicon-w256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w256.png


--------------------------------------------------------------------------------
/logo/favicon-w32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w32.png


--------------------------------------------------------------------------------
/logo/favicon-w48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon-w48.png


--------------------------------------------------------------------------------
/logo/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/favicon.ico


--------------------------------------------------------------------------------
/logo/favicon.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8" ?>
2 | <svg baseProfile="full" height="128px" version="1.1" width="128px" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs /><rect fill="#42359C" height="16px" width="16px" x="16" y="0" /><rect fill="#42359C" height="16px" width="16px" x="32" y="0" /><rect fill="#42359C" height="16px" width="16px" x="48" y="0" /><rect fill="#42359C" height="16px" width="16px" x="64" y="0" /><rect fill="#42359C" height="16px" width="16px" x="80" y="0" /><rect fill="#42359C" height="16px" width="16px" x="96" y="0" /><rect fill="#42359C" height="16px" width="16px" x="112" y="0" /><rect fill="#42359C" height="16px" width="16px" x="16" y="16" /><rect fill="#42359C" height="16px" width="16px" x="32" y="16" /><rect fill="#42359C" height="16px" width="16px" x="48" y="16" /><rect fill="#42359C" height="16px" width="16px" x="64" y="16" /><rect fill="#42359C" height="16px" width="16px" x="80" y="16" /><rect fill="#42359C" height="16px" width="16px" x="96" y="16" /><rect fill="#42359C" height="16px" width="16px" x="112" y="16" /><rect fill="#42359C" height="16px" width="16px" x="64" y="32" /><rect fill="#42359C" height="16px" width="16px" x="80" y="32" /><rect fill="#42359C" height="16px" width="16px" x="96" y="32" /><rect fill="#42359C" height="16px" width="16px" x="112" y="32" /><rect fill="#42359C" height="16px" width="16px" x="48" y="48" /><rect fill="#42359C" height="16px" width="16px" x="64" y="48" /><rect fill="#42359C" height="16px" width="16px" x="80" y="48" /><rect fill="#42359C" height="16px" width="16px" x="96" y="48" /><rect fill="#42359C" height="16px" width="16px" x="112" y="48" /><rect fill="#42359C" height="16px" width="16px" x="32" y="64" /><rect fill="#42359C" height="16px" width="16px" x="48" y="64" /><rect fill="#42359C" height="16px" width="16px" x="64" y="64" /><rect fill="#42359C" height="16px" width="16px" x="96" y="64" /><rect fill="#42359C" height="16px" width="16px" x="112" y="64" /><rect fill="#42359C" height="16px" width="16px" x="16" y="80" /><rect fill="#42359C" height="16px" width="16px" x="32" y="80" /><rect fill="#42359C" height="16px" width="16px" x="48" y="80" /><rect fill="#42359C" height="16px" width="16px" x="96" y="80" /><rect fill="#42359C" height="16px" width="16px" x="112" y="80" /><rect fill="#42359C" height="16px" width="16px" x="0" y="96" /><rect fill="#42359C" height="16px" width="16px" x="16" y="96" /><rect fill="#42359C" height="16px" width="16px" x="32" y="96" /><rect fill="#42359C" height="16px" width="16px" x="96" y="96" /><rect fill="#42359C" height="16px" width="16px" x="112" y="96" /><rect fill="#42359C" height="16px" width="16px" x="0" y="112" /><rect fill="#42359C" height="16px" width="16px" x="16" y="112" /></svg>


--------------------------------------------------------------------------------
/logo/logo-w100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/logo-w100.png


--------------------------------------------------------------------------------
/logo/logo-w600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/logo/logo-w600.png


--------------------------------------------------------------------------------
/pysparkling/__init__.py:
--------------------------------------------------------------------------------
 1 | """pysparkling module"""
 2 | # flake8: noqa
 3 | 
 4 | from . import exceptions, fileio, streaming
 5 | from .__version__ import __version__
 6 | from .accumulators import Accumulator, AccumulatorParam
 7 | from .broadcast import Broadcast
 8 | from .cache_manager import CacheManager, TimedCacheManager
 9 | from .context import Context
10 | from .rdd import RDD
11 | from .sql.types import Row
12 | from .stat_counter import StatCounter
13 | from .storagelevel import StorageLevel
14 | 
15 | __all__ = ['RDD', 'Context', 'Broadcast', 'StatCounter', 'CacheManager', 'Row',
16 |            'TimedCacheManager', 'StorageLevel',
17 |            'exceptions', 'fileio', 'streaming']
18 | 


--------------------------------------------------------------------------------
/pysparkling/__version__.py:
--------------------------------------------------------------------------------
1 | from ._version import get_versions
2 | 
3 | __version__ = get_versions()['version']
4 | 


--------------------------------------------------------------------------------
/pysparkling/accumulators.py:
--------------------------------------------------------------------------------
  1 | # A large part of this module is extracted from its PySpark counterpart at
  2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/accumulators.html
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | """
 21 | >>> from pysparkling import Context
 22 | >>> sc = Context()
 23 | >>> a = sc.accumulator(1)
 24 | >>> a.value
 25 | 1
 26 | >>> a.value = 2
 27 | >>> a.value
 28 | 2
 29 | >>> a += 5
 30 | >>> a.value
 31 | 7
 32 | 
 33 | >>> sc.accumulator(1.0).value
 34 | 1.0
 35 | 
 36 | >>> sc.accumulator(1j).value
 37 | 1j
 38 | 
 39 | >>> rdd = sc.parallelize([1,2,3])
 40 | >>> def f(x):
 41 | ...     global a
 42 | ...     a += x
 43 | >>> rdd.foreach(f)
 44 | >>> a.value
 45 | 13
 46 | 
 47 | >>> b = sc.accumulator(0)
 48 | >>> def g(x):
 49 | ...     b.add(x)
 50 | >>> rdd.foreach(g)
 51 | >>> b.value
 52 | 6
 53 | 
 54 | >>> from pysparkling import AccumulatorParam
 55 | >>> class VectorAccumulatorParam(AccumulatorParam):
 56 | ...     def zero(self, value):
 57 | ...         return [0.0] * len(value)
 58 | ...     def addInPlace(self, val1, val2):
 59 | ...         for i in range(len(val1)):
 60 | ...              val1[i] += val2[i]
 61 | ...         return val1
 62 | >>> va = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam())
 63 | >>> va.value
 64 | [1.0, 2.0, 3.0]
 65 | >>> def g(x):
 66 | ...     global va
 67 | ...     va += [x] * 3
 68 | >>> rdd.foreach(g)
 69 | >>> va.value
 70 | [7.0, 8.0, 9.0]
 71 | 
 72 | >>> sc.accumulator([1.0, 2.0, 3.0]) # doctest: +IGNORE_EXCEPTION_DETAIL
 73 | Traceback (most recent call last):
 74 | ...
 75 | TypeError: No default accumulator param for type <type 'list'>
 76 | """
 77 | 
 78 | 
 79 | __all__ = ['Accumulator', 'AccumulatorParam']
 80 | 
 81 | 
 82 | class Accumulator:
 83 |     """
 84 |     A shared variable that can be accumulated, i.e., has a commutative and associative "add"
 85 |     operation. Tasks can add values to an Accumulator with the ``+=`` operator
 86 | 
 87 |     The API supports accumulators for primitive data types like ``int`` and
 88 |     ``float``, users can also define accumulators for custom types by providing a custom
 89 |     ``AccumulatorParam`` object. Refer to the doctest of this module for an example.
 90 |     """
 91 | 
 92 |     def __init__(self, value, accum_param):
 93 |         """Create a new Accumulator with a given initial value and AccumulatorParam object"""
 94 |         self.accum_param = accum_param
 95 |         self._value = value
 96 | 
 97 |     @property
 98 |     def value(self):
 99 |         return self._value
100 | 
101 |     @value.setter
102 |     def value(self, value):
103 |         self._value = value
104 | 
105 |     def add(self, term):
106 |         """Adds a term to this accumulator's value"""
107 |         self._value = self.accum_param.addInPlace(self._value, term)
108 | 
109 |     def __iadd__(self, term):
110 |         """The += operator; adds a term to this accumulator's value"""
111 |         self.add(term)
112 |         return self
113 | 
114 |     def __str__(self):
115 |         return str(self._value)
116 | 
117 |     def __repr__(self):
118 |         return f"Accumulator<value={self._value}>"
119 | 
120 | 
121 | class AccumulatorParam:
122 |     """
123 |     Helper object that defines how to accumulate values of a given type.
124 |     """
125 |     def zero(self, value):
126 |         """
127 |         Provide a "zero value" for the type, compatible in dimensions with the
128 |         provided ``value`` (e.g., a zero vector)
129 |         """
130 |         raise NotImplementedError
131 | 
132 |     def addInPlace(self, value1, value2):
133 |         """
134 |         Add two values of the accumulator's data type, returning a new value;
135 |         for efficiency, can also update ``value1`` in place and return it.
136 |         """
137 |         raise NotImplementedError
138 | 
139 | 
140 | class AddingAccumulatorParam(AccumulatorParam):
141 |     """
142 |     An AccumulatorParam that uses the + operators to add values. Designed for simple types
143 |     such as integers, floats, and lists. Requires the zero value for the underlying type
144 |     as a parameter.
145 |     """
146 |     def __init__(self, zero_value):
147 |         self.zero_value = zero_value
148 | 
149 |     def zero(self, value):
150 |         return self.zero_value
151 | 
152 |     def addInPlace(self, value1, value2):
153 |         value1 += value2
154 |         return value1
155 | 
156 | 
157 | # Singleton accumulator params for some standard types
158 | INT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0)
159 | FLOAT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0)
160 | COMPLEX_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0j)
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     #
165 |     # Execute doctests with
166 |     #
167 |     # $ python -m pysparkling.accumulators -v
168 |     #
169 |     import doctest
170 |     import sys
171 | 
172 |     failure_count, _ = doctest.testmod()
173 |     if failure_count:
174 |         sys.exit(-1)
175 | 


--------------------------------------------------------------------------------
/pysparkling/broadcast.py:
--------------------------------------------------------------------------------
 1 | # A large part of this module is extracted from its PySpark counterpart at
 2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/broadcast.html
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | __all__ = ['Broadcast']
21 | 
22 | 
23 | class Broadcast:
24 |     """
25 |     A broadcast variable created with ``b = sc.broadcast(0)``.
26 |     Access its value through ``b.value``.
27 | 
28 |     Examples:
29 | 
30 |     >>> from pysparkling import Context
31 |     >>> sc = Context()
32 |     >>> b = sc.broadcast([1, 2, 3, 4, 5])
33 |     >>> b.value
34 |     [1, 2, 3, 4, 5]
35 |     >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
36 |     [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
37 |     """
38 |     def __init__(self, sc=None, value=None):
39 |         self._value = value
40 | 
41 |     @property
42 |     def value(self):
43 |         """Returs the broadcasted value."""
44 |         return self._value
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     #
49 |     # Execute doctests with
50 |     #
51 |     # $ python -m pysparkling.accumulators -v
52 |     #
53 |     import doctest
54 |     import sys
55 | 
56 |     failure_count, _ = doctest.testmod()
57 |     if failure_count:
58 |         sys.exit(-1)
59 | 


--------------------------------------------------------------------------------
/pysparkling/exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class ConnectionException(Exception):
 3 |     pass
 4 | 
 5 | 
 6 | class ContextIsLockedException(Exception):
 7 |     pass
 8 | 
 9 | 
10 | class FileAlreadyExistsException(Exception):
11 |     pass
12 | 
13 | 
14 | class FileSystemNotSupported(Exception):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .file import File
4 | from .textfile import TextFile
5 | 
6 | # flake8: noqa
7 | 
8 | __all__ = ['File', 'TextFile']
9 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .bz2 import Bz2
 4 | from .codec import Codec
 5 | from .gz import Gz
 6 | from .lzma import Lzma
 7 | from .sevenz import SevenZ
 8 | from .tar import Tar, TarBz2, TarGz
 9 | from .zip import Zip
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | FILE_ENDINGS = [
14 |     (('.tar',), Tar),
15 |     (('.tar.gz',), TarGz),
16 |     (('.tar.bz2',), TarBz2),
17 |     (('.gz',), Gz),
18 |     (('.zip',), Zip),
19 |     (('.bz2',), Bz2),
20 |     (('.lzma', '.xz'), Lzma),
21 |     (('.7z',), SevenZ),
22 | ]
23 | 
24 | 
25 | class NoCodec(Codec):
26 |     pass
27 | 
28 | 
29 | def get_codec(path):
30 |     """Find the codec implementation for this path."""
31 |     if '.' not in path or path.rfind('/') > path.rfind('.'):
32 |         return Codec
33 | 
34 |     for endings, codec_class in FILE_ENDINGS:
35 |         if any(path.endswith(e) for e in endings):
36 |             log.debug('Using %s codec: %s', endings, path)
37 |             return codec_class
38 | 
39 |     return NoCodec
40 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/bz2.py:
--------------------------------------------------------------------------------
 1 | import bz2
 2 | import io
 3 | import logging
 4 | 
 5 | from .codec import Codec
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Bz2(Codec):
11 |     """Implementation of :class:`.Codec` for bz2 compression."""
12 | 
13 |     def compress(self, stream):
14 |         return io.BytesIO(bz2.compress(b''.join(stream)))
15 | 
16 |     def decompress(self, stream):
17 |         return io.BytesIO(bz2.decompress(stream.read()))
18 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/codec.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class Codec:
 7 |     """Codec."""
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def compress(self, stream):
12 |         """Compress.
13 | 
14 |         :param io.BytesIO stream: Uncompressed input stream.
15 |         :rtype: io.BytesIO
16 |         """
17 |         return stream
18 | 
19 |     def decompress(self, stream):
20 |         """Decompress.
21 | 
22 |         :param io.BytesIO stream: Compressed input stream.
23 |         :rtype: io.BytesIO
24 |         """
25 |         return stream
26 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/gz.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | from io import BytesIO
 3 | import logging
 4 | 
 5 | from .codec import Codec
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Gz(Codec):
11 |     """Implementation of :class:`.Codec` for gz compression."""
12 | 
13 |     def compress(self, stream):
14 |         compressed = BytesIO()
15 | 
16 |         with gzip.GzipFile(fileobj=compressed, mode='wb') as f:
17 |             f.write(stream.read())
18 | 
19 |         compressed.seek(0)
20 |         return compressed
21 | 
22 |     def decompress(self, stream):
23 |         uncompressed = BytesIO()
24 | 
25 |         with gzip.GzipFile(fileobj=stream, mode='rb') as f:
26 |             uncompressed.write(f.read())
27 | 
28 |         uncompressed.seek(0)
29 |         return uncompressed
30 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/lzma.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | import logging
 3 | import lzma
 4 | 
 5 | from .codec import Codec
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Lzma(Codec):
11 |     """Implementation of :class:`.Codec` for lzma compression.
12 | 
13 |     Needs Python >= 3.3.
14 |     """
15 | 
16 |     def __init__(self):
17 |         if lzma is None:
18 |             log.warning('LZMA codec not supported. It is only supported '
19 |                         'in Python>=3.3. Not compressing streams.')
20 |         super().__init__()
21 | 
22 |     def compress(self, stream):
23 |         if lzma is None:
24 |             return Codec.compress(self, stream)
25 | 
26 |         return BytesIO(lzma.compress(stream.read()))
27 | 
28 |     def decompress(self, stream):
29 |         if lzma is None:
30 |             return Codec.decompress(self, stream)
31 | 
32 |         return BytesIO(lzma.decompress(stream.read()))
33 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/sevenz.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import py7zlib
 3 | except ImportError:
 4 |     py7zlib = None
 5 | 
 6 | from io import BytesIO
 7 | import logging
 8 | 
 9 | from .codec import Codec
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | 
14 | class SevenZ(Codec):
15 |     """Implementation of :class:`.Codec` for 7z compression.
16 | 
17 |     Needs the `pylzma` module.
18 |     """
19 | 
20 |     def __init__(self):
21 |         if py7zlib is None:
22 |             log.warning('py7zlib could not be imported. To read 7z files, '
23 |                         'install the library with "pip install pylzma".')
24 |         super().__init__()
25 | 
26 |     def compress(self, stream):
27 |         log.warning('Writing of 7z compressed archives is not supported.')
28 |         return stream
29 | 
30 |     def decompress(self, stream):
31 |         if py7zlib is None:
32 |             return Codec.decompress(self, stream)
33 | 
34 |         uncompressed = BytesIO()
35 | 
36 |         f = py7zlib.Archive7z(file=stream)
37 |         for f_name in f.getnames():
38 |             uncompressed.write(f.getmember(f_name).read())
39 | 
40 |         uncompressed.seek(0)
41 |         return uncompressed
42 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/tar.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | import logging
 3 | import tarfile
 4 | 
 5 | from .codec import Codec
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Tar(Codec):
11 |     """Implementation of :class:`.Codec` for tar compression."""
12 | 
13 |     def compress(self, stream):
14 |         compressed = BytesIO()
15 | 
16 |         with tarfile.open(fileobj=compressed, mode='w') as f:
17 |             s = stream.read()
18 | 
19 |             t = tarfile.TarInfo('data')
20 |             t.size = len(s)
21 | 
22 |             f.addfile(t, BytesIO(s))
23 | 
24 |         compressed.seek(0)
25 |         return compressed
26 | 
27 |     def decompress(self, stream):
28 |         uncompressed = BytesIO()
29 | 
30 |         with tarfile.open(fileobj=stream, mode='r') as f:
31 |             for tar_info in f.getmembers():
32 |                 if not tar_info.isfile():
33 |                     continue
34 |                 uncompressed.write(f.extractfile(tar_info).read())
35 | 
36 |         uncompressed.seek(0)
37 |         return uncompressed
38 | 
39 | 
40 | class TarGz(Codec):
41 |     """Implementation of :class:`.Codec` for .tar.gz compression."""
42 | 
43 |     def compress(self, stream):
44 |         compressed = BytesIO()
45 | 
46 |         with tarfile.open(fileobj=compressed, mode='w:gz') as f:
47 |             s = stream.read()
48 | 
49 |             t = tarfile.TarInfo('data')
50 |             t.size = len(s)
51 | 
52 |             f.addfile(t, BytesIO(s))
53 | 
54 |         compressed.seek(0)
55 |         return compressed
56 | 
57 |     def decompress(self, stream):
58 |         uncompressed = BytesIO()
59 | 
60 |         with tarfile.open(fileobj=stream, mode='r:gz') as f:
61 |             for tar_info in f.getmembers():
62 |                 if not tar_info.isfile():
63 |                     continue
64 |                 uncompressed.write(f.extractfile(tar_info).read())
65 | 
66 |         uncompressed.seek(0)
67 |         return uncompressed
68 | 
69 | 
70 | class TarBz2(Codec):
71 |     """Implementation of :class:`.Codec` for .tar.bz2 compression."""
72 | 
73 |     def compress(self, stream):
74 |         compressed = BytesIO()
75 | 
76 |         with tarfile.open(fileobj=compressed, mode='w:bz2') as f:
77 |             s = stream.read()
78 | 
79 |             t = tarfile.TarInfo('data')
80 |             t.size = len(s)
81 | 
82 |             f.addfile(t, BytesIO(s))
83 | 
84 |         compressed.seek(0)
85 |         return compressed
86 | 
87 |     def decompress(self, stream):
88 |         uncompressed = BytesIO()
89 | 
90 |         with tarfile.open(fileobj=stream, mode='r:bz2') as f:
91 |             for tar_info in f.getmembers():
92 |                 if not tar_info.isfile():
93 |                     continue
94 |                 uncompressed.write(f.extractfile(tar_info).read())
95 | 
96 |         uncompressed.seek(0)
97 |         return uncompressed
98 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/codec/zip.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | import logging
 3 | import zipfile
 4 | 
 5 | from .codec import Codec
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Zip(Codec):
11 |     """Implementation of :class:`.Codec` for zip compression."""
12 | 
13 |     def compress(self, stream):
14 |         compressed = BytesIO()
15 | 
16 |         with zipfile.ZipFile(file=compressed, mode='w', allowZip64=True) as f:
17 |             f.writestr('data', stream.read())
18 | 
19 |         compressed.seek(0)
20 |         return compressed
21 | 
22 |     def decompress(self, stream):
23 |         uncompressed = BytesIO()
24 | 
25 |         with zipfile.ZipFile(file=stream, mode='r', allowZip64=True) as f:
26 |             for f_name in f.namelist():
27 |                 uncompressed.write(f.read(f_name))
28 | 
29 |         uncompressed.seek(0)
30 |         return uncompressed
31 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/file.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import logging
  3 | 
  4 | from . import codec, fs
  5 | 
  6 | log = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | class File:
 10 |     """File object.
 11 | 
 12 |     :param file_name: Any file name.
 13 |     """
 14 | 
 15 |     def __init__(self, file_name):
 16 |         self.file_name = file_name
 17 |         self.fs = fs.get_fs(file_name)(file_name)
 18 |         self.codec = codec.get_codec(file_name)()
 19 | 
 20 |     @staticmethod
 21 |     def resolve_filenames(all_expr):
 22 |         """resolve expression for a filename
 23 | 
 24 |         :param all_expr:
 25 |             A comma separated list of expressions. The expressions can contain
 26 |             the wildcard characters ``*`` and ``?``. It also resolves Spark
 27 |             datasets to the paths of the individual partitions
 28 |             (i.e. ``my_data`` gets resolved to
 29 |             ``[my_data/part-00000, my_data/part-00001]``).
 30 | 
 31 |         :returns: A list of file names.
 32 |         :rtype: list
 33 |         """
 34 |         files = []
 35 |         for expr in all_expr.split(','):
 36 |             expr = expr.strip()
 37 |             files += fs.get_fs(expr).resolve_filenames(expr)
 38 |         log.debug('Filenames: %s', files)
 39 |         return files
 40 | 
 41 |     @classmethod
 42 |     def get_content(cls, all_expr):
 43 |         """Return all files matching or in folder matching one of the given expression
 44 | 
 45 |         :param all_expr:
 46 |             A list of expressions.
 47 |             The expressions can contain the wildcard characters ``*`` and ``?``.
 48 | 
 49 |         :returns: A list of file names.
 50 |         :rtype: list
 51 |         """
 52 |         files = []
 53 |         for expr in all_expr:
 54 |             expr = expr.strip()
 55 |             files += fs.get_fs(expr).resolve_content(expr)
 56 |         log.debug('Filenames: %s', files)
 57 |         return files
 58 | 
 59 |     def exists(self):
 60 |         """Checks both for a file or directory at this location.
 61 | 
 62 |         :returns: True or false.
 63 |         """
 64 |         return self.fs.exists()
 65 | 
 66 |     def load(self):
 67 |         """Load the data from a file.
 68 | 
 69 |         :rtype: io.BytesIO
 70 |         """
 71 |         stream = self.fs.load()
 72 |         stream = self.codec.decompress(stream)
 73 |         return stream
 74 | 
 75 |     def dump(self, stream=None):
 76 |         """Writes a stream to a file.
 77 | 
 78 |         :param stream:
 79 |             A BytesIO instance. ``bytes`` are also possible and are converted
 80 |             to BytesIO.
 81 | 
 82 |         :rtype: File
 83 |         """
 84 |         if stream is None:
 85 |             stream = BytesIO()
 86 | 
 87 |         if isinstance(stream, bytes):
 88 |             stream = BytesIO(stream)
 89 | 
 90 |         stream = self.codec.compress(stream)
 91 |         self.fs.dump(stream)
 92 | 
 93 |         return self
 94 | 
 95 |     def make_public(self, recursive=False):
 96 |         """Makes the file public. Currently only supported on S3.
 97 | 
 98 |         :param recursive: Whether to apply this recursively.
 99 |         :rtype: File
100 |         """
101 |         self.fs.make_public(recursive)
102 |         return self
103 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .file_system import FileSystem
 2 | from .gs import GS
 3 | from .hdfs import Hdfs
 4 | from .http import Http
 5 | from .local import Local
 6 | from .s3 import S3
 7 | 
 8 | __all__ = ['FileSystem', 'GS', 'Hdfs', 'Http', 'Local', 'S3']
 9 | 
10 | 
11 | FILE_EXTENSIONS = [
12 |     (('file', ''), Local),
13 |     (('s3', 's3n'), S3),
14 |     (('gs', 'gcs'), GS),
15 |     (('http', 'https'), Http),
16 |     (('hdfs'), Hdfs),
17 | ]
18 | 
19 | 
20 | def get_fs(path):
21 |     """Find the file system implementation for this path."""
22 |     scheme = ''
23 |     if '://' in path:
24 |         scheme = path.partition('://')[0]
25 | 
26 |     for schemes, fs_class in FILE_EXTENSIONS:
27 |         if scheme in schemes:
28 |             return fs_class
29 | 
30 |     return FileSystem
31 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/file_system.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import logging
 3 | import typing as t
 4 | 
 5 | log = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class FileSystem:
 9 |     """Interface class for the file system.
10 | 
11 |     :param str file_name: File name.
12 |     """
13 |     def __init__(self, file_name: str):
14 |         self.file_name: str = file_name
15 | 
16 |     @staticmethod
17 |     def resolve_filenames(expr: str) -> t.List[str]:
18 |         """Resolve the given glob-like expression to filenames.
19 | 
20 |         :rtype: list
21 |         """
22 |         log.error('Cannot resolve: %s', expr)
23 |         raise NotImplementedError
24 | 
25 |     @staticmethod
26 |     def resolve_content(expr: str) -> t.List[str]:
27 |         """Return all the files matching expr or in a folder matching expr
28 | 
29 |         :rtype: list
30 |         """
31 |         log.error('Cannot resolve: %s', expr)
32 |         raise NotImplementedError
33 | 
34 |     def exists(self) -> bool:
35 |         """Check whether the given file_name exists.
36 | 
37 |         :rtype: bool
38 |         """
39 |         log.warning('Could not determine whether %s exists due to unhandled scheme.', self.file_name)
40 |         raise NotImplementedError
41 | 
42 |     def load(self) -> io.BytesIO:
43 |         """Load a file to a stream."""
44 |         log.error('Cannot load: %s', self.file_name)
45 |         raise NotImplementedError
46 | 
47 |     def load_text(self, encoding: str = 'utf8', encoding_errors: str = 'ignore') -> io.StringIO:
48 |         """Load a file to a stream.
49 | 
50 |         :param str encoding: Text encoding.
51 |         :param str encoding_errors: How to handle encoding errors.
52 |         """
53 |         log.error('Cannot load: %s', self.file_name)
54 |         raise NotImplementedError
55 | 
56 |     def dump(self, stream: io.BytesIO):
57 |         """Dump a stream to a file.
58 | 
59 |         :param io.BytesIO stream: Input tream.
60 |         """
61 |         log.error('Cannot dump: %s', self.file_name)
62 |         raise NotImplementedError
63 | 
64 |     def make_public(self, recursive=False):
65 |         """Make the file public (only on some file systems).
66 | 
67 |         :param bool recursive: Recurse.
68 |         :rtype: FileSystem
69 |         """
70 |         log.warning('Cannot make %s public.', self.file_name)
71 |         raise NotImplementedError
72 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/gs.py:
--------------------------------------------------------------------------------
  1 | from fnmatch import fnmatch
  2 | from io import BytesIO, StringIO
  3 | import logging
  4 | 
  5 | from ...exceptions import FileSystemNotSupported
  6 | from ...utils import parse_file_uri, Tokenizer
  7 | from .file_system import FileSystem
  8 | 
  9 | log = logging.getLogger(__name__)
 10 | 
 11 | try:
 12 |     from gcloud import storage
 13 | except ImportError:
 14 |     storage = None
 15 | 
 16 | 
 17 | class GS(FileSystem):
 18 |     """:class:`.FileSystem` implementation for Google Storage.
 19 | 
 20 |     Paths are of the form `gs://bucket_name/file_path` or
 21 |     `gs://project_name:bucket_name/file_path`.
 22 |     """
 23 | 
 24 |     #: Set a default project name.
 25 |     project_name = None
 26 | 
 27 |     #: Default mime type.
 28 |     mime_type = 'text/plain'
 29 | 
 30 |     _clients = {}
 31 | 
 32 |     def __init__(self, file_name):
 33 |         if storage is None:
 34 |             raise FileSystemNotSupported(
 35 |                 'Google Storage is not supported. Install "gcloud".'
 36 |             )
 37 | 
 38 |         super().__init__(file_name)
 39 | 
 40 |         # obtain key
 41 |         t = Tokenizer(self.file_name)
 42 |         t.get_next('://')  # skip scheme
 43 |         bucket_name = t.get_next('/')
 44 |         if ':' in bucket_name:
 45 |             project_name, _, bucket_name = bucket_name.partition(':')
 46 |         else:
 47 |             project_name = GS.project_name
 48 |         blob_name = t.get_next()
 49 | 
 50 |         client = GS._get_client(project_name)
 51 |         bucket = client.get_bucket(bucket_name)
 52 |         self.blob = bucket.get_blob(blob_name)
 53 |         if not self.blob:
 54 |             self.blob = bucket.blob(blob_name)
 55 | 
 56 |     @staticmethod
 57 |     def _get_client(project_name):
 58 |         if project_name not in GS._clients:
 59 |             if storage is None:
 60 |                 raise FileSystemNotSupported(
 61 |                     'Google Storage is not supported. Install "gcloud".'
 62 |                 )
 63 |             GS._clients[project_name] = storage.Client(project_name)
 64 |         return GS._clients[project_name]
 65 | 
 66 |     @staticmethod
 67 |     def resolve_filenames(expr):
 68 |         files = []
 69 | 
 70 |         t = Tokenizer(expr)
 71 |         scheme = t.get_next('://')
 72 |         bucket_name = t.get_next('/')
 73 |         if ':' in bucket_name:
 74 |             project_name, _, bucket_name = bucket_name.partition(':')
 75 |         else:
 76 |             project_name = GS.project_name
 77 |         prefix = t.get_next(['*', '?'])
 78 | 
 79 |         bucket = GS._get_client(project_name).get_bucket(bucket_name)
 80 |         expr_s = len(scheme) + 3 + len(project_name) + 1 + len(bucket_name) + 1
 81 |         expr = expr[expr_s:]
 82 |         for k in bucket.list_blobs(prefix=prefix):
 83 |             if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'):
 84 |                 files.append(f'{scheme}://{project_name}:{bucket_name}/{k.name}')
 85 |         return files
 86 | 
 87 |     @staticmethod
 88 |     def resolve_content(expr):
 89 |         scheme, raw_bucket_name, folder_path, pattern = parse_file_uri(expr)
 90 | 
 91 |         if ':' in raw_bucket_name:
 92 |             project_name, _, bucket_name = raw_bucket_name.partition(':')
 93 |         else:
 94 |             project_name = GS.project_name
 95 |             bucket_name = raw_bucket_name
 96 | 
 97 |         folder_path = folder_path[1:]  # Remove leading slash
 98 | 
 99 |         expr = f"{folder_path}{pattern}"
100 |         # Match all files inside folders that match expr
101 |         pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*"
102 | 
103 |         bucket = GS._get_client(project_name).get_bucket(bucket_name)
104 | 
105 |         files = []
106 |         for k in bucket.list_blobs(prefix=folder_path):
107 |             if not k.name.endswith("/") and (
108 |                     fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr)
109 |             ):
110 |                 files.append(
111 |                     f'{scheme}://{raw_bucket_name}/{k.name}'
112 |                 )
113 |         return files
114 | 
115 |     def exists(self):
116 |         t = Tokenizer(self.file_name)
117 |         t.get_next('//')  # skip scheme
118 |         bucket_name = t.get_next('/')
119 |         if ':' in bucket_name:
120 |             project_name, _, bucket_name = bucket_name.partition(':')
121 |         else:
122 |             project_name = GS.project_name
123 |         blob_name = t.get_next()
124 |         bucket = GS._get_client(project_name).get_bucket(bucket_name)
125 |         return (bucket.get_blob(blob_name)
126 |                 or list(bucket.list_blobs(prefix=f'{blob_name}/')))
127 | 
128 |     def load(self):
129 |         log.debug('Loading %s with size %s.', self.blob.name, self.blob.size)
130 |         return BytesIO(self.blob.download_as_string())
131 | 
132 |     def load_text(self, encoding='utf8', encoding_errors='ignore'):
133 |         log.debug('Loading %s with size %s.', self.blob.name, self.blob.size)
134 |         return StringIO(
135 |             self.blob.download_as_string().decode(
136 |                 encoding, encoding_errors
137 |             )
138 |         )
139 | 
140 |     def dump(self, stream):
141 |         log.debug('Dumping to %s.', self.blob.name)
142 |         self.blob.upload_from_string(stream.read(),
143 |                                      content_type=self.mime_type)
144 |         return self
145 | 
146 |     def make_public(self, recursive=False):
147 |         self.blob.make_public(recursive)
148 |         return self
149 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/hdfs.py:
--------------------------------------------------------------------------------
  1 | from fnmatch import fnmatch
  2 | from io import BytesIO, StringIO
  3 | import logging
  4 | 
  5 | from ...exceptions import FileSystemNotSupported
  6 | from ...utils import format_file_uri, parse_file_uri
  7 | from .file_system import FileSystem
  8 | 
  9 | log = logging.getLogger(__name__)
 10 | 
 11 | try:
 12 |     import hdfs
 13 | except ImportError:
 14 |     hdfs = None
 15 | 
 16 | 
 17 | class Hdfs(FileSystem):
 18 |     """:class:`.FileSystem` implementation for HDFS."""
 19 | 
 20 |     _conn = {}
 21 | 
 22 |     def __init__(self, file_name):
 23 |         if hdfs is None:
 24 |             raise FileSystemNotSupported(
 25 |                 'hdfs not supported. Install the python package "hdfs".'
 26 |             )
 27 | 
 28 |         super().__init__(file_name)
 29 | 
 30 |     @staticmethod
 31 |     def client_and_path(path):
 32 |         _, domain, folder_path, file_pattern = parse_file_uri(path)
 33 | 
 34 |         if ':' not in domain:
 35 |             port = 50070
 36 |         else:
 37 |             domain, port = domain.split(':')
 38 |             port = int(port)
 39 |         cache_id = domain + '__' + str(port)
 40 | 
 41 |         if cache_id not in Hdfs._conn:
 42 |             if hdfs is None:
 43 |                 raise FileSystemNotSupported(
 44 |                     'hdfs not supported. Install the python package "hdfs".'
 45 |                 )
 46 |             Hdfs._conn[cache_id] = hdfs.InsecureClient(  # pylint: disable=no-member
 47 |                 f'http://{domain}:{port}'
 48 |             )
 49 |         return Hdfs._conn[cache_id], folder_path + file_pattern
 50 | 
 51 |     def exists(self):
 52 |         c, p = Hdfs.client_and_path(self.file_name)
 53 |         try:
 54 |             c.status(p)
 55 |         except hdfs.util.HdfsError:  # pylint: disable=no-member
 56 |             return False
 57 |         return True
 58 | 
 59 |     @staticmethod
 60 |     def resolve_filenames(expr):
 61 |         c, _ = Hdfs.client_and_path(expr)
 62 | 
 63 |         scheme, domain, folder_path, _ = parse_file_uri(expr)
 64 | 
 65 |         files = []
 66 |         for fn, file_status in c.list(folder_path, status=True):
 67 |             file_local_path = f'{folder_path}{fn}'
 68 |             file_path = format_file_uri(scheme, domain, file_local_path)
 69 |             part_file_expr = expr + ("" if expr.endswith("/") else "/") + 'part*'
 70 | 
 71 |             if fnmatch(file_path, expr):
 72 |                 if file_status["type"] != "DIRECTORY":
 73 |                     files.append(file_path)
 74 |                 else:
 75 |                     files += Hdfs._get_folder_part_files(
 76 |                         c,
 77 |                         scheme,
 78 |                         domain,
 79 |                         file_local_path,
 80 |                         part_file_expr
 81 |                     )
 82 |             elif fnmatch(file_path, part_file_expr):
 83 |                 files.append(file_path)
 84 |         return files
 85 | 
 86 |     @staticmethod
 87 |     def _get_folder_part_files(c, scheme, domain, folder_local_path, expr_with_part):
 88 |         files = []
 89 |         for fn, file_status in c.list(folder_local_path, status=True):
 90 |             sub_file_path = format_file_uri(scheme, domain, folder_local_path, fn)
 91 |             if fnmatch(sub_file_path, expr_with_part) and file_status["type"] != "DIRECTORY":
 92 |                 files.append(sub_file_path)
 93 |         return files
 94 | 
 95 |     @classmethod
 96 |     def _get_folder_files_by_expr(cls, c, scheme, domain, folder_path, expr=None):
 97 |         """
 98 |         Using client c, retrieves all files located in the folder `folder_path` that matches `expr`
 99 | 
100 |         :param c: An HDFS client
101 |         :param scheme: a scheme such as hdfs
102 |         :param domain: a DFS web server
103 |         :param folder_path: a folder path without patterns
104 |         :param expr: a pattern
105 | 
106 |         :return: list of matching files absolute paths prefixed with the scheme and domain
107 |         """
108 |         file_paths = []
109 |         for fn, file_status in c.list(folder_path, status=True):
110 |             file_local_path = f'{folder_path}{fn}'
111 |             if expr is None or fnmatch(file_local_path, expr):
112 |                 if file_status["type"] == "DIRECTORY":
113 |                     file_paths += cls._get_folder_files_by_expr(
114 |                         c,
115 |                         scheme,
116 |                         domain,
117 |                         file_local_path + "/",
118 |                         expr=None
119 |                     )
120 |                 else:
121 |                     file_path = format_file_uri(scheme, domain, file_local_path)
122 |                     file_paths.append(file_path)
123 |             elif file_status["type"] == "DIRECTORY":
124 |                 file_paths += cls._get_folder_files_by_expr(
125 |                     c, scheme, domain, file_local_path + "/", expr
126 |                 )
127 |         return file_paths
128 | 
129 |     @classmethod
130 |     def resolve_content(cls, expr):
131 |         c, _ = cls.client_and_path(expr)
132 | 
133 |         scheme, domain, folder_path, pattern = parse_file_uri(expr)
134 | 
135 |         expr = folder_path + pattern
136 | 
137 |         return cls._get_folder_files_by_expr(c, scheme, domain, folder_path, expr)
138 | 
139 |     def load(self):
140 |         log.debug('Hdfs read for %s.', self.file_name)
141 |         c, path = Hdfs.client_and_path(self.file_name)
142 | 
143 |         with c.read(path) as reader:
144 |             r = BytesIO(reader.read())
145 | 
146 |         return r
147 | 
148 |     def load_text(self, encoding='utf8', encoding_errors='ignore'):
149 |         log.debug('Hdfs text read for %s.', self.file_name)
150 |         c, path = Hdfs.client_and_path(self.file_name)
151 | 
152 |         with c.read(path) as reader:
153 |             r = StringIO(reader.read().decode(encoding, encoding_errors))
154 | 
155 |         return r
156 | 
157 |     def dump(self, stream):
158 |         log.debug('Dump to %s with hdfs write.', self.file_name)
159 |         c, path = Hdfs.client_and_path(self.file_name)
160 |         c.write(path, stream)
161 |         return self
162 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/http.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO, StringIO
 2 | import logging
 3 | 
 4 | from ...exceptions import ConnectionException, FileSystemNotSupported
 5 | from .file_system import FileSystem
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | try:
10 |     import requests
11 | except ImportError:
12 |     requests = None
13 | 
14 | 
15 | class Http(FileSystem):
16 |     """:class:`.FileSystem` implementation for HTTP."""
17 | 
18 |     def __init__(self, file_name):
19 |         if requests is None:
20 |             raise FileSystemNotSupported(
21 |                 'http not supported. Install "requests".'
22 |             )
23 | 
24 |         super().__init__(file_name)
25 |         self.headers = None
26 | 
27 |     @staticmethod
28 |     def resolve_filenames(expr):
29 |         if Http(expr).exists():
30 |             return [expr]
31 |         return []
32 | 
33 |     def exists(self):
34 |         r = requests.head(self.file_name, allow_redirects=True)
35 |         return r.status_code == 200
36 | 
37 |     def load(self):
38 |         log.debug('Http GET request for %s.', self.file_name)
39 |         r = requests.get(self.file_name, headers=self.headers)
40 |         if r.status_code != 200:
41 |             raise ConnectionException()
42 |         return BytesIO(r.content)
43 | 
44 |     def load_text(self, encoding='utf8', encoding_errors='ignore'):
45 |         # warning: encoding and encoding_errors are ignored
46 |         log.debug('Http GET request for %s.', self.file_name)
47 |         r = requests.get(self.file_name, headers=self.headers)
48 |         if r.status_code != 200:
49 |             raise ConnectionException()
50 |         return StringIO(r.text)
51 | 
52 |     def dump(self, stream):
53 |         log.debug('Dump to %s with http PUT.', self.file_name)
54 |         requests.put(self.file_name, data=b''.join(stream))
55 |         return self
56 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/local.py:
--------------------------------------------------------------------------------
 1 | from fnmatch import fnmatch
 2 | import glob
 3 | import io
 4 | import logging
 5 | import os
 6 | 
 7 | from ...utils import Tokenizer
 8 | from .file_system import FileSystem
 9 | 
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | class Local(FileSystem):
14 |     """:class:`.FileSystem` implementation for the local file system."""
15 | 
16 |     @staticmethod
17 |     def resolve_filenames(expr: str):
18 |         if expr.startswith('file://'):
19 |             expr = expr[7:]
20 | 
21 |         if os.path.isfile(expr):
22 |             return [expr]
23 | 
24 |         os_sep = [os.path.sep]
25 |         if os.path.altsep:
26 |             os_sep.append(os.path.altsep)
27 | 
28 |         if not any(sep in expr for sep in os_sep):
29 |             expr = '.' + os.path.sep + expr
30 | 
31 |         t = Tokenizer(expr)
32 |         prefix = t.get_next(['*', '?'])
33 | 
34 |         if not any(prefix.endswith(sep) for sep in os_sep) and any(sep in prefix for sep in os_sep):
35 |             prefix = os.path.dirname(prefix)
36 | 
37 |         files = []
38 |         for root, _, filenames in os.walk(prefix):
39 |             for filename in filenames:
40 |                 path = os.path.join(root, filename)
41 |                 if fnmatch(path, expr) or fnmatch(path, expr + '/part*'):
42 |                     files.append(path)
43 |         return files
44 | 
45 |     @staticmethod
46 |     def resolve_content(expr):
47 |         if expr.startswith('file://'):
48 |             expr = expr[7:]
49 |         matches = glob.glob(expr)
50 |         file_paths = []
51 |         for match in matches:
52 |             if os.path.isfile(match):
53 |                 file_paths.append(match)
54 |             else:
55 |                 file_paths += [
56 |                     os.path.join(root, f)
57 |                     for root, _, files in os.walk(match)
58 |                     for f in files
59 |                     if not f.startswith(("_", "."))
60 |                 ]
61 |         return file_paths
62 | 
63 |     @property
64 |     def file_path(self):
65 |         if self.file_name.startswith('file://'):
66 |             return self.file_name[7:]
67 |         return self.file_name
68 | 
69 |     def exists(self):
70 |         return os.path.exists(self.file_path)
71 | 
72 |     def load(self):
73 |         with io.open(self.file_path, 'rb') as f:
74 |             return io.BytesIO(f.read())
75 | 
76 |     def load_text(self, encoding='utf8', encoding_errors='ignore'):
77 |         with io.open(self.file_path, 'r',
78 |                      encoding=encoding, errors=encoding_errors) as f:
79 |             return io.StringIO(f.read())
80 | 
81 |     def dump(self, stream):
82 |         file_path = self.file_path  # caching
83 | 
84 |         # making sure directory exists
85 |         dirname = os.path.dirname(file_path)
86 |         if dirname and not os.path.exists(dirname):
87 |             log.debug('creating local directory %s', dirname)
88 |             os.makedirs(dirname)
89 | 
90 |         log.debug('writing file %s', file_path)
91 |         with io.open(file_path, 'wb') as f:
92 |             for c in stream:
93 |                 f.write(c)
94 |         return self
95 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/fs/s3.py:
--------------------------------------------------------------------------------
  1 | from fnmatch import fnmatch
  2 | from io import BytesIO, StringIO
  3 | import logging
  4 | 
  5 | from ...exceptions import FileSystemNotSupported
  6 | from ...utils import parse_file_uri, Tokenizer
  7 | from .file_system import FileSystem
  8 | 
  9 | log = logging.getLogger(__name__)
 10 | 
 11 | try:
 12 |     import boto
 13 | except ImportError:
 14 |     boto = None
 15 | 
 16 | 
 17 | class S3(FileSystem):
 18 |     """:class:`.FileSystem` implementation for S3.
 19 | 
 20 |     Use environment variables ``AWS_SECRET_ACCESS_KEY`` and
 21 |     ``AWS_ACCESS_KEY_ID`` for auth and use file paths of the form
 22 |     ``s3://bucket_name/filename.txt``.
 23 |     """
 24 | 
 25 |     #: Keyword arguments for new connections.
 26 |     #: Example: set to `{'anon': True}` for anonymous connections.
 27 |     connection_kwargs = {}
 28 | 
 29 |     _conn = None
 30 | 
 31 |     def __init__(self, file_name):
 32 |         if boto is None:
 33 |             raise FileSystemNotSupported('S3 not supported. Install "boto".')
 34 | 
 35 |         super().__init__(file_name)
 36 | 
 37 |         # obtain key
 38 |         t = Tokenizer(self.file_name)
 39 |         t.get_next('://')  # skip scheme
 40 |         bucket_name = t.get_next('/')
 41 |         key_name = t.get_next()
 42 |         conn = self._get_conn()
 43 |         bucket = conn.get_bucket(bucket_name, validate=False)
 44 |         self.key = bucket.get_key(key_name)
 45 |         if not self.key:
 46 |             self.key = bucket.new_key(key_name)
 47 | 
 48 |     @classmethod
 49 |     def _get_conn(cls):
 50 |         if not cls._conn:
 51 |             if boto is None:
 52 |                 raise FileSystemNotSupported('S3 not supported. Install "boto".')
 53 |             cls._conn = boto.connect_s3(**cls.connection_kwargs)
 54 |         return cls._conn
 55 | 
 56 |     @classmethod
 57 |     def resolve_filenames(cls, expr):
 58 |         files = []
 59 | 
 60 |         t = Tokenizer(expr)
 61 |         scheme = t.get_next('://')
 62 |         bucket_name = t.get_next('/')
 63 |         prefix = t.get_next(['*', '?'])
 64 | 
 65 |         bucket = cls._get_conn().get_bucket(
 66 |             bucket_name,
 67 |             validate=False
 68 |         )
 69 |         expr = expr[len(scheme) + 3 + len(bucket_name) + 1:]
 70 |         for k in bucket.list(prefix=prefix):
 71 |             if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'):
 72 |                 files.append(f'{scheme}://{bucket_name}/{k.name}')
 73 |         return files
 74 | 
 75 |     @classmethod
 76 |     def resolve_content(cls, expr):
 77 |         scheme, bucket_name, folder_path, pattern = parse_file_uri(expr)
 78 | 
 79 |         folder_path = folder_path[1:]  # Remove leading slash
 80 | 
 81 |         expr = f"{folder_path}{pattern}"
 82 |         # Match all files inside folders that match expr
 83 |         pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*"
 84 | 
 85 |         bucket = cls._get_conn().get_bucket(
 86 |             bucket_name,
 87 |             validate=False
 88 |         )
 89 |         files = []
 90 |         for k in bucket.list(prefix=folder_path):
 91 |             if fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr):
 92 |                 files.append(f'{scheme}://{bucket_name}/{k.name}')
 93 |         return files
 94 | 
 95 |     def exists(self):
 96 |         t = Tokenizer(self.file_name)
 97 |         t.get_next('//')  # skip scheme
 98 |         bucket_name = t.get_next('/')
 99 |         key_name = t.get_next()
100 |         conn = self._get_conn()
101 |         bucket = conn.get_bucket(bucket_name, validate=False)
102 |         return (bucket.get_key(key_name)
103 |                 or bucket.list(prefix=f'{key_name}/'))
104 | 
105 |     def load(self):
106 |         log.debug('Loading %s with size %s.', self.key.name, self.key.size)
107 |         return BytesIO(self.key.get_contents_as_string())
108 | 
109 |     def load_text(self, encoding='utf8', encoding_errors='ignore'):
110 |         log.debug('Loading %s with size %s.', self.key.name, self.key.size)
111 |         return StringIO(
112 |             self.key.get_contents_as_string().decode(encoding, encoding_errors)
113 |         )
114 | 
115 |     def dump(self, stream):
116 |         log.debug('Dumping to %s.', self.key.name)
117 |         self.key.set_contents_from_file(stream)
118 |         return self
119 | 
120 |     def make_public(self, recursive=False):
121 |         self.key.make_public(recursive)
122 |         return self
123 | 


--------------------------------------------------------------------------------
/pysparkling/fileio/textfile.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO, StringIO, TextIOWrapper
 2 | import logging
 3 | 
 4 | from . import codec
 5 | from .file import File
 6 | from .fs.file_system import FileSystem
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | class TextFile(File):
12 |     """Derived from :class:`File`.
13 | 
14 |     :param file_name: Any text file name.
15 |     """
16 | 
17 |     def load(self, encoding='utf8', encoding_errors='ignore'):  # pylint: disable=arguments-differ
18 |         """Load the data from a file.
19 | 
20 |         :param str encoding: The character encoding of the file.
21 |         :param str encoding_errors: How to handle encoding errors.
22 |         :rtype: io.StringIO
23 |         """
24 |         # pylint: disable=comparison-with-callable
25 |         if isinstance(self.codec, codec.NoCodec) and \
26 |            self.fs.load_text != FileSystem.load_text:
27 |             stream = self.fs.load_text(encoding, encoding_errors)
28 |         else:
29 |             stream = self.fs.load()
30 |             stream = self.codec.decompress(stream)
31 |             stream = TextIOWrapper(stream, encoding, encoding_errors)
32 |         return stream
33 | 
34 |     def dump(self, stream=None, encoding='utf8', encoding_errors='ignore'):  # pylint: disable=arguments-differ
35 |         """Writes a stream to a file.
36 | 
37 |         :param stream:
38 |             An ``io.StringIO`` instance. A ``str`` is also possible and
39 |             get converted to ``io.StringIO``.
40 | 
41 |         :param encoding: (optional)
42 |             The character encoding of the file.
43 | 
44 |         :rtype: TextFile
45 |         """
46 |         if stream is None:
47 |             stream = StringIO()
48 | 
49 |         if isinstance(stream, str):
50 |             stream = StringIO(stream)
51 | 
52 |         stream = self.codec.compress(
53 |             BytesIO(stream.read().encode(encoding, encoding_errors))
54 |         )
55 |         self.fs.dump(stream)
56 | 
57 |         return self
58 | 


--------------------------------------------------------------------------------
/pysparkling/partition.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class Partition:
 7 |     def __init__(self, x, idx=None):
 8 |         self.index = idx
 9 |         self._x = list(x)
10 | 
11 |     def x(self):
12 |         return self._x
13 | 
14 |     def hashCode(self):
15 |         return self.index
16 | 
17 |     def __getstate__(self):
18 |         return {
19 |             'index': self.index,
20 |             '_x': self.x(),
21 |         }
22 | 


--------------------------------------------------------------------------------
/pysparkling/samplers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | 
 4 | try:
 5 |     import numpy
 6 | except ImportError:
 7 |     numpy = None
 8 | 
 9 | 
10 | def pysparkling_poisson(lambda_):
11 |     if lambda_ == 0.0:
12 |         return 0
13 | 
14 |     n = 0
15 |     exp_neg_lambda = math.exp(-lambda_)
16 |     prod = 1.0
17 |     while True:
18 |         prod *= random.random()
19 |         if prod > exp_neg_lambda:
20 |             n += 1
21 |         else:
22 |             return n
23 | 
24 | 
25 | def poisson(lambda_):
26 |     if numpy is not None:
27 |         return numpy.random.poisson(lambda_)
28 |     return pysparkling_poisson(lambda_)
29 | 
30 | 
31 | class BernoulliSampler:
32 |     def __init__(self, expectation):
33 |         self.expectation = expectation
34 | 
35 |     def __call__(self, sample):
36 |         return 1 if random.random() < self.expectation else 0
37 | 
38 | 
39 | class PoissonSampler:
40 |     def __init__(self, expectation):
41 |         self.expectation = expectation
42 | 
43 |     def __call__(self, sample):
44 |         return poisson(self.expectation)
45 | 
46 | 
47 | class BernoulliSamplerPerKey:
48 |     def __init__(self, expectations):
49 |         self.expectations = expectations
50 | 
51 |     def __call__(self, sample):
52 |         key = sample[0]
53 |         return 1 if random.random() < self.expectations.get(key, 0.0) else 0
54 | 
55 | 
56 | class PoissonSamplerPerKey:
57 |     def __init__(self, expectations):
58 |         self.expectations = expectations
59 | 
60 |     def __call__(self, sample):
61 |         key = sample[0]
62 |         return poisson(self.expectations.get(key, 0.0))
63 | 


--------------------------------------------------------------------------------
/pysparkling/sql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/__init__.py


--------------------------------------------------------------------------------
/pysparkling/sql/conf.py:
--------------------------------------------------------------------------------
 1 | _sentinel = object()
 2 | 
 3 | 
 4 | class RuntimeConfig:
 5 |     def __init__(self, jconf=None):
 6 |         self._conf = {}
 7 | 
 8 |     def set(self, key, value):
 9 |         self._conf[key] = value
10 | 
11 |     def get(self, key, default=_sentinel):
12 |         self._checkType(key, "key")
13 |         if default is _sentinel:
14 |             return self._conf.get(key)
15 |         if default is not None:
16 |             self._checkType(default, "default")
17 |         return self._conf.get(key, default)
18 | 
19 |     def unset(self, key):
20 |         del self._conf[key]
21 | 
22 |     def _checkType(self, obj, identifier):
23 |         if not isinstance(obj, str):
24 |             raise TypeError(f"expected {identifier} '{obj}' to be a string (was '{type(obj).__name__}')")
25 | 
26 |     def isModifiable(self, key):
27 |         raise NotImplementedError("pysparkling does not support yet this feature")
28 | 


--------------------------------------------------------------------------------
/pysparkling/sql/context.py:
--------------------------------------------------------------------------------
 1 | from .session import SparkSession
 2 | 
 3 | 
 4 | class SQLContext:
 5 |     _instantiatedContext = None
 6 | 
 7 |     def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
 8 |         self._sc = sparkContext
 9 |         if sparkSession is None:
10 |             sparkSession = SparkSession.builder.getOrCreate()
11 |         self.sparkSession = sparkSession
12 |         if SQLContext._instantiatedContext is None:
13 |             SQLContext._instantiatedContext = self
14 | 
15 |     @classmethod
16 |     def getOrCreate(cls, sc):
17 |         """
18 |         Get the existing SQLContext or create a new one with given SparkContext.
19 | 
20 |         :param sc: SparkContext
21 |         """
22 |         if cls._instantiatedContext is None:
23 |             cls(sc, SparkSession(sc), None)
24 |         return cls._instantiatedContext
25 | 
26 |     def newSession(self):
27 |         """
28 |         Returns a new SQLContext as new session, that has separate SQLConf,
29 |         registered temporary views and UDFs, but shared SparkContext and
30 |         table cache.
31 |         """
32 |         return self.__class__(self._sc, self.sparkSession.newSession())
33 | 
34 |     def setConf(self, key, value):
35 |         """Sets the given Spark SQL configuration property.
36 |         """
37 |         self.sparkSession.conf.set(key, value)
38 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/expressions/__init__.py


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/expressions/aggregate/__init__.py


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/aggregations.py:
--------------------------------------------------------------------------------
 1 | from ..expressions import Expression
 2 | 
 3 | 
 4 | class Aggregation(Expression):
 5 |     @property
 6 |     def is_an_aggregation(self):
 7 |         return True
 8 | 
 9 |     def merge(self, row, schema):
10 |         raise NotImplementedError
11 | 
12 |     def mergeStats(self, other, schema):
13 |         raise NotImplementedError
14 | 
15 |     def eval(self, row, schema):
16 |         raise NotImplementedError
17 | 
18 |     def args(self):
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/collectors.py:
--------------------------------------------------------------------------------
  1 | from .aggregations import Aggregation
  2 | 
  3 | 
  4 | class CollectList(Aggregation):
  5 |     pretty_name = "collect_list"
  6 | 
  7 |     def __init__(self, column):
  8 |         super().__init__(column)
  9 |         self.column = column
 10 |         self.items = []
 11 | 
 12 |     def merge(self, row, schema):
 13 |         self.items.append(self.column.eval(row, schema))
 14 | 
 15 |     def mergeStats(self, other, schema):
 16 |         self.items += other.items
 17 | 
 18 |     def eval(self, row, schema):
 19 |         return self.items
 20 | 
 21 |     def args(self):
 22 |         return (self.column,)
 23 | 
 24 | 
 25 | class CollectSet(Aggregation):
 26 |     pretty_name = "collect_set"
 27 | 
 28 |     def __init__(self, column):
 29 |         super().__init__(column)
 30 |         self.column = column
 31 |         self.items = set()
 32 | 
 33 |     def merge(self, row, schema):
 34 |         self.items.add(self.column.eval(row, schema))
 35 | 
 36 |     def mergeStats(self, other, schema):
 37 |         self.items |= other.items
 38 | 
 39 |     def eval(self, row, schema):
 40 |         return list(self.items)
 41 | 
 42 |     def args(self):
 43 |         return (self.column,)
 44 | 
 45 | 
 46 | class SumDistinct(Aggregation):
 47 |     pretty_name = "sum_distinct"
 48 | 
 49 |     def __init__(self, column):
 50 |         super().__init__(column)
 51 |         self.column = column
 52 |         self.items = set()
 53 | 
 54 |     def merge(self, row, schema):
 55 |         self.items.add(self.column.eval(row, schema))
 56 | 
 57 |     def mergeStats(self, other, schema):
 58 |         self.items |= other.items
 59 | 
 60 |     def eval(self, row, schema):
 61 |         return sum(self.items)
 62 | 
 63 |     def args(self):
 64 |         return (self.column,)
 65 | 
 66 | 
 67 | class First(Aggregation):
 68 |     pretty_name = "first"
 69 |     _sentinel = object()
 70 | 
 71 |     def __init__(self, column, ignore_nulls):
 72 |         super().__init__(column)
 73 |         self.column = column
 74 |         self.value = self._sentinel
 75 |         self.ignore_nulls = ignore_nulls.get_literal_value()
 76 | 
 77 |     def merge(self, row, schema):
 78 |         if self.value is First._sentinel or (self.ignore_nulls and self.value is None):
 79 |             self.value = self.column.eval(row, schema)
 80 | 
 81 |     def mergeStats(self, other, schema):
 82 |         if self.value is First._sentinel or (self.ignore_nulls and self.value is None):
 83 |             self.value = other.value
 84 | 
 85 |     def eval(self, row, schema):
 86 |         return self.value if self.value is not First._sentinel else None
 87 | 
 88 |     def args(self):
 89 |         return (
 90 |             self.column,
 91 |             str(self.ignore_nulls).lower()
 92 |         )
 93 | 
 94 | 
 95 | class Last(Aggregation):
 96 |     pretty_name = "last"
 97 |     _sentinel = object()
 98 | 
 99 |     def __init__(self, column, ignore_nulls):
100 |         super().__init__(column)
101 |         self.column = column
102 |         self.value = None
103 |         self.ignore_nulls = ignore_nulls.get_literal_value()
104 | 
105 |     def merge(self, row, schema):
106 |         new_value = self.column.eval(row, schema)
107 |         if not (self.ignore_nulls and new_value is None):
108 |             self.value = new_value
109 | 
110 |     def mergeStats(self, other, schema):
111 |         if not (self.ignore_nulls and other.value is None):
112 |             self.value = other.value
113 | 
114 |     def eval(self, row, schema):
115 |         return self.value
116 | 
117 |     def args(self):
118 |         return (
119 |             self.column,
120 |             str(self.ignore_nulls).lower()
121 |         )
122 | 
123 | 
124 | class CountDistinct(Aggregation):
125 |     pretty_name = "count"
126 | 
127 |     def __init__(self, columns):
128 |         super().__init__(columns)
129 |         self.columns = columns
130 |         self.items = set()
131 | 
132 |     def merge(self, row, schema):
133 |         self.items.add(tuple(
134 |             col.eval(row, schema) for col in self.columns
135 |         ))
136 | 
137 |     def mergeStats(self, other, schema):
138 |         self.items += other.items
139 | 
140 |     def eval(self, row, schema):
141 |         return len(self.items)
142 | 
143 |     def args(self):
144 |         return f"DISTINCT {','.join(self.columns)}"
145 | 
146 | 
147 | class ApproxCountDistinct(Aggregation):
148 |     pretty_name = "approx_count_distinct"
149 | 
150 |     def __init__(self, column):
151 |         super().__init__(column)
152 |         self.column = column
153 |         self.items = set()
154 | 
155 |     def merge(self, row, schema):
156 |         self.items.add(self.column.eval(row, schema))
157 | 
158 |     def mergeStats(self, other, schema):
159 |         self.items += other.items
160 | 
161 |     def eval(self, row, schema):
162 |         return len(self.items)
163 | 
164 |     def args(self):
165 |         return (self.column,)
166 | 
167 | 
168 | __all__ = [
169 |     "SumDistinct", "ApproxCountDistinct", "CollectList", "CollectSet",
170 |     "First", "CountDistinct", "Last"
171 | ]
172 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/covariance_aggregations.py:
--------------------------------------------------------------------------------
 1 | from ....stat_counter import CovarianceCounter
 2 | from .aggregations import Aggregation
 3 | 
 4 | 
 5 | class CovarianceStatAggregation(Aggregation):
 6 |     def __init__(self, column1, column2):
 7 |         super().__init__(column1, column2)
 8 |         self.column1 = column1
 9 |         self.column2 = column2
10 |         self.stat_helper = CovarianceCounter(method="pearson")
11 | 
12 |     def merge(self, row, schema):
13 |         self.stat_helper.add(row.eval(self.column1, schema), row.eval(self.column2, schema))
14 | 
15 |     def mergeStats(self, other, schema):
16 |         self.stat_helper.merge(other)
17 | 
18 |     def eval(self, row, schema):
19 |         raise NotImplementedError
20 | 
21 |     def args(self):
22 |         return (
23 |             self.column1,
24 |             self.column2
25 |         )
26 | 
27 | 
28 | class Corr(CovarianceStatAggregation):
29 |     pretty_name = "corr"
30 | 
31 |     def eval(self, row, schema):
32 |         return self.stat_helper.pearson_correlation
33 | 
34 | 
35 | class CovarSamp(CovarianceStatAggregation):
36 |     pretty_name = "covar_samp"
37 | 
38 |     def eval(self, row, schema):
39 |         return self.stat_helper.covar_samp
40 | 
41 | 
42 | class CovarPop(CovarianceStatAggregation):
43 |     pretty_name = "covar_pop"
44 | 
45 |     def eval(self, row, schema):
46 |         return self.stat_helper.covar_pop
47 | 
48 | 
49 | __all__ = ["Corr", "CovarSamp", "CovarPop"]
50 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/stat_aggregations.py:
--------------------------------------------------------------------------------
  1 | from ....stat_counter import ColumnStatHelper
  2 | from ...column import Column
  3 | from ..literals import Literal
  4 | from ..mappers import StarOperator
  5 | from .aggregations import Aggregation
  6 | 
  7 | 
  8 | class SimpleStatAggregation(Aggregation):
  9 |     def __init__(self, column):
 10 |         super().__init__(column)
 11 |         self.column = column
 12 |         self.stat_helper = ColumnStatHelper(column)
 13 | 
 14 |     def merge(self, row, schema):
 15 |         self.stat_helper.merge(row, schema)
 16 | 
 17 |     def mergeStats(self, other, schema):
 18 |         self.stat_helper.mergeStats(other.stat_helper)
 19 | 
 20 |     def eval(self, row, schema):
 21 |         raise NotImplementedError
 22 | 
 23 |     def args(self):
 24 |         return (self.column,)
 25 | 
 26 | 
 27 | class Count(SimpleStatAggregation):
 28 |     pretty_name = "count"
 29 | 
 30 |     def __init__(self, column):
 31 |         if isinstance(column.expr, StarOperator):
 32 |             column = Column(Literal(1))
 33 |         super().__init__(column)
 34 |         self.column = column
 35 |         self.stat_helper = ColumnStatHelper(column)
 36 | 
 37 |     def eval(self, row, schema):
 38 |         return self.stat_helper.count
 39 | 
 40 | 
 41 | class Max(SimpleStatAggregation):
 42 |     pretty_name = "max"
 43 | 
 44 |     def eval(self, row, schema):
 45 |         return self.stat_helper.max
 46 | 
 47 | 
 48 | class Min(SimpleStatAggregation):
 49 |     pretty_name = "min"
 50 | 
 51 |     def eval(self, row, schema):
 52 |         return self.stat_helper.min
 53 | 
 54 | 
 55 | class Sum(SimpleStatAggregation):
 56 |     pretty_name = "sum"
 57 | 
 58 |     def eval(self, row, schema):
 59 |         return self.stat_helper.sum
 60 | 
 61 | 
 62 | class Avg(SimpleStatAggregation):
 63 |     pretty_name = "avg"
 64 | 
 65 |     def eval(self, row, schema):
 66 |         return self.stat_helper.mean
 67 | 
 68 | 
 69 | class VarSamp(SimpleStatAggregation):
 70 |     pretty_name = "var_samp"
 71 | 
 72 |     def eval(self, row, schema):
 73 |         return self.stat_helper.variance_samp
 74 | 
 75 | 
 76 | class VarPop(SimpleStatAggregation):
 77 |     pretty_name = "var_pop"
 78 | 
 79 |     def eval(self, row, schema):
 80 |         return self.stat_helper.variance_pop
 81 | 
 82 | 
 83 | class StddevSamp(SimpleStatAggregation):
 84 |     pretty_name = "stddev_samp"
 85 | 
 86 |     def eval(self, row, schema):
 87 |         return self.stat_helper.stddev_samp
 88 | 
 89 | 
 90 | class StddevPop(SimpleStatAggregation):
 91 |     pretty_name = "stddev_pop"
 92 | 
 93 |     def eval(self, row, schema):
 94 |         return self.stat_helper.stddev_pop
 95 | 
 96 | 
 97 | class Skewness(SimpleStatAggregation):
 98 |     pretty_name = "skewness"
 99 | 
100 |     def eval(self, row, schema):
101 |         return self.stat_helper.skewness
102 | 
103 | 
104 | class Kurtosis(SimpleStatAggregation):
105 |     pretty_name = "kurtosis"
106 | 
107 |     def eval(self, row, schema):
108 |         return self.stat_helper.kurtosis
109 | 
110 | 
111 | __all__ = [
112 |     "Avg", "VarPop", "VarSamp", "Sum", "StddevPop", "StddevSamp",
113 |     "Skewness", "Min", "Max", "Kurtosis", "Count"
114 | ]
115 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/csvs.py:
--------------------------------------------------------------------------------
 1 | from ..casts import NO_TIMESTAMP_CONVERSION
 2 | from ..internal_utils.options import Options
 3 | from ..internal_utils.readers.csvreader import csv_record_to_row, CSVReader
 4 | from ..internal_utils.readers.utils import guess_schema_from_strings
 5 | from ..utils import AnalysisException
 6 | from .expressions import Expression
 7 | 
 8 | sql_csv_function_options = dict(
 9 |     dateFormat=NO_TIMESTAMP_CONVERSION,
10 |     timestampFormat=NO_TIMESTAMP_CONVERSION,
11 | )
12 | 
13 | 
14 | class SchemaOfCsv(Expression):
15 |     pretty_name = "schema_of_csv"
16 | 
17 |     def __init__(self, column, options):
18 |         super().__init__(column)
19 |         self.column = column
20 |         self.input_options = options
21 |         self.options = Options(CSVReader.default_options, sql_csv_function_options, options)
22 | 
23 |     def eval(self, row, schema):
24 |         value = self.column.eval(row, schema)
25 |         if not isinstance(value, str) or value == "":
26 |             raise AnalysisException(
27 |                 "type mismatch: The input csv should be a string literal and not null; "
28 |                 f"however, got {value}."
29 |             )
30 |         record_as_row = csv_record_to_row(value, self.options)
31 |         schema = guess_schema_from_strings(record_as_row.__fields__, [record_as_row], self.options)
32 |         return schema.simpleString()
33 | 
34 |     def args(self):
35 |         return (self.column,)
36 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/explodes.py:
--------------------------------------------------------------------------------
 1 | from ..types import DataType, IntegerType, StructField
 2 | from .expressions import UnaryExpression
 3 | 
 4 | 
 5 | class Explode(UnaryExpression):
 6 |     def __init__(self, column):
 7 |         super().__init__(column)
 8 |         self.column = column
 9 | 
10 |     @property
11 |     def may_output_multiple_rows(self):
12 |         return True
13 | 
14 |     def eval(self, row, schema):
15 |         values = self.column.eval(row, schema)
16 |         if not values:
17 |             return []
18 |         return [[value] for value in values]
19 | 
20 |     def __str__(self):
21 |         return "col"
22 | 
23 | 
24 | class ExplodeOuter(Explode):
25 |     def eval(self, row, schema):
26 |         values = self.column.eval(row, schema)
27 |         if not values:
28 |             return [[None]]
29 |         return [[value] for value in values]
30 | 
31 |     def __str__(self):
32 |         return "col"
33 | 
34 | 
35 | class PosExplode(UnaryExpression):
36 |     def eval(self, row, schema):
37 |         values = self.column.eval(row, schema)
38 |         if not values:
39 |             return []
40 |         return list(enumerate(values))
41 | 
42 |     def __str__(self):
43 |         return "posexplode"
44 | 
45 |     @property
46 |     def may_output_multiple_rows(self):
47 |         return True
48 | 
49 |     @property
50 |     def may_output_multiple_cols(self):
51 |         return True
52 | 
53 |     def output_fields(self, schema):
54 |         return [
55 |             StructField("pos", IntegerType(), False),
56 |             StructField("col", DataType(), False)
57 |         ]
58 | 
59 | 
60 | class PosExplodeOuter(PosExplode):
61 |     def eval(self, row, schema):
62 |         values = self.column.eval(row, schema)
63 |         if not values:
64 |             return [[None, None]]
65 |         return list(enumerate(values))
66 | 
67 |     def __str__(self):
68 |         return "posexplode_outer"
69 | 
70 | 
71 | __all__ = ["PosExplodeOuter", "PosExplode", "ExplodeOuter", "Explode"]
72 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/fields.py:
--------------------------------------------------------------------------------
 1 | from ..types import StructField
 2 | from ..utils import AnalysisException
 3 | from .expressions import Expression
 4 | 
 5 | 
 6 | class FieldAsExpression(Expression):
 7 |     def __init__(self, field):
 8 |         super().__init__()
 9 |         self.field = field
10 | 
11 |     def eval(self, row, schema):
12 |         return row[find_position_in_schema(schema, self.field)]
13 | 
14 |     def __str__(self):
15 |         return self.field.name
16 | 
17 |     def output_fields(self, schema):
18 |         return [self.field]
19 | 
20 |     def args(self):
21 |         return (self.field,)
22 | 
23 | 
24 | def find_position_in_schema(schema, expr):
25 |     if isinstance(expr, str):
26 |         show_id = False
27 |         field_name = expr
28 |         matches = set(i for i, field in enumerate(schema.fields) if field_name == field.name)
29 |     elif isinstance(expr, FieldAsExpression):
30 |         return find_position_in_schema(schema, expr.field)
31 |     elif isinstance(expr, StructField) and hasattr(expr, "id"):
32 |         show_id = True
33 |         field_name = format_field(expr, show_id=show_id)
34 |         matches = set(i for i, field in enumerate(schema.fields) if expr.id == field.id)
35 |     else:
36 |         if isinstance(expr, StructField):
37 |             expression = f"Unbound field {expr.name}"
38 |         else:
39 |             expression = f"Expression type '{type(expr)}'"
40 | 
41 |         raise NotImplementedError(
42 |             f"{expression} is not supported. "
43 |             "As a user you should not see this error, feel free to report a bug at "
44 |             "https://github.com/svenkreiss/pysparkling/issues"
45 |         )
46 | 
47 |     return get_checked_matches(matches, field_name, schema, show_id)
48 | 
49 | 
50 | def get_checked_matches(matches, field_name, schema, show_id):
51 |     if not matches:
52 |         raise AnalysisException(f"Unable to find the column '{field_name}'"
53 |                                 f" among {format_schema(schema, show_id)}")
54 | 
55 |     if len(matches) > 1:
56 |         raise AnalysisException(
57 |             f"Reference '{field_name}' is ambiguous, found {len(matches)} columns matching it."
58 |         )
59 | 
60 |     return matches.pop()
61 | 
62 | 
63 | def format_schema(schema, show_id):
64 |     return [format_field(field, show_id=show_id) for field in schema.fields]
65 | 
66 | 
67 | def format_field(field, show_id):
68 |     if show_id:
69 |         return f"{field.name}#{field.id}"
70 |     return field.name
71 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/jsons.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from ...utils import get_json_encoder
 4 | from ..internal_utils.options import Options
 5 | from ..internal_utils.readers.jsonreader import JSONReader
 6 | from .expressions import Expression
 7 | 
 8 | 
 9 | class StructsToJson(Expression):
10 |     pretty_name = "structstojson"
11 | 
12 |     default_options = dict(
13 |         dateFormat="yyyy-MM-dd",
14 |         timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
15 |     )
16 | 
17 |     def __init__(self, column, options):
18 |         super().__init__(column)
19 |         self.column = column
20 |         self.input_options = options
21 |         self.options = Options(JSONReader.default_options, options)
22 |         self.encoder = get_json_encoder(self.options)
23 | 
24 |     def eval(self, row, schema):
25 |         value = self.column.eval(row, schema)
26 |         return json.dumps(
27 |             value,
28 |             cls=self.encoder,
29 |             separators=(',', ':')
30 |         )
31 | 
32 |     def args(self):
33 |         if self.input_options is None:
34 |             return (self.column, )
35 |         return (
36 |             self.column,
37 |             self.input_options
38 |         )
39 | 
40 | 
41 | __all__ = ["StructsToJson"]
42 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/literals.py:
--------------------------------------------------------------------------------
 1 | from ..utils import AnalysisException
 2 | from .expressions import Expression
 3 | 
 4 | 
 5 | class Literal(Expression):
 6 |     def __init__(self, value):
 7 |         super().__init__()
 8 |         self.value = value
 9 | 
10 |     def eval(self, row, schema):
11 |         return self.value
12 | 
13 |     def __str__(self):
14 |         if self.value is True:
15 |             return "true"
16 |         if self.value is False:
17 |             return "false"
18 |         if self.value is None:
19 |             return "NULL"
20 |         return str(self.value)
21 | 
22 |     def get_literal_value(self):
23 |         if hasattr(self.value, "expr") or isinstance(self.value, Expression):
24 |             raise AnalysisException("Value should not be a Column or an Expression,"
25 |                                     f" but got {type(self)}: {self}")
26 |         return self.value
27 | 
28 |     def args(self):
29 |         return (self.value, )
30 | 
31 | 
32 | __all__ = ["Literal"]
33 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/orders.py:
--------------------------------------------------------------------------------
 1 | from .expressions import Expression
 2 | 
 3 | 
 4 | class SortOrder(Expression):
 5 |     sort_order = None
 6 | 
 7 |     def __init__(self, column):
 8 |         super().__init__(column)
 9 |         self.column = column
10 | 
11 |     def eval(self, row, schema):
12 |         return self.column.eval(row, schema)
13 | 
14 |     def __str__(self):
15 |         return f"{self.column} {self.sort_order}"
16 | 
17 |     def args(self):
18 |         return (self.column,)
19 | 
20 | 
21 | class AscNullsFirst(SortOrder):
22 |     sort_order = "ASC NULLS FIRST"
23 | 
24 | 
25 | class AscNullsLast(SortOrder):
26 |     sort_order = "ASC NULLS LAST"
27 | 
28 | 
29 | class DescNullsFirst(SortOrder):
30 |     sort_order = "DESCNULLS FIRST"
31 | 
32 | 
33 | class DescNullsLast(SortOrder):
34 |     sort_order = "DESC NULLS LAST"
35 | 
36 | 
37 | Asc = AscNullsFirst
38 | Desc = DescNullsLast
39 | 


--------------------------------------------------------------------------------
/pysparkling/sql/expressions/userdefined.py:
--------------------------------------------------------------------------------
 1 | from .expressions import Expression
 2 | 
 3 | 
 4 | class UserDefinedFunction(Expression):
 5 |     def __init__(self, f, return_type, *exprs):
 6 |         super().__init__()
 7 |         self.f = f
 8 |         self.return_type = return_type
 9 |         self.exprs = exprs
10 | 
11 |     def eval(self, row, schema):
12 |         return self.f(*(expr.eval(row, schema) for expr in self.exprs))
13 | 
14 |     def __str__(self):
15 |         arguments = ', '.join(str(arg) for arg in self.args())
16 |         return f"{self.f.__name__}({arguments})"
17 | 
18 |     def args(self):
19 |         return self.exprs
20 | 
21 | 
22 | __all__ = ["UserDefinedFunction"]
23 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/internal_utils/__init__.py


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/column.py:
--------------------------------------------------------------------------------
 1 | def resolve_column(col, row, schema, allow_generator=True):
 2 |     """
 3 |     Return the list of column names corresponding to a column value and a schema and:
 4 |     If allow generator is False, a list of values corresponding to a row
 5 |     If allow generator is True, a list of list of values, each list correspond to a row
 6 |     """
 7 |     output_cols = [field.name for field in col.output_fields(schema)]
 8 | 
 9 |     output_values = col.eval(row, schema)
10 | 
11 |     if not allow_generator and col.may_output_multiple_rows:
12 |         raise Exception("Generators are not supported when it's nested in expressions,"
13 |                         f" but got: {col}")
14 | 
15 |     if not col.may_output_multiple_rows:
16 |         output_values = [output_values]
17 |         if not col.may_output_multiple_cols:
18 |             output_values = [output_values]
19 | 
20 |     return output_cols, output_values
21 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/joins.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The following constants are used to identify join types
 3 | """
 4 | INNER_JOIN = "inner"
 5 | CROSS_JOIN = "cross"
 6 | FULL_JOIN = "full"
 7 | LEFT_JOIN = "left"
 8 | RIGHT_JOIN = "right"
 9 | LEFT_SEMI_JOIN = "leftsemi"
10 | LEFT_ANTI_JOIN = "leftanti"
11 | 
12 | JOIN_TYPES = dict(
13 |     inner=INNER_JOIN,
14 |     cross=CROSS_JOIN,
15 |     outer=FULL_JOIN,
16 |     full=FULL_JOIN,
17 |     fullouter=FULL_JOIN,
18 |     left=LEFT_JOIN,
19 |     leftouter=LEFT_JOIN,
20 |     right=RIGHT_JOIN,
21 |     rightouter=RIGHT_JOIN,
22 |     leftsemi=LEFT_SEMI_JOIN,
23 |     leftanti=LEFT_ANTI_JOIN,
24 | )
25 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/options.py:
--------------------------------------------------------------------------------
 1 | class Options(dict):
 2 |     """
 3 |     A case insensitive dict, which can be initialized from multiple dicts
 4 |     and whose values can be access through attr syntax
 5 | 
 6 |     It also stores "false" and "true" strings as Boolean
 7 | 
 8 |     e.g.:
 9 | 
10 |     >>> default_options = dict(sep=",", samplingRatio=None)
11 |     >>> requested_options = dict(Sep="|")
12 |     >>> o=Options({"format": "json", "lineSep": ","}, Format="csv")
13 |     >>> o.format, o.linesep
14 |     ('csv', ',')
15 |     >>> o.UndefinedSetting
16 |     Traceback (most recent call last):
17 |     ...
18 |     KeyError: 'undefinedsetting'
19 |     """
20 | 
21 |     def __init__(self, *args, **kwargs):
22 |         d = {
23 |             key.lower(): value
24 |             for arg in args
25 |             if arg is not None
26 |             for key, value in arg.items()
27 |         }
28 |         d.update({
29 |             key.lower(): value
30 |             for key, value in kwargs.items()
31 |         })
32 |         super().__init__(d)
33 | 
34 |     def setdefault(self, k, default=None):
35 |         return super().setdefault(k.lower(), default)
36 | 
37 |     @staticmethod
38 |     def fromkeys(seq, value=None):
39 |         return Options({k.lower(): value for k in seq})
40 | 
41 |     def __getitem__(self, k):
42 |         return super().__getitem__(k.lower())
43 | 
44 |     def __setitem__(self, k, v):
45 |         if isinstance(v, str) and v.lower() in ("true", "false"):
46 |             v = (v.lower() == "true")
47 |         super().__setitem__(k.lower(), v)
48 | 
49 |     def __delitem__(self, k):
50 |         super().__delitem__(k.lower())
51 | 
52 |     def get(self, k, *args, **kwargs):
53 |         return super().get(k.lower(), *args, **kwargs)
54 | 
55 |     def __contains__(self, o):
56 |         if not isinstance(o, str):
57 |             return False
58 |         return super().__contains__(o.lower())
59 | 
60 |     def __getattr__(self, item):
61 |         if not item.startswith("_"):
62 |             return self[item.lower()]
63 |         return getattr(super(), item)
64 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import InternalReader
2 | 
3 | __all__ = [
4 |     'InternalReader'
5 | ]
6 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/common.py:
--------------------------------------------------------------------------------
 1 | from ...internal_utils.readers import csvreader, jsonreader, textreader
 2 | from ...internal_utils.readwrite import OptionUtils, to_option_stored_value
 3 | from ...types import StructType
 4 | 
 5 | 
 6 | class InternalReader(OptionUtils):
 7 |     def schema(self, schema):
 8 |         if not isinstance(schema, StructType):
 9 |             raise NotImplementedError("Pysparkling currently only supports StructType for schemas")
10 |         self._schema = schema
11 | 
12 |     def option(self, key, value):
13 |         self._options[key.lower()] = to_option_stored_value(value)
14 | 
15 |     def __init__(self, spark):
16 |         """
17 | 
18 |         :type spark: pysparkling.sql.session.SparkSession
19 |         """
20 |         self._spark = spark
21 |         self._options = {}
22 |         self._schema = None
23 | 
24 |     def csv(self, paths):
25 |         return csvreader.CSVReader(self._spark, paths, self._schema, self._options).read()
26 | 
27 |     def json(self, paths):
28 |         return jsonreader.JSONReader(self._spark, paths, self._schema, self._options).read()
29 | 
30 |     def text(self, paths):
31 |         return textreader.TextReader(self._spark, paths, self._schema, self._options).read()
32 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/csvreader.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import itertools
  3 | 
  4 | from ....fileio import TextFile
  5 | from ...casts import get_caster
  6 | from ...internal_utils.options import Options
  7 | from ...internal_utils.readers.utils import guess_schema_from_strings, resolve_partitions
  8 | from ...schema_utils import infer_schema_from_rdd
  9 | from ...types import create_row, StringType, StructField, StructType
 10 | 
 11 | 
 12 | class CSVReader:
 13 |     default_options = dict(
 14 |         lineSep=None,
 15 |         encoding="utf-8",
 16 |         sep=",",
 17 |         inferSchema=False,
 18 |         header=False
 19 |     )
 20 | 
 21 |     def __init__(self, spark, paths, schema, options):
 22 |         self.spark = spark
 23 |         self.paths = paths
 24 |         self.schema = schema
 25 |         self.options = Options(self.default_options, options)
 26 | 
 27 |     def read(self):
 28 |         sc = self.spark._sc
 29 |         paths = self.paths
 30 | 
 31 |         partitions, partition_schema = resolve_partitions(paths)
 32 | 
 33 |         rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
 34 |         rdd = rdd_filenames.flatMap(partial(
 35 |             parse_csv_file,
 36 |             partitions,
 37 |             partition_schema,
 38 |             self.schema,
 39 |             self.options
 40 |         ))
 41 | 
 42 |         if self.schema is not None:
 43 |             schema = self.schema
 44 |         elif self.options.inferSchema:
 45 |             fields = rdd.take(1)[0].__fields__
 46 |             schema = guess_schema_from_strings(fields, rdd.collect(), options=self.options)
 47 |         else:
 48 |             schema = infer_schema_from_rdd(rdd)
 49 | 
 50 |         schema_with_string = StructType(fields=[
 51 |             StructField(field.name, StringType()) for field in schema.fields
 52 |         ])
 53 | 
 54 |         if partition_schema:
 55 |             partitions_fields = partition_schema.fields
 56 |             full_schema = StructType(schema.fields[:-len(partitions_fields)] + partitions_fields)
 57 |         else:
 58 |             full_schema = schema
 59 | 
 60 |         cast_row = get_caster(
 61 |             from_type=schema_with_string, to_type=full_schema, options=self.options
 62 |         )
 63 |         casted_rdd = rdd.map(cast_row)
 64 |         casted_rdd._name = paths
 65 | 
 66 |         # pylint: disable=import-outside-toplevel, cyclic-import
 67 |         from ...internals import DataFrameInternal
 68 | 
 69 |         return DataFrameInternal(
 70 |             sc,
 71 |             casted_rdd,
 72 |             schema=full_schema
 73 |         )
 74 | 
 75 | 
 76 | def parse_csv_file(partitions, partition_schema, schema, options, file_name):
 77 |     f_content = TextFile(file_name).load(encoding=options.encoding).read()
 78 |     records = (f_content.split(options.lineSep)
 79 |                if options.lineSep is not None
 80 |                else f_content.splitlines())
 81 |     if options.header == "true":
 82 |         header = records[0].split(options.sep)
 83 |         records = records[1:]
 84 |     else:
 85 |         header = None
 86 | 
 87 |     null_value = ""
 88 |     rows = []
 89 |     for record in records:
 90 |         row = csv_record_to_row(
 91 |             record, options, schema, header, null_value, partition_schema, partitions[file_name]
 92 |         )
 93 |         row.set_input_file_name(file_name)
 94 |         rows.append(row)
 95 | 
 96 |     return rows
 97 | 
 98 | 
 99 | def csv_record_to_row(record, options, schema=None, header=None,
100 |                       null_value=None, partition_schema=None, partition=None):
101 |     record_values = [val if val != null_value else None for val in record.split(options.sep)]
102 |     if schema is not None:
103 |         field_names = [f.name for f in schema.fields]
104 |     elif header is not None:
105 |         field_names = header
106 |     else:
107 |         field_names = [f"_c{i}" for i, field in enumerate(record_values)]
108 |     partition_field_names = [
109 |         f.name for f in partition_schema.fields
110 |     ] if partition_schema else []
111 |     row = create_row(
112 |         itertools.chain(field_names, partition_field_names),
113 |         itertools.chain(record_values, partition or [])
114 |     )
115 |     return row
116 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/jsonreader.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import itertools
  3 | import json
  4 | 
  5 | from ...casts import get_struct_caster
  6 | from ...internal_utils.options import Options
  7 | from ...internal_utils.readers.utils import get_records, resolve_partitions
  8 | from ...schema_utils import infer_schema_from_rdd
  9 | from ...types import create_row, row_from_keyed_values, StructType
 10 | 
 11 | 
 12 | class JSONReader:
 13 |     default_options = dict(
 14 |         primitivesAsString=False,
 15 |         prefersDecimal=False,
 16 |         allowComments=False,
 17 |         allowUnquotedFieldNames=False,
 18 |         allowSingleQuotes=True,
 19 |         allowNumericLeadingZero=False,
 20 |         allowBackslashEscapingAnyCharacter=False,
 21 |         mode="PERMISSIVE",
 22 |         columnNameOfCorruptRecord="",
 23 |         dateFormat="yyyy-MM-dd",
 24 |         timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
 25 |         multiLine=False,
 26 |         allowUnquotedControlChars=False,
 27 |         encoding=None,
 28 |         lineSep=None,
 29 |         samplingRatio=1.0,
 30 |         dropFieldIfAllNull=False,
 31 |         locale="en-US",
 32 |     )
 33 | 
 34 |     def __init__(self, spark, paths, schema, options):
 35 |         self.spark = spark
 36 |         self.paths = paths
 37 |         self.schema = schema
 38 |         self.options = Options(self.default_options, options)
 39 | 
 40 |     def read(self):
 41 |         sc = self.spark._sc
 42 |         paths = self.paths
 43 | 
 44 |         partitions, partition_schema = resolve_partitions(paths)
 45 | 
 46 |         rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
 47 |         rdd = rdd_filenames.flatMap(partial(
 48 |             parse_json_file,
 49 |             partitions,
 50 |             partition_schema,
 51 |             self.schema,
 52 |             self.options
 53 |         ))
 54 | 
 55 |         inferred_schema = infer_schema_from_rdd(rdd)
 56 | 
 57 |         schema = self.schema if self.schema is not None else inferred_schema
 58 |         schema_fields = {
 59 |             field.name: field
 60 |             for field in schema.fields
 61 |         }
 62 | 
 63 |         # Field order is defined by fields in the record, not by the given schema
 64 |         # Field type is defined by the given schema or inferred
 65 |         full_schema = StructType(
 66 |             fields=[
 67 |                 schema_fields.get(field.name, field)
 68 |                 for field in inferred_schema.fields
 69 |             ]
 70 |         )
 71 | 
 72 |         cast_row = get_struct_caster(inferred_schema, full_schema, options=self.options)
 73 |         casted_rdd = rdd.map(cast_row)
 74 |         casted_rdd._name = paths
 75 | 
 76 |         # pylint: disable=import-outside-toplevel, cyclic-import
 77 |         from ...internals import DataFrameInternal
 78 | 
 79 |         return DataFrameInternal(
 80 |             sc,
 81 |             casted_rdd,
 82 |             schema=full_schema
 83 |         )
 84 | 
 85 | 
 86 | def parse_json_file(partitions, partition_schema, schema, options, file_name):
 87 |     records = get_records(file_name, options.linesep, options.encoding)
 88 |     rows = []
 89 |     for record in records:
 90 |         partition = partitions[file_name]
 91 |         row = parse_record(record, schema, partition, partition_schema, options)
 92 |         row.set_input_file_name(file_name)
 93 |         rows.append(row)
 94 |     return rows
 95 | 
 96 | 
 97 | def parse_record(record, schema, partition, partition_schema, options):
 98 |     raw_record_value = json.loads(record, encoding=options.encoding)
 99 |     if not isinstance(raw_record_value, dict):
100 |         raise NotImplementedError(
101 |             "Top level items should be JSON objects (dicts),"
102 |             f" got {type(raw_record_value)} with {raw_record_value}"
103 |         )
104 |     record_value = decode_record(raw_record_value)
105 |     if schema is not None:
106 |         record_fields = record_value.__fields__
107 |         available_names = tuple(partition_schema.names) + record_fields
108 |         field_names = [name for name in record_fields if name in schema.names] + [
109 |             f.name for f in schema.fields if f.name not in available_names
110 |         ]
111 |     else:
112 |         field_names = list(record_value.__fields__)
113 |     record_values = [
114 |         record_value[field_name] if field_name in record_value.__fields__ else None
115 |         for field_name in field_names
116 |     ]
117 |     partition_field_names = [f.name for f in partition_schema.fields] if partition_schema else []
118 |     # pylint: disable=W0511
119 |     # todo: handle nested rows
120 |     row = create_row(
121 |         itertools.chain(field_names, partition_field_names),
122 |         itertools.chain(record_values, partition)
123 |     )
124 |     return row
125 | 
126 | 
127 | def decode_record(item):
128 |     if isinstance(item, list):
129 |         return [decode_record(e) for e in item]
130 |     if isinstance(item, dict):
131 |         return row_from_keyed_values(
132 |             (key, decode_record(value))
133 |             for key, value in item.items()
134 |         )
135 |     return item
136 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/textreader.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import itertools
 3 | 
 4 | from ....fileio import TextFile
 5 | from ...internal_utils.options import Options
 6 | from ...internal_utils.readers.utils import resolve_partitions
 7 | from ...types import create_row, StringType, StructField, StructType
 8 | 
 9 | 
10 | class TextReader:
11 |     default_options = dict(
12 |         lineSep=None,
13 |         encoding="utf-8",
14 |         sep=",",
15 |         inferSchema=False,
16 |         header=False
17 |     )
18 | 
19 |     def __init__(self, spark, paths, schema, options):
20 |         self.spark = spark
21 |         self.paths = paths
22 |         self.schema = schema or StructType([StructField("value", StringType())])
23 |         self.options = Options(self.default_options, options)
24 | 
25 |     def read(self):
26 |         sc = self.spark._sc
27 |         paths = self.paths
28 | 
29 |         partitions, partition_schema = resolve_partitions(paths)
30 | 
31 |         rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
32 |         rdd = rdd_filenames.flatMap(partial(
33 |             parse_text_file,
34 |             partitions,
35 |             partition_schema,
36 |             self.schema,
37 |             self.options
38 |         ))
39 | 
40 |         if partition_schema:
41 |             partitions_fields = partition_schema.fields
42 |             full_schema = StructType(self.schema.fields + partitions_fields)
43 |         else:
44 |             full_schema = self.schema
45 | 
46 |         rdd._name = paths
47 | 
48 |         # pylint: disable=import-outside-toplevel, cyclic-import
49 |         from ...internals import DataFrameInternal
50 | 
51 |         return DataFrameInternal(
52 |             sc,
53 |             rdd,
54 |             schema=full_schema
55 |         )
56 | 
57 | 
58 | def parse_text_file(partitions, partition_schema, schema, options, file_name):
59 |     f_content = TextFile(file_name).load(encoding=options.encoding).read()
60 |     records = (f_content.split(options.lineSep)
61 |                if options.lineSep is not None
62 |                else f_content.splitlines())
63 | 
64 |     rows = []
65 |     for record in records:
66 |         row = text_record_to_row(record, options, schema, partition_schema, partitions[file_name])
67 |         row.set_input_file_name(file_name)
68 |         rows.append(row)
69 | 
70 |     return rows
71 | 
72 | 
73 | def text_record_to_row(record, options, schema, partition_schema, partition):
74 |     partition_field_names = [
75 |         f.name for f in partition_schema.fields
76 |     ] if partition_schema else []
77 |     row = create_row(
78 |         itertools.chain([schema.fields[0].name], partition_field_names),
79 |         itertools.chain([record], partition or [])
80 |     )
81 |     return row
82 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/utils.py:
--------------------------------------------------------------------------------
  1 | from ....fileio import File, TextFile
  2 | from ...casts import get_caster
  3 | from ...types import (
  4 |     DecimalType, DoubleType, IntegerType, LongType, row_from_keyed_values, StringType, StructField, StructType,
  5 |     TimestampType
  6 | )
  7 | from ...utils import AnalysisException
  8 | 
  9 | 
 10 | def resolve_partitions(patterns):
 11 |     """
 12 |     Given a list of patterns, returns all the files matching or in folders matching
 13 |     one of them.
 14 | 
 15 |     The file are returned in a list of tuple of 2 elements:
 16 |     - The first tuple is the file path
 17 |     - The second being the partition keys and values if any were encountered else None
 18 | 
 19 |     In addition to this list, return, if the data was partitioned, a schema for the
 20 |     partition keys, else None
 21 | 
 22 |     :type patterns: list of str
 23 |     :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
 24 |     """
 25 |     file_paths = File.get_content(patterns)
 26 |     if not file_paths:
 27 |         raise AnalysisException(f'Path does not exist: {patterns}')
 28 |     partitions = {}
 29 |     for file_path in file_paths:
 30 |         if "=" in file_path:
 31 |             row = row_from_keyed_values(
 32 |                 folder.split("=")
 33 |                 for folder in file_path.split("/")[:-1]
 34 |                 if folder.count("=") == 1
 35 |             )
 36 |             partitions[file_path] = row
 37 |         else:
 38 |             partitions[file_path] = None
 39 | 
 40 |     partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
 41 |     if len(partitioning_field_sets) > 1:
 42 |         raise Exception(
 43 |             f"Conflicting directory structures detected while reading {','.join(patterns)}. "
 44 |             f"All partitions must have the same partitioning fields,"
 45 |             f" found fields {' and also '.join(str(fields) for fields in partitioning_field_sets)}"
 46 |         )
 47 | 
 48 |     if partitioning_field_sets:
 49 |         if any(value is None for value in partitions.values()):
 50 |             paths = [path for path, value in partitions.items() if value is None]
 51 |             raise AnalysisException(
 52 |                 f"Unable to parse those malformed folders: {paths} of {file_paths}"
 53 |             )
 54 |         partitioning_fields = partitioning_field_sets.pop()
 55 |         partition_schema = guess_schema_from_strings(
 56 |             partitioning_fields, partitions.values(), options={}
 57 |         )
 58 |     else:
 59 |         partition_schema = None
 60 | 
 61 |     return partitions, partition_schema
 62 | 
 63 | 
 64 | def guess_schema_from_strings(schema_fields, data, options):
 65 |     field_values = [
 66 |         (field, [row[field] for row in data])
 67 |         for field in schema_fields
 68 |     ]
 69 | 
 70 |     field_types_and_values = [
 71 |         (field, guess_type_from_values_as_string(values, options))
 72 |         for field, values in field_values
 73 |     ]
 74 | 
 75 |     schema = StructType(fields=[
 76 |         StructField(field, field_type)
 77 |         for field, field_type in field_types_and_values
 78 |     ])
 79 | 
 80 |     return schema
 81 | 
 82 | 
 83 | def guess_type_from_values_as_string(values, options):
 84 |     # Reproduces inferences available in Spark
 85 |     # PartitioningUtils.inferPartitionColumnValue()
 86 |     # located in org.apache.spark.sql.execution.datasources
 87 |     tested_types = (
 88 |         IntegerType(),
 89 |         LongType(),
 90 |         DecimalType(),
 91 |         DoubleType(),
 92 |         TimestampType(),
 93 |         StringType()
 94 |     )
 95 |     string_type = StringType()
 96 |     for tested_type in tested_types:
 97 |         type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options)
 98 |         try:
 99 |             for value in values:
100 |                 casted_value = type_caster(value)
101 |                 if casted_value is None and value not in ("null", None):
102 |                     raise ValueError
103 |             return tested_type
104 |         except ValueError:
105 |             pass
106 |     # Should never happen
107 |     raise AnalysisException(
108 |         "Unable to find a matching type for some fields, even StringType did not work"
109 |     )
110 | 
111 | 
112 | def get_records(f_name, linesep, encoding):
113 |     f_content = TextFile(f_name).load(encoding=encoding).read()
114 |     records = f_content.split(linesep) if linesep is not None else f_content.splitlines()
115 |     return records
116 | 


--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readwrite.py:
--------------------------------------------------------------------------------
 1 | from ..utils import IllegalArgumentException
 2 | 
 3 | 
 4 | def to_option_stored_value(value):
 5 |     if value is None:
 6 |         return None
 7 |     if isinstance(value, bool):
 8 |         return str(value).lower()
 9 |     return str(value)
10 | 
11 | 
12 | class OptionUtils:
13 |     def _set_opts(self, schema=None, **options):
14 |         """
15 |         Set named options (filter out those the value is None)
16 |         """
17 |         if schema is not None:
18 |             self.schema(schema)
19 |         for k, v in options.items():
20 |             if v is not None:
21 |                 self.option(k, v)
22 | 
23 |     def option(self, key, value):
24 |         raise NotImplementedError
25 | 
26 |     def schema(self, schema):
27 |         # By default OptionUtils subclass do not support schema
28 |         raise IllegalArgumentException(
29 |             f"schema is not a valid argument for {self.__class__}"
30 |         )
31 | 


--------------------------------------------------------------------------------
/pysparkling/sql/schema_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | from .internal_utils.joins import (
 4 |     CROSS_JOIN, FULL_JOIN, INNER_JOIN, LEFT_ANTI_JOIN, LEFT_JOIN, LEFT_SEMI_JOIN, RIGHT_JOIN
 5 | )
 6 | from .types import _get_null_fields, _has_nulltype, _infer_schema, _merge_type, StructField, StructType
 7 | from .utils import IllegalArgumentException
 8 | 
 9 | 
10 | def infer_schema_from_rdd(rdd):
11 |     return infer_schema_from_list(rdd.takeSample(withReplacement=False, num=200))
12 | 
13 | 
14 | def infer_schema_from_list(data, names=None):
15 |     """
16 |     Infer schema from list of Row or tuple.
17 | 
18 |     :param data: list of Row or tuple
19 |     :param names: list of column names
20 |     :return: :class:`pysparkling.sql.types.StructType`
21 |     """
22 |     if not data:
23 |         raise ValueError("can not infer schema from empty dataset")
24 |     first = data[0]
25 |     if isinstance(first, dict):
26 |         raise NotImplementedError(
27 |             "Inferring schema from dict is deprecated in Spark "
28 |             "and not implemented in pysparkling. "
29 |             "Please use .sql.Row instead"
30 |         )
31 |     schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
32 |     if _has_nulltype(schema):
33 |         null_fields = "', '".join(_get_null_fields(schema))
34 |         raise ValueError(
35 |             "Type(s) of the following field(s) cannot be determined after inferring:"
36 |             f" '{null_fields}'"
37 |         )
38 |     return schema
39 | 
40 | 
41 | def merge_schemas(left_schema, right_schema, how, on=None):
42 |     if on is None:
43 |         on = []
44 | 
45 |     left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema, on)
46 |     other_left_fields = [field for field in left_schema.fields if field not in left_on_fields]
47 |     other_right_fields = [field for field in right_schema.fields if field not in right_on_fields]
48 | 
49 |     if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, LEFT_ANTI_JOIN, LEFT_SEMI_JOIN):
50 |         on_fields = left_on_fields
51 |     elif how == RIGHT_JOIN:
52 |         on_fields = right_on_fields
53 |     elif how == FULL_JOIN:
54 |         on_fields = [StructField(field.name, field.dataType, nullable=True)
55 |                      for field in left_on_fields]
56 |     else:
57 |         raise IllegalArgumentException(f"Invalid how argument in join: {how}")
58 | 
59 |     return StructType(fields=on_fields + other_left_fields + other_right_fields)
60 | 
61 | 
62 | def get_on_fields(left_schema, right_schema, on):
63 |     left_on_fields = [next(field for field in left_schema if field.name == c) for c in on]
64 |     right_on_fields = [next(field for field in right_schema if field.name == c) for c in on]
65 |     return left_on_fields, right_on_fields
66 | 
67 | 
68 | def get_schema_from_cols(cols, current_schema):
69 |     new_schema = StructType(fields=[
70 |         field for col in cols for field in col.find_fields_in_schema(current_schema)
71 |     ])
72 |     return new_schema
73 | 


--------------------------------------------------------------------------------
/pysparkling/sql/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/sql/tests/__init__.py


--------------------------------------------------------------------------------
/pysparkling/sql/tests/data/fundings/part-0.csv:
--------------------------------------------------------------------------------
1 | permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round
2 | mycityfaces,MyCityFaces,7,web,Scottsdale,AZ,2008-01-01,50000,USD,seed
3 | flypaper,Flypaper,,web,Phoenix,AZ,2008-02-01,3000000,USD,a
4 | chosenlist-com,ChosenList.com,5,web,Scottsdale,AZ,2008-01-25,233750,USD,angel
5 | digg,Digg,60,web,San Francisco,CA,2006-12-01,8500000,USD,b
6 | 


--------------------------------------------------------------------------------
/pysparkling/sql/tests/expressions/test_mappers.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from pysparkling.utils import MonotonicallyIncreasingIDGenerator
 4 | 
 5 | 
 6 | class MonotonicallyIncreasingIDGeneratorTests(TestCase):
 7 |     def test_init_ok(self):
 8 |         sut = MonotonicallyIncreasingIDGenerator(0)
 9 |         self.assertEqual(sut.value, -1)  # Shouldn't we throw an error here?
10 | 
11 |         sut = MonotonicallyIncreasingIDGenerator(1)
12 |         self.assertEqual(sut.value, 8589934592 - 1)  # I do it this way so I can easily find/replace the value
13 | 
14 |         sut = MonotonicallyIncreasingIDGenerator(2)
15 |         self.assertEqual(sut.value, 2 * 8589934592 - 1)
16 | 
17 |     def test_next_value_ok(self):
18 |         sut = MonotonicallyIncreasingIDGenerator(1)
19 |         self.assertEqual(next(sut), 8589934592)
20 |         self.assertEqual(next(sut), 8589934593)
21 |         self.assertEqual(next(sut), 8589934594)
22 | 


--------------------------------------------------------------------------------
/pysparkling/sql/tests/test_session.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | import pytest
  4 | 
  5 | from pysparkling import Context, StorageLevel
  6 | from pysparkling.sql.session import SparkSession
  7 | from pysparkling.sql.types import (
  8 |     ArrayType, DoubleType, IntegerType, LongType, MapType, Row, row_from_keyed_values, StringType, StructField,
  9 |     StructType
 10 | )
 11 | from pysparkling.sql.utils import require_minimum_pandas_version
 12 | 
 13 | try:
 14 |     require_minimum_pandas_version()
 15 |     has_pandas = True
 16 | except ImportError:
 17 |     has_pandas = False
 18 | 
 19 | 
 20 | class SessionTests(TestCase):
 21 |     spark = SparkSession(sparkContext=Context())
 22 | 
 23 |     def test_session_range(self):
 24 |         df = self.spark.range(3)
 25 |         self.assertEqual(df.count(), 3)
 26 |         self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
 27 |         self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)])
 28 | 
 29 |     def test_session_create_data_frame_from_rdd(self):
 30 |         df = self.spark.createDataFrame(self.spark.sparkContext.parallelize([
 31 |             (1, "one"),
 32 |             (2, "two"),
 33 |             (3, "three"),
 34 |         ]))
 35 |         self.assertEqual(df.count(), 3)
 36 |         self.assertListEqual(
 37 |             df.collect(),
 38 |             [Row(_1=1, _2='one'),
 39 |              Row(_1=2, _2='two'),
 40 |              Row(_1=3, _2='three')])
 41 |         self.assertEqual(
 42 |             df.schema,
 43 |             StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
 44 |         )
 45 | 
 46 |     def test_session_create_data_frame_from_list(self):
 47 |         df = self.spark.createDataFrame([
 48 |             (1, "one"),
 49 |             (2, "two"),
 50 |             (3, "three"),
 51 |         ])
 52 |         self.assertEqual(df.count(), 3)
 53 |         self.assertListEqual(
 54 |             df.collect(),
 55 |             [Row(_1=1, _2='one'),
 56 |              Row(_1=2, _2='two'),
 57 |              Row(_1=3, _2='three')])
 58 |         self.assertEqual(
 59 |             df.schema,
 60 |             StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
 61 |         )
 62 | 
 63 |     @pytest.mark.skipif(not has_pandas, reason='pandas is not installed')
 64 |     def test_session_create_data_frame_from_pandas_data_frame(self):
 65 |         try:
 66 |             # Pandas is an optional dependency
 67 |             # pylint: disable=import-outside-toplevel
 68 |             import pandas as pd
 69 |         except ImportError as e:
 70 |             raise ImportError("pandas is not importable") from e
 71 | 
 72 |         pdf = pd.DataFrame([
 73 |             (1, "one"),
 74 |             (2, "two"),
 75 |             (3, "three")
 76 |         ])
 77 | 
 78 |         df = self.spark.createDataFrame(pdf)
 79 | 
 80 |         self.assertEqual(df.count(), 3)
 81 |         self.assertListEqual(
 82 |             df.collect(),
 83 |             [Row(**{"0": 1, "1": 'one'}),
 84 |              Row(**{"0": 2, "1": 'two'}),
 85 |              Row(**{"0": 3, "2": 'three'})])
 86 |         self.assertEqual(
 87 |             df.schema,
 88 |             StructType([StructField("0", LongType(), True), StructField("1", StringType(), True)])
 89 |         )
 90 | 
 91 |     def test_session_create_data_frame_from_list_with_col_names(self):
 92 |         df = self.spark.createDataFrame([(0.0, [1.0, 0.8]),
 93 |                                          (1.0, [0.0, 0.0]),
 94 |                                          (2.0, [0.5, 0.5])],
 95 |                                         ["label", "features"])
 96 |         self.assertEqual(df.count(), 3)
 97 |         self.assertListEqual(
 98 |             df.collect(),
 99 |             [
100 |                 row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]),
101 |                 row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]),
102 |                 row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]),
103 |             ]
104 |         )
105 | 
106 |         self.assertEqual(
107 |             df.schema,
108 |             StructType([
109 |                 StructField("label", DoubleType(), True),
110 |                 StructField("features", ArrayType(DoubleType(), True), True)
111 |             ])
112 |         )
113 | 
114 |     def test_session_create_data_frame_from_list_with_schema(self):
115 |         schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
116 |         df = self.spark.createDataFrame([({'a': 1},)], schema=schema)
117 |         self.assertEqual(df.count(), 1)
118 |         self.assertListEqual(
119 |             df.collect(),
120 |             [Row(map={'a': 1})]
121 |         )
122 |         self.assertEqual(df.schema, schema)
123 | 
124 |     def test_session_storage_level(self):
125 |         spark = SparkSession(Context())
126 |         df = spark.range(4, numPartitions=2)
127 |         self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
128 |         persisted_df = df.persist()
129 |         self.assertEqual(persisted_df.is_cached, True)
130 |         self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
131 | 


--------------------------------------------------------------------------------
/pysparkling/sql/tests/test_write.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import shutil
  4 | from unittest import TestCase
  5 | 
  6 | from dateutil.tz import tzlocal
  7 | 
  8 | from pysparkling import Context, Row
  9 | from pysparkling.sql.session import SparkSession
 10 | from pysparkling.sql.utils import AnalysisException
 11 | 
 12 | spark = SparkSession(Context())
 13 | 
 14 | 
 15 | def get_folder_content(folder_path):
 16 |     folder_content = {}
 17 |     for root, _, files in os.walk(folder_path):
 18 |         relative_path = root[len(folder_path):]
 19 |         for file in files:
 20 |             file_path = os.path.join(root, file)
 21 |             with open(file_path, 'r', encoding='utf8') as file_content:
 22 |                 folder_content[os.path.join(relative_path, file)] = file_content.readlines()
 23 |     return folder_content
 24 | 
 25 | 
 26 | class DataFrameWriterTests(TestCase):
 27 |     maxDiff = None
 28 | 
 29 |     @staticmethod
 30 |     def clean():
 31 |         if os.path.exists(".tmp"):
 32 |             shutil.rmtree(".tmp")
 33 | 
 34 |     def setUp(self):
 35 |         self.clean()
 36 | 
 37 |         tz = datetime.datetime.now().astimezone().strftime('%z')  # +0100
 38 |         self.tz = f'{tz[:3]}:{tz[3:]}'  # --> +01:00
 39 | 
 40 |     def tearDown(self):
 41 |         self.clean()
 42 | 
 43 |     def test_write_to_csv(self):
 44 |         df = spark.createDataFrame(
 45 |             [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ),
 46 |              Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))]
 47 |         )
 48 |         df.write.csv(".tmp/wonderland/")
 49 |         self.assertDictEqual(
 50 |             get_folder_content(".tmp/wonderland"),
 51 |             {
 52 |                 '_SUCCESS': [],
 53 |                 'part-00000-8447389540241120843.csv': [
 54 |                     f'2,Alice,2017-01-01T00:00:00.000{self.tz}\n',
 55 |                     f'5,Bob,2014-03-02T00:00:00.000{self.tz}\n'
 56 |                 ]
 57 |             }
 58 |         )
 59 | 
 60 |     def test_write_to_csv_with_custom_options(self):
 61 |         df = spark.createDataFrame(
 62 |             [
 63 |                 Row(age=2, name='Alice', occupation=None),
 64 |                 Row(age=5, name='Bob', occupation=""),
 65 |             ]
 66 |         )
 67 |         df.write.csv(".tmp/wonderland/", sep="^", emptyValue="", nullValue="null", header=True)
 68 |         self.assertDictEqual(
 69 |             get_folder_content(".tmp/wonderland"),
 70 |             {
 71 |                 '_SUCCESS': [],
 72 |                 'part-00000-4061950540148431296.csv': [
 73 |                     'age^name^occupation\n',
 74 |                     '2^Alice^null\n',
 75 |                     '5^Bob^\n',
 76 |                 ],
 77 |             }
 78 |         )
 79 | 
 80 |     def test_write_to_csv_fail_when_overwrite(self):
 81 |         df = spark.createDataFrame(
 82 |             [Row(age=2, name='Alice'),
 83 |              Row(age=5, name='Bob')]
 84 |         )
 85 |         df.write.csv(".tmp/wonderland/")
 86 |         with self.assertRaises(AnalysisException) as ctx:
 87 |             df.write.csv(".tmp/wonderland/")
 88 |         self.assertEqual(ctx.exception.args[0], 'path .tmp/wonderland already exists.;')
 89 |         self.assertDictEqual(
 90 |             get_folder_content(".tmp/wonderland"),
 91 |             {
 92 |                 '_SUCCESS': [],
 93 |                 'part-00000-3434325560268771971.csv': [
 94 |                     '2,Alice\n',
 95 |                     '5,Bob\n',
 96 |                 ],
 97 |             }
 98 |         )
 99 | 
100 |     def test_write_to_json(self):
101 |         df = spark.createDataFrame(
102 |             [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ),
103 |              Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))]
104 |         )
105 |         df.write.json(".tmp/wonderland/")
106 |         self.assertDictEqual(
107 |             get_folder_content(".tmp/wonderland"),
108 |             {
109 |                 '_SUCCESS': [],
110 |                 'part-00000-8447389540241120843.json': [
111 |                     f'{{"age":2,"name":"Alice","time":"2017-01-01T00:00:00.000{self.tz}"}}\n',
112 |                     f'{{"age":5,"name":"Bob","time":"2014-03-02T00:00:00.000{self.tz}"}}\n',
113 |                 ],
114 |             }
115 |         )
116 | 
117 |     def test_write_nested_rows_to_json(self):
118 |         df = spark.createDataFrame([
119 |             Row(age=2, name='Alice', animals=[
120 |                 Row(name="Chessur", type="cat"),
121 |                 Row(name="The White Rabbit", type="Rabbit")]),
122 |             Row(age=5, name='Bob', animals=[])
123 |         ])
124 |         df.write.json(".tmp/wonderland/")
125 |         self.assertDictEqual(
126 |             get_folder_content(".tmp/wonderland"),
127 |             {
128 |                 '_SUCCESS': [],
129 |                 'part-00000-2819354714706678872.json': [
130 |                     '{"age":2,"animals":['
131 |                     '{"name":"Chessur","type":"cat"},'
132 |                     '{"name":"The White Rabbit","type":"Rabbit"}'
133 |                     '],"name":"Alice"}\n',
134 |                     '{"age":5,"animals":[],"name":"Bob"}\n',
135 |                 ],
136 |             }
137 |         )
138 | 


--------------------------------------------------------------------------------
/pysparkling/sql/utils.py:
--------------------------------------------------------------------------------
 1 | class CapturedException(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class AnalysisException(CapturedException):
 6 |     pass
 7 | 
 8 | 
 9 | class ParseException(CapturedException):
10 |     pass
11 | 
12 | 
13 | class IllegalArgumentException(CapturedException):
14 |     pass
15 | 
16 | 
17 | def require_minimum_pandas_version():
18 |     """ Raise an ImportError if Pandas version is < 0.23.2
19 |     """
20 |     minimum_pandas_version = (0, 23, 2)
21 | 
22 |     # pandas is an optional dependency
23 |     # pylint: disable=import-outside-toplevel
24 |     try:
25 |         import pandas
26 |         have_pandas = True
27 |     except ImportError:
28 |         have_pandas = False
29 | 
30 |     if not have_pandas:
31 |         raise ImportError(
32 |             f"Pandas >= {minimum_pandas_version} must be installed; however none were found."
33 |         )
34 |     if parse_pandas_version(pandas.__version__) < minimum_pandas_version:
35 |         raise ImportError(
36 |             f"Pandas >= {minimum_pandas_version} must be installed;"
37 |             f" however, your version was {pandas.__version__}."
38 |         )
39 | 
40 | 
41 | def parse_pandas_version(version):
42 |     return tuple(int(part) for part in version.split("."))
43 | 


--------------------------------------------------------------------------------
/pysparkling/storagelevel.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | __all__ = ["StorageLevel"]
19 | 
20 | 
21 | class StorageLevel:
22 | 
23 |     """
24 |     Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
25 |     whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
26 |     in a JAVA-specific serialized format, and whether to replicate the RDD partitions on multiple
27 |     nodes. Also contains static constants for some commonly used storage levels, MEMORY_ONLY.
28 |     Since the data is always serialized on the Python side, all the constants use the serialized
29 |     formats.
30 |     """
31 | 
32 |     def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1):
33 |         self.useDisk = useDisk
34 |         self.useMemory = useMemory
35 |         self.useOffHeap = useOffHeap
36 |         self.deserialized = deserialized
37 |         self.replication = replication
38 | 
39 |     def __repr__(self):
40 |         return (
41 |             f"StorageLevel({self.useDisk}, {self.useMemory}, {self.useOffHeap}, {self.deserialized}, "
42 |             f"{self.replication})"
43 |         )
44 | 
45 |     def __str__(self):
46 |         result = ""
47 |         result += "Disk " if self.useDisk else ""
48 |         result += "Memory " if self.useMemory else ""
49 |         result += "OffHeap " if self.useOffHeap else ""
50 |         result += "Deserialized " if self.deserialized else "Serialized "
51 |         result += f"{self.replication}x Replicated"
52 |         return result
53 | 
54 | 
55 | StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False)
56 | StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
57 | StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False)
58 | StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2)
59 | StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False)
60 | StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
61 | StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
62 | 


--------------------------------------------------------------------------------
/pysparkling/streaming/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .context import StreamingContext
3 | from .dstream import DStream
4 | 
5 | __all__ = ['StreamingContext', 'DStream']
6 | 


--------------------------------------------------------------------------------
/pysparkling/streaming/filestream.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ..fileio import File
 4 | from ..rdd import EmptyRDD
 5 | 
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class FileTextStreamDeserializer:
10 |     def __init__(self, context):
11 |         self.context = context
12 | 
13 |     def __call__(self, path):
14 |         if path is None:
15 |             return EmptyRDD(self.context)
16 | 
17 |         return self.context.textFile(path)
18 | 
19 | 
20 | class FileBinaryStreamDeserializer:
21 |     def __init__(self, context, recordLength=None):
22 |         self.context = context
23 |         self.record_length = recordLength
24 | 
25 |     def __call__(self, path):
26 |         if path is None:
27 |             return EmptyRDD(self.context)
28 | 
29 |         return self.context.binaryRecords(
30 |             path, recordLength=self.record_length)
31 | 
32 | 
33 | class FileStream:
34 |     def __init__(self, path, process_all=False):
35 |         self.path = path
36 |         self.files_done = set()
37 |         if not process_all:
38 |             self.files_done = set(File.resolve_filenames(self.path))
39 | 
40 |     def get(self):
41 |         files = [fn for fn in File.resolve_filenames(self.path)
42 |                  if fn not in self.files_done]
43 |         if not files:
44 |             return None
45 | 
46 |         self.files_done |= set(files)
47 |         return ','.join(files)
48 | 
49 |     def stop(self):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/pysparkling/streaming/queuestream.py:
--------------------------------------------------------------------------------
 1 | from ..rdd import EmptyRDD, RDD
 2 | 
 3 | 
 4 | class QueueStreamDeserializer:
 5 |     def __init__(self, context):
 6 |         self.context = context
 7 | 
 8 |     def ensure_rdd(self, data):
 9 |         if data is None:
10 |             return EmptyRDD(self.context)
11 |         if isinstance(data, RDD):
12 |             return data
13 |         return self.context.parallelize(data)
14 | 
15 |     def __call__(self, data):
16 |         return self.ensure_rdd(data)
17 | 
18 | 
19 | class QueueStream:
20 |     def __init__(self, queue, oneAtATime=True, default=None):
21 |         self.queue = queue
22 |         self.oneAtATime = oneAtATime
23 |         self.default = default
24 | 
25 |     def get(self):
26 |         q_size = self.queue.qsize()
27 | 
28 |         if q_size == 0:
29 |             return self.default
30 | 
31 |         if self.oneAtATime:
32 |             return self.queue.get_nowait()
33 | 
34 |         return [e for _ in range(q_size) for e in self.queue.get_nowait()]
35 | 


--------------------------------------------------------------------------------
/pysparkling/streaming/tcpstream.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import struct
 3 | 
 4 | from tornado.gen import coroutine, moment
 5 | from tornado.iostream import StreamClosedError
 6 | from tornado.tcpserver import TCPServer
 7 | 
 8 | from ..rdd import EmptyRDD
 9 | 
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | class TCPDeserializer:
14 |     def __init__(self, context):
15 |         self.context = context
16 | 
17 |     def __call__(self, data):
18 |         if data is None:
19 |             return EmptyRDD(self.context)
20 | 
21 |         return self.context.parallelize(data)
22 | 
23 | 
24 | class TCPTextStream(TCPServer):
25 |     def __init__(self, delimiter=b'\n'):
26 |         super().__init__()
27 |         self.delimiter = delimiter
28 |         self.buffer = []
29 | 
30 |     def get(self):
31 |         if not self.buffer:
32 |             return []
33 | 
34 |         buffer_ = self.buffer
35 |         self.buffer = []
36 |         return buffer_
37 | 
38 |     @coroutine
39 |     def handle_stream(self, stream, address):
40 |         try:
41 |             while True:
42 |                 for _ in range(100):
43 |                     data = yield stream.read_until(self.delimiter)
44 |                     self.buffer.append(data[:-1].decode('utf8'))
45 |                 yield moment
46 |         except StreamClosedError:
47 |             pass
48 | 
49 | 
50 | class TCPBinaryStream(TCPServer):
51 |     """Consumes binary messages from a TCP socket.
52 | 
53 |     :param length: An int or string.
54 |     """
55 | 
56 |     def __init__(self, length=None):
57 |         super().__init__()
58 |         self.length = length
59 |         self.buffer = []
60 | 
61 |         self.prefix_length = None
62 |         if not isinstance(self.length, int):
63 |             self.prefix_length = struct.calcsize(self.length)
64 | 
65 |     def get(self):
66 |         if not self.buffer:
67 |             return []
68 | 
69 |         buffer_ = self.buffer
70 |         self.buffer = []
71 |         return buffer_
72 | 
73 |     @coroutine
74 |     def handle_stream(self, stream, address):
75 |         try:
76 |             while True:
77 |                 for _ in range(100):
78 |                     if self.prefix_length:
79 |                         prefix = yield stream.read_bytes(self.prefix_length)
80 |                         message_length = struct.unpack(self.length, prefix)[0]
81 |                     else:
82 |                         message_length = self.length
83 |                     data = yield stream.read_bytes(message_length)
84 |                     self.buffer.append(data)
85 |                 yield moment
86 |         except StreamClosedError:
87 |             return
88 | 


--------------------------------------------------------------------------------
/pysparkling/task_context.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class TaskContext:
 7 |     def __init__(self, cache_manager, catch_exceptions,
 8 |                  stage_id=0, partition_id=0, max_retries=3, retry_wait=0):
 9 |         self.cache_manager = cache_manager
10 |         self.catch_exceptions = catch_exceptions
11 |         self.stage_id = stage_id
12 |         self.partition_id = partition_id
13 |         self.max_retries = max_retries
14 |         self.retry_wait = retry_wait
15 | 
16 |         self.attempt_number = 0
17 |         self.is_completed = False
18 |         self.is_running_locally = True
19 |         self.task_completion_listeners = []
20 | 
21 |     def _create_child(self):
22 |         return TaskContext(self.cache_manager, self.catch_exceptions,
23 |                            stage_id=self.stage_id + 1,
24 |                            partition_id=self.partition_id)
25 | 
26 |     def attemptNumber(self):
27 |         return self.attempt_number
28 | 
29 |     def partitionId(self):
30 |         return self.partition_id
31 | 
32 |     def stageId(self):
33 |         return self.stage_id
34 | 


--------------------------------------------------------------------------------
/pysparkling/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/__init__.py


--------------------------------------------------------------------------------
/pysparkling/tests/data.7z:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/data.7z


--------------------------------------------------------------------------------
/pysparkling/tests/data.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/data.tar.gz


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.bz2/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.bz2/_SUCCESS


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.gz/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.gz/_SUCCESS


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/pysparkling/tests/pyspark/key_value.txt/_SUCCESS


--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt/part-00000:
--------------------------------------------------------------------------------
1 | ('a', 1)
2 | ('b', 2)
3 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_broadcast.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pysparkling
 4 | 
 5 | 
 6 | class BroadcastTest(unittest.TestCase):
 7 |     def setUp(self) -> None:
 8 |         self.context = pysparkling.Context()
 9 | 
10 |     def testSimple(self):
11 |         b = self.context.broadcast([1, 2, 3, 4, 5])
12 |         self.assertEqual(b.value, [1, 2, 3, 4, 5])
13 | 
14 |     def testAppendFails(self):
15 |         b = self.context.broadcast([1, 2, 3, 4, 5])
16 |         with self.assertRaises(AttributeError):
17 |             b.value += [1]  # type: ignore
18 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_cache.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import pysparkling
 5 | 
 6 | 
 7 | class Manip:
 8 |     def __init__(self):
 9 |         self.count = 0
10 | 
11 |     def trivial_manip_with_debug(self, e):
12 |         self.count += 1
13 |         print(f'manipulating {e}')
14 |         return e
15 | 
16 | 
17 | def test_cache_empty_partition():
18 |     m = Manip()
19 | 
20 |     c = pysparkling.Context()
21 |     rdd = c.parallelize(range(10), 2)
22 |     rdd = rdd.map(m.trivial_manip_with_debug)
23 |     rdd = rdd.filter(lambda e: e > 6).cache()
24 |     print(rdd.collect())
25 |     print(rdd.collect())
26 | 
27 |     print(f'count of map executions: {m.count}')
28 |     assert m.count == 10
29 | 
30 | 
31 | def test_timed_cache():
32 |     m = Manip()
33 | 
34 |     # create a timed cache manager
35 |     cm = pysparkling.TimedCacheManager(timeout=1.0)
36 | 
37 |     # create a cache entry
38 |     c = pysparkling.Context(cache_manager=cm)
39 |     rdd = c.parallelize(range(10), 2)
40 |     rdd = rdd.map(m.trivial_manip_with_debug).cache()
41 |     print(rdd.collect())
42 |     # make sure the cache is working
43 |     count_before = m.count
44 |     print(rdd.collect())
45 |     count_after = m.count
46 |     assert count_before == count_after
47 | 
48 |     # wait to have the cache expire
49 |     time.sleep(1.5)
50 |     cm.gc()
51 |     print(rdd.collect())
52 |     assert m.count > count_after
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     logging.basicConfig(level=logging.DEBUG)
57 |     # test_cache_empty_partition()
58 |     test_timed_cache()
59 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_context.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import unittest
 3 | 
 4 | import pysparkling
 5 | 
 6 | 
 7 | class Context(unittest.TestCase):
 8 |     def test_broadcast(self):
 9 |         b = pysparkling.Context().broadcast([1, 2, 3])
10 |         self.assertEqual(b.value[0], 1)
11 | 
12 |     def test_lock1(self):
13 |         """Should not be able to create a new RDD inside a map operation."""
14 |         sc = pysparkling.Context()
15 |         self.assertRaises(
16 |             pysparkling.exceptions.ContextIsLockedException,
17 |             lambda: (sc
18 |                      .parallelize(range(5))
19 |                      .map(lambda _: sc.parallelize([1]))
20 |                      .collect())
21 |         )
22 | 
23 |     def test_lock2(self):
24 |         """Should not be able to create RDDs containing RDDs."""
25 |         sc = pysparkling.Context()
26 | 
27 |         def parallelize_in_parallelize():
28 |             o = sc.parallelize(sc.parallelize(range(x)) for x in range(5))
29 |             print(o.map(lambda x: x.collect()).collect())
30 | 
31 |         self.assertRaises(
32 |             pysparkling.exceptions.ContextIsLockedException,
33 |             parallelize_in_parallelize
34 |         )
35 | 
36 |     def test_parallelize_single_element(self):
37 |         my_rdd = pysparkling.Context().parallelize([7], 100)
38 |         self.assertEqual(my_rdd.collect(), [7])
39 | 
40 |     def test_parallelize_matched_elements(self):
41 |         my_rdd = pysparkling.Context().parallelize([1, 2, 3, 4, 5], 5)
42 |         self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5])
43 | 
44 |     def test_parallelize_empty_partitions_at_end(self):
45 |         my_rdd = pysparkling.Context().parallelize(range(3529), 500)
46 |         print(my_rdd.getNumPartitions())
47 |         my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p)))
48 |         self.assertEqual(my_rdd.getNumPartitions(), 500)
49 |         self.assertEqual(my_rdd.count(), 3529)
50 | 
51 |     def test_retry(self):
52 | 
53 |         class EverySecondCallFails:
54 |             def __init__(self):
55 |                 self.attempt = 0
56 | 
57 |             def __call__(self, value):
58 |                 self.attempt += 1
59 |                 if self.attempt % 2 == 1:
60 |                     raise Exception
61 |                 return value
62 | 
63 |         data = list(range(6))
64 |         rdd = pysparkling.Context().parallelize(data, 3)
65 |         result = rdd.mapPartitions(EverySecondCallFails()).collect()
66 |         self.assertEqual(result, data)
67 | 
68 |     def test_union(self):
69 |         sc = pysparkling.Context()
70 |         rdd1 = sc.parallelize(['Hello'])
71 |         rdd2 = sc.parallelize(['World'])
72 |         union = sc.union([rdd1, rdd2]).collect()
73 |         print(union)
74 |         self.assertEqual(union, ['Hello', 'World'])
75 | 
76 |     def test_version(self):
77 |         self.assertIsInstance(pysparkling.Context().version, str)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     logging.basicConfig(level=logging.DEBUG)
82 |     Context().test_retry()
83 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_resolve_filenames.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | 
  5 | from pysparkling.fileio import File
  6 | 
  7 | CURRENT_FILE_LOCATION = __file__
  8 | 
  9 | 
 10 | class MockedHdfsClient:
 11 |     def list(self, path, status):
 12 |         if path == "/user/username/":
 13 |             return [
 14 |                 ("input", {"type": "DIRECTORY"}),
 15 |                 ("output", {"type": "DIRECTORY"})
 16 |             ]
 17 |         if path in ('/user/username/input', '/user/username/input/'):
 18 |             return [
 19 |                 ("part-00001.gz", {"type": "FILE"}),
 20 |                 ("part-00002.gz", {"type": "FILE"}),
 21 |                 ("_SUCCESS", {"type": "FILE"})
 22 |             ]
 23 |         raise NotImplementedError(f"Return value not mocked for '{path}'")
 24 | 
 25 | 
 26 | class MockedS3Bucket:
 27 |     def list(self, *args, **kwargs):
 28 |         return [
 29 |             MockedS3Key("user/username/input/part-00001.gz"),
 30 |             MockedS3Key("user/username/input/part-00002.gz"),
 31 |             MockedS3Key("user/username/input/_SUCCESS"),
 32 |         ]
 33 | 
 34 | 
 35 | class MockedS3Connection:
 36 |     def get_bucket(self, *args, **kwargs):
 37 |         return MockedS3Bucket()
 38 | 
 39 | 
 40 | class MockedS3Key:
 41 |     def __init__(self, name):
 42 |         self.name = name
 43 | 
 44 | 
 45 | def test_local_1():
 46 |     filenames = File.resolve_filenames(
 47 |         f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*'
 48 |     )
 49 |     assert CURRENT_FILE_LOCATION in filenames
 50 | 
 51 | 
 52 | def test_local_2():
 53 |     filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
 54 |     assert filenames == [CURRENT_FILE_LOCATION]
 55 | 
 56 | 
 57 | @pytest.mark.skipif(not os.getenv('AWS_ACCESS_KEY_ID'), reason='no AWS env')
 58 | def test_s3_1():
 59 |     filenames = File.resolve_filenames(
 60 |         's3n://aws-publicdatasets/common-crawl/'
 61 |         'crawl-data/CC-MAIN-2015-11/warc.paths.*'
 62 |     )
 63 |     print(filenames)
 64 |     assert ('s3n://aws-publicdatasets/common-crawl/'
 65 |             'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
 66 | 
 67 | 
 68 | def test_hdfs_resolve_filenames_with_wildcard():
 69 |     # hdfs is an optional dependency
 70 |     # pylint: disable=import-outside-toplevel
 71 |     from pysparkling.fileio.fs import Hdfs
 72 |     Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
 73 | 
 74 |     filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-*.gz")
 75 |     print(filenames)
 76 |     assert filenames == [
 77 |         'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
 78 |         'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
 79 |     ]
 80 | 
 81 | 
 82 | def test_hdfs_resolve_filenames_with_folder_path():
 83 |     # hdfs is an optional dependency
 84 |     # pylint: disable=import-outside-toplevel
 85 |     from pysparkling.fileio.fs import Hdfs
 86 |     Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
 87 | 
 88 |     filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input")
 89 |     print(filenames)
 90 |     assert filenames == [
 91 |         'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
 92 |         'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
 93 |     ]
 94 | 
 95 | 
 96 | def test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash():
 97 |     # hdfs is an optional dependency
 98 |     # pylint: disable=import-outside-toplevel
 99 |     from pysparkling.fileio.fs import Hdfs
100 |     Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
101 | 
102 |     filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/")
103 |     print(filenames)
104 |     assert filenames == [
105 |         'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
106 |         'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
107 |     ]
108 | 
109 | 
110 | def test_hdfs_resolve_filenames_with_file_path():
111 |     # hdfs is an optional dependency
112 |     # pylint: disable=import-outside-toplevel
113 |     from pysparkling.fileio.fs import Hdfs
114 |     Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
115 | 
116 |     filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-00001.gz")
117 |     print(filenames)
118 |     assert filenames == [
119 |         'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz'
120 |     ]
121 | 
122 | 
123 | def test_s3_resolve_filenames():
124 |     # boto is an optional dependency
125 |     # pylint: disable=import-outside-toplevel
126 |     from pysparkling.fileio.fs import S3
127 |     S3._get_conn = classmethod(lambda *args, **kwargs: MockedS3Connection())
128 | 
129 |     filenames = S3.resolve_filenames("s3://bucket-name/user/username/input/part-*.gz")
130 |     print(filenames)
131 |     assert filenames == [
132 |         's3://bucket-name/user/username/input/part-00001.gz',
133 |         's3://bucket-name/user/username/input/part-00002.gz'
134 |     ]
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     test_local_1()
139 |     test_local_2()
140 |     test_s3_1()
141 |     test_hdfs_resolve_filenames_with_folder_path()
142 |     test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash()
143 |     test_hdfs_resolve_filenames_with_file_path()
144 |     test_hdfs_resolve_filenames_with_wildcard()
145 |     test_s3_resolve_filenames()
146 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_sample.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pysparkling
 4 | 
 5 | 
 6 | def test_trivial_sample():
 7 |     rdd = pysparkling.Context().parallelize(range(1000), 1000)
 8 |     sampled = rdd.sample(False, 0.01, 42).collect()
 9 |     print(sampled)
10 |     assert sampled == [97, 164, 294, 695, 807, 864, 911]
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     logging.basicConfig(level=logging.DEBUG)
15 |     test_trivial_sample()
16 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_stat_counter.py:
--------------------------------------------------------------------------------
 1 | import pysparkling
 2 | from pysparkling.sql.functions import col
 3 | from pysparkling.sql.types import IntegerType, Row, StructField, StructType
 4 | from pysparkling.stat_counter import ColumnStatHelper
 5 | 
 6 | 
 7 | def test_mean():
 8 |     d = [1, 4, 9, 160]
 9 |     s = pysparkling.StatCounter(d)
10 |     assert sum(d) / len(d) == s.mean()
11 | 
12 | 
13 | def test_column_stat_helper():
14 |     """
15 |     Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries
16 |     """
17 |     schema = StructType([StructField("value", IntegerType())])
18 |     helper = ColumnStatHelper(col("value"))
19 |     for i in range(1, 100001):
20 |         helper.merge(Row(value=i), schema)
21 |     helper.finalize()
22 |     assert helper.count == 100000
23 |     assert helper.min == 1
24 |     assert helper.max == 100000
25 |     assert helper.mean == 50000.5
26 |     assert helper.stddev == 28867.65779668774  # sample standard deviation
27 |     assert helper.get_quantile(0) == 1
28 |     assert helper.get_quantile(0.25) == 24998
29 |     assert helper.get_quantile(0.5) == 50000
30 |     assert helper.get_quantile(0.75) == 74993
31 |     assert helper.get_quantile(1) == 100000
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     test_mean()
36 |     test_column_stat_helper()
37 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_files.py:
--------------------------------------------------------------------------------
 1 | import tornado.testing
 2 | 
 3 | import pysparkling
 4 | 
 5 | 
 6 | class TextFile(tornado.testing.AsyncTestCase):
 7 | 
 8 |     def test_connect(self):
 9 |         sc = pysparkling.Context()
10 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
11 | 
12 |         result = []
13 |         (
14 |             ssc.textFileStream('LICENS*', process_all=True)
15 |             .count()
16 |             .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
17 |         )
18 | 
19 |         ssc.start()
20 |         ssc.awaitTermination(timeout=0.3)
21 |         self.assertEqual(sum(result), 44)
22 | 
23 |     def test_save(self):
24 |         sc = pysparkling.Context()
25 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
26 | 
27 |         (
28 |             ssc.textFileStream('LICENS*')
29 |             .count()
30 |             .saveAsTextFiles('tests/textout/')
31 |         )
32 | 
33 |     def test_save_gz(self):
34 |         sc = pysparkling.Context()
35 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
36 | 
37 |         (
38 |             ssc.textFileStream('LICENS*')
39 |             .count()
40 |             .saveAsTextFiles('tests/textout/', suffix='.gz')
41 |         )
42 | 
43 | 
44 | class BinaryFile(tornado.testing.AsyncTestCase):
45 | 
46 |     def test_read_file(self):
47 |         sc = pysparkling.Context()
48 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
49 | 
50 |         result = []
51 |         (
52 |             ssc.fileBinaryStream('LICENS*', process_all=True)
53 |             .count()
54 |             .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
55 |         )
56 | 
57 |         ssc.start()
58 |         ssc.awaitTermination(timeout=0.3)
59 |         self.assertEqual(sum(result), 1)
60 | 
61 |     def test_read_chunks(self):
62 |         sc = pysparkling.Context()
63 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
64 | 
65 |         result = []
66 |         (
67 |             ssc.fileBinaryStream('LICENS*', recordLength=40, process_all=True)
68 |             .count()
69 |             .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
70 |         )
71 | 
72 |         ssc.start()
73 |         ssc.awaitTermination(timeout=0.3)
74 |         self.assertEqual(sum(result), 54)
75 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_queue.py:
--------------------------------------------------------------------------------
 1 | import tornado.testing
 2 | 
 3 | import pysparkling
 4 | 
 5 | 
 6 | class TestCount(tornado.testing.AsyncTestCase):
 7 | 
 8 |     def test_count(self):
 9 |         sc = pysparkling.Context()
10 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
11 | 
12 |         result = []
13 |         (
14 |             ssc.queueStream([range(20), ['a', 'b'], ['c']])
15 |             .count()
16 |             .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
17 |         )
18 | 
19 |         ssc.start()
20 |         ssc.awaitTermination(timeout=0.35)
21 |         self.assertEqual(sum(result), 23)
22 | 
23 |     def test_groupByKey(self):
24 |         sc = pysparkling.Context()
25 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
26 | 
27 |         result = []
28 |         (
29 |             ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)],
30 |                              [('a', 2), ('b', 3)]])
31 |             .groupByKey().mapPartitions(sorted).mapValues(sorted)
32 |             .foreachRDD(lambda rdd: result.append(rdd.collect()))
33 |         )
34 | 
35 |         ssc.start()
36 |         ssc.awaitTermination(timeout=0.25)
37 |         self.assertEqual(
38 |             result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
39 | 
40 |     def test_mapValues(self):
41 |         sc = pysparkling.Context()
42 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
43 | 
44 |         result = []
45 |         (
46 |             ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]])
47 |             .mapValues(sorted)
48 |             .foreachRDD(lambda rdd: result.append(rdd.collect()))
49 |         )
50 | 
51 |         ssc.start()
52 |         ssc.awaitTermination(timeout=0.15)
53 |         self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
54 | 


--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_tcp.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from contextlib import closing
 3 | import struct
 4 | 
 5 | import tornado.gen
 6 | import tornado.tcpclient
 7 | import tornado.testing
 8 | 
 9 | import pysparkling
10 | 
11 | 
12 | class TCPTextTest(tornado.testing.AsyncTestCase):
13 |     @tornado.gen.coroutine
14 |     def client(self):
15 |         client = tornado.tcpclient.TCPClient()
16 |         for v in range(20):
17 |             stream = yield client.connect('127.0.0.1', 8123)
18 |             with closing(stream):
19 |                 stream.write(f'a = {v}\n'.encode('utf8'))
20 |         client.close()
21 | 
22 |     def test_connect(self):
23 |         sc = pysparkling.Context()
24 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
25 | 
26 |         counter = Counter()
27 |         (
28 |             ssc.socketTextStream('127.0.0.1', 8123)
29 |             .foreachRDD(lambda rdd:
30 |                         counter.update(''.join(rdd.collect()))
31 |                         if rdd.collect() else None)
32 |         )
33 |         self.client()
34 | 
35 |         ssc.start()
36 |         ssc.awaitTermination(timeout=0.3)
37 |         self.assertEqual(counter['a'], 20)
38 | 
39 | 
40 | class TCPBinaryFixedLengthTest(tornado.testing.AsyncTestCase):
41 |     @tornado.gen.coroutine
42 |     def client(self):
43 |         client = tornado.tcpclient.TCPClient()
44 |         stream = yield client.connect('127.0.0.1', 8124)
45 |         with closing(stream):
46 |             stream.write(b'hello')
47 |         client.close()
48 | 
49 |     def test_main(self):
50 |         sc = pysparkling.Context()
51 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
52 | 
53 |         counter = Counter()
54 |         (
55 |             ssc.socketBinaryStream('127.0.0.1', 8124, length=5)
56 |             .foreachRDD(lambda rdd: counter.update(rdd.collect()))
57 |         )
58 |         self.client()
59 | 
60 |         ssc.start()
61 |         ssc.awaitTermination(timeout=0.3)
62 |         self.assertEqual(counter[b'hello'], 1)
63 | 
64 | 
65 | class TCPBinaryUIntLengthTest(tornado.testing.AsyncTestCase):
66 |     @tornado.gen.coroutine
67 |     def client(self):
68 |         client = tornado.tcpclient.TCPClient()
69 |         stream = yield client.connect('127.0.0.1', 8125)
70 |         with closing(stream):
71 |             stream.write(struct.pack('<I', 10) + b'hellohello')
72 |         client.close()
73 | 
74 |     def test_main(self):
75 |         sc = pysparkling.Context()
76 |         ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
77 | 
78 |         counter = Counter()
79 |         (
80 |             ssc.socketBinaryStream('127.0.0.1', 8125, length='<I')
81 |             .foreachRDD(lambda rdd: counter.update(rdd.collect()))
82 |         )
83 |         self.client()
84 | 
85 |         ssc.start()
86 |         ssc.awaitTermination(timeout=0.3)
87 |         self.assertEqual(counter[b'hellohello'], 1)
88 | 


--------------------------------------------------------------------------------
/scripts/benchmark_csv.py:
--------------------------------------------------------------------------------
 1 | """Benchmark csv reading performance."""
 2 | 
 3 | import argparse
 4 | import random
 5 | from string import ascii_uppercase
 6 | 
 7 | import pysparkling
 8 | 
 9 | 
10 | def create_csv(filename, lines=10000000, columns=12):
11 |     with open(filename, 'w', encoding='utf8') as f:
12 |         column_names = ','.join(ascii_uppercase[i] for i in range(columns))
13 |         f.write(f'{column_names}\n')
14 | 
15 |         for _ in range(lines):
16 |             values = ','.join(
17 |                 f'{100 * (c + 1) * random.random():.3f}'
18 |                 for c in range(columns)
19 |             )
20 |             f.write(f'{values}\n')
21 | 
22 | 
23 | def read_csv(filename):
24 |     c = pysparkling.Context()
25 |     r = c.textFile(filename)
26 |     r = r.map(lambda l: l + 'something else')
27 |     print(r.count())
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     p = argparse.ArgumentParser(description=__doc__)
32 |     p.add_argument('--create', default=False, action='store_true',
33 |                    help='create csv test file')
34 |     p.add_argument('--testfile', default='test.csv',
35 |                    help='the test file')
36 |     args = p.parse_args()
37 | 
38 |     if args.create:
39 |         create_csv(filename=args.testfile)
40 |     else:
41 |         read_csv(filename=args.testfile)
42 | 


--------------------------------------------------------------------------------
/scripts/benchmark_generators.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | 
 4 | def with_generator():
 5 |     return (x for x in range(1000))
 6 | 
 7 | 
 8 | def with_yield():
 9 |     for x in range(1000):
10 |         yield x
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     print(timeit.timeit(stmt='list(with_generator())',
15 |                         setup='from __main__ import with_generator',
16 |                         number=10000))
17 |     print(timeit.timeit(stmt='list(with_yield())',
18 |                         setup='from __main__ import with_yield',
19 |                         number=10000))
20 | 


--------------------------------------------------------------------------------
/scripts/ipcluster_simple.py:
--------------------------------------------------------------------------------
1 | from ipyparallel import Client
2 | 
3 | rc = Client(packer='pickle')
4 | 
5 | view = rc[:]
6 | results = view.map(lambda x: x ** 30, range(8))
7 | print(results.get())
8 | 


--------------------------------------------------------------------------------
/scripts/log_streaming.py:
--------------------------------------------------------------------------------
 1 | import pysparkling
 2 | 
 3 | 
 4 | def main():
 5 |     sc = pysparkling.Context()
 6 |     ssc = pysparkling.streaming.StreamingContext(sc, 1)
 7 |     ssc.textFileStream('/var/log/system.log*').pprint()
 8 |     ssc.start()
 9 |     ssc.awaitTermination(timeout=3.0)
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     main()
14 | 


--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/multiprocessing_performance_plot.pdf


--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/multiprocessing_performance_plot.png


--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | import pysparkling.tests.test_multiprocessing as test_mp
 5 | 
 6 | 
 7 | def plot(has_hyperthreading=True):
 8 |     n_cpu, r = test_mp.test_performance()
 9 |     r = {n: 1.0 / (v[0] / r[1][0]) for n, v in r.items()}
10 | 
11 |     if has_hyperthreading:
12 |         n_cpu /= 2
13 | 
14 |     x, y = zip(*sorted(r.items()))
15 |     x_left = np.array(x) - 0.5
16 | 
17 |     fig, ax = plt.subplots()
18 | 
19 |     # ideal line
20 |     # line = ax.plot((1, n_cpu), (1.0, n_cpu),
21 |     #                linewidth=2, linestyle='dashed', color='grey')
22 |     # ax.plot((n_cpu, max(x)+0.5), (n_cpu, n_cpu),
23 |     #         linewidth=2, linestyle='dashed', color='grey')
24 |     n_threads = n_cpu * 2 if has_hyperthreading else n_cpu
25 |     bars_ideal = ax.bar(
26 |         x_left,
27 |         range(n_threads) + [n_threads for _ in range(len(x) - n_threads)],
28 |         1.0, color='lightgrey', linewidth=0,
29 |     )
30 | 
31 |     # measured
32 |     bars = ax.bar(x_left, y, 1.0, color='y')
33 | 
34 |     # divide with cpu cores
35 |     ax.plot((n_cpu + 0.5, n_cpu + 0.5), (0, n_threads + 1),
36 |             linewidth=2, linestyle='solid', color='black')
37 |     ax.text(n_cpu + 0.4, n_threads + 1,
38 |             f'{n_cpu} CPU cores',
39 |             ha='right', va='top')
40 | 
41 |     # divide with cpu threads
42 |     if has_hyperthreading:
43 |         ax.plot((n_cpu * 2 + 0.5, n_cpu * 2 + 0.5), (0, n_threads + 1),
44 |                 linewidth=2, linestyle='solid', color='black')
45 |         ax.text(n_cpu * 2 + 0.4, n_threads + 1,
46 |                 f'{n_cpu * 2} CPU threads',
47 |                 ha='right', va='top')
48 | 
49 |     # add some text for labels, title and axes ticks
50 |     ax.set_xlabel('n processes')
51 |     ax.set_ylabel('speedup')
52 |     ax.set_xticks(x)
53 |     ax.set_xticklabels(['no\nserialization\n(single process)']
54 |                        + [str(s) for s in x[1:]])
55 |     ax.set_xlim(-0.5, max(x) + 0.5)
56 |     ax.set_ylim(0, max(x))
57 |     ax.legend((bars[0], bars_ideal[0]), ('measured', 'ideal'),
58 |               loc='upper left')
59 | 
60 |     for rect in bars:
61 |         height = rect.get_height()
62 |         ax.text(rect.get_x() + rect.get_width() / 2., height - 0.05,
63 |                 f'{height:.2f}',
64 |                 ha='center', va='top')
65 | 
66 |     fig.tight_layout()
67 |     # plt.show()
68 |     fig.savefig('tests/multiprocessing_performance_plot.pdf')
69 |     fig.savefig('tests/multiprocessing_performance_plot.png', dpi=300)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     plot()
74 | 


--------------------------------------------------------------------------------
/scripts/profile_textfile.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | from memory_profiler import profile
 4 | 
 5 | import pysparkling
 6 | 
 7 | 
 8 | @profile
 9 | def main():
10 |     tempFile = tempfile.NamedTemporaryFile(delete=True)  # pylint: disable=consider-using-with
11 |     tempFile.close()
12 | 
13 |     sc = pysparkling.Context()
14 |     sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz')
15 |     rdd = sc.textFile(tempFile.name + '.gz')
16 |     rdd.collect()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 


--------------------------------------------------------------------------------
/scripts/pyspark_comparisons.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | 
 3 | SC = pyspark.SparkContext()
 4 | 
 5 | 
 6 | def simple_textFile():
 7 |     print(SC.textFile('tests/test_simple.py').collect())
 8 |     print(SC.textFile('tests/test_simple.py').name())
 9 |     print(SC.parallelize([1, 2, 3]).name())
10 | 
11 | 
12 | def indent_line(l):
13 |     print('============== INDENTING LINE ================')
14 |     return '--- ' + l
15 | 
16 | 
17 | def lazy_execution():
18 |     r = SC.textFile('tests/test_simple.py').map(indent_line)
19 |     r.foreach(indent_line)
20 |     print()
21 |     print()
22 |     print()
23 |     # at this point, no map() or foreach() should have been executed
24 |     print(r.collect())
25 | 
26 | 
27 | def count_lines():
28 |     r = SC.wholeTextFiles('tests/*.py').keys().collect()
29 |     print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
30 |     print(r)
31 |     print(SC.textFile('tests/*.py').count())
32 | 
33 | 
34 | def create_key_value_txt():
35 |     r = SC.parallelize([('a', 1), ('b', 2)], 1)
36 |     r.saveAsTextFile('tests/pyspark/key_value.txt')
37 |     r.saveAsHadoopFile(
38 |         "tests/pyspark/key_value.txt.bz2",
39 |         "org.apache.hadoop.mapred.TextOutputFormat",
40 |         compressionCodecClass="org.apache.hadoop.io.compress.BZip2Codec",
41 |     )
42 |     r.saveAsHadoopFile(
43 |         "tests/pyspark/key_value.txt.gz",
44 |         "org.apache.hadoop.mapred.TextOutputFormat",
45 |         compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec",
46 |     )
47 |     # r.saveAsHadoopFile(
48 |     #     "tests/pyspark/key_value.txt.lzo",
49 |     #     "org.apache.hadoop.mapred.TextOutputFormat",
50 |     #     compressionCodecClass="com.hadoop.compression.lzo.LzopCodec",
51 |     # )
52 | 
53 |     r_txt = SC.textFile('tests/pyspark/key_value.txt')
54 |     print(r_txt.collect())
55 |     r_gz = SC.textFile('tests/pyspark/key_value.txt.gz')
56 |     print(r_gz.collect())
57 |     r_bz2 = SC.textFile('tests/pyspark/key_value.txt.bz2')
58 |     print(r_bz2.collect())
59 | 
60 | 
61 | def create_pickled_files():
62 |     rdd = SC.parallelize(['hello', 'world', 1, 2], 2)
63 |     rdd.saveAsPickleFile('tests/pyspark/mixed.pickle')
64 |     rdd.saveAsPickleFile('tests/pyspark/mixed_batched.pickle', 1)
65 | 
66 | 
67 | def stat():
68 |     d = [1, 4, 9, 16, 25, 36]
69 |     s1 = SC.parallelize(d).stats()
70 |     s2 = SC.parallelize(d, 3).stats()
71 |     print(str(s1))
72 |     print(str(s2))
73 | 
74 | 
75 | def partition_by():
76 |     rdd = SC.parallelize(range(20), 2).map(lambda x: (x, x))
77 |     r = rdd.partitionBy(2).collect()
78 |     print('>>>>>>', r)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     # simple_textFile()
83 |     # lazy_execution()
84 |     # count_lines()
85 |     # create_key_value_txt()
86 |     # create_pickled_files()
87 |     # stat()
88 |     partition_by()
89 | 


--------------------------------------------------------------------------------
/scripts/pyspark_streaming.py:
--------------------------------------------------------------------------------
  1 | """Explore PySpark API.
  2 | 
  3 | Run with `spark-submit scripts/pyspark_streaming.py`.
  4 | """
  5 | import time
  6 | 
  7 | import pyspark.streaming
  8 | 
  9 | 
 10 | def simple_queue(ssc):
 11 |     ssc.queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=False).pprint()
 12 | 
 13 | 
 14 | def simple_queue_count(ssc):
 15 |     (ssc
 16 |      .queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=False)
 17 |      .count()
 18 |      .foreachRDD(lambda t, r: print('>>>>>>>>>>>>>>', t, r.collect())))
 19 | 
 20 | 
 21 | def simple_queue_one_at_a_time(ssc):
 22 |     ssc.queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True).pprint()
 23 | 
 24 | 
 25 | def save_text(ssc):
 26 |     (ssc
 27 |      .queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True)
 28 |      .saveAsTextFiles('scripts/textout/'))
 29 | 
 30 | 
 31 | def window(ssc):
 32 |     (ssc
 33 |      .queueStream([[1], [2], [3], [4], [5], [6]])
 34 |      .window(3)
 35 |      .foreachRDD(lambda rdd: print('>>>>>>>>>', rdd.collect())))
 36 | 
 37 | 
 38 | def updateStateByKey(ssc):
 39 |     def processStateUpdateByKey(input_stream, state):
 40 |         print('i', input_stream)
 41 |         print('s', state)
 42 |         return state if not input_stream else input_stream[-1]
 43 | 
 44 |     ssc.checkpoint('checkpoints/')
 45 |     (ssc
 46 |      .queueStream([[('a', 1), ('b', 3)], [('a', 2), ('a', 5), ('c', 4)]])
 47 |      .updateStateByKey(processStateUpdateByKey)
 48 |      .pprint()
 49 |      )
 50 | 
 51 | 
 52 | def stream_log(ssc):
 53 |     ssc.textFileStream('/var/log/system.log*').pprint()
 54 | 
 55 | 
 56 | def stream_queue_default(ssc):
 57 |     (ssc
 58 |      .queueStream([[4], [2]], default=['placeholder'])
 59 |      .foreachRDD(lambda rdd: print(rdd.collect())))
 60 | 
 61 | 
 62 | def join_with_repeated_keys(ssc):
 63 |     s1 = ssc.queueStream([[('a', 4), ('a', 2)], [('c', 7)]])
 64 |     s2 = ssc.queueStream([[('b', 1), ('b', 3)], [('c', 8)]])
 65 |     (
 66 |         s1.fullOuterJoin(s2)
 67 |         .foreachRDD(lambda rdd: print(sorted(rdd.collect())))
 68 |     )
 69 | 
 70 | 
 71 | def union(ssc):
 72 |     odd = ssc.queueStream([[1], [3], [5]])
 73 |     even = ssc.queueStream([[2], [4], [6]])
 74 |     (
 75 |         odd.union(even)
 76 |         .foreachRDD(lambda rdd: print(rdd.collect()))
 77 |     )
 78 | 
 79 | 
 80 | def quiet_logs(sc):
 81 |     logger = sc._jvm.org.apache.log4j
 82 |     logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
 83 |     logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     spark_context = pyspark.SparkContext()
 88 |     quiet_logs(spark_context)
 89 |     streaming_context = pyspark.streaming.StreamingContext(spark_context, 1)
 90 | 
 91 |     # simple_queue(ssc)
 92 |     # simple_queue_count(ssc)
 93 |     # simple_queue_one_at_a_time(ssc)
 94 |     # save_text(ssc)
 95 |     # window(ssc)
 96 |     # updateStateByKey(ssc)
 97 |     # stream_log(ssc)
 98 |     # stream_queue_default(ssc)
 99 |     # join_with_repeated_keys(ssc)
100 |     union(streaming_context)
101 | 
102 |     streaming_context.start()
103 |     time.sleep(3.0)
104 |     streaming_context.stop(stopGraceFully=True)
105 | 


--------------------------------------------------------------------------------
/scripts/readme_example.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 | 
3 | my_rdd = Context().textFile('tests/*.py')
4 | 
5 | unfiltered_count = my_rdd.count()
6 | filtered_count = my_rdd.filter(lambda l: l.startswith("import ")).count()
7 | print(f'In tests/*.py: all lines={unfiltered_count}, with import={filtered_count}')
8 | 


--------------------------------------------------------------------------------
/scripts/readme_example_common_crawl.py:
--------------------------------------------------------------------------------
 1 | from pysparkling import Context
 2 | 
 3 | # read all the paths of warc and wat files of the latest Common Crawl
 4 | paths_rdd = Context().textFile(
 5 |     's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
 6 |     'warc.paths.*,'
 7 |     's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
 8 |     'wat.paths.gz',
 9 | )
10 | 
11 | print(paths_rdd.collect())
12 | 


--------------------------------------------------------------------------------
/scripts/readme_example_human_microbiome.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 | 
3 | by_subject_rdd = Context().textFile(
4 |     's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*'
5 | )
6 | print(by_subject_rdd.takeSample(True, 1))
7 | 


--------------------------------------------------------------------------------
/scripts/readme_example_word_count.py:
--------------------------------------------------------------------------------
 1 | from pysparkling import Context
 2 | 
 3 | counts = (
 4 |     Context()
 5 |     .textFile('README.rst')
 6 |     .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line))
 7 |     .flatMap(lambda line: line.split(' '))
 8 |     .map(lambda word: (word, 1))
 9 |     .reduceByKey(lambda a, b: a + b)
10 | )
11 | print(counts.collect())
12 | 


--------------------------------------------------------------------------------
/scripts/starcluster_simple.py:
--------------------------------------------------------------------------------
 1 | from ipyparallel import Client
 2 | 
 3 | rc = Client('/Users/sven/.starcluster/ipcluster/'
 4 |             'SecurityGroup:@sc-smallcluster-us-east-1.json',
 5 |             sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle')
 6 | 
 7 | view = rc[:]
 8 | results = view.map(lambda x: x ** 30, range(8))
 9 | print(results.get())
10 | 


--------------------------------------------------------------------------------
/scripts/tcpperf_client.py:
--------------------------------------------------------------------------------
  1 | """Sends tcp messages."""
  2 | import argparse
  3 | from contextlib import closing
  4 | import json
  5 | import random
  6 | import struct
  7 | import sys
  8 | import time
  9 | 
 10 | from tornado import gen
 11 | from tornado.ioloop import IOLoop, PeriodicCallback
 12 | from tornado.iostream import StreamClosedError
 13 | from tornado.tcpclient import TCPClient
 14 | 
 15 | 
 16 | class Emitter:
 17 |     def __init__(self, port, n=1000, values=1, duration=3.0):
 18 |         self.port = port
 19 |         self.n = n
 20 |         self.values = values
 21 |         self.duration = duration
 22 |         self.message = self.hello
 23 |         self.i = 0
 24 | 
 25 |         self.pcb = None
 26 |         self.client = None
 27 | 
 28 |     def start(self):
 29 |         self.client = TCPClient()
 30 | 
 31 |         self.pcb = PeriodicCallback(self.send, 1000.0 / self.n)
 32 |         self.pcb.start()
 33 | 
 34 |         IOLoop.current().call_later(self.duration + 0.5, self.stop)
 35 |         IOLoop.current().start()
 36 |         IOLoop.clear_current()
 37 | 
 38 |     def stop(self):
 39 |         if self.pcb is not None:
 40 |             self.pcb.stop()
 41 |         if self.client is not None:
 42 |             self.client.close()
 43 |         IOLoop.current().stop()
 44 | 
 45 |     @gen.coroutine
 46 |     def send(self):
 47 |         if self.i >= self.duration * self.n * self.values:
 48 |             self.pcb.stop()
 49 |             return
 50 | 
 51 |         try:
 52 |             stream = yield self.client.connect('127.0.0.1', self.port)
 53 |             with closing(stream):
 54 |                 messages = b''.join(self.message() for _ in range(self.values))
 55 |                 stream.write(messages)
 56 |                 self.i += self.values
 57 |         except StreamClosedError:
 58 |             return
 59 | 
 60 |     def hello(self):
 61 |         return b'hello\n'
 62 | 
 63 |     def r(self):
 64 |         s = random.randint(1, 10)
 65 |         v = s / 10.0 + (1.5 - s / 10.0) * random.random()
 66 |         return (s, v)
 67 | 
 68 |     def text(self):
 69 |         s, v = self.r()
 70 |         return f'sensor{s}|{v}\n'.encode('utf8')
 71 | 
 72 |     def json(self):
 73 |         s, v = self.r()
 74 |         return (json.dumps({f'sensor{s}': v}) + '\n').encode('utf8')
 75 | 
 76 |     def bello(self):
 77 |         # 5 bytes
 78 |         return b'bello'
 79 | 
 80 |     def struct(self):
 81 |         # 8 bytes
 82 |         return struct.pack('If', *self.r())
 83 | 
 84 | 
 85 | def main():
 86 |     parser = argparse.ArgumentParser(description=__doc__)
 87 |     parser.add_argument('-n', type=int, default=1000,
 88 |                         help='number of connections')
 89 |     parser.add_argument('--values', type=int, default=1,
 90 |                         help='number of values per connection')
 91 |     parser.add_argument('--port', type=int, default=8123,
 92 |                         help='target port number')
 93 |     parser.add_argument('--format', default='hello',
 94 |                         help='format of the messages: hello (default), '
 95 |                              'text, json, bello (binary hello), '
 96 |                              'struct (binary)')
 97 |     parser.add_argument('--delay', type=float, default=0.5,
 98 |                         help='wait before start sending messages')
 99 |     args = parser.parse_args()
100 | 
101 |     time.sleep(args.delay)
102 |     e = Emitter(args.port, args.n, args.values)
103 |     e.message = getattr(e, args.format)
104 |     e.start()
105 |     print(f'{sys.argv[0]} sent {e.i} messages')
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     main()
110 | 


--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv:
--------------------------------------------------------------------------------
 1 | # messages, hello, text, json, bello, struct
 2 | 8000, 5505, 5077, 5315, 5128, 5309
 3 | 7000, 4641, 4369, 4395, 4846, 4670
 4 | 6000, 5238, 4854, 4825, 4639, 5184
 5 | 5000, 4329, 4626, 4314, 4270, 4246
 6 | 4500, 4064, 4406, 3900, 3980, 4278
 7 | 4000, 3681, 3584, 3680, 3710, 3709
 8 | 3500, 3378, 3307, 3299, 3404, 3220
 9 | 3000, 2888, 2892, 2961, 2890, 2871
10 | 2000, 1978, 1970, 1989, 1972, 1970
11 | 1000, 998, 998, 996, 1001, 998
12 | 100, 100, 100, 100, 101, 100
13 | 


--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_connections.csv.pdf


--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_connections.csv.png


--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv:
--------------------------------------------------------------------------------
 1 | # messages, hello, text, json, bello, struct
 2 | 100000, 72700, 77500, 77800, 69500, 60000
 3 | 90000, 82000, 58600, 58500, 60400, 59000
 4 | 80000, 65400, 65900, 56800, 57600, 58300
 5 | 70000, 59300, 59900, 56800, 50500, 56500
 6 | 60000, 56800, 55100, 55600, 52300, 55400
 7 | 50000, 50100, 50300, 50000, 48900, 50000
 8 | 45000, 45000, 45300, 45000, 45000, 45100
 9 | 40000, 40000, 40100, 40300, 39800, 40000
10 | 30000, 30000, 30000, 30000, 30000, 30000
11 | 20000, 20500, 20000, 20500, 20100, 20300
12 | 10000, 10000, 10000, 10000, 10000, 10000
13 | 


--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_messages.csv.pdf


--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/431df12873bd9cf12af5f085cd7e283aabdcf097/scripts/tcpperf_messages.csv.png


--------------------------------------------------------------------------------
/scripts/tcpperf_plot.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import csv
 3 | 
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | matplotlib.use('Agg')
 8 | 
 9 | 
10 | class Plot:
11 |     def __init__(self, filename, x_label=None, y_label=None):
12 |         self.filename = filename
13 |         self.x_label = x_label or 'connections per second'
14 |         self.y_label = y_label or 'processed messages per second'
15 |         self.record = None
16 |         self.data = list(self.read())
17 |         self.frame()
18 | 
19 |     def read(self):
20 |         with open(self.filename, 'r', encoding='utf8') as f:
21 |             reader = csv.reader(f)
22 | 
23 |             try:
24 |                 first_line = next(reader)
25 |             except StopIteration:
26 |                 return
27 | 
28 |             self.record = namedtuple('record', [k.strip().replace('# ', '')
29 |                                                 for k in first_line])
30 |             for row_raw in reader:
31 |                 row = self.record._make([int(v) for v in row_raw])
32 |                 yield row
33 | 
34 |     def frame(self):
35 |         fig, ax = plt.subplots()
36 | 
37 |         x = [row.messages for row in self.data]
38 |         y = [row.hello for row in self.data]
39 | 
40 |         # add some text for labels, title and axes ticks
41 |         ax.set_xlabel(self.x_label)
42 |         ax.set_ylabel(self.y_label)
43 |         # ax.set_xticks(x)
44 |         ax.set_xlim(-300, max(x) + 300)
45 |         ax.set_ylim(-300, max(y) + 2000)
46 | 
47 |         fig.tight_layout()
48 | 
49 |         self.fig, self.ax = fig, ax
50 |         return self
51 | 
52 |     def plot(self):
53 |         x = [row.messages for row in self.data]
54 | 
55 |         ideal, = self.ax.plot([0.0, max(x)], [0.0, max(x)], label='ideal',
56 |                               color='black', linestyle='--', linewidth=1)
57 |         graphs = [
58 |             self.ax.plot(x, [getattr(row, k) for row in self.data], label=k)
59 |             for k in self.record._fields if k != 'messages'
60 |         ]
61 | 
62 |         self.ax.legend(
63 |             handles=[ideal] + [g for g, in graphs],
64 |             loc='upper left',
65 |         )
66 | 
67 |         return self
68 | 
69 |     def show(self):
70 |         plt.show()
71 |         return self
72 | 
73 |     def save(self):
74 |         self.fig.savefig(self.filename + '.pdf')
75 |         self.fig.savefig(self.filename + '.png', dpi=300)
76 |         return self
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     Plot('tests/tcpperf_connections.csv').plot().save()
81 |     (Plot('tests/tcpperf_messages.csv',
82 |           x_label='inbound messages per second')
83 |      .plot()
84 |      .save())
85 | 


--------------------------------------------------------------------------------
/scripts/tcpperf_server.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import json
  3 | import logging
  4 | import math
  5 | import os
  6 | import struct
  7 | import time
  8 | 
  9 | import pysparkling
 10 | 
 11 | N_CONNECTIONS = (100, 1000, 2000, 3000, 3500, 4000, 4500, 5000,
 12 |                  6000, 7000, 8000)
 13 | N_CONNECTIONS_1K = (10, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100)
 14 | 
 15 | 
 16 | class Server:
 17 |     def __init__(self, pause=60, values=1, start_port=8123, processes=2):
 18 |         self.pause = pause
 19 |         self.values = values
 20 |         self.port = start_port
 21 |         self.processes = processes
 22 | 
 23 |     def client(self, n=2000, format_='hello'):
 24 |         for _ in range(self.processes):
 25 |             os.system(
 26 |                 f'python tests/tcpperf_client.py -n {int(n / self.processes)}'
 27 |                 f' --port {self.port} --format {format_} --values {self.values}'
 28 |                 f' &'
 29 |             )
 30 | 
 31 |     def _run_process(self, n, to_kv, format_):
 32 |         c = pysparkling.Context()
 33 |         stream_c = pysparkling.streaming.StreamingContext(c, 1.0)
 34 | 
 35 |         counts = []
 36 |         sensor_sums = defaultdict(float)
 37 |         sensor_squares = defaultdict(float)
 38 |         sensor_counts = defaultdict(int)
 39 |         if format_ not in ('bello', 'struct'):
 40 |             t = stream_c.socketTextStream('localhost', self.port)
 41 |         else:
 42 |             length = {'bello': 5, 'struct': 8}[format_]
 43 |             t = stream_c.socketBinaryStream('localhost', self.port, length)
 44 |         t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0]))
 45 |         if to_kv is not None:
 46 |             def update(rdd):
 47 |                 for k, v in rdd.collect():
 48 |                     sensor_sums[k] += sum(v)
 49 |                     sensor_squares[k] += sum(vv ** 2 for vv in v)
 50 |                     sensor_counts[k] += len(v)
 51 | 
 52 |             t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd))
 53 | 
 54 |         self.client(n, format_=format_)
 55 | 
 56 |         stream_c.start()
 57 |         stream_c.awaitTermination(timeout=5.0)
 58 | 
 59 |         return (
 60 |             counts,
 61 |             sensor_sums,
 62 |             sensor_squares,
 63 |             sensor_counts
 64 |         )
 65 | 
 66 |     def run(self, n=2000, to_kv=None, format_='hello'):
 67 |         counts, sensor_sums, sensor_squares, sensor_counts = self._run_process(n, to_kv, format_)
 68 | 
 69 |         result = max(counts) if counts else 0
 70 |         sensor_expections = {
 71 |             # expectation of X and X^2
 72 |             k: (sensor_sums[k] / v, sensor_squares[k] / v)
 73 |             for k, v in sensor_counts.items()
 74 |         }
 75 |         sensors = {
 76 |             k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0] ** 2))
 77 |             for k, ex_ex2 in sensor_expections.items()
 78 |         }
 79 |         print(f'run: n = {n}, counts = {counts}, result = {result}')
 80 |         print(f'sensors = {sensors}')
 81 |         time.sleep(self.pause)
 82 |         self.port += 1
 83 |         return result
 84 | 
 85 | 
 86 | def main():
 87 |     logging.basicConfig(level=logging.WARNING)
 88 | 
 89 |     def kv_from_text(text):
 90 |         k, _, v = text.partition('|')
 91 |         return k, float(v)
 92 | 
 93 |     def kv_from_json(text):
 94 |         j = json.loads(text)
 95 |         return list(j.items())[0]
 96 | 
 97 |     def kv_from_struct(b):
 98 |         s, v = struct.unpack('If', b)
 99 |         return f'sensor{s}', v
100 | 
101 |     with open('tests/tcpperf_messages.csv', 'w', encoding='utf8') as f:
102 |         f.write('# messages, hello, text, json, bello, struct\n')
103 |         server_1k = Server(pause=2, values=1000, processes=5)
104 |         for n in reversed(N_CONNECTIONS_1K):
105 |             data = (
106 |                 n * 1000,
107 |                 server_1k.run(n),
108 |                 server_1k.run(n, None, 'bello'),
109 |                 server_1k.run(n, kv_from_text, 'text'),
110 |                 server_1k.run(n, kv_from_json, 'json'),
111 |                 server_1k.run(n, kv_from_struct, 'struct'),
112 |             )
113 |             f.write(', '.join(f'{d}' for d in data) + '\n')
114 | 
115 |     with open('tests/tcpperf_connections.csv', 'w', encoding='utf8') as f:
116 |         f.write('# messages, hello, text, json, bello, struct\n')
117 |         server = Server()
118 |         for n in reversed(N_CONNECTIONS):
119 |             data = (
120 |                 n,
121 |                 server.run(n),
122 |                 server.run(n, None, 'bello'),
123 |                 server.run(n, kv_from_text, 'text'),
124 |                 server.run(n, kv_from_json, 'json'),
125 |                 server.run(n, kv_from_struct, 'struct'),
126 |             )
127 |             f.write(', '.join(f'{d}' for d in data) + '\n')
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     main()
132 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = W503, E731
 3 | exclude = venv*,logo,docs,build
 4 | max-line-length = 119
 5 | 
 6 | [tool:pytest]
 7 | addopts = --doctest-modules --cov=pysparkling --cov-report=html --cov-branch
 8 | testpaths = pysparkling
 9 | doctest_optionflags = ALLOW_UNICODE NORMALIZE_WHITESPACE
10 | 
11 | [pycodestyle]
12 | max-line-length=119
13 | ignore=E731,E741,W503
14 | exclude=pysparkling/__init__.py
15 | 
16 | # See the docstring in versioneer.py for instructions. Note that you must
17 | # re-run 'versioneer.py setup' after changing this section, and commit the
18 | # resulting files.
19 | 
20 | [versioneer]
21 | VCS = git
22 | style = pep440
23 | versionfile_source = pysparkling/_version.py
24 | versionfile_build = pysparkling/_version.py
25 | tag_prefix = v
26 | # parentdir_prefix =
27 | 
28 | [coverage:run]
29 | branch = True
30 | cover_pylib = False
31 | data_file = reports/.coverage
32 | source = pysparkling
33 | omit = pysparkling/_version.py
34 | 
35 | [coverage:report]
36 | show_missing = True
37 | skip_covered = False
38 | 
39 | [coverage:html]
40 | directory = reports/coverage
41 | 
42 | [isort]
43 | src_paths = pysparkling,scripts
44 | skip_gitignore = True
45 | line_length = 119
46 | order_by_type = False
47 | case_sensitive = False
48 | multi_line_output = 5
49 | force_sort_within_sections = True
50 | skip = versioneer.py


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | import versioneer
 3 | 
 4 | setup(
 5 |     name='pysparkling',
 6 |     version=versioneer.get_version(),
 7 |     cmdclass=versioneer.get_cmdclass(),
 8 |     packages=find_packages(),
 9 |     license='MIT',
10 |     description='Pure Python implementation of the Spark RDD interface.',
11 |     long_description=open('README.rst', 'r', encoding='utf8').read(),
12 |     author='pysparkling contributors',
13 |     url='https://github.com/svenkreiss/pysparkling',
14 | 
15 |     install_requires=[
16 |         'pytz>=2019.3',
17 |         'python-dateutil>=2.8.0'
18 |     ],
19 |     extras_require={
20 |         'hdfs': ['hdfs>=2.0.0'],
21 |         'http': ['requests>=2.6.0'],
22 |         'performance': ['matplotlib>=1.5.3'],
23 |         's3': ['boto>=2.36.0'],
24 |         'streaming': ['tornado>=4.3'],
25 |         'sql': [
26 |             'numpy',
27 |             'pandas>=0.23.2',
28 |         ],
29 |         'tests': [
30 |             'backports.tempfile==1.0rc1',
31 |             'cloudpickle>=0.1.0',
32 |             'isort',
33 |             'pylint',
34 |             'pylzma',
35 |             'memory-profiler>=0.47',
36 |             'pycodestyle',
37 |             'pytest',
38 |             'pytest-cov',
39 |             'requests>=2.6.0',
40 |             'tornado>=4.3',
41 |         ],
42 |         'scripts': [
43 |             'ipyparallel',
44 |             'pyspark',
45 |             'matplotlib',
46 |         ]
47 |     },
48 | 
49 |     classifiers=[
50 |         'Development Status :: 4 - Beta',
51 |         'Intended Audience :: Developers',
52 |         'Natural Language :: English',
53 |         'License :: OSI Approved :: MIT License',
54 |         'Operating System :: OS Independent',
55 |         'Programming Language :: Python',
56 |         'Programming Language :: Python :: 3.7',
57 |         'Programming Language :: Python :: 3.8',
58 |         'Programming Language :: Python :: 3.9',
59 |         'Programming Language :: Python :: 3.10',
60 |         'Programming Language :: Python :: 3.11',
61 |         'Programming Language :: Python :: Implementation :: PyPy',
62 |     ]
63 | )
64 | 


--------------------------------------------------------------------------------