├── pysparkling
├── sql
│ ├── __init__.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── data
│ │ │ └── fundings
│ │ │ │ └── part-0.csv
│ │ ├── expressions
│ │ │ └── test_mappers.py
│ │ ├── test_session.py
│ │ └── test_write.py
│ ├── expressions
│ │ ├── __init__.py
│ │ ├── aggregate
│ │ │ ├── __init__.py
│ │ │ ├── aggregations.py
│ │ │ ├── covariance_aggregations.py
│ │ │ ├── stat_aggregations.py
│ │ │ └── collectors.py
│ │ ├── userdefined.py
│ │ ├── orders.py
│ │ ├── literals.py
│ │ ├── jsons.py
│ │ ├── csvs.py
│ │ ├── explodes.py
│ │ └── fields.py
│ ├── internal_utils
│ │ ├── __init__.py
│ │ ├── readers
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── textreader.py
│ │ │ ├── csvreader.py
│ │ │ ├── utils.py
│ │ │ └── jsonreader.py
│ │ ├── joins.py
│ │ ├── column.py
│ │ ├── readwrite.py
│ │ └── options.py
│ ├── conf.py
│ ├── utils.py
│ ├── context.py
│ └── schema_utils.py
├── tests
│ ├── __init__.py
│ ├── pyspark
│ │ ├── key_value.txt
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ ├── key_value.txt.bz2
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000.bz2
│ │ └── key_value.txt.gz
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000.gz
│ ├── data.7z
│ ├── data.tar.gz
│ ├── test_sample.py
│ ├── test_broadcast.py
│ ├── test_stat_counter.py
│ ├── test_cache.py
│ ├── test_streaming_queue.py
│ ├── test_streaming_files.py
│ ├── test_streaming_tcp.py
│ ├── test_context.py
│ └── test_resolve_filenames.py
├── __version__.py
├── streaming
│ ├── __init__.py
│ ├── queuestream.py
│ ├── filestream.py
│ └── tcpstream.py
├── fileio
│ ├── __init__.py
│ ├── codec
│ │ ├── bz2.py
│ │ ├── codec.py
│ │ ├── gz.py
│ │ ├── zip.py
│ │ ├── lzma.py
│ │ ├── __init__.py
│ │ ├── sevenz.py
│ │ └── tar.py
│ ├── fs
│ │ ├── __init__.py
│ │ ├── http.py
│ │ ├── file_system.py
│ │ ├── local.py
│ │ ├── s3.py
│ │ ├── gs.py
│ │ └── hdfs.py
│ ├── textfile.py
│ └── file.py
├── exceptions.py
├── partition.py
├── __init__.py
├── task_context.py
├── samplers.py
├── broadcast.py
├── storagelevel.py
└── accumulators.py
├── docs
├── sphinx
│ ├── version_index
│ │ ├── .nojekyll
│ │ ├── CNAME
│ │ ├── favicon.ico
│ │ ├── circle.yml
│ │ └── index.html
│ ├── images
│ │ ├── favicon.ico
│ │ └── logo-w600.png
│ ├── api_rdd.rst
│ ├── index.rst
│ ├── api_streaming.rst
│ ├── api.rst
│ ├── dev.rst
│ ├── api_context.rst
│ ├── api_fileio.rst
│ ├── read_write.rst
│ └── parallel.rst
├── requirements.txt
└── readthedocs.png
├── .gitattributes
├── MANIFEST.in
├── logo
├── favicon.ico
├── banner-w500.png
├── favicon-w16.png
├── favicon-w32.png
├── favicon-w48.png
├── logo-w100.png
├── logo-w600.png
├── banner-w1500.png
├── favicon-w128.png
├── favicon-w256.png
├── favicon.svg
└── create.py
├── scripts
├── tcpperf_messages.csv.pdf
├── tcpperf_messages.csv.png
├── tcpperf_connections.csv.pdf
├── tcpperf_connections.csv.png
├── multiprocessing_performance_plot.pdf
├── multiprocessing_performance_plot.png
├── ipcluster_simple.py
├── readme_example_human_microbiome.py
├── readme_example.py
├── log_streaming.py
├── starcluster_simple.py
├── readme_example_word_count.py
├── readme_example_common_crawl.py
├── tcpperf_connections.csv
├── profile_textfile.py
├── tcpperf_messages.csv
├── benchmark_generators.py
├── benchmark_csv.py
├── multiprocessing_performance_plot.py
├── tcpperf_plot.py
├── pyspark_comparisons.py
├── pyspark_streaming.py
├── tcpperf_client.py
└── tcpperf_server.py
├── .pylintrc
├── .readthedocs.yml
├── .github
├── stale.yml
└── workflows
│ ├── deploy.yml
│ └── tests.yml
├── setup.cfg
├── .gitignore
├── setup.py
├── LICENSE
└── README.rst
/pysparkling/sql/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/sql/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/sphinx/version_index/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/sphinx/version_index/CNAME:
--------------------------------------------------------------------------------
1 | pysparkling.trivial.io
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.bz2/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.gz/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pysparkling/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include pysparkling/_version.py
3 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dateutil
2 | Sphinx
3 | sphinx_rtd_theme
4 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt/part-00000:
--------------------------------------------------------------------------------
1 | ('a', 1)
2 | ('b', 2)
3 |
--------------------------------------------------------------------------------
/logo/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon.ico
--------------------------------------------------------------------------------
/docs/readthedocs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/docs/readthedocs.png
--------------------------------------------------------------------------------
/logo/banner-w500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/banner-w500.png
--------------------------------------------------------------------------------
/logo/favicon-w16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon-w16.png
--------------------------------------------------------------------------------
/logo/favicon-w32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon-w32.png
--------------------------------------------------------------------------------
/logo/favicon-w48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon-w48.png
--------------------------------------------------------------------------------
/logo/logo-w100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/logo-w100.png
--------------------------------------------------------------------------------
/logo/logo-w600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/logo-w600.png
--------------------------------------------------------------------------------
/logo/banner-w1500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/banner-w1500.png
--------------------------------------------------------------------------------
/logo/favicon-w128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon-w128.png
--------------------------------------------------------------------------------
/logo/favicon-w256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/logo/favicon-w256.png
--------------------------------------------------------------------------------
/pysparkling/tests/data.7z:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/pysparkling/tests/data.7z
--------------------------------------------------------------------------------
/docs/sphinx/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/docs/sphinx/images/favicon.ico
--------------------------------------------------------------------------------
/pysparkling/__version__.py:
--------------------------------------------------------------------------------
1 | from ._version import get_versions
2 |
3 | __version__ = get_versions()['version']
4 |
--------------------------------------------------------------------------------
/pysparkling/tests/data.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/pysparkling/tests/data.tar.gz
--------------------------------------------------------------------------------
/docs/sphinx/images/logo-w600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/docs/sphinx/images/logo-w600.png
--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/tcpperf_messages.csv.pdf
--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/tcpperf_messages.csv.png
--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/tcpperf_connections.csv.pdf
--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/tcpperf_connections.csv.png
--------------------------------------------------------------------------------
/docs/sphinx/version_index/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/docs/sphinx/version_index/favicon.ico
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import InternalReader
2 |
3 | __all__ = [
4 | 'InternalReader'
5 | ]
6 |
--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/multiprocessing_performance_plot.pdf
--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/scripts/multiprocessing_performance_plot.png
--------------------------------------------------------------------------------
/docs/sphinx/version_index/circle.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 | pre:
3 | - sudo pip install html5validator
4 | test:
5 | override:
6 | - html5validator
7 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/pysparkling/tests/pyspark/key_value.txt.gz/part-00000.gz
--------------------------------------------------------------------------------
/pysparkling/streaming/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .context import StreamingContext
3 | from .dstream import DStream
4 |
5 | __all__ = ['StreamingContext', 'DStream']
6 |
--------------------------------------------------------------------------------
/pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenkreiss/pysparkling/HEAD/pysparkling/tests/pyspark/key_value.txt.bz2/part-00000.bz2
--------------------------------------------------------------------------------
/pysparkling/fileio/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .file import File
4 | from .textfile import TextFile
5 |
6 | # flake8: noqa
7 |
8 | __all__ = ['File', 'TextFile']
9 |
--------------------------------------------------------------------------------
/docs/sphinx/api_rdd.rst:
--------------------------------------------------------------------------------
1 | .. _api_rdd:
2 |
3 | RDD
4 | ---
5 |
6 | .. autoclass:: pysparkling.RDD
7 | :members:
8 |
9 | .. autoclass:: pysparkling.StatCounter
10 | :members:
11 |
--------------------------------------------------------------------------------
/scripts/ipcluster_simple.py:
--------------------------------------------------------------------------------
1 | from ipyparallel import Client
2 |
3 | rc = Client(packer='pickle')
4 |
5 | view = rc[:]
6 | results = view.map(lambda x: x ** 30, range(8))
7 | print(results.get())
8 |
--------------------------------------------------------------------------------
/scripts/readme_example_human_microbiome.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 |
3 | by_subject_rdd = Context().textFile(
4 | 's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*'
5 | )
6 | print(by_subject_rdd.takeSample(True, 1))
7 |
--------------------------------------------------------------------------------
/pysparkling/exceptions.py:
--------------------------------------------------------------------------------
1 |
2 | class ConnectionException(Exception):
3 | pass
4 |
5 |
6 | class ContextIsLockedException(Exception):
7 | pass
8 |
9 |
10 | class FileAlreadyExistsException(Exception):
11 | pass
12 |
13 |
14 | class FileSystemNotSupported(Exception):
15 | pass
16 |
--------------------------------------------------------------------------------
/scripts/readme_example.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 |
3 | my_rdd = Context().textFile('tests/*.py')
4 |
5 | unfiltered_count = my_rdd.count()
6 | filtered_count = my_rdd.filter(lambda l: l.startswith("import ")).count()
7 | print(f'In tests/*.py: all lines={unfiltered_count}, with import={filtered_count}')
8 |
--------------------------------------------------------------------------------
/scripts/log_streaming.py:
--------------------------------------------------------------------------------
1 | import pysparkling
2 |
3 |
4 | def main():
5 | sc = pysparkling.Context()
6 | ssc = pysparkling.streaming.StreamingContext(sc, 1)
7 | ssc.textFileStream('/var/log/system.log*').pprint()
8 | ssc.start()
9 | ssc.awaitTermination(timeout=3.0)
10 |
11 |
12 | if __name__ == '__main__':
13 | main()
14 |
--------------------------------------------------------------------------------
/docs/sphinx/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. include:: ../../README.rst
3 |
4 |
5 | Contents
6 | ========
7 |
8 | .. toctree::
9 | :maxdepth: 2
10 |
11 | self
12 | read_write
13 | api
14 | dev
15 |
16 |
17 |
18 | .. Indices and tables
19 | .. ==================
20 |
21 | .. * :ref:`genindex`
22 | .. * :ref:`modindex`
23 | .. * :ref:`search`
24 |
--------------------------------------------------------------------------------
/scripts/starcluster_simple.py:
--------------------------------------------------------------------------------
1 | from ipyparallel import Client
2 |
3 | rc = Client('/Users/sven/.starcluster/ipcluster/'
4 | 'SecurityGroup:@sc-smallcluster-us-east-1.json',
5 | sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle')
6 |
7 | view = rc[:]
8 | results = view.map(lambda x: x ** 30, range(8))
9 | print(results.get())
10 |
--------------------------------------------------------------------------------
/scripts/readme_example_word_count.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 |
3 | counts = (
4 | Context()
5 | .textFile('README.rst')
6 | .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line))
7 | .flatMap(lambda line: line.split(' '))
8 | .map(lambda word: (word, 1))
9 | .reduceByKey(lambda a, b: a + b)
10 | )
11 | print(counts.collect())
12 |
--------------------------------------------------------------------------------
/pysparkling/sql/tests/data/fundings/part-0.csv:
--------------------------------------------------------------------------------
1 | permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round
2 | mycityfaces,MyCityFaces,7,web,Scottsdale,AZ,2008-01-01,50000,USD,seed
3 | flypaper,Flypaper,,web,Phoenix,AZ,2008-02-01,3000000,USD,a
4 | chosenlist-com,ChosenList.com,5,web,Scottsdale,AZ,2008-01-25,233750,USD,angel
5 | digg,Digg,60,web,San Francisco,CA,2006-12-01,8500000,USD,b
6 |
--------------------------------------------------------------------------------
/scripts/readme_example_common_crawl.py:
--------------------------------------------------------------------------------
1 | from pysparkling import Context
2 |
3 | # read all the paths of warc and wat files of the latest Common Crawl
4 | paths_rdd = Context().textFile(
5 | 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
6 | 'warc.paths.*,'
7 | 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
8 | 'wat.paths.gz',
9 | )
10 |
11 | print(paths_rdd.collect())
12 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_sample.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import pysparkling
4 |
5 |
6 | def test_trivial_sample():
7 | rdd = pysparkling.Context().parallelize(range(1000), 1000)
8 | sampled = rdd.sample(False, 0.01, 42).collect()
9 | print(sampled)
10 | assert sampled == [97, 164, 294, 695, 807, 864, 911]
11 |
12 |
13 | if __name__ == '__main__':
14 | logging.basicConfig(level=logging.DEBUG)
15 | test_trivial_sample()
16 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [BASIC]
2 |
3 | variable-rgx=[a-z0-9_]{1,30}$
4 | good-names=log
5 |
6 | disable=invalid-name,unused-argument,too-few-public-methods,missing-docstring,logging-format-interpolation,too-many-instance-attributes,duplicate-code,too-many-public-methods,too-many-arguments,protected-access,too-many-lines,missing-timeout,unnecessary-lambda-assignment
7 |
8 | [FORMAT]
9 | max-line-length=119
10 |
11 | [SIMILARITIES]
12 |
13 | ignore-imports=yes
14 |
--------------------------------------------------------------------------------
/docs/sphinx/api_streaming.rst:
--------------------------------------------------------------------------------
1 | .. _api_streaming:
2 |
3 | Streaming
4 | ---------
5 |
6 | .. warning::
7 | This is a new addition to the API (March 2017) that should only be used
8 | with care.
9 |
10 |
11 | StreamingContext
12 | ^^^^^^^^^^^^^^^^
13 |
14 | .. autoclass:: pysparkling.streaming.StreamingContext
15 | :members:
16 |
17 |
18 | DStream
19 | ^^^^^^^
20 |
21 | .. autoclass:: pysparkling.streaming.DStream
22 | :members:
23 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/bz2.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import io
3 | import logging
4 |
5 | from .codec import Codec
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | class Bz2(Codec):
11 | """Implementation of :class:`.Codec` for bz2 compression."""
12 |
13 | def compress(self, stream):
14 | return io.BytesIO(bz2.compress(b''.join(stream)))
15 |
16 | def decompress(self, stream):
17 | return io.BytesIO(bz2.decompress(stream.read()))
18 |
--------------------------------------------------------------------------------
/pysparkling/partition.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | log = logging.getLogger(__name__)
4 |
5 |
6 | class Partition:
7 | def __init__(self, x, idx=None):
8 | self.index = idx
9 | self._x = list(x)
10 |
11 | def x(self):
12 | return self._x
13 |
14 | def hashCode(self):
15 | return self.index
16 |
17 | def __getstate__(self):
18 | return {
19 | 'index': self.index,
20 | '_x': self.x(),
21 | }
22 |
--------------------------------------------------------------------------------
/docs/sphinx/api.rst:
--------------------------------------------------------------------------------
1 | .. _api:
2 |
3 | API
4 | ===
5 |
6 | .. currentmodule:: pysparkling
7 |
8 | A usual ``pysparkling`` session starts with either parallelizing a `list`
9 | with :func:`Context.parallelize` or by reading data from a file using
10 | :func:`Context.textFile`. These two methods return :class:`RDD` instances that
11 | can then be processed.
12 |
13 |
14 | .. toctree::
15 | :maxdepth: 2
16 |
17 | api_rdd
18 | api_context
19 | api_streaming
20 | api_fileio
21 |
--------------------------------------------------------------------------------
/scripts/tcpperf_connections.csv:
--------------------------------------------------------------------------------
1 | # messages, hello, text, json, bello, struct
2 | 8000, 5505, 5077, 5315, 5128, 5309
3 | 7000, 4641, 4369, 4395, 4846, 4670
4 | 6000, 5238, 4854, 4825, 4639, 5184
5 | 5000, 4329, 4626, 4314, 4270, 4246
6 | 4500, 4064, 4406, 3900, 3980, 4278
7 | 4000, 3681, 3584, 3680, 3710, 3709
8 | 3500, 3378, 3307, 3299, 3404, 3220
9 | 3000, 2888, 2892, 2961, 2890, 2871
10 | 2000, 1978, 1970, 1989, 1972, 1970
11 | 1000, 998, 998, 996, 1001, 998
12 | 100, 100, 100, 100, 101, 100
13 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/sphinx/conf.py
11 |
12 | # Optionally set the version of Python and requirements required to build your docs
13 | python:
14 | version: 3.7
15 | install:
16 | - requirements: docs/requirements.txt
17 |
--------------------------------------------------------------------------------
/scripts/profile_textfile.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 |
3 | from memory_profiler import profile
4 |
5 | import pysparkling
6 |
7 |
8 | @profile
9 | def main():
10 | tempFile = tempfile.NamedTemporaryFile(delete=True) # pylint: disable=consider-using-with
11 | tempFile.close()
12 |
13 | sc = pysparkling.Context()
14 | sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz')
15 | rdd = sc.textFile(tempFile.name + '.gz')
16 | rdd.collect()
17 |
18 |
19 | if __name__ == '__main__':
20 | main()
21 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/aggregations.py:
--------------------------------------------------------------------------------
1 | from ..expressions import Expression
2 |
3 |
4 | class Aggregation(Expression):
5 | @property
6 | def is_an_aggregation(self):
7 | return True
8 |
9 | def merge(self, row, schema):
10 | raise NotImplementedError
11 |
12 | def mergeStats(self, other, schema):
13 | raise NotImplementedError
14 |
15 | def eval(self, row, schema):
16 | raise NotImplementedError
17 |
18 | def args(self):
19 | raise NotImplementedError
20 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_broadcast.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pysparkling
4 |
5 |
6 | class BroadcastTest(unittest.TestCase):
7 | def setUp(self) -> None:
8 | self.context = pysparkling.Context()
9 |
10 | def testSimple(self):
11 | b = self.context.broadcast([1, 2, 3, 4, 5])
12 | self.assertEqual(b.value, [1, 2, 3, 4, 5])
13 |
14 | def testAppendFails(self):
15 | b = self.context.broadcast([1, 2, 3, 4, 5])
16 | with self.assertRaises(AttributeError):
17 | b.value += [1] # type: ignore
18 |
--------------------------------------------------------------------------------
/scripts/tcpperf_messages.csv:
--------------------------------------------------------------------------------
1 | # messages, hello, text, json, bello, struct
2 | 100000, 72700, 77500, 77800, 69500, 60000
3 | 90000, 82000, 58600, 58500, 60400, 59000
4 | 80000, 65400, 65900, 56800, 57600, 58300
5 | 70000, 59300, 59900, 56800, 50500, 56500
6 | 60000, 56800, 55100, 55600, 52300, 55400
7 | 50000, 50100, 50300, 50000, 48900, 50000
8 | 45000, 45000, 45300, 45000, 45000, 45100
9 | 40000, 40000, 40100, 40300, 39800, 40000
10 | 30000, 30000, 30000, 30000, 30000, 30000
11 | 20000, 20500, 20000, 20500, 20100, 20300
12 | 10000, 10000, 10000, 10000, 10000, 10000
13 |
--------------------------------------------------------------------------------
/scripts/benchmark_generators.py:
--------------------------------------------------------------------------------
1 | import timeit
2 |
3 |
4 | def with_generator():
5 | return (x for x in range(1000))
6 |
7 |
8 | def with_yield():
9 | for x in range(1000):
10 | yield x
11 |
12 |
13 | if __name__ == '__main__':
14 | print(timeit.timeit(stmt='list(with_generator())',
15 | setup='from __main__ import with_generator',
16 | number=10000))
17 | print(timeit.timeit(stmt='list(with_yield())',
18 | setup='from __main__ import with_yield',
19 | number=10000))
20 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/codec.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | log = logging.getLogger(__name__)
4 |
5 |
6 | class Codec:
7 | """Codec."""
8 | def __init__(self):
9 | pass
10 |
11 | def compress(self, stream):
12 | """Compress.
13 |
14 | :param io.BytesIO stream: Uncompressed input stream.
15 | :rtype: io.BytesIO
16 | """
17 | return stream
18 |
19 | def decompress(self, stream):
20 | """Decompress.
21 |
22 | :param io.BytesIO stream: Compressed input stream.
23 | :rtype: io.BytesIO
24 | """
25 | return stream
26 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/joins.py:
--------------------------------------------------------------------------------
1 | """
2 | The following constants are used to identify join types
3 | """
4 | INNER_JOIN = "inner"
5 | CROSS_JOIN = "cross"
6 | FULL_JOIN = "full"
7 | LEFT_JOIN = "left"
8 | RIGHT_JOIN = "right"
9 | LEFT_SEMI_JOIN = "leftsemi"
10 | LEFT_ANTI_JOIN = "leftanti"
11 |
12 | JOIN_TYPES = dict(
13 | inner=INNER_JOIN,
14 | cross=CROSS_JOIN,
15 | outer=FULL_JOIN,
16 | full=FULL_JOIN,
17 | fullouter=FULL_JOIN,
18 | left=LEFT_JOIN,
19 | leftouter=LEFT_JOIN,
20 | right=RIGHT_JOIN,
21 | rightouter=RIGHT_JOIN,
22 | leftsemi=LEFT_SEMI_JOIN,
23 | leftanti=LEFT_ANTI_JOIN,
24 | )
25 |
--------------------------------------------------------------------------------
/pysparkling/__init__.py:
--------------------------------------------------------------------------------
1 | """pysparkling module"""
2 | # flake8: noqa
3 |
4 | from . import exceptions, fileio, streaming
5 | from .__version__ import __version__
6 | from .accumulators import Accumulator, AccumulatorParam
7 | from .broadcast import Broadcast
8 | from .cache_manager import CacheManager, TimedCacheManager
9 | from .context import Context
10 | from .rdd import RDD
11 | from .sql.types import Row
12 | from .stat_counter import StatCounter
13 | from .storagelevel import StorageLevel
14 |
15 | __all__ = ['RDD', 'Context', 'Broadcast', 'StatCounter', 'CacheManager', 'Row',
16 | 'TimedCacheManager', 'StorageLevel',
17 | 'exceptions', 'fileio', 'streaming']
18 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/userdefined.py:
--------------------------------------------------------------------------------
1 | from .expressions import Expression
2 |
3 |
4 | class UserDefinedFunction(Expression):
5 | def __init__(self, f, return_type, *exprs):
6 | super().__init__()
7 | self.f = f
8 | self.return_type = return_type
9 | self.exprs = exprs
10 |
11 | def eval(self, row, schema):
12 | return self.f(*(expr.eval(row, schema) for expr in self.exprs))
13 |
14 | def __str__(self):
15 | arguments = ', '.join(str(arg) for arg in self.args())
16 | return f"{self.f.__name__}({arguments})"
17 |
18 | def args(self):
19 | return self.exprs
20 |
21 |
22 | __all__ = ["UserDefinedFunction"]
23 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 60
3 | # Number of days of inactivity before a stale issue is closed
4 | daysUntilClose: 7
5 | # Issues with these labels will never be considered stale
6 | exemptLabels:
7 | - pinned
8 | - security
9 | # Label to use when marking an issue as stale
10 | staleLabel: stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 | This issue has been automatically marked as stale because it has not had
14 | recent activity. It will be closed if no further activity occurs. Thank you
15 | for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/gz.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | from io import BytesIO
3 | import logging
4 |
5 | from .codec import Codec
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | class Gz(Codec):
11 | """Implementation of :class:`.Codec` for gz compression."""
12 |
13 | def compress(self, stream):
14 | compressed = BytesIO()
15 |
16 | with gzip.GzipFile(fileobj=compressed, mode='wb') as f:
17 | f.write(stream.read())
18 |
19 | compressed.seek(0)
20 | return compressed
21 |
22 | def decompress(self, stream):
23 | uncompressed = BytesIO()
24 |
25 | with gzip.GzipFile(fileobj=stream, mode='rb') as f:
26 | uncompressed.write(f.read())
27 |
28 | uncompressed.seek(0)
29 | return uncompressed
30 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/__init__.py:
--------------------------------------------------------------------------------
1 | from .file_system import FileSystem
2 | from .gs import GS
3 | from .hdfs import Hdfs
4 | from .http import Http
5 | from .local import Local
6 | from .s3 import S3
7 |
8 | __all__ = ['FileSystem', 'GS', 'Hdfs', 'Http', 'Local', 'S3']
9 |
10 |
11 | FILE_EXTENSIONS = [
12 | (('file', ''), Local),
13 | (('s3', 's3n'), S3),
14 | (('gs', 'gcs'), GS),
15 | (('http', 'https'), Http),
16 | (('hdfs'), Hdfs),
17 | ]
18 |
19 |
20 | def get_fs(path):
21 | """Find the file system implementation for this path."""
22 | scheme = ''
23 | if '://' in path:
24 | scheme = path.partition('://')[0]
25 |
26 | for schemes, fs_class in FILE_EXTENSIONS:
27 | if scheme in schemes:
28 | return fs_class
29 |
30 | return FileSystem
31 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/zip.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import logging
3 | import zipfile
4 |
5 | from .codec import Codec
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | class Zip(Codec):
11 | """Implementation of :class:`.Codec` for zip compression."""
12 |
13 | def compress(self, stream):
14 | compressed = BytesIO()
15 |
16 | with zipfile.ZipFile(file=compressed, mode='w', allowZip64=True) as f:
17 | f.writestr('data', stream.read())
18 |
19 | compressed.seek(0)
20 | return compressed
21 |
22 | def decompress(self, stream):
23 | uncompressed = BytesIO()
24 |
25 | with zipfile.ZipFile(file=stream, mode='r', allowZip64=True) as f:
26 | for f_name in f.namelist():
27 | uncompressed.write(f.read(f_name))
28 |
29 | uncompressed.seek(0)
30 | return uncompressed
31 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/orders.py:
--------------------------------------------------------------------------------
1 | from .expressions import Expression
2 |
3 |
4 | class SortOrder(Expression):
5 | sort_order = None
6 |
7 | def __init__(self, column):
8 | super().__init__(column)
9 | self.column = column
10 |
11 | def eval(self, row, schema):
12 | return self.column.eval(row, schema)
13 |
14 | def __str__(self):
15 | return f"{self.column} {self.sort_order}"
16 |
17 | def args(self):
18 | return (self.column,)
19 |
20 |
21 | class AscNullsFirst(SortOrder):
22 | sort_order = "ASC NULLS FIRST"
23 |
24 |
25 | class AscNullsLast(SortOrder):
26 | sort_order = "ASC NULLS LAST"
27 |
28 |
29 | class DescNullsFirst(SortOrder):
30 | sort_order = "DESCNULLS FIRST"
31 |
32 |
33 | class DescNullsLast(SortOrder):
34 | sort_order = "DESC NULLS LAST"
35 |
36 |
37 | Asc = AscNullsFirst
38 | Desc = DescNullsLast
39 |
--------------------------------------------------------------------------------
/pysparkling/sql/conf.py:
--------------------------------------------------------------------------------
1 | _sentinel = object()
2 |
3 |
4 | class RuntimeConfig:
5 | def __init__(self, jconf=None):
6 | self._conf = {}
7 |
8 | def set(self, key, value):
9 | self._conf[key] = value
10 |
11 | def get(self, key, default=_sentinel):
12 | self._checkType(key, "key")
13 | if default is _sentinel:
14 | return self._conf.get(key)
15 | if default is not None:
16 | self._checkType(default, "default")
17 | return self._conf.get(key, default)
18 |
19 | def unset(self, key):
20 | del self._conf[key]
21 |
22 | def _checkType(self, obj, identifier):
23 | if not isinstance(obj, str):
24 | raise TypeError(f"expected {identifier} '{obj}' to be a string (was '{type(obj).__name__}')")
25 |
26 | def isModifiable(self, key):
27 | raise NotImplementedError("pysparkling does not support yet this feature")
28 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/lzma.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import logging
3 | import lzma
4 |
5 | from .codec import Codec
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | class Lzma(Codec):
11 | """Implementation of :class:`.Codec` for lzma compression.
12 |
13 | Needs Python >= 3.3.
14 | """
15 |
16 | def __init__(self):
17 | if lzma is None:
18 | log.warning('LZMA codec not supported. It is only supported '
19 | 'in Python>=3.3. Not compressing streams.')
20 | super().__init__()
21 |
22 | def compress(self, stream):
23 | if lzma is None:
24 | return Codec.compress(self, stream)
25 |
26 | return BytesIO(lzma.compress(stream.read()))
27 |
28 | def decompress(self, stream):
29 | if lzma is None:
30 | return Codec.decompress(self, stream)
31 |
32 | return BytesIO(lzma.decompress(stream.read()))
33 |
--------------------------------------------------------------------------------
/pysparkling/sql/tests/expressions/test_mappers.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from pysparkling.utils import MonotonicallyIncreasingIDGenerator
4 |
5 |
6 | class MonotonicallyIncreasingIDGeneratorTests(TestCase):
7 | def test_init_ok(self):
8 | sut = MonotonicallyIncreasingIDGenerator(0)
9 | self.assertEqual(sut.value, -1) # Shouldn't we throw an error here?
10 |
11 | sut = MonotonicallyIncreasingIDGenerator(1)
12 | self.assertEqual(sut.value, 8589934592 - 1) # I do it this way so I can easily find/replace the value
13 |
14 | sut = MonotonicallyIncreasingIDGenerator(2)
15 | self.assertEqual(sut.value, 2 * 8589934592 - 1)
16 |
17 | def test_next_value_ok(self):
18 | sut = MonotonicallyIncreasingIDGenerator(1)
19 | self.assertEqual(next(sut), 8589934592)
20 | self.assertEqual(next(sut), 8589934593)
21 | self.assertEqual(next(sut), 8589934594)
22 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/column.py:
--------------------------------------------------------------------------------
1 | def resolve_column(col, row, schema, allow_generator=True):
2 | """
3 | Return the list of column names corresponding to a column value and a schema and:
4 | If allow generator is False, a list of values corresponding to a row
5 | If allow generator is True, a list of list of values, each list correspond to a row
6 | """
7 | output_cols = [field.name for field in col.output_fields(schema)]
8 |
9 | output_values = col.eval(row, schema)
10 |
11 | if not allow_generator and col.may_output_multiple_rows:
12 | raise Exception("Generators are not supported when it's nested in expressions,"
13 | f" but got: {col}")
14 |
15 | if not col.may_output_multiple_rows:
16 | output_values = [output_values]
17 | if not col.may_output_multiple_cols:
18 | output_values = [output_values]
19 |
20 | return output_cols, output_values
21 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readwrite.py:
--------------------------------------------------------------------------------
1 | from ..utils import IllegalArgumentException
2 |
3 |
4 | def to_option_stored_value(value):
5 | if value is None:
6 | return None
7 | if isinstance(value, bool):
8 | return str(value).lower()
9 | return str(value)
10 |
11 |
12 | class OptionUtils:
13 | def _set_opts(self, schema=None, **options):
14 | """
15 | Set named options (filter out those the value is None)
16 | """
17 | if schema is not None:
18 | self.schema(schema)
19 | for k, v in options.items():
20 | if v is not None:
21 | self.option(k, v)
22 |
23 | def option(self, key, value):
24 | raise NotImplementedError
25 |
26 | def schema(self, schema):
27 | # By default OptionUtils subclass do not support schema
28 | raise IllegalArgumentException(
29 | f"schema is not a valid argument for {self.__class__}"
30 | )
31 |
--------------------------------------------------------------------------------
/docs/sphinx/dev.rst:
--------------------------------------------------------------------------------
1 | .. _dev:
2 |
3 | Development
4 | ===========
5 |
6 | Fork the Github repository and apply your changes in a feature branch.
7 | To run pysparkling's unit tests:
8 |
9 | .. code-block:: sh
10 |
11 | # install
12 | pip install -e .[hdfs,performance,streaming,test]
13 | flake8 --install-hook
14 |
15 | # run linting and test
16 | flake8
17 | pytest -vv
18 |
19 | Don't run ``python setup.py test`` as this will
20 | not execute the doctests. When all tests pass, create a Pull Request on GitHub.
21 | Please also update ``HISTORY.rst`` with short description of your change.
22 |
23 | To preview the docs locally, install the extra dependencies with
24 | ``pip install -r docs/requirements.txt``, and then cd into ``docs/sphinx``,
25 | run ``make html`` and open ``_build/html/index.html``.
26 |
27 | Please also try not to add derivative work from other projects. If you do,
28 | incorporate proper handling of external licenses in your Pull Request.
29 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/literals.py:
--------------------------------------------------------------------------------
1 | from ..utils import AnalysisException
2 | from .expressions import Expression
3 |
4 |
5 | class Literal(Expression):
6 | def __init__(self, value):
7 | super().__init__()
8 | self.value = value
9 |
10 | def eval(self, row, schema):
11 | return self.value
12 |
13 | def __str__(self):
14 | if self.value is True:
15 | return "true"
16 | if self.value is False:
17 | return "false"
18 | if self.value is None:
19 | return "NULL"
20 | return str(self.value)
21 |
22 | def get_literal_value(self):
23 | if hasattr(self.value, "expr") or isinstance(self.value, Expression):
24 | raise AnalysisException("Value should not be a Column or an Expression,"
25 | f" but got {type(self)}: {self}")
26 | return self.value
27 |
28 | def args(self):
29 | return (self.value, )
30 |
31 |
32 | __all__ = ["Literal"]
33 |
--------------------------------------------------------------------------------
/pysparkling/streaming/queuestream.py:
--------------------------------------------------------------------------------
1 | from ..rdd import EmptyRDD, RDD
2 |
3 |
4 | class QueueStreamDeserializer:
5 | def __init__(self, context):
6 | self.context = context
7 |
8 | def ensure_rdd(self, data):
9 | if data is None:
10 | return EmptyRDD(self.context)
11 | if isinstance(data, RDD):
12 | return data
13 | return self.context.parallelize(data)
14 |
15 | def __call__(self, data):
16 | return self.ensure_rdd(data)
17 |
18 |
19 | class QueueStream:
20 | def __init__(self, queue, oneAtATime=True, default=None):
21 | self.queue = queue
22 | self.oneAtATime = oneAtATime
23 | self.default = default
24 |
25 | def get(self):
26 | q_size = self.queue.qsize()
27 |
28 | if q_size == 0:
29 | return self.default
30 |
31 | if self.oneAtATime:
32 | return self.queue.get_nowait()
33 |
34 | return [e for _ in range(q_size) for e in self.queue.get_nowait()]
35 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from .bz2 import Bz2
4 | from .codec import Codec
5 | from .gz import Gz
6 | from .lzma import Lzma
7 | from .sevenz import SevenZ
8 | from .tar import Tar, TarBz2, TarGz
9 | from .zip import Zip
10 |
11 | log = logging.getLogger(__name__)
12 |
13 | FILE_ENDINGS = [
14 | (('.tar',), Tar),
15 | (('.tar.gz',), TarGz),
16 | (('.tar.bz2',), TarBz2),
17 | (('.gz',), Gz),
18 | (('.zip',), Zip),
19 | (('.bz2',), Bz2),
20 | (('.lzma', '.xz'), Lzma),
21 | (('.7z',), SevenZ),
22 | ]
23 |
24 |
25 | class NoCodec(Codec):
26 | pass
27 |
28 |
29 | def get_codec(path):
30 | """Find the codec implementation for this path."""
31 | if '.' not in path or path.rfind('/') > path.rfind('.'):
32 | return Codec
33 |
34 | for endings, codec_class in FILE_ENDINGS:
35 | if any(path.endswith(e) for e in endings):
36 | log.debug('Using %s codec: %s', endings, path)
37 | return codec_class
38 |
39 | return NoCodec
40 |
--------------------------------------------------------------------------------
/docs/sphinx/api_context.rst:
--------------------------------------------------------------------------------
1 | .. _api_context:
2 |
3 | .. currentmodule:: pysparkling
4 |
5 | Context
6 | -------
7 |
8 | A :class:`~pysparkling.Context` describes the setup. Instantiating a Context with the default
9 | arguments using ``Context()`` is the most lightweight setup. All data is just
10 | in the local thread and is never serialized or deserialized.
11 |
12 | If you want to process the data in parallel, you can use the `multiprocessing`
13 | module. Given the limitations of the default `pickle` serializer, you can
14 | specify to serialize all methods with `cloudpickle` instead. For example,
15 | a common instantiation with `multiprocessing` looks like this:
16 |
17 | .. code-block:: python
18 |
19 | sc = pysparkling.Context(
20 | multiprocessing.Pool(4),
21 | serializer=cloudpickle.dumps,
22 | deserializer=pickle.loads,
23 | )
24 |
25 | This assumes that your data is serializable with `pickle` which is generally
26 | faster. You can also specify a custom serializer/deserializer for data.
27 |
28 | .. autoclass:: pysparkling.Context
29 | :members:
30 |
--------------------------------------------------------------------------------
/pysparkling/task_context.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | log = logging.getLogger(__name__)
4 |
5 |
6 | class TaskContext:
7 | def __init__(self, cache_manager, catch_exceptions,
8 | stage_id=0, partition_id=0, max_retries=3, retry_wait=0):
9 | self.cache_manager = cache_manager
10 | self.catch_exceptions = catch_exceptions
11 | self.stage_id = stage_id
12 | self.partition_id = partition_id
13 | self.max_retries = max_retries
14 | self.retry_wait = retry_wait
15 |
16 | self.attempt_number = 0
17 | self.is_completed = False
18 | self.is_running_locally = True
19 | self.task_completion_listeners = []
20 |
21 | def _create_child(self):
22 | return TaskContext(self.cache_manager, self.catch_exceptions,
23 | stage_id=self.stage_id + 1,
24 | partition_id=self.partition_id)
25 |
26 | def attemptNumber(self):
27 | return self.attempt_number
28 |
29 | def partitionId(self):
30 | return self.partition_id
31 |
32 | def stageId(self):
33 | return self.stage_id
34 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/sevenz.py:
--------------------------------------------------------------------------------
1 | try:
2 | import py7zlib
3 | except ImportError:
4 | py7zlib = None
5 |
6 | from io import BytesIO
7 | import logging
8 |
9 | from .codec import Codec
10 |
11 | log = logging.getLogger(__name__)
12 |
13 |
14 | class SevenZ(Codec):
15 | """Implementation of :class:`.Codec` for 7z compression.
16 |
17 | Needs the `pylzma` module.
18 | """
19 |
20 | def __init__(self):
21 | if py7zlib is None:
22 | log.warning('py7zlib could not be imported. To read 7z files, '
23 | 'install the library with "pip install pylzma".')
24 | super().__init__()
25 |
26 | def compress(self, stream):
27 | log.warning('Writing of 7z compressed archives is not supported.')
28 | return stream
29 |
30 | def decompress(self, stream):
31 | if py7zlib is None:
32 | return Codec.decompress(self, stream)
33 |
34 | uncompressed = BytesIO()
35 |
36 | f = py7zlib.Archive7z(file=stream)
37 | for f_name in f.getnames():
38 | uncompressed.write(f.getmember(f_name).read())
39 |
40 | uncompressed.seek(0)
41 | return uncompressed
42 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/common.py:
--------------------------------------------------------------------------------
1 | from ...internal_utils.readers import csvreader, jsonreader, textreader
2 | from ...internal_utils.readwrite import OptionUtils, to_option_stored_value
3 | from ...types import StructType
4 |
5 |
6 | class InternalReader(OptionUtils):
7 | def schema(self, schema):
8 | if not isinstance(schema, StructType):
9 | raise NotImplementedError("Pysparkling currently only supports StructType for schemas")
10 | self._schema = schema
11 |
12 | def option(self, key, value):
13 | self._options[key.lower()] = to_option_stored_value(value)
14 |
15 | def __init__(self, spark):
16 | """
17 |
18 | :type spark: pysparkling.sql.session.SparkSession
19 | """
20 | self._spark = spark
21 | self._options = {}
22 | self._schema = None
23 |
24 | def csv(self, paths):
25 | return csvreader.CSVReader(self._spark, paths, self._schema, self._options).read()
26 |
27 | def json(self, paths):
28 | return jsonreader.JSONReader(self._spark, paths, self._schema, self._options).read()
29 |
30 | def text(self, paths):
31 | return textreader.TextReader(self._spark, paths, self._schema, self._options).read()
32 |
--------------------------------------------------------------------------------
/pysparkling/sql/utils.py:
--------------------------------------------------------------------------------
1 | class CapturedException(Exception):
2 | pass
3 |
4 |
5 | class AnalysisException(CapturedException):
6 | pass
7 |
8 |
9 | class ParseException(CapturedException):
10 | pass
11 |
12 |
13 | class IllegalArgumentException(CapturedException):
14 | pass
15 |
16 |
17 | def require_minimum_pandas_version():
18 | """ Raise an ImportError if Pandas version is < 0.23.2
19 | """
20 | minimum_pandas_version = (0, 23, 2)
21 |
22 | # pandas is an optional dependency
23 | # pylint: disable=import-outside-toplevel
24 | try:
25 | import pandas
26 | have_pandas = True
27 | except ImportError:
28 | have_pandas = False
29 |
30 | if not have_pandas:
31 | raise ImportError(
32 | f"Pandas >= {minimum_pandas_version} must be installed; however none were found."
33 | )
34 | if parse_pandas_version(pandas.__version__) < minimum_pandas_version:
35 | raise ImportError(
36 | f"Pandas >= {minimum_pandas_version} must be installed;"
37 | f" however, your version was {pandas.__version__}."
38 | )
39 |
40 |
41 | def parse_pandas_version(version):
42 | return tuple(int(part) for part in version.split("."))
43 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/jsons.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from ...utils import get_json_encoder
4 | from ..internal_utils.options import Options
5 | from ..internal_utils.readers.jsonreader import JSONReader
6 | from .expressions import Expression
7 |
8 |
9 | class StructsToJson(Expression):
10 | pretty_name = "structstojson"
11 |
12 | default_options = dict(
13 | dateFormat="yyyy-MM-dd",
14 | timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
15 | )
16 |
17 | def __init__(self, column, options):
18 | super().__init__(column)
19 | self.column = column
20 | self.input_options = options
21 | self.options = Options(JSONReader.default_options, options)
22 | self.encoder = get_json_encoder(self.options)
23 |
24 | def eval(self, row, schema):
25 | value = self.column.eval(row, schema)
26 | return json.dumps(
27 | value,
28 | cls=self.encoder,
29 | separators=(',', ':')
30 | )
31 |
32 | def args(self):
33 | if self.input_options is None:
34 | return (self.column, )
35 | return (
36 | self.column,
37 | self.input_options
38 | )
39 |
40 |
41 | __all__ = ["StructsToJson"]
42 |
--------------------------------------------------------------------------------
/scripts/benchmark_csv.py:
--------------------------------------------------------------------------------
1 | """Benchmark csv reading performance."""
2 |
3 | import argparse
4 | import random
5 | from string import ascii_uppercase
6 |
7 | import pysparkling
8 |
9 |
10 | def create_csv(filename, lines=10000000, columns=12):
11 | with open(filename, 'w', encoding='utf8') as f:
12 | column_names = ','.join(ascii_uppercase[i] for i in range(columns))
13 | f.write(f'{column_names}\n')
14 |
15 | for _ in range(lines):
16 | values = ','.join(
17 | f'{100 * (c + 1) * random.random():.3f}'
18 | for c in range(columns)
19 | )
20 | f.write(f'{values}\n')
21 |
22 |
23 | def read_csv(filename):
24 | c = pysparkling.Context()
25 | r = c.textFile(filename)
26 | r = r.map(lambda l: l + 'something else')
27 | print(r.count())
28 |
29 |
30 | if __name__ == '__main__':
31 | p = argparse.ArgumentParser(description=__doc__)
32 | p.add_argument('--create', default=False, action='store_true',
33 | help='create csv test file')
34 | p.add_argument('--testfile', default='test.csv',
35 | help='the test file')
36 | args = p.parse_args()
37 |
38 | if args.create:
39 | create_csv(filename=args.testfile)
40 | else:
41 | read_csv(filename=args.testfile)
42 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = W503, E731
3 | exclude = venv*,logo,docs,build
4 | max-line-length = 119
5 |
6 | [tool:pytest]
7 | addopts = --doctest-modules --cov=pysparkling --cov-report=html --cov-branch
8 | testpaths = pysparkling
9 | doctest_optionflags = ALLOW_UNICODE NORMALIZE_WHITESPACE
10 |
11 | [pycodestyle]
12 | max-line-length=119
13 | ignore=E731,E741,W503
14 | exclude=pysparkling/__init__.py
15 |
16 | # See the docstring in versioneer.py for instructions. Note that you must
17 | # re-run 'versioneer.py setup' after changing this section, and commit the
18 | # resulting files.
19 |
20 | [versioneer]
21 | VCS = git
22 | style = pep440
23 | versionfile_source = pysparkling/_version.py
24 | versionfile_build = pysparkling/_version.py
25 | tag_prefix = v
26 | # parentdir_prefix =
27 |
28 | [coverage:run]
29 | branch = True
30 | cover_pylib = False
31 | data_file = reports/.coverage
32 | source = pysparkling
33 | omit = pysparkling/_version.py
34 |
35 | [coverage:report]
36 | show_missing = True
37 | skip_covered = False
38 |
39 | [coverage:html]
40 | directory = reports/coverage
41 |
42 | [isort]
43 | src_paths = pysparkling,scripts
44 | skip_gitignore = True
45 | line_length = 119
46 | order_by_type = False
47 | case_sensitive = False
48 | multi_line_output = 5
49 | force_sort_within_sections = True
50 | skip = versioneer.py
--------------------------------------------------------------------------------
/pysparkling/tests/test_stat_counter.py:
--------------------------------------------------------------------------------
1 | import pysparkling
2 | from pysparkling.sql.functions import col
3 | from pysparkling.sql.types import IntegerType, Row, StructField, StructType
4 | from pysparkling.stat_counter import ColumnStatHelper
5 |
6 |
7 | def test_mean():
8 | d = [1, 4, 9, 160]
9 | s = pysparkling.StatCounter(d)
10 | assert sum(d) / len(d) == s.mean()
11 |
12 |
13 | def test_column_stat_helper():
14 | """
15 | Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries
16 | """
17 | schema = StructType([StructField("value", IntegerType())])
18 | helper = ColumnStatHelper(col("value"))
19 | for i in range(1, 100001):
20 | helper.merge(Row(value=i), schema)
21 | helper.finalize()
22 | assert helper.count == 100000
23 | assert helper.min == 1
24 | assert helper.max == 100000
25 | assert helper.mean == 50000.5
26 | assert helper.stddev == 28867.65779668774 # sample standard deviation
27 | assert helper.get_quantile(0) == 1
28 | assert helper.get_quantile(0.25) == 24998
29 | assert helper.get_quantile(0.5) == 50000
30 | assert helper.get_quantile(0.75) == 74993
31 | assert helper.get_quantile(1) == 100000
32 |
33 |
34 | if __name__ == '__main__':
35 | test_mean()
36 | test_column_stat_helper()
37 |
--------------------------------------------------------------------------------
/pysparkling/sql/context.py:
--------------------------------------------------------------------------------
1 | from .session import SparkSession
2 |
3 |
4 | class SQLContext:
5 | _instantiatedContext = None
6 |
7 | def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
8 | self._sc = sparkContext
9 | if sparkSession is None:
10 | sparkSession = SparkSession.builder.getOrCreate()
11 | self.sparkSession = sparkSession
12 | if SQLContext._instantiatedContext is None:
13 | SQLContext._instantiatedContext = self
14 |
15 | @classmethod
16 | def getOrCreate(cls, sc):
17 | """
18 | Get the existing SQLContext or create a new one with given SparkContext.
19 |
20 | :param sc: SparkContext
21 | """
22 | if cls._instantiatedContext is None:
23 | cls(sc, SparkSession(sc), None)
24 | return cls._instantiatedContext
25 |
26 | def newSession(self):
27 | """
28 | Returns a new SQLContext as new session, that has separate SQLConf,
29 | registered temporary views and UDFs, but shared SparkContext and
30 | table cache.
31 | """
32 | return self.__class__(self._sc, self.sparkSession.newSession())
33 |
34 | def setConf(self, key, value):
35 | """Sets the given Spark SQL configuration property.
36 | """
37 | self.sparkSession.conf.set(key, value)
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | test.*
3 | profile.out
4 | .vscode
5 | scripts/textout
6 | tests/textout
7 | checkpoints/
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | .pytest_cache/
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Vim
18 | *.sw[po]
19 |
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | .env/
24 | venv*/
25 | pypy/
26 | pypy3/
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 |
42 | # PyInstaller
43 | # Usually these files are written by a python script from a template
44 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
45 | *.manifest
46 | *.spec
47 |
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 |
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *,cover
61 |
62 | # Translations
63 | *.mo
64 | *.pot
65 |
66 | # Django stuff:
67 | *.log
68 |
69 | # Sphinx documentation
70 | docs/sphinx/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Spark data files
76 | *.crc
77 |
78 | # IPython
79 | *.ipynb.syncdoc
80 | .ipynb_checkpoints
81 | .ipython-daemon.json
82 |
83 | /.idea/
84 | /reports/
85 | /pysparkling/tests/20news-19997.tar.gz
86 |
87 | /scripts_private/
88 |
--------------------------------------------------------------------------------
/pysparkling/streaming/filestream.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from ..fileio import File
4 | from ..rdd import EmptyRDD
5 |
6 | log = logging.getLogger(__name__)
7 |
8 |
9 | class FileTextStreamDeserializer:
10 | def __init__(self, context):
11 | self.context = context
12 |
13 | def __call__(self, path):
14 | if path is None:
15 | return EmptyRDD(self.context)
16 |
17 | return self.context.textFile(path)
18 |
19 |
20 | class FileBinaryStreamDeserializer:
21 | def __init__(self, context, recordLength=None):
22 | self.context = context
23 | self.record_length = recordLength
24 |
25 | def __call__(self, path):
26 | if path is None:
27 | return EmptyRDD(self.context)
28 |
29 | return self.context.binaryRecords(
30 | path, recordLength=self.record_length)
31 |
32 |
33 | class FileStream:
34 | def __init__(self, path, process_all=False):
35 | self.path = path
36 | self.files_done = set()
37 | if not process_all:
38 | self.files_done = set(File.resolve_filenames(self.path))
39 |
40 | def get(self):
41 | files = [fn for fn in File.resolve_filenames(self.path)
42 | if fn not in self.files_done]
43 | if not files:
44 | return None
45 |
46 | self.files_done |= set(files)
47 | return ','.join(files)
48 |
49 | def stop(self):
50 | pass
51 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/csvs.py:
--------------------------------------------------------------------------------
1 | from ..casts import NO_TIMESTAMP_CONVERSION
2 | from ..internal_utils.options import Options
3 | from ..internal_utils.readers.csvreader import csv_record_to_row, CSVReader
4 | from ..internal_utils.readers.utils import guess_schema_from_strings
5 | from ..utils import AnalysisException
6 | from .expressions import Expression
7 |
8 | sql_csv_function_options = dict(
9 | dateFormat=NO_TIMESTAMP_CONVERSION,
10 | timestampFormat=NO_TIMESTAMP_CONVERSION,
11 | )
12 |
13 |
14 | class SchemaOfCsv(Expression):
15 | pretty_name = "schema_of_csv"
16 |
17 | def __init__(self, column, options):
18 | super().__init__(column)
19 | self.column = column
20 | self.input_options = options
21 | self.options = Options(CSVReader.default_options, sql_csv_function_options, options)
22 |
23 | def eval(self, row, schema):
24 | value = self.column.eval(row, schema)
25 | if not isinstance(value, str) or value == "":
26 | raise AnalysisException(
27 | "type mismatch: The input csv should be a string literal and not null; "
28 | f"however, got {value}."
29 | )
30 | record_as_row = csv_record_to_row(value, self.options)
31 | schema = guess_schema_from_strings(record_as_row.__fields__, [record_as_row], self.options)
32 | return schema.simpleString()
33 |
34 | def args(self):
35 | return (self.column,)
36 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/covariance_aggregations.py:
--------------------------------------------------------------------------------
1 | from ....stat_counter import CovarianceCounter
2 | from .aggregations import Aggregation
3 |
4 |
5 | class CovarianceStatAggregation(Aggregation):
6 | def __init__(self, column1, column2):
7 | super().__init__(column1, column2)
8 | self.column1 = column1
9 | self.column2 = column2
10 | self.stat_helper = CovarianceCounter(method="pearson")
11 |
12 | def merge(self, row, schema):
13 | self.stat_helper.add(row.eval(self.column1, schema), row.eval(self.column2, schema))
14 |
15 | def mergeStats(self, other, schema):
16 | self.stat_helper.merge(other)
17 |
18 | def eval(self, row, schema):
19 | raise NotImplementedError
20 |
21 | def args(self):
22 | return (
23 | self.column1,
24 | self.column2
25 | )
26 |
27 |
28 | class Corr(CovarianceStatAggregation):
29 | pretty_name = "corr"
30 |
31 | def eval(self, row, schema):
32 | return self.stat_helper.pearson_correlation
33 |
34 |
35 | class CovarSamp(CovarianceStatAggregation):
36 | pretty_name = "covar_samp"
37 |
38 | def eval(self, row, schema):
39 | return self.stat_helper.covar_samp
40 |
41 |
42 | class CovarPop(CovarianceStatAggregation):
43 | pretty_name = "covar_pop"
44 |
45 | def eval(self, row, schema):
46 | return self.stat_helper.covar_pop
47 |
48 |
49 | __all__ = ["Corr", "CovarSamp", "CovarPop"]
50 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_cache.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | import pysparkling
5 |
6 |
7 | class Manip:
8 | def __init__(self):
9 | self.count = 0
10 |
11 | def trivial_manip_with_debug(self, e):
12 | self.count += 1
13 | print(f'manipulating {e}')
14 | return e
15 |
16 |
17 | def test_cache_empty_partition():
18 | m = Manip()
19 |
20 | c = pysparkling.Context()
21 | rdd = c.parallelize(range(10), 2)
22 | rdd = rdd.map(m.trivial_manip_with_debug)
23 | rdd = rdd.filter(lambda e: e > 6).cache()
24 | print(rdd.collect())
25 | print(rdd.collect())
26 |
27 | print(f'count of map executions: {m.count}')
28 | assert m.count == 10
29 |
30 |
31 | def test_timed_cache():
32 | m = Manip()
33 |
34 | # create a timed cache manager
35 | cm = pysparkling.TimedCacheManager(timeout=1.0)
36 |
37 | # create a cache entry
38 | c = pysparkling.Context(cache_manager=cm)
39 | rdd = c.parallelize(range(10), 2)
40 | rdd = rdd.map(m.trivial_manip_with_debug).cache()
41 | print(rdd.collect())
42 | # make sure the cache is working
43 | count_before = m.count
44 | print(rdd.collect())
45 | count_after = m.count
46 | assert count_before == count_after
47 |
48 | # wait to have the cache expire
49 | time.sleep(1.5)
50 | cm.gc()
51 | print(rdd.collect())
52 | assert m.count > count_after
53 |
54 |
55 | if __name__ == '__main__':
56 | logging.basicConfig(level=logging.DEBUG)
57 | # test_cache_empty_partition()
58 | test_timed_cache()
59 |
--------------------------------------------------------------------------------
/pysparkling/samplers.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 |
4 | try:
5 | import numpy
6 | except ImportError:
7 | numpy = None
8 |
9 |
10 | def pysparkling_poisson(lambda_):
11 | if lambda_ == 0.0:
12 | return 0
13 |
14 | n = 0
15 | exp_neg_lambda = math.exp(-lambda_)
16 | prod = 1.0
17 | while True:
18 | prod *= random.random()
19 | if prod > exp_neg_lambda:
20 | n += 1
21 | else:
22 | return n
23 |
24 |
25 | def poisson(lambda_):
26 | if numpy is not None:
27 | return numpy.random.poisson(lambda_)
28 | return pysparkling_poisson(lambda_)
29 |
30 |
31 | class BernoulliSampler:
32 | def __init__(self, expectation):
33 | self.expectation = expectation
34 |
35 | def __call__(self, sample):
36 | return 1 if random.random() < self.expectation else 0
37 |
38 |
39 | class PoissonSampler:
40 | def __init__(self, expectation):
41 | self.expectation = expectation
42 |
43 | def __call__(self, sample):
44 | return poisson(self.expectation)
45 |
46 |
47 | class BernoulliSamplerPerKey:
48 | def __init__(self, expectations):
49 | self.expectations = expectations
50 |
51 | def __call__(self, sample):
52 | key = sample[0]
53 | return 1 if random.random() < self.expectations.get(key, 0.0) else 0
54 |
55 |
56 | class PoissonSamplerPerKey:
57 | def __init__(self, expectations):
58 | self.expectations = expectations
59 |
60 | def __call__(self, sample):
61 | key = sample[0]
62 | return poisson(self.expectations.get(key, 0.0))
63 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Build and upload
2 |
3 | # Build on every branch push, tag push, and pull request change:
4 | # on: [push, pull_request]
5 | # Alternatively, to publish when a (published) GitHub Release is created, use the following:
6 | on:
7 | push:
8 | branches:
9 | - master
10 | pull_request:
11 | branches:
12 | - master
13 | release:
14 | types:
15 | - published
16 |
17 | jobs:
18 | build_sdist:
19 | name: Build Python source distribution
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: actions/checkout@v3
23 | with:
24 | fetch-depth: 0
25 |
26 | - uses: actions/setup-python@v4
27 | name: Install Python
28 | with:
29 | python-version: '3.7'
30 |
31 | - name: Build sdist
32 | run: python setup.py sdist
33 |
34 | - uses: actions/upload-artifact@v3
35 | with:
36 | path: dist/*.tar.gz
37 |
38 | upload_pypi:
39 | needs: [build_sdist]
40 | runs-on: ubuntu-latest
41 | # upload to PyPI on every tag starting with 'v'
42 | # if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
43 | # alternatively, to publish when a GitHub Release is created, use the following rule:
44 | if: github.event_name == 'release' && github.event.action == 'published'
45 | steps:
46 | - uses: actions/download-artifact@v3
47 | with:
48 | name: artifact
49 | path: dist
50 |
51 | - uses: pypa/gh-action-pypi-publish@master
52 | with:
53 | user: __token__
54 | password: ${{ secrets.pypi_token }}
55 | # To test: repository_url: https://test.pypi.org/legacy/
56 |
--------------------------------------------------------------------------------
/docs/sphinx/version_index/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Databench Docs
10 |
11 |
12 |
13 |
14 |
15 |
30 |
31 |
32 |
35 |
36 |
37 |
38 | pysparkling Docs
39 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | matrix:
11 | os: [ ubuntu-latest, macos-latest, windows-latest ]
12 | python: [ 3.7, 3.8, 3.9, "3.10", "3.11" ]
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 | with:
17 | fetch-depth: 0
18 |
19 | - name: Set up Python ${{ matrix.python }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python }}
23 |
24 | - name: Install
25 | run: |
26 | python -m pip install --upgrade pip setuptools
27 | python -m pip install -e ".[tests,scripts]"
28 |
29 | - name: Print environment
30 | run: |
31 | python -m pip freeze
32 | python --version
33 | python -c "import pysparkling; print(pysparkling.__version__)"
34 |
35 | - name: Check if import order is fine
36 | run: |
37 | isort . --check --diff
38 |
39 | - name: Test pysparkling/rdd.py
40 | run: python -m pytest pysparkling/rdd.py -vv
41 |
42 | - name: Test pysparkling/tests
43 | if: matrix.os == 'ubuntu-latest' # because of timing sensitivity in stream tests
44 | run: python -m pytest pysparkling/tests -vv
45 |
46 | - name: Install SQL Dependencies
47 | run: |
48 | python -m pip install -e ".[sql]"
49 |
50 | - name: Lint
51 | if: matrix.python != '3.9'
52 | run: pylint pysparkling scripts --disable=fixme
53 |
54 | - name: pycodestyle
55 | run: python -m pycodestyle pysparkling scripts
56 |
57 | - name: Test All
58 | if: matrix.os == 'ubuntu-latest' # because of timing sensitivity in stream tests
59 | run: python -m pytest -vv
60 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_queue.py:
--------------------------------------------------------------------------------
1 | import tornado.testing
2 |
3 | import pysparkling
4 |
5 |
6 | class TestCount(tornado.testing.AsyncTestCase):
7 |
8 | def test_count(self):
9 | sc = pysparkling.Context()
10 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
11 |
12 | result = []
13 | (
14 | ssc.queueStream([range(20), ['a', 'b'], ['c']])
15 | .count()
16 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
17 | )
18 |
19 | ssc.start()
20 | ssc.awaitTermination(timeout=0.35)
21 | self.assertEqual(sum(result), 23)
22 |
23 | def test_groupByKey(self):
24 | sc = pysparkling.Context()
25 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
26 |
27 | result = []
28 | (
29 | ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)],
30 | [('a', 2), ('b', 3)]])
31 | .groupByKey().mapPartitions(sorted).mapValues(sorted)
32 | .foreachRDD(lambda rdd: result.append(rdd.collect()))
33 | )
34 |
35 | ssc.start()
36 | ssc.awaitTermination(timeout=0.25)
37 | self.assertEqual(
38 | result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
39 |
40 | def test_mapValues(self):
41 | sc = pysparkling.Context()
42 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
43 |
44 | result = []
45 | (
46 | ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]])
47 | .mapValues(sorted)
48 | .foreachRDD(lambda rdd: result.append(rdd.collect()))
49 | )
50 |
51 | ssc.start()
52 | ssc.awaitTermination(timeout=0.15)
53 | self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
54 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/http.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO, StringIO
2 | import logging
3 |
4 | from ...exceptions import ConnectionException, FileSystemNotSupported
5 | from .file_system import FileSystem
6 |
7 | log = logging.getLogger(__name__)
8 |
9 | try:
10 | import requests
11 | except ImportError:
12 | requests = None
13 |
14 |
15 | class Http(FileSystem):
16 | """:class:`.FileSystem` implementation for HTTP."""
17 |
18 | def __init__(self, file_name):
19 | if requests is None:
20 | raise FileSystemNotSupported(
21 | 'http not supported. Install "requests".'
22 | )
23 |
24 | super().__init__(file_name)
25 | self.headers = None
26 |
27 | @staticmethod
28 | def resolve_filenames(expr):
29 | if Http(expr).exists():
30 | return [expr]
31 | return []
32 |
33 | def exists(self):
34 | r = requests.head(self.file_name, allow_redirects=True)
35 | return r.status_code == 200
36 |
37 | def load(self):
38 | log.debug('Http GET request for %s.', self.file_name)
39 | r = requests.get(self.file_name, headers=self.headers)
40 | if r.status_code != 200:
41 | raise ConnectionException()
42 | return BytesIO(r.content)
43 |
44 | def load_text(self, encoding='utf8', encoding_errors='ignore'):
45 | # warning: encoding and encoding_errors are ignored
46 | log.debug('Http GET request for %s.', self.file_name)
47 | r = requests.get(self.file_name, headers=self.headers)
48 | if r.status_code != 200:
49 | raise ConnectionException()
50 | return StringIO(r.text)
51 |
52 | def dump(self, stream):
53 | log.debug('Dump to %s with http PUT.', self.file_name)
54 | requests.put(self.file_name, data=b''.join(stream))
55 | return self
56 |
--------------------------------------------------------------------------------
/docs/sphinx/api_fileio.rst:
--------------------------------------------------------------------------------
1 | .. _api_fileio:
2 |
3 |
4 | fileio
5 | ------
6 |
7 | .. currentmodule:: pysparkling
8 |
9 | The functionality provided by this module is used in :func:`Context.textFile`
10 | for reading and in :func:`RDD.saveAsTextFile` for writing.
11 |
12 | .. currentmodule:: pysparkling.fileio
13 |
14 | You can use this submodule with :func:`File.dump`, :func:`File.load` and
15 | :func:`File.exists` to read, write and check for existance of a file.
16 | All methods transparently handle various schemas (for example ``http://``,
17 | ``s3://`` and ``file://``) and compression/decompression of ``.gz`` and
18 | ``.bz2`` files (among others).
19 |
20 |
21 | .. autoclass:: pysparkling.fileio.File
22 | :members:
23 |
24 | .. autoclass:: pysparkling.fileio.TextFile
25 | :members:
26 |
27 |
28 | File System
29 | ^^^^^^^^^^^
30 |
31 | .. autoclass:: pysparkling.fileio.fs.FileSystem
32 | :members:
33 |
34 | .. autoclass:: pysparkling.fileio.fs.Local
35 | :members:
36 |
37 | .. autoclass:: pysparkling.fileio.fs.GS
38 | :members:
39 |
40 | .. autoclass:: pysparkling.fileio.fs.Hdfs
41 | :members:
42 |
43 | .. autoclass:: pysparkling.fileio.fs.Http
44 | :members:
45 |
46 | .. autoclass:: pysparkling.fileio.fs.S3
47 | :members:
48 |
49 |
50 | Codec
51 | ^^^^^
52 |
53 | .. autoclass:: pysparkling.fileio.codec.Codec
54 | :members:
55 |
56 | .. autoclass:: pysparkling.fileio.codec.Bz2
57 | :members:
58 |
59 | .. autoclass:: pysparkling.fileio.codec.Gz
60 | :members:
61 |
62 | .. autoclass:: pysparkling.fileio.codec.Lzma
63 | :members:
64 |
65 | .. autoclass:: pysparkling.fileio.codec.SevenZ
66 | :members:
67 |
68 | .. autoclass:: pysparkling.fileio.codec.Tar
69 | :members:
70 |
71 | .. autoclass:: pysparkling.fileio.codec.TarGz
72 | :members:
73 |
74 | .. autoclass:: pysparkling.fileio.codec.TarBz2
75 | :members:
76 |
77 | .. autoclass:: pysparkling.fileio.codec.Zip
78 | :members:
79 |
--------------------------------------------------------------------------------
/pysparkling/fileio/textfile.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO, StringIO, TextIOWrapper
2 | import logging
3 |
4 | from . import codec
5 | from .file import File
6 | from .fs.file_system import FileSystem
7 |
8 | log = logging.getLogger(__name__)
9 |
10 |
11 | class TextFile(File):
12 | """Derived from :class:`File`.
13 |
14 | :param file_name: Any text file name.
15 | """
16 |
17 | def load(self, encoding='utf8', encoding_errors='ignore'): # pylint: disable=arguments-differ
18 | """Load the data from a file.
19 |
20 | :param str encoding: The character encoding of the file.
21 | :param str encoding_errors: How to handle encoding errors.
22 | :rtype: io.StringIO
23 | """
24 | # pylint: disable=comparison-with-callable
25 | if isinstance(self.codec, codec.NoCodec) and \
26 | self.fs.load_text != FileSystem.load_text:
27 | stream = self.fs.load_text(encoding, encoding_errors)
28 | else:
29 | stream = self.fs.load()
30 | stream = self.codec.decompress(stream)
31 | stream = TextIOWrapper(stream, encoding, encoding_errors)
32 | return stream
33 |
34 | def dump(self, stream=None, encoding='utf8', encoding_errors='ignore'): # pylint: disable=arguments-differ
35 | """Writes a stream to a file.
36 |
37 | :param stream:
38 | An ``io.StringIO`` instance. A ``str`` is also possible and
39 | get converted to ``io.StringIO``.
40 |
41 | :param encoding: (optional)
42 | The character encoding of the file.
43 |
44 | :rtype: TextFile
45 | """
46 | if stream is None:
47 | stream = StringIO()
48 |
49 | if isinstance(stream, str):
50 | stream = StringIO(stream)
51 |
52 | stream = self.codec.compress(
53 | BytesIO(stream.read().encode(encoding, encoding_errors))
54 | )
55 | self.fs.dump(stream)
56 |
57 | return self
58 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/explodes.py:
--------------------------------------------------------------------------------
1 | from ..types import DataType, IntegerType, StructField
2 | from .expressions import UnaryExpression
3 |
4 |
5 | class Explode(UnaryExpression):
6 | def __init__(self, column):
7 | super().__init__(column)
8 | self.column = column
9 |
10 | @property
11 | def may_output_multiple_rows(self):
12 | return True
13 |
14 | def eval(self, row, schema):
15 | values = self.column.eval(row, schema)
16 | if not values:
17 | return []
18 | return [[value] for value in values]
19 |
20 | def __str__(self):
21 | return "col"
22 |
23 |
24 | class ExplodeOuter(Explode):
25 | def eval(self, row, schema):
26 | values = self.column.eval(row, schema)
27 | if not values:
28 | return [[None]]
29 | return [[value] for value in values]
30 |
31 | def __str__(self):
32 | return "col"
33 |
34 |
35 | class PosExplode(UnaryExpression):
36 | def eval(self, row, schema):
37 | values = self.column.eval(row, schema)
38 | if not values:
39 | return []
40 | return list(enumerate(values))
41 |
42 | def __str__(self):
43 | return "posexplode"
44 |
45 | @property
46 | def may_output_multiple_rows(self):
47 | return True
48 |
49 | @property
50 | def may_output_multiple_cols(self):
51 | return True
52 |
53 | def output_fields(self, schema):
54 | return [
55 | StructField("pos", IntegerType(), False),
56 | StructField("col", DataType(), False)
57 | ]
58 |
59 |
60 | class PosExplodeOuter(PosExplode):
61 | def eval(self, row, schema):
62 | values = self.column.eval(row, schema)
63 | if not values:
64 | return [[None, None]]
65 | return list(enumerate(values))
66 |
67 | def __str__(self):
68 | return "posexplode_outer"
69 |
70 |
71 | __all__ = ["PosExplodeOuter", "PosExplode", "ExplodeOuter", "Explode"]
72 |
--------------------------------------------------------------------------------
/pysparkling/broadcast.py:
--------------------------------------------------------------------------------
1 | # A large part of this module is extracted from its PySpark counterpart at
2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/broadcast.html
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | __all__ = ['Broadcast']
21 |
22 |
23 | class Broadcast:
24 | """
25 | A broadcast variable created with ``b = sc.broadcast(0)``.
26 | Access its value through ``b.value``.
27 |
28 | Examples:
29 |
30 | >>> from pysparkling import Context
31 | >>> sc = Context()
32 | >>> b = sc.broadcast([1, 2, 3, 4, 5])
33 | >>> b.value
34 | [1, 2, 3, 4, 5]
35 | >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
36 | [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
37 | """
38 | def __init__(self, sc=None, value=None):
39 | self._value = value
40 |
41 | @property
42 | def value(self):
43 | """Returs the broadcasted value."""
44 | return self._value
45 |
46 |
47 | if __name__ == "__main__":
48 | #
49 | # Execute doctests with
50 | #
51 | # $ python -m pysparkling.accumulators -v
52 | #
53 | import doctest
54 | import sys
55 |
56 | failure_count, _ = doctest.testmod()
57 | if failure_count:
58 | sys.exit(-1)
59 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/options.py:
--------------------------------------------------------------------------------
1 | class Options(dict):
2 | """
3 | A case insensitive dict, which can be initialized from multiple dicts
4 | and whose values can be access through attr syntax
5 |
6 | It also stores "false" and "true" strings as Boolean
7 |
8 | e.g.:
9 |
10 | >>> default_options = dict(sep=",", samplingRatio=None)
11 | >>> requested_options = dict(Sep="|")
12 | >>> o=Options({"format": "json", "lineSep": ","}, Format="csv")
13 | >>> o.format, o.linesep
14 | ('csv', ',')
15 | >>> o.UndefinedSetting
16 | Traceback (most recent call last):
17 | ...
18 | KeyError: 'undefinedsetting'
19 | """
20 |
21 | def __init__(self, *args, **kwargs):
22 | d = {
23 | key.lower(): value
24 | for arg in args
25 | if arg is not None
26 | for key, value in arg.items()
27 | }
28 | d.update({
29 | key.lower(): value
30 | for key, value in kwargs.items()
31 | })
32 | super().__init__(d)
33 |
34 | def setdefault(self, k, default=None):
35 | return super().setdefault(k.lower(), default)
36 |
37 | @staticmethod
38 | def fromkeys(seq, value=None):
39 | return Options({k.lower(): value for k in seq})
40 |
41 | def __getitem__(self, k):
42 | return super().__getitem__(k.lower())
43 |
44 | def __setitem__(self, k, v):
45 | if isinstance(v, str) and v.lower() in ("true", "false"):
46 | v = (v.lower() == "true")
47 | super().__setitem__(k.lower(), v)
48 |
49 | def __delitem__(self, k):
50 | super().__delitem__(k.lower())
51 |
52 | def get(self, k, *args, **kwargs):
53 | return super().get(k.lower(), *args, **kwargs)
54 |
55 | def __contains__(self, o):
56 | if not isinstance(o, str):
57 | return False
58 | return super().__contains__(o.lower())
59 |
60 | def __getattr__(self, item):
61 | if not item.startswith("_"):
62 | return self[item.lower()]
63 | return getattr(super(), item)
64 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | import versioneer
3 |
4 | setup(
5 | name='pysparkling',
6 | version=versioneer.get_version(),
7 | cmdclass=versioneer.get_cmdclass(),
8 | packages=find_packages(),
9 | license='MIT',
10 | description='Pure Python implementation of the Spark RDD interface.',
11 | long_description=open('README.rst', 'r', encoding='utf8').read(),
12 | author='pysparkling contributors',
13 | url='https://github.com/svenkreiss/pysparkling',
14 |
15 | install_requires=[
16 | 'pytz>=2019.3',
17 | 'python-dateutil>=2.8.0'
18 | ],
19 | extras_require={
20 | 'hdfs': ['hdfs>=2.0.0'],
21 | 'http': ['requests>=2.6.0'],
22 | 'performance': ['matplotlib>=1.5.3'],
23 | 's3': ['boto>=2.36.0'],
24 | 'streaming': ['tornado>=4.3'],
25 | 'sql': [
26 | 'numpy',
27 | 'pandas>=0.23.2',
28 | ],
29 | 'tests': [
30 | 'backports.tempfile==1.0rc1',
31 | 'cloudpickle>=0.1.0',
32 | 'isort',
33 | 'pylint',
34 | 'pylzma',
35 | 'memory-profiler>=0.47',
36 | 'pycodestyle',
37 | 'pytest',
38 | 'pytest-cov',
39 | 'requests>=2.6.0',
40 | 'tornado>=4.3',
41 | ],
42 | 'scripts': [
43 | 'ipyparallel',
44 | 'pyspark',
45 | 'matplotlib',
46 | ]
47 | },
48 |
49 | classifiers=[
50 | 'Development Status :: 4 - Beta',
51 | 'Intended Audience :: Developers',
52 | 'Natural Language :: English',
53 | 'License :: OSI Approved :: MIT License',
54 | 'Operating System :: OS Independent',
55 | 'Programming Language :: Python',
56 | 'Programming Language :: Python :: 3.7',
57 | 'Programming Language :: Python :: 3.8',
58 | 'Programming Language :: Python :: 3.9',
59 | 'Programming Language :: Python :: 3.10',
60 | 'Programming Language :: Python :: 3.11',
61 | 'Programming Language :: Python :: Implementation :: PyPy',
62 | ]
63 | )
64 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_files.py:
--------------------------------------------------------------------------------
1 | import tornado.testing
2 |
3 | import pysparkling
4 |
5 |
6 | class TextFile(tornado.testing.AsyncTestCase):
7 |
8 | def test_connect(self):
9 | sc = pysparkling.Context()
10 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
11 |
12 | result = []
13 | (
14 | ssc.textFileStream('LICENS*', process_all=True)
15 | .count()
16 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
17 | )
18 |
19 | ssc.start()
20 | ssc.awaitTermination(timeout=0.3)
21 | self.assertEqual(sum(result), 44)
22 |
23 | def test_save(self):
24 | sc = pysparkling.Context()
25 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
26 |
27 | (
28 | ssc.textFileStream('LICENS*')
29 | .count()
30 | .saveAsTextFiles('tests/textout/')
31 | )
32 |
33 | def test_save_gz(self):
34 | sc = pysparkling.Context()
35 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
36 |
37 | (
38 | ssc.textFileStream('LICENS*')
39 | .count()
40 | .saveAsTextFiles('tests/textout/', suffix='.gz')
41 | )
42 |
43 |
44 | class BinaryFile(tornado.testing.AsyncTestCase):
45 |
46 | def test_read_file(self):
47 | sc = pysparkling.Context()
48 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
49 |
50 | result = []
51 | (
52 | ssc.fileBinaryStream('LICENS*', process_all=True)
53 | .count()
54 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
55 | )
56 |
57 | ssc.start()
58 | ssc.awaitTermination(timeout=0.3)
59 | self.assertEqual(sum(result), 1)
60 |
61 | def test_read_chunks(self):
62 | sc = pysparkling.Context()
63 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
64 |
65 | result = []
66 | (
67 | ssc.fileBinaryStream('LICENS*', recordLength=40, process_all=True)
68 | .count()
69 | .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
70 | )
71 |
72 | ssc.start()
73 | ssc.awaitTermination(timeout=0.3)
74 | self.assertEqual(sum(result), 54)
75 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015-2020 pysparkling contributors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
24 | -----------------------------------------------------------------------------
25 |
26 |
27 | Parts of the files pysparkling/accumulators.py, pysparkling/broadcast.py,
28 | pysparkling/rdd.py, pysparkling/storagelevel.py and pysparkling/sql were
29 | extracted from their PySpark counterparts under the following license:
30 |
31 | Licensed to the Apache Software Foundation (ASF) under one or more
32 | contributor license agreements. See the NOTICE file distributed with
33 | this work for additional information regarding copyright ownership.
34 | The ASF licenses this file to You under the Apache License, Version 2.0
35 | (the "License"); you may not use this file except in compliance with
36 | the License. You may obtain a copy of the License at
37 |
38 | http://www.apache.org/licenses/LICENSE-2.0
39 |
40 | Unless required by applicable law or agreed to in writing, software
41 | distributed under the License is distributed on an "AS IS" BASIS,
42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 | See the License for the specific language governing permissions and
44 | limitations under the License.
45 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/file_system.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import typing as t
4 |
5 | log = logging.getLogger(__name__)
6 |
7 |
8 | class FileSystem:
9 | """Interface class for the file system.
10 |
11 | :param str file_name: File name.
12 | """
13 | def __init__(self, file_name: str):
14 | self.file_name: str = file_name
15 |
16 | @staticmethod
17 | def resolve_filenames(expr: str) -> t.List[str]:
18 | """Resolve the given glob-like expression to filenames.
19 |
20 | :rtype: list
21 | """
22 | log.error('Cannot resolve: %s', expr)
23 | raise NotImplementedError
24 |
25 | @staticmethod
26 | def resolve_content(expr: str) -> t.List[str]:
27 | """Return all the files matching expr or in a folder matching expr
28 |
29 | :rtype: list
30 | """
31 | log.error('Cannot resolve: %s', expr)
32 | raise NotImplementedError
33 |
34 | def exists(self) -> bool:
35 | """Check whether the given file_name exists.
36 |
37 | :rtype: bool
38 | """
39 | log.warning('Could not determine whether %s exists due to unhandled scheme.', self.file_name)
40 | raise NotImplementedError
41 |
42 | def load(self) -> io.BytesIO:
43 | """Load a file to a stream."""
44 | log.error('Cannot load: %s', self.file_name)
45 | raise NotImplementedError
46 |
47 | def load_text(self, encoding: str = 'utf8', encoding_errors: str = 'ignore') -> io.StringIO:
48 | """Load a file to a stream.
49 |
50 | :param str encoding: Text encoding.
51 | :param str encoding_errors: How to handle encoding errors.
52 | """
53 | log.error('Cannot load: %s', self.file_name)
54 | raise NotImplementedError
55 |
56 | def dump(self, stream: io.BytesIO):
57 | """Dump a stream to a file.
58 |
59 | :param io.BytesIO stream: Input tream.
60 | """
61 | log.error('Cannot dump: %s', self.file_name)
62 | raise NotImplementedError
63 |
64 | def make_public(self, recursive=False):
65 | """Make the file public (only on some file systems).
66 |
67 | :param bool recursive: Recurse.
68 | :rtype: FileSystem
69 | """
70 | log.warning('Cannot make %s public.', self.file_name)
71 | raise NotImplementedError
72 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/fields.py:
--------------------------------------------------------------------------------
1 | from ..types import StructField
2 | from ..utils import AnalysisException
3 | from .expressions import Expression
4 |
5 |
6 | class FieldAsExpression(Expression):
7 | def __init__(self, field):
8 | super().__init__()
9 | self.field = field
10 |
11 | def eval(self, row, schema):
12 | return row[find_position_in_schema(schema, self.field)]
13 |
14 | def __str__(self):
15 | return self.field.name
16 |
17 | def output_fields(self, schema):
18 | return [self.field]
19 |
20 | def args(self):
21 | return (self.field,)
22 |
23 |
24 | def find_position_in_schema(schema, expr):
25 | if isinstance(expr, str):
26 | show_id = False
27 | field_name = expr
28 | matches = set(i for i, field in enumerate(schema.fields) if field_name == field.name)
29 | elif isinstance(expr, FieldAsExpression):
30 | return find_position_in_schema(schema, expr.field)
31 | elif isinstance(expr, StructField) and hasattr(expr, "id"):
32 | show_id = True
33 | field_name = format_field(expr, show_id=show_id)
34 | matches = set(i for i, field in enumerate(schema.fields) if expr.id == field.id)
35 | else:
36 | if isinstance(expr, StructField):
37 | expression = f"Unbound field {expr.name}"
38 | else:
39 | expression = f"Expression type '{type(expr)}'"
40 |
41 | raise NotImplementedError(
42 | f"{expression} is not supported. "
43 | "As a user you should not see this error, feel free to report a bug at "
44 | "https://github.com/svenkreiss/pysparkling/issues"
45 | )
46 |
47 | return get_checked_matches(matches, field_name, schema, show_id)
48 |
49 |
50 | def get_checked_matches(matches, field_name, schema, show_id):
51 | if not matches:
52 | raise AnalysisException(f"Unable to find the column '{field_name}'"
53 | f" among {format_schema(schema, show_id)}")
54 |
55 | if len(matches) > 1:
56 | raise AnalysisException(
57 | f"Reference '{field_name}' is ambiguous, found {len(matches)} columns matching it."
58 | )
59 |
60 | return matches.pop()
61 |
62 |
63 | def format_schema(schema, show_id):
64 | return [format_field(field, show_id=show_id) for field in schema.fields]
65 |
66 |
67 | def format_field(field, show_id):
68 | if show_id:
69 | return f"{field.name}#{field.id}"
70 | return field.name
71 |
--------------------------------------------------------------------------------
/scripts/multiprocessing_performance_plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | import pysparkling.tests.test_multiprocessing as test_mp
5 |
6 |
7 | def plot(has_hyperthreading=True):
8 | n_cpu, r = test_mp.test_performance()
9 | r = {n: 1.0 / (v[0] / r[1][0]) for n, v in r.items()}
10 |
11 | if has_hyperthreading:
12 | n_cpu /= 2
13 |
14 | x, y = zip(*sorted(r.items()))
15 | x_left = np.array(x) - 0.5
16 |
17 | fig, ax = plt.subplots()
18 |
19 | # ideal line
20 | # line = ax.plot((1, n_cpu), (1.0, n_cpu),
21 | # linewidth=2, linestyle='dashed', color='grey')
22 | # ax.plot((n_cpu, max(x)+0.5), (n_cpu, n_cpu),
23 | # linewidth=2, linestyle='dashed', color='grey')
24 | n_threads = n_cpu * 2 if has_hyperthreading else n_cpu
25 | bars_ideal = ax.bar(
26 | x_left,
27 | range(n_threads) + [n_threads for _ in range(len(x) - n_threads)],
28 | 1.0, color='lightgrey', linewidth=0,
29 | )
30 |
31 | # measured
32 | bars = ax.bar(x_left, y, 1.0, color='y')
33 |
34 | # divide with cpu cores
35 | ax.plot((n_cpu + 0.5, n_cpu + 0.5), (0, n_threads + 1),
36 | linewidth=2, linestyle='solid', color='black')
37 | ax.text(n_cpu + 0.4, n_threads + 1,
38 | f'{n_cpu} CPU cores',
39 | ha='right', va='top')
40 |
41 | # divide with cpu threads
42 | if has_hyperthreading:
43 | ax.plot((n_cpu * 2 + 0.5, n_cpu * 2 + 0.5), (0, n_threads + 1),
44 | linewidth=2, linestyle='solid', color='black')
45 | ax.text(n_cpu * 2 + 0.4, n_threads + 1,
46 | f'{n_cpu * 2} CPU threads',
47 | ha='right', va='top')
48 |
49 | # add some text for labels, title and axes ticks
50 | ax.set_xlabel('n processes')
51 | ax.set_ylabel('speedup')
52 | ax.set_xticks(x)
53 | ax.set_xticklabels(['no\nserialization\n(single process)']
54 | + [str(s) for s in x[1:]])
55 | ax.set_xlim(-0.5, max(x) + 0.5)
56 | ax.set_ylim(0, max(x))
57 | ax.legend((bars[0], bars_ideal[0]), ('measured', 'ideal'),
58 | loc='upper left')
59 |
60 | for rect in bars:
61 | height = rect.get_height()
62 | ax.text(rect.get_x() + rect.get_width() / 2., height - 0.05,
63 | f'{height:.2f}',
64 | ha='center', va='top')
65 |
66 | fig.tight_layout()
67 | # plt.show()
68 | fig.savefig('tests/multiprocessing_performance_plot.pdf')
69 | fig.savefig('tests/multiprocessing_performance_plot.png', dpi=300)
70 |
71 |
72 | if __name__ == '__main__':
73 | plot()
74 |
--------------------------------------------------------------------------------
/pysparkling/streaming/tcpstream.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import struct
3 |
4 | from tornado.gen import coroutine, moment
5 | from tornado.iostream import StreamClosedError
6 | from tornado.tcpserver import TCPServer
7 |
8 | from ..rdd import EmptyRDD
9 |
10 | log = logging.getLogger(__name__)
11 |
12 |
13 | class TCPDeserializer:
14 | def __init__(self, context):
15 | self.context = context
16 |
17 | def __call__(self, data):
18 | if data is None:
19 | return EmptyRDD(self.context)
20 |
21 | return self.context.parallelize(data)
22 |
23 |
24 | class TCPTextStream(TCPServer):
25 | def __init__(self, delimiter=b'\n'):
26 | super().__init__()
27 | self.delimiter = delimiter
28 | self.buffer = []
29 |
30 | def get(self):
31 | if not self.buffer:
32 | return []
33 |
34 | buffer_ = self.buffer
35 | self.buffer = []
36 | return buffer_
37 |
38 | @coroutine
39 | def handle_stream(self, stream, address):
40 | try:
41 | while True:
42 | for _ in range(100):
43 | data = yield stream.read_until(self.delimiter)
44 | self.buffer.append(data[:-1].decode('utf8'))
45 | yield moment
46 | except StreamClosedError:
47 | pass
48 |
49 |
50 | class TCPBinaryStream(TCPServer):
51 | """Consumes binary messages from a TCP socket.
52 |
53 | :param length: An int or string.
54 | """
55 |
56 | def __init__(self, length=None):
57 | super().__init__()
58 | self.length = length
59 | self.buffer = []
60 |
61 | self.prefix_length = None
62 | if not isinstance(self.length, int):
63 | self.prefix_length = struct.calcsize(self.length)
64 |
65 | def get(self):
66 | if not self.buffer:
67 | return []
68 |
69 | buffer_ = self.buffer
70 | self.buffer = []
71 | return buffer_
72 |
73 | @coroutine
74 | def handle_stream(self, stream, address):
75 | try:
76 | while True:
77 | for _ in range(100):
78 | if self.prefix_length:
79 | prefix = yield stream.read_bytes(self.prefix_length)
80 | message_length = struct.unpack(self.length, prefix)[0]
81 | else:
82 | message_length = self.length
83 | data = yield stream.read_bytes(message_length)
84 | self.buffer.append(data)
85 | yield moment
86 | except StreamClosedError:
87 | return
88 |
--------------------------------------------------------------------------------
/scripts/tcpperf_plot.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | import csv
3 |
4 | import matplotlib
5 | import matplotlib.pyplot as plt
6 |
7 | matplotlib.use('Agg')
8 |
9 |
10 | class Plot:
11 | def __init__(self, filename, x_label=None, y_label=None):
12 | self.filename = filename
13 | self.x_label = x_label or 'connections per second'
14 | self.y_label = y_label or 'processed messages per second'
15 | self.record = None
16 | self.data = list(self.read())
17 | self.frame()
18 |
19 | def read(self):
20 | with open(self.filename, 'r', encoding='utf8') as f:
21 | reader = csv.reader(f)
22 |
23 | try:
24 | first_line = next(reader)
25 | except StopIteration:
26 | return
27 |
28 | self.record = namedtuple('record', [k.strip().replace('# ', '')
29 | for k in first_line])
30 | for row_raw in reader:
31 | row = self.record._make([int(v) for v in row_raw])
32 | yield row
33 |
34 | def frame(self):
35 | fig, ax = plt.subplots()
36 |
37 | x = [row.messages for row in self.data]
38 | y = [row.hello for row in self.data]
39 |
40 | # add some text for labels, title and axes ticks
41 | ax.set_xlabel(self.x_label)
42 | ax.set_ylabel(self.y_label)
43 | # ax.set_xticks(x)
44 | ax.set_xlim(-300, max(x) + 300)
45 | ax.set_ylim(-300, max(y) + 2000)
46 |
47 | fig.tight_layout()
48 |
49 | self.fig, self.ax = fig, ax
50 | return self
51 |
52 | def plot(self):
53 | x = [row.messages for row in self.data]
54 |
55 | ideal, = self.ax.plot([0.0, max(x)], [0.0, max(x)], label='ideal',
56 | color='black', linestyle='--', linewidth=1)
57 | graphs = [
58 | self.ax.plot(x, [getattr(row, k) for row in self.data], label=k)
59 | for k in self.record._fields if k != 'messages'
60 | ]
61 |
62 | self.ax.legend(
63 | handles=[ideal] + [g for g, in graphs],
64 | loc='upper left',
65 | )
66 |
67 | return self
68 |
69 | def show(self):
70 | plt.show()
71 | return self
72 |
73 | def save(self):
74 | self.fig.savefig(self.filename + '.pdf')
75 | self.fig.savefig(self.filename + '.png', dpi=300)
76 | return self
77 |
78 |
79 | if __name__ == '__main__':
80 | Plot('tests/tcpperf_connections.csv').plot().save()
81 | (Plot('tests/tcpperf_messages.csv',
82 | x_label='inbound messages per second')
83 | .plot()
84 | .save())
85 |
--------------------------------------------------------------------------------
/logo/favicon.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/textreader.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import itertools
3 |
4 | from ....fileio import TextFile
5 | from ...internal_utils.options import Options
6 | from ...internal_utils.readers.utils import resolve_partitions
7 | from ...types import create_row, StringType, StructField, StructType
8 |
9 |
10 | class TextReader:
11 | default_options = dict(
12 | lineSep=None,
13 | encoding="utf-8",
14 | sep=",",
15 | inferSchema=False,
16 | header=False
17 | )
18 |
19 | def __init__(self, spark, paths, schema, options):
20 | self.spark = spark
21 | self.paths = paths
22 | self.schema = schema or StructType([StructField("value", StringType())])
23 | self.options = Options(self.default_options, options)
24 |
25 | def read(self):
26 | sc = self.spark._sc
27 | paths = self.paths
28 |
29 | partitions, partition_schema = resolve_partitions(paths)
30 |
31 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
32 | rdd = rdd_filenames.flatMap(partial(
33 | parse_text_file,
34 | partitions,
35 | partition_schema,
36 | self.schema,
37 | self.options
38 | ))
39 |
40 | if partition_schema:
41 | partitions_fields = partition_schema.fields
42 | full_schema = StructType(self.schema.fields + partitions_fields)
43 | else:
44 | full_schema = self.schema
45 |
46 | rdd._name = paths
47 |
48 | # pylint: disable=import-outside-toplevel, cyclic-import
49 | from ...internals import DataFrameInternal
50 |
51 | return DataFrameInternal(
52 | sc,
53 | rdd,
54 | schema=full_schema
55 | )
56 |
57 |
58 | def parse_text_file(partitions, partition_schema, schema, options, file_name):
59 | f_content = TextFile(file_name).load(encoding=options.encoding).read()
60 | records = (f_content.split(options.lineSep)
61 | if options.lineSep is not None
62 | else f_content.splitlines())
63 |
64 | rows = []
65 | for record in records:
66 | row = text_record_to_row(record, options, schema, partition_schema, partitions[file_name])
67 | row.set_input_file_name(file_name)
68 | rows.append(row)
69 |
70 | return rows
71 |
72 |
73 | def text_record_to_row(record, options, schema, partition_schema, partition):
74 | partition_field_names = [
75 | f.name for f in partition_schema.fields
76 | ] if partition_schema else []
77 | row = create_row(
78 | itertools.chain([schema.fields[0].name], partition_field_names),
79 | itertools.chain([record], partition or [])
80 | )
81 | return row
82 |
--------------------------------------------------------------------------------
/scripts/pyspark_comparisons.py:
--------------------------------------------------------------------------------
1 | import pyspark
2 |
3 | SC = pyspark.SparkContext()
4 |
5 |
6 | def simple_textFile():
7 | print(SC.textFile('tests/test_simple.py').collect())
8 | print(SC.textFile('tests/test_simple.py').name())
9 | print(SC.parallelize([1, 2, 3]).name())
10 |
11 |
12 | def indent_line(l):
13 | print('============== INDENTING LINE ================')
14 | return '--- ' + l
15 |
16 |
17 | def lazy_execution():
18 | r = SC.textFile('tests/test_simple.py').map(indent_line)
19 | r.foreach(indent_line)
20 | print()
21 | print()
22 | print()
23 | # at this point, no map() or foreach() should have been executed
24 | print(r.collect())
25 |
26 |
27 | def count_lines():
28 | r = SC.wholeTextFiles('tests/*.py').keys().collect()
29 | print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
30 | print(r)
31 | print(SC.textFile('tests/*.py').count())
32 |
33 |
34 | def create_key_value_txt():
35 | r = SC.parallelize([('a', 1), ('b', 2)], 1)
36 | r.saveAsTextFile('tests/pyspark/key_value.txt')
37 | r.saveAsHadoopFile(
38 | "tests/pyspark/key_value.txt.bz2",
39 | "org.apache.hadoop.mapred.TextOutputFormat",
40 | compressionCodecClass="org.apache.hadoop.io.compress.BZip2Codec",
41 | )
42 | r.saveAsHadoopFile(
43 | "tests/pyspark/key_value.txt.gz",
44 | "org.apache.hadoop.mapred.TextOutputFormat",
45 | compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec",
46 | )
47 | # r.saveAsHadoopFile(
48 | # "tests/pyspark/key_value.txt.lzo",
49 | # "org.apache.hadoop.mapred.TextOutputFormat",
50 | # compressionCodecClass="com.hadoop.compression.lzo.LzopCodec",
51 | # )
52 |
53 | r_txt = SC.textFile('tests/pyspark/key_value.txt')
54 | print(r_txt.collect())
55 | r_gz = SC.textFile('tests/pyspark/key_value.txt.gz')
56 | print(r_gz.collect())
57 | r_bz2 = SC.textFile('tests/pyspark/key_value.txt.bz2')
58 | print(r_bz2.collect())
59 |
60 |
61 | def create_pickled_files():
62 | rdd = SC.parallelize(['hello', 'world', 1, 2], 2)
63 | rdd.saveAsPickleFile('tests/pyspark/mixed.pickle')
64 | rdd.saveAsPickleFile('tests/pyspark/mixed_batched.pickle', 1)
65 |
66 |
67 | def stat():
68 | d = [1, 4, 9, 16, 25, 36]
69 | s1 = SC.parallelize(d).stats()
70 | s2 = SC.parallelize(d, 3).stats()
71 | print(str(s1))
72 | print(str(s2))
73 |
74 |
75 | def partition_by():
76 | rdd = SC.parallelize(range(20), 2).map(lambda x: (x, x))
77 | r = rdd.partitionBy(2).collect()
78 | print('>>>>>>', r)
79 |
80 |
81 | if __name__ == '__main__':
82 | # simple_textFile()
83 | # lazy_execution()
84 | # count_lines()
85 | # create_key_value_txt()
86 | # create_pickled_files()
87 | # stat()
88 | partition_by()
89 |
--------------------------------------------------------------------------------
/pysparkling/fileio/codec/tar.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import logging
3 | import tarfile
4 |
5 | from .codec import Codec
6 |
7 | log = logging.getLogger(__name__)
8 |
9 |
10 | class Tar(Codec):
11 | """Implementation of :class:`.Codec` for tar compression."""
12 |
13 | def compress(self, stream):
14 | compressed = BytesIO()
15 |
16 | with tarfile.open(fileobj=compressed, mode='w') as f:
17 | s = stream.read()
18 |
19 | t = tarfile.TarInfo('data')
20 | t.size = len(s)
21 |
22 | f.addfile(t, BytesIO(s))
23 |
24 | compressed.seek(0)
25 | return compressed
26 |
27 | def decompress(self, stream):
28 | uncompressed = BytesIO()
29 |
30 | with tarfile.open(fileobj=stream, mode='r') as f:
31 | for tar_info in f.getmembers():
32 | if not tar_info.isfile():
33 | continue
34 | uncompressed.write(f.extractfile(tar_info).read())
35 |
36 | uncompressed.seek(0)
37 | return uncompressed
38 |
39 |
40 | class TarGz(Codec):
41 | """Implementation of :class:`.Codec` for .tar.gz compression."""
42 |
43 | def compress(self, stream):
44 | compressed = BytesIO()
45 |
46 | with tarfile.open(fileobj=compressed, mode='w:gz') as f:
47 | s = stream.read()
48 |
49 | t = tarfile.TarInfo('data')
50 | t.size = len(s)
51 |
52 | f.addfile(t, BytesIO(s))
53 |
54 | compressed.seek(0)
55 | return compressed
56 |
57 | def decompress(self, stream):
58 | uncompressed = BytesIO()
59 |
60 | with tarfile.open(fileobj=stream, mode='r:gz') as f:
61 | for tar_info in f.getmembers():
62 | if not tar_info.isfile():
63 | continue
64 | uncompressed.write(f.extractfile(tar_info).read())
65 |
66 | uncompressed.seek(0)
67 | return uncompressed
68 |
69 |
70 | class TarBz2(Codec):
71 | """Implementation of :class:`.Codec` for .tar.bz2 compression."""
72 |
73 | def compress(self, stream):
74 | compressed = BytesIO()
75 |
76 | with tarfile.open(fileobj=compressed, mode='w:bz2') as f:
77 | s = stream.read()
78 |
79 | t = tarfile.TarInfo('data')
80 | t.size = len(s)
81 |
82 | f.addfile(t, BytesIO(s))
83 |
84 | compressed.seek(0)
85 | return compressed
86 |
87 | def decompress(self, stream):
88 | uncompressed = BytesIO()
89 |
90 | with tarfile.open(fileobj=stream, mode='r:bz2') as f:
91 | for tar_info in f.getmembers():
92 | if not tar_info.isfile():
93 | continue
94 | uncompressed.write(f.extractfile(tar_info).read())
95 |
96 | uncompressed.seek(0)
97 | return uncompressed
98 |
--------------------------------------------------------------------------------
/pysparkling/storagelevel.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | __all__ = ["StorageLevel"]
19 |
20 |
21 | class StorageLevel:
22 |
23 | """
24 | Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
25 | whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
26 | in a JAVA-specific serialized format, and whether to replicate the RDD partitions on multiple
27 | nodes. Also contains static constants for some commonly used storage levels, MEMORY_ONLY.
28 | Since the data is always serialized on the Python side, all the constants use the serialized
29 | formats.
30 | """
31 |
32 | def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1):
33 | self.useDisk = useDisk
34 | self.useMemory = useMemory
35 | self.useOffHeap = useOffHeap
36 | self.deserialized = deserialized
37 | self.replication = replication
38 |
39 | def __repr__(self):
40 | return (
41 | f"StorageLevel({self.useDisk}, {self.useMemory}, {self.useOffHeap}, {self.deserialized}, "
42 | f"{self.replication})"
43 | )
44 |
45 | def __str__(self):
46 | result = ""
47 | result += "Disk " if self.useDisk else ""
48 | result += "Memory " if self.useMemory else ""
49 | result += "OffHeap " if self.useOffHeap else ""
50 | result += "Deserialized " if self.deserialized else "Serialized "
51 | result += f"{self.replication}x Replicated"
52 | return result
53 |
54 |
55 | StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False)
56 | StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
57 | StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False)
58 | StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2)
59 | StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False)
60 | StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
61 | StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
62 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_streaming_tcp.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from contextlib import closing
3 | import struct
4 |
5 | import tornado.gen
6 | import tornado.tcpclient
7 | import tornado.testing
8 |
9 | import pysparkling
10 |
11 |
12 | class TCPTextTest(tornado.testing.AsyncTestCase):
13 | @tornado.gen.coroutine
14 | def client(self):
15 | client = tornado.tcpclient.TCPClient()
16 | for v in range(20):
17 | stream = yield client.connect('127.0.0.1', 8123)
18 | with closing(stream):
19 | stream.write(f'a = {v}\n'.encode('utf8'))
20 | client.close()
21 |
22 | def test_connect(self):
23 | sc = pysparkling.Context()
24 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
25 |
26 | counter = Counter()
27 | (
28 | ssc.socketTextStream('127.0.0.1', 8123)
29 | .foreachRDD(lambda rdd:
30 | counter.update(''.join(rdd.collect()))
31 | if rdd.collect() else None)
32 | )
33 | self.client()
34 |
35 | ssc.start()
36 | ssc.awaitTermination(timeout=0.3)
37 | self.assertEqual(counter['a'], 20)
38 |
39 |
40 | class TCPBinaryFixedLengthTest(tornado.testing.AsyncTestCase):
41 | @tornado.gen.coroutine
42 | def client(self):
43 | client = tornado.tcpclient.TCPClient()
44 | stream = yield client.connect('127.0.0.1', 8124)
45 | with closing(stream):
46 | stream.write(b'hello')
47 | client.close()
48 |
49 | def test_main(self):
50 | sc = pysparkling.Context()
51 | ssc = pysparkling.streaming.StreamingContext(sc, 0.1)
52 |
53 | counter = Counter()
54 | (
55 | ssc.socketBinaryStream('127.0.0.1', 8124, length=5)
56 | .foreachRDD(lambda rdd: counter.update(rdd.collect()))
57 | )
58 | self.client()
59 |
60 | ssc.start()
61 | ssc.awaitTermination(timeout=0.3)
62 | self.assertEqual(counter[b'hello'], 1)
63 |
64 |
65 | class TCPBinaryUIntLengthTest(tornado.testing.AsyncTestCase):
66 | @tornado.gen.coroutine
67 | def client(self):
68 | client = tornado.tcpclient.TCPClient()
69 | stream = yield client.connect('127.0.0.1', 8125)
70 | with closing(stream):
71 | stream.write(struct.pack('>>>>>>>>>>>>>', t, r.collect())))
19 |
20 |
21 | def simple_queue_one_at_a_time(ssc):
22 | ssc.queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True).pprint()
23 |
24 |
25 | def save_text(ssc):
26 | (ssc
27 | .queueStream([range(5), ['a', 'b'], ['c']], oneAtATime=True)
28 | .saveAsTextFiles('scripts/textout/'))
29 |
30 |
31 | def window(ssc):
32 | (ssc
33 | .queueStream([[1], [2], [3], [4], [5], [6]])
34 | .window(3)
35 | .foreachRDD(lambda rdd: print('>>>>>>>>>', rdd.collect())))
36 |
37 |
38 | def updateStateByKey(ssc):
39 | def processStateUpdateByKey(input_stream, state):
40 | print('i', input_stream)
41 | print('s', state)
42 | return state if not input_stream else input_stream[-1]
43 |
44 | ssc.checkpoint('checkpoints/')
45 | (ssc
46 | .queueStream([[('a', 1), ('b', 3)], [('a', 2), ('a', 5), ('c', 4)]])
47 | .updateStateByKey(processStateUpdateByKey)
48 | .pprint()
49 | )
50 |
51 |
52 | def stream_log(ssc):
53 | ssc.textFileStream('/var/log/system.log*').pprint()
54 |
55 |
56 | def stream_queue_default(ssc):
57 | (ssc
58 | .queueStream([[4], [2]], default=['placeholder'])
59 | .foreachRDD(lambda rdd: print(rdd.collect())))
60 |
61 |
62 | def join_with_repeated_keys(ssc):
63 | s1 = ssc.queueStream([[('a', 4), ('a', 2)], [('c', 7)]])
64 | s2 = ssc.queueStream([[('b', 1), ('b', 3)], [('c', 8)]])
65 | (
66 | s1.fullOuterJoin(s2)
67 | .foreachRDD(lambda rdd: print(sorted(rdd.collect())))
68 | )
69 |
70 |
71 | def union(ssc):
72 | odd = ssc.queueStream([[1], [3], [5]])
73 | even = ssc.queueStream([[2], [4], [6]])
74 | (
75 | odd.union(even)
76 | .foreachRDD(lambda rdd: print(rdd.collect()))
77 | )
78 |
79 |
80 | def quiet_logs(sc):
81 | logger = sc._jvm.org.apache.log4j
82 | logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
83 | logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
84 |
85 |
86 | if __name__ == '__main__':
87 | spark_context = pyspark.SparkContext()
88 | quiet_logs(spark_context)
89 | streaming_context = pyspark.streaming.StreamingContext(spark_context, 1)
90 |
91 | # simple_queue(ssc)
92 | # simple_queue_count(ssc)
93 | # simple_queue_one_at_a_time(ssc)
94 | # save_text(ssc)
95 | # window(ssc)
96 | # updateStateByKey(ssc)
97 | # stream_log(ssc)
98 | # stream_queue_default(ssc)
99 | # join_with_repeated_keys(ssc)
100 | union(streaming_context)
101 |
102 | streaming_context.start()
103 | time.sleep(3.0)
104 | streaming_context.stop(stopGraceFully=True)
105 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/stat_aggregations.py:
--------------------------------------------------------------------------------
1 | from ....stat_counter import ColumnStatHelper
2 | from ...column import Column
3 | from ..literals import Literal
4 | from ..mappers import StarOperator
5 | from .aggregations import Aggregation
6 |
7 |
8 | class SimpleStatAggregation(Aggregation):
9 | def __init__(self, column):
10 | super().__init__(column)
11 | self.column = column
12 | self.stat_helper = ColumnStatHelper(column)
13 |
14 | def merge(self, row, schema):
15 | self.stat_helper.merge(row, schema)
16 |
17 | def mergeStats(self, other, schema):
18 | self.stat_helper.mergeStats(other.stat_helper)
19 |
20 | def eval(self, row, schema):
21 | raise NotImplementedError
22 |
23 | def args(self):
24 | return (self.column,)
25 |
26 |
27 | class Count(SimpleStatAggregation):
28 | pretty_name = "count"
29 |
30 | def __init__(self, column):
31 | if isinstance(column.expr, StarOperator):
32 | column = Column(Literal(1))
33 | super().__init__(column)
34 | self.column = column
35 | self.stat_helper = ColumnStatHelper(column)
36 |
37 | def eval(self, row, schema):
38 | return self.stat_helper.count
39 |
40 |
41 | class Max(SimpleStatAggregation):
42 | pretty_name = "max"
43 |
44 | def eval(self, row, schema):
45 | return self.stat_helper.max
46 |
47 |
48 | class Min(SimpleStatAggregation):
49 | pretty_name = "min"
50 |
51 | def eval(self, row, schema):
52 | return self.stat_helper.min
53 |
54 |
55 | class Sum(SimpleStatAggregation):
56 | pretty_name = "sum"
57 |
58 | def eval(self, row, schema):
59 | return self.stat_helper.sum
60 |
61 |
62 | class Avg(SimpleStatAggregation):
63 | pretty_name = "avg"
64 |
65 | def eval(self, row, schema):
66 | return self.stat_helper.mean
67 |
68 |
69 | class VarSamp(SimpleStatAggregation):
70 | pretty_name = "var_samp"
71 |
72 | def eval(self, row, schema):
73 | return self.stat_helper.variance_samp
74 |
75 |
76 | class VarPop(SimpleStatAggregation):
77 | pretty_name = "var_pop"
78 |
79 | def eval(self, row, schema):
80 | return self.stat_helper.variance_pop
81 |
82 |
83 | class StddevSamp(SimpleStatAggregation):
84 | pretty_name = "stddev_samp"
85 |
86 | def eval(self, row, schema):
87 | return self.stat_helper.stddev_samp
88 |
89 |
90 | class StddevPop(SimpleStatAggregation):
91 | pretty_name = "stddev_pop"
92 |
93 | def eval(self, row, schema):
94 | return self.stat_helper.stddev_pop
95 |
96 |
97 | class Skewness(SimpleStatAggregation):
98 | pretty_name = "skewness"
99 |
100 | def eval(self, row, schema):
101 | return self.stat_helper.skewness
102 |
103 |
104 | class Kurtosis(SimpleStatAggregation):
105 | pretty_name = "kurtosis"
106 |
107 | def eval(self, row, schema):
108 | return self.stat_helper.kurtosis
109 |
110 |
111 | __all__ = [
112 | "Avg", "VarPop", "VarSamp", "Sum", "StddevPop", "StddevSamp",
113 | "Skewness", "Min", "Max", "Kurtosis", "Count"
114 | ]
115 |
--------------------------------------------------------------------------------
/pysparkling/fileio/file.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | import logging
3 |
4 | from . import codec, fs
5 |
6 | log = logging.getLogger(__name__)
7 |
8 |
9 | class File:
10 | """File object.
11 |
12 | :param file_name: Any file name.
13 | """
14 |
15 | def __init__(self, file_name):
16 | self.file_name = file_name
17 | self.fs = fs.get_fs(file_name)(file_name)
18 | self.codec = codec.get_codec(file_name)()
19 |
20 | @staticmethod
21 | def resolve_filenames(all_expr):
22 | """resolve expression for a filename
23 |
24 | :param all_expr:
25 | A comma separated list of expressions. The expressions can contain
26 | the wildcard characters ``*`` and ``?``. It also resolves Spark
27 | datasets to the paths of the individual partitions
28 | (i.e. ``my_data`` gets resolved to
29 | ``[my_data/part-00000, my_data/part-00001]``).
30 |
31 | :returns: A list of file names.
32 | :rtype: list
33 | """
34 | files = []
35 | for expr in all_expr.split(','):
36 | expr = expr.strip()
37 | files += fs.get_fs(expr).resolve_filenames(expr)
38 | log.debug('Filenames: %s', files)
39 | return files
40 |
41 | @classmethod
42 | def get_content(cls, all_expr):
43 | """Return all files matching or in folder matching one of the given expression
44 |
45 | :param all_expr:
46 | A list of expressions.
47 | The expressions can contain the wildcard characters ``*`` and ``?``.
48 |
49 | :returns: A list of file names.
50 | :rtype: list
51 | """
52 | files = []
53 | for expr in all_expr:
54 | expr = expr.strip()
55 | files += fs.get_fs(expr).resolve_content(expr)
56 | log.debug('Filenames: %s', files)
57 | return files
58 |
59 | def exists(self):
60 | """Checks both for a file or directory at this location.
61 |
62 | :returns: True or false.
63 | """
64 | return self.fs.exists()
65 |
66 | def load(self):
67 | """Load the data from a file.
68 |
69 | :rtype: io.BytesIO
70 | """
71 | stream = self.fs.load()
72 | stream = self.codec.decompress(stream)
73 | return stream
74 |
75 | def dump(self, stream=None):
76 | """Writes a stream to a file.
77 |
78 | :param stream:
79 | A BytesIO instance. ``bytes`` are also possible and are converted
80 | to BytesIO.
81 |
82 | :rtype: File
83 | """
84 | if stream is None:
85 | stream = BytesIO()
86 |
87 | if isinstance(stream, bytes):
88 | stream = BytesIO(stream)
89 |
90 | stream = self.codec.compress(stream)
91 | self.fs.dump(stream)
92 |
93 | return self
94 |
95 | def make_public(self, recursive=False):
96 | """Makes the file public. Currently only supported on S3.
97 |
98 | :param recursive: Whether to apply this recursively.
99 | :rtype: File
100 | """
101 | self.fs.make_public(recursive)
102 | return self
103 |
--------------------------------------------------------------------------------
/scripts/tcpperf_client.py:
--------------------------------------------------------------------------------
1 | """Sends tcp messages."""
2 | import argparse
3 | from contextlib import closing
4 | import json
5 | import random
6 | import struct
7 | import sys
8 | import time
9 |
10 | from tornado import gen
11 | from tornado.ioloop import IOLoop, PeriodicCallback
12 | from tornado.iostream import StreamClosedError
13 | from tornado.tcpclient import TCPClient
14 |
15 |
16 | class Emitter:
17 | def __init__(self, port, n=1000, values=1, duration=3.0):
18 | self.port = port
19 | self.n = n
20 | self.values = values
21 | self.duration = duration
22 | self.message = self.hello
23 | self.i = 0
24 |
25 | self.pcb = None
26 | self.client = None
27 |
28 | def start(self):
29 | self.client = TCPClient()
30 |
31 | self.pcb = PeriodicCallback(self.send, 1000.0 / self.n)
32 | self.pcb.start()
33 |
34 | IOLoop.current().call_later(self.duration + 0.5, self.stop)
35 | IOLoop.current().start()
36 | IOLoop.clear_current()
37 |
38 | def stop(self):
39 | if self.pcb is not None:
40 | self.pcb.stop()
41 | if self.client is not None:
42 | self.client.close()
43 | IOLoop.current().stop()
44 |
45 | @gen.coroutine
46 | def send(self):
47 | if self.i >= self.duration * self.n * self.values:
48 | self.pcb.stop()
49 | return
50 |
51 | try:
52 | stream = yield self.client.connect('127.0.0.1', self.port)
53 | with closing(stream):
54 | messages = b''.join(self.message() for _ in range(self.values))
55 | stream.write(messages)
56 | self.i += self.values
57 | except StreamClosedError:
58 | return
59 |
60 | def hello(self):
61 | return b'hello\n'
62 |
63 | def r(self):
64 | s = random.randint(1, 10)
65 | v = s / 10.0 + (1.5 - s / 10.0) * random.random()
66 | return (s, v)
67 |
68 | def text(self):
69 | s, v = self.r()
70 | return f'sensor{s}|{v}\n'.encode('utf8')
71 |
72 | def json(self):
73 | s, v = self.r()
74 | return (json.dumps({f'sensor{s}': v}) + '\n').encode('utf8')
75 |
76 | def bello(self):
77 | # 5 bytes
78 | return b'bello'
79 |
80 | def struct(self):
81 | # 8 bytes
82 | return struct.pack('If', *self.r())
83 |
84 |
85 | def main():
86 | parser = argparse.ArgumentParser(description=__doc__)
87 | parser.add_argument('-n', type=int, default=1000,
88 | help='number of connections')
89 | parser.add_argument('--values', type=int, default=1,
90 | help='number of values per connection')
91 | parser.add_argument('--port', type=int, default=8123,
92 | help='target port number')
93 | parser.add_argument('--format', default='hello',
94 | help='format of the messages: hello (default), '
95 | 'text, json, bello (binary hello), '
96 | 'struct (binary)')
97 | parser.add_argument('--delay', type=float, default=0.5,
98 | help='wait before start sending messages')
99 | args = parser.parse_args()
100 |
101 | time.sleep(args.delay)
102 | e = Emitter(args.port, args.n, args.values)
103 | e.message = getattr(e, args.format)
104 | e.start()
105 | print(f'{sys.argv[0]} sent {e.i} messages')
106 |
107 |
108 | if __name__ == '__main__':
109 | main()
110 |
--------------------------------------------------------------------------------
/docs/sphinx/read_write.rst:
--------------------------------------------------------------------------------
1 | .. _read_write:
2 |
3 | .. currentmodule:: pysparkling
4 |
5 |
6 | Reading and Writing
7 | ===================
8 |
9 | This is a collection of best practices or templates for reading and writing
10 | various input and output formats.
11 |
12 |
13 | Batch
14 | -----
15 |
16 | Python List
17 | ~~~~~~~~~~~
18 |
19 | The most direct input and output is from and to a Python list.
20 |
21 | .. code-block:: python
22 |
23 | import pysparkling
24 |
25 | sc = pysparkling.Context()
26 |
27 | # reading
28 | rdd = sc.parallelize(['hello', 'world'])
29 |
30 | # back to Python list
31 | print(rdd.collect())
32 |
33 | # back to an iterator
34 | rdd.toLocalIterator()
35 |
36 |
37 | ND-JSON
38 | ~~~~~~~
39 |
40 | Newline delimited JSON is a text file where every line is its own JSON string.
41 |
42 |
43 | .. code-block:: python
44 |
45 | import json
46 | import pysparkling
47 |
48 | sc = pysparkling.Context()
49 |
50 | # reading
51 | rdd = (
52 | sc
53 | .textFile('input.json')
54 | .map(json.loads)
55 | )
56 |
57 | # writing
58 | (
59 | rdd
60 | .map(json.dumps)
61 | .saveAsTextFile('output.json')
62 | )
63 |
64 |
65 | CSV
66 | ~~~
67 |
68 | .. code-block:: python
69 |
70 | import csv
71 | import io
72 | import pysparkling
73 |
74 | sc = pysparkling.Context()
75 |
76 | # reading
77 | rdd = (
78 | sc
79 | .textFile('input.csv')
80 | .mapPartitions(csv.reader)
81 | )
82 |
83 | # writing
84 | def csv_row(data):
85 | s = io.StringIO()
86 | csv.writer(s).writerow(data)
87 | return s.getvalue()[:-1]
88 |
89 | (
90 | rdd
91 | .map(csv_row)
92 | .saveAsTextFile('output.csv')
93 | )
94 |
95 |
96 | TensorFlow Records
97 | ~~~~~~~~~~~~~~~~~~
98 |
99 | This example preprocesses example data into a TensorFlow Records file. The
100 | second part is a cross check and prints the contents of the `tfrecords` file.
101 |
102 | .. code-block:: python
103 |
104 | import pysparkling
105 | import tensorflow as tf
106 |
107 | def to_tfrecord(self, xy):
108 | X, y = xy
109 | example = tf.train.Example(features=tf.train.Features(feature={
110 | 'X': tf.train.Feature(float_list=tf.train.FloatList(value=X)),
111 | 'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
112 | }))
113 | return example.SerializeToString()
114 |
115 | # example
116 | X = [1.2, 3.1, 8.7]
117 | y = [2, 5]
118 |
119 | # writing
120 | sc = pysparkling.Context()
121 | rdd = (
122 | sc
123 | .parallelize([(X, y)])
124 | .map(to_tfrecord)
125 | )
126 | with tf.python_io.TFRecordWriter('out.tfrecords') as writer:
127 | for example in rdd.toLocalIterator():
128 | writer.write(example)
129 |
130 | # debugging a tf records file
131 | for serialized_example in tf.python_io.tf_record_iterator('out.tfrecords'):
132 | example = tf.train.Example()
133 | example.ParseFromString(serialized_example)
134 | X = example.features.feature['X'].float_list.value
135 | y = example.features.feature['y'].int64_list.value
136 | print(X, y)
137 |
138 |
139 | Streaming
140 | ---------
141 |
142 | Python List
143 | ~~~~~~~~~~~
144 |
145 | .. code-block:: python
146 |
147 | import pysparkling
148 |
149 | sc = pysparkling.Context()
150 | ssc = pysparkling.streaming.StreamingContext(sc, 1.0)
151 |
152 | (
153 | ssc
154 | .queueStream([[4], [2], [7]])
155 | .foreachRDD(lambda rdd: print(rdd.collect()))
156 | )
157 |
158 | ssc.start()
159 | ssc.awaitTermination(3.5)
160 |
161 | # output:
162 | # [4]
163 | # [2]
164 | # [7]
165 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/csvreader.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import itertools
3 |
4 | from ....fileio import TextFile
5 | from ...casts import get_caster
6 | from ...internal_utils.options import Options
7 | from ...internal_utils.readers.utils import guess_schema_from_strings, resolve_partitions
8 | from ...schema_utils import infer_schema_from_rdd
9 | from ...types import create_row, StringType, StructField, StructType
10 |
11 |
12 | class CSVReader:
13 | default_options = dict(
14 | lineSep=None,
15 | encoding="utf-8",
16 | sep=",",
17 | inferSchema=False,
18 | header=False
19 | )
20 |
21 | def __init__(self, spark, paths, schema, options):
22 | self.spark = spark
23 | self.paths = paths
24 | self.schema = schema
25 | self.options = Options(self.default_options, options)
26 |
27 | def read(self):
28 | sc = self.spark._sc
29 | paths = self.paths
30 |
31 | partitions, partition_schema = resolve_partitions(paths)
32 |
33 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
34 | rdd = rdd_filenames.flatMap(partial(
35 | parse_csv_file,
36 | partitions,
37 | partition_schema,
38 | self.schema,
39 | self.options
40 | ))
41 |
42 | if self.schema is not None:
43 | schema = self.schema
44 | elif self.options.inferSchema:
45 | fields = rdd.take(1)[0].__fields__
46 | schema = guess_schema_from_strings(fields, rdd.collect(), options=self.options)
47 | else:
48 | schema = infer_schema_from_rdd(rdd)
49 |
50 | schema_with_string = StructType(fields=[
51 | StructField(field.name, StringType()) for field in schema.fields
52 | ])
53 |
54 | if partition_schema:
55 | partitions_fields = partition_schema.fields
56 | full_schema = StructType(schema.fields[:-len(partitions_fields)] + partitions_fields)
57 | else:
58 | full_schema = schema
59 |
60 | cast_row = get_caster(
61 | from_type=schema_with_string, to_type=full_schema, options=self.options
62 | )
63 | casted_rdd = rdd.map(cast_row)
64 | casted_rdd._name = paths
65 |
66 | # pylint: disable=import-outside-toplevel, cyclic-import
67 | from ...internals import DataFrameInternal
68 |
69 | return DataFrameInternal(
70 | sc,
71 | casted_rdd,
72 | schema=full_schema
73 | )
74 |
75 |
76 | def parse_csv_file(partitions, partition_schema, schema, options, file_name):
77 | f_content = TextFile(file_name).load(encoding=options.encoding).read()
78 | records = (f_content.split(options.lineSep)
79 | if options.lineSep is not None
80 | else f_content.splitlines())
81 | if options.header == "true":
82 | header = records[0].split(options.sep)
83 | records = records[1:]
84 | else:
85 | header = None
86 |
87 | null_value = ""
88 | rows = []
89 | for record in records:
90 | row = csv_record_to_row(
91 | record, options, schema, header, null_value, partition_schema, partitions[file_name]
92 | )
93 | row.set_input_file_name(file_name)
94 | rows.append(row)
95 |
96 | return rows
97 |
98 |
99 | def csv_record_to_row(record, options, schema=None, header=None,
100 | null_value=None, partition_schema=None, partition=None):
101 | record_values = [val if val != null_value else None for val in record.split(options.sep)]
102 | if schema is not None:
103 | field_names = [f.name for f in schema.fields]
104 | elif header is not None:
105 | field_names = header
106 | else:
107 | field_names = [f"_c{i}" for i, field in enumerate(record_values)]
108 | partition_field_names = [
109 | f.name for f in partition_schema.fields
110 | ] if partition_schema else []
111 | row = create_row(
112 | itertools.chain(field_names, partition_field_names),
113 | itertools.chain(record_values, partition or [])
114 | )
115 | return row
116 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/s3.py:
--------------------------------------------------------------------------------
1 | from fnmatch import fnmatch
2 | from io import BytesIO, StringIO
3 | import logging
4 |
5 | from ...exceptions import FileSystemNotSupported
6 | from ...utils import parse_file_uri, Tokenizer
7 | from .file_system import FileSystem
8 |
9 | log = logging.getLogger(__name__)
10 |
11 | try:
12 | import boto
13 | except ImportError:
14 | boto = None
15 |
16 |
17 | class S3(FileSystem):
18 | """:class:`.FileSystem` implementation for S3.
19 |
20 | Use environment variables ``AWS_SECRET_ACCESS_KEY`` and
21 | ``AWS_ACCESS_KEY_ID`` for auth and use file paths of the form
22 | ``s3://bucket_name/filename.txt``.
23 | """
24 |
25 | #: Keyword arguments for new connections.
26 | #: Example: set to `{'anon': True}` for anonymous connections.
27 | connection_kwargs = {}
28 |
29 | _conn = None
30 |
31 | def __init__(self, file_name):
32 | if boto is None:
33 | raise FileSystemNotSupported('S3 not supported. Install "boto".')
34 |
35 | super().__init__(file_name)
36 |
37 | # obtain key
38 | t = Tokenizer(self.file_name)
39 | t.get_next('://') # skip scheme
40 | bucket_name = t.get_next('/')
41 | key_name = t.get_next()
42 | conn = self._get_conn()
43 | bucket = conn.get_bucket(bucket_name, validate=False)
44 | self.key = bucket.get_key(key_name)
45 | if not self.key:
46 | self.key = bucket.new_key(key_name)
47 |
48 | @classmethod
49 | def _get_conn(cls):
50 | if not cls._conn:
51 | if boto is None:
52 | raise FileSystemNotSupported('S3 not supported. Install "boto".')
53 | cls._conn = boto.connect_s3(**cls.connection_kwargs)
54 | return cls._conn
55 |
56 | @classmethod
57 | def resolve_filenames(cls, expr):
58 | files = []
59 |
60 | t = Tokenizer(expr)
61 | scheme = t.get_next('://')
62 | bucket_name = t.get_next('/')
63 | prefix = t.get_next(['*', '?'])
64 |
65 | bucket = cls._get_conn().get_bucket(
66 | bucket_name,
67 | validate=False
68 | )
69 | expr = expr[len(scheme) + 3 + len(bucket_name) + 1:]
70 | for k in bucket.list(prefix=prefix):
71 | if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'):
72 | files.append(f'{scheme}://{bucket_name}/{k.name}')
73 | return files
74 |
75 | @classmethod
76 | def resolve_content(cls, expr):
77 | scheme, bucket_name, folder_path, pattern = parse_file_uri(expr)
78 |
79 | folder_path = folder_path[1:] # Remove leading slash
80 |
81 | expr = f"{folder_path}{pattern}"
82 | # Match all files inside folders that match expr
83 | pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*"
84 |
85 | bucket = cls._get_conn().get_bucket(
86 | bucket_name,
87 | validate=False
88 | )
89 | files = []
90 | for k in bucket.list(prefix=folder_path):
91 | if fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr):
92 | files.append(f'{scheme}://{bucket_name}/{k.name}')
93 | return files
94 |
95 | def exists(self):
96 | t = Tokenizer(self.file_name)
97 | t.get_next('//') # skip scheme
98 | bucket_name = t.get_next('/')
99 | key_name = t.get_next()
100 | conn = self._get_conn()
101 | bucket = conn.get_bucket(bucket_name, validate=False)
102 | return (bucket.get_key(key_name)
103 | or bucket.list(prefix=f'{key_name}/'))
104 |
105 | def load(self):
106 | log.debug('Loading %s with size %s.', self.key.name, self.key.size)
107 | return BytesIO(self.key.get_contents_as_string())
108 |
109 | def load_text(self, encoding='utf8', encoding_errors='ignore'):
110 | log.debug('Loading %s with size %s.', self.key.name, self.key.size)
111 | return StringIO(
112 | self.key.get_contents_as_string().decode(encoding, encoding_errors)
113 | )
114 |
115 | def dump(self, stream):
116 | log.debug('Dumping to %s.', self.key.name)
117 | self.key.set_contents_from_file(stream)
118 | return self
119 |
120 | def make_public(self, recursive=False):
121 | self.key.make_public(recursive)
122 | return self
123 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/utils.py:
--------------------------------------------------------------------------------
1 | from ....fileio import File, TextFile
2 | from ...casts import get_caster
3 | from ...types import (
4 | DecimalType, DoubleType, IntegerType, LongType, row_from_keyed_values, StringType, StructField, StructType,
5 | TimestampType
6 | )
7 | from ...utils import AnalysisException
8 |
9 |
10 | def resolve_partitions(patterns):
11 | """
12 | Given a list of patterns, returns all the files matching or in folders matching
13 | one of them.
14 |
15 | The file are returned in a list of tuple of 2 elements:
16 | - The first tuple is the file path
17 | - The second being the partition keys and values if any were encountered else None
18 |
19 | In addition to this list, return, if the data was partitioned, a schema for the
20 | partition keys, else None
21 |
22 | :type patterns: list of str
23 | :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
24 | """
25 | file_paths = File.get_content(patterns)
26 | if not file_paths:
27 | raise AnalysisException(f'Path does not exist: {patterns}')
28 | partitions = {}
29 | for file_path in file_paths:
30 | if "=" in file_path:
31 | row = row_from_keyed_values(
32 | folder.split("=")
33 | for folder in file_path.split("/")[:-1]
34 | if folder.count("=") == 1
35 | )
36 | partitions[file_path] = row
37 | else:
38 | partitions[file_path] = None
39 |
40 | partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
41 | if len(partitioning_field_sets) > 1:
42 | raise Exception(
43 | f"Conflicting directory structures detected while reading {','.join(patterns)}. "
44 | f"All partitions must have the same partitioning fields,"
45 | f" found fields {' and also '.join(str(fields) for fields in partitioning_field_sets)}"
46 | )
47 |
48 | if partitioning_field_sets:
49 | if any(value is None for value in partitions.values()):
50 | paths = [path for path, value in partitions.items() if value is None]
51 | raise AnalysisException(
52 | f"Unable to parse those malformed folders: {paths} of {file_paths}"
53 | )
54 | partitioning_fields = partitioning_field_sets.pop()
55 | partition_schema = guess_schema_from_strings(
56 | partitioning_fields, partitions.values(), options={}
57 | )
58 | else:
59 | partition_schema = None
60 |
61 | return partitions, partition_schema
62 |
63 |
64 | def guess_schema_from_strings(schema_fields, data, options):
65 | field_values = [
66 | (field, [row[field] for row in data])
67 | for field in schema_fields
68 | ]
69 |
70 | field_types_and_values = [
71 | (field, guess_type_from_values_as_string(values, options))
72 | for field, values in field_values
73 | ]
74 |
75 | schema = StructType(fields=[
76 | StructField(field, field_type)
77 | for field, field_type in field_types_and_values
78 | ])
79 |
80 | return schema
81 |
82 |
83 | def guess_type_from_values_as_string(values, options):
84 | # Reproduces inferences available in Spark
85 | # PartitioningUtils.inferPartitionColumnValue()
86 | # located in org.apache.spark.sql.execution.datasources
87 | tested_types = (
88 | IntegerType(),
89 | LongType(),
90 | DecimalType(),
91 | DoubleType(),
92 | TimestampType(),
93 | StringType()
94 | )
95 | string_type = StringType()
96 | for tested_type in tested_types:
97 | type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options)
98 | try:
99 | for value in values:
100 | casted_value = type_caster(value)
101 | if casted_value is None and value not in ("null", None):
102 | raise ValueError
103 | return tested_type
104 | except ValueError:
105 | pass
106 | # Should never happen
107 | raise AnalysisException(
108 | "Unable to find a matching type for some fields, even StringType did not work"
109 | )
110 |
111 |
112 | def get_records(f_name, linesep, encoding):
113 | f_content = TextFile(f_name).load(encoding=encoding).read()
114 | records = f_content.split(linesep) if linesep is not None else f_content.splitlines()
115 | return records
116 |
--------------------------------------------------------------------------------
/logo/create.py:
--------------------------------------------------------------------------------
1 | """Creates an SVG of the Databench logo. Optionally also a png."""
2 |
3 | import os
4 | import random
5 |
6 | import svgwrite
7 |
8 | DATA = [
9 | [0, 1, 1, 1, 1, 1, 1, 1],
10 | [0, 1, 1, 1, 1, 1, 1, 1],
11 | [0, 0, 0, 0, 1, 1, 1, 1],
12 | [0, 0, 0, 1, 1, 1, 1, 1],
13 | [0, 0, 1, 1, 1, 0, 1, 1],
14 | [0, 1, 1, 1, 0, 0, 1, 1],
15 | [1, 1, 1, 0, 0, 0, 1, 1],
16 | [1, 1, 0, 0, 0, 0, 0, 0],
17 | ]
18 |
19 |
20 | def color(x, y):
21 | """triangles.
22 |
23 | Colors:
24 | - http://paletton.com/#uid=70l150klllletuehUpNoMgTsdcs shade 2
25 | """
26 |
27 | return '#42359C' # "#CDB95B"
28 |
29 | if (x - 4) > (y - 4) and -(y - 4) <= (x - 4):
30 | # right
31 | return '#42359C' # "#CDB95B"
32 | elif (x - 4) > (y - 4) and -(y - 4) > (x - 4):
33 | # top
34 | return "#CD845B"
35 | elif (x - 4) <= (y - 4) and -(y - 4) <= (x - 4):
36 | # bottom
37 | return "#57488E"
38 | elif (x - 4) <= (y - 4) and -(y - 4) > (x - 4):
39 | # left
40 | return "#3B8772"
41 |
42 | # should not happen
43 | return "black"
44 |
45 |
46 | def simple(svg_document, x, y, v):
47 | if v == 1:
48 | svg_document.add(svg_document.rect(insert=(x * 16, y * 16),
49 | size=("16px", "16px"),
50 | # rx="2px",
51 | # stroke_width="1",
52 | # stroke=color(x, y),
53 | fill=color(x, y)))
54 |
55 |
56 | def smaller(svg_document, x, y, v, x_offset=0, y_offset=0):
57 | # from center
58 | distance2 = (x - 3.5) ** 2 + (y - 3.5) ** 2
59 | max_distance2 = 2 * 4 ** 2
60 |
61 | if v == 1:
62 | size = 16.0 * (1.0 - distance2 / max_distance2)
63 | number_of_cubes = int(16 ** 2 / (size ** 2))
64 | for i in range(number_of_cubes):
65 | xi = x * 16 + 1 + random.random() * (14.0 - size) + x_offset
66 | yi = y * 16 + 1 + random.random() * (14.0 - size) + y_offset
67 | sizepx = str(size) + "px"
68 | svg_document.add(svg_document.rect(insert=(xi, yi),
69 | size=(sizepx, sizepx),
70 | rx="2px",
71 | stroke_width="1",
72 | # stroke='#4E9954',
73 | stroke='#FAE5A5',
74 | # stroke=color(x, y),
75 | fill=color(x, y)))
76 |
77 |
78 | def main():
79 | svg_favicon = svgwrite.Drawing(filename="favicon.svg",
80 | size=("128px", "128px"))
81 | svg_document = svgwrite.Drawing(filename="logo.svg",
82 | size=("128px", "128px"))
83 | svg_banner = svgwrite.Drawing(filename="banner.svg",
84 | size=("600px", "200px"))
85 | for y, r in enumerate(DATA):
86 | for x, v in enumerate(r):
87 | simple(svg_favicon, x, y, v)
88 | smaller(svg_document, x, y, v)
89 | smaller(svg_banner, x, y, v, x_offset=20, y_offset=40)
90 | # add banner text
91 | g = svg_banner.g(style='font-size:40px; font-family:Arial; font-weight: bold; font-style: italic;')
92 | g.add(svg_banner.text(
93 | 'pysparkling',
94 | insert=(180, 120), fill='#000000'),
95 | )
96 | svg_banner.add(g)
97 | # print(svg_document.tostring())
98 | svg_favicon.save()
99 | svg_document.save()
100 | svg_banner.save()
101 |
102 | # create pngs
103 | os.system('svg2png --width=100 --height=100 logo.svg logo-w100.png')
104 | os.system('svg2png --width=600 --height=600 logo.svg logo-w600.png')
105 | os.system('svg2png --width=500 --height=100 banner.svg banner-w500.png')
106 | os.system('svg2png --width=1500 --height=400 banner.svg banner-w1500.png')
107 | favicon_sizes = [16, 32, 48, 128, 256]
108 | for s in favicon_sizes:
109 | os.system(f'svg2png --width={s} --height={s} favicon.svg favicon-w{s}.png')
110 | png_favicon_names = [f'favicon-w{s}.png' for s in favicon_sizes]
111 | os.system('convert ' + (' '.join(png_favicon_names)) +
112 | ' -colors 256 favicon.ico')
113 |
114 |
115 | if __name__ == "__main__":
116 | random.seed(42)
117 | main()
118 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/logo/logo-w100.png
2 | :target: https://github.com/svenkreiss/pysparkling
3 |
4 | pysparkling
5 | ===========
6 |
7 | **Pysparkling** provides a faster, more responsive way to develop programs
8 | for PySpark. It enables code intended for Spark applications to execute
9 | entirely in Python, without incurring the overhead of initializing and
10 | passing data through the JVM and Hadoop. The focus is on having a lightweight
11 | and fast implementation for small datasets at the expense of some data
12 | resilience features and some parallel processing features.
13 |
14 | **How does it work?** To switch execution of a script from PySpark to pysparkling,
15 | have the code initialize a pysparkling Context instead of a SparkContext, and
16 | use the pysparkling Context to set up your RDDs. The beauty is you don't have
17 | to change a single line of code after the Context initialization, because
18 | pysparkling's API is (almost) exactly the same as PySpark's. Since it's so easy
19 | to switch between PySpark and pysparkling, you can choose the right tool for your
20 | use case.
21 |
22 | **When would I use it?** Say you are writing a Spark application because you
23 | need robust computation on huge datasets, but you also want the same application
24 | to provide fast answers on a small dataset. You're finding Spark is not responsive
25 | enough for your needs, but you don't want to rewrite an entire separate application
26 | for the *small-answers-fast* problem. You'd rather reuse your Spark code but somehow
27 | get it to run fast. Pysparkling bypasses the stuff that causes Spark's long startup
28 | times and less responsive feel.
29 |
30 | Here are a few areas where pysparkling excels:
31 |
32 | * Small to medium-scale exploratory data analysis
33 | * Application prototyping
34 | * Low-latency web deployments
35 | * Unit tests
36 |
37 |
38 | Install
39 | =======
40 |
41 | .. code-block:: bash
42 |
43 | python3 -m pip install "pysparkling[s3,hdfs,http,streaming]"
44 |
45 |
46 | `Documentation `_:
47 |
48 | .. image:: https://raw.githubusercontent.com/svenkreiss/pysparkling/master/docs/readthedocs.png
49 | :target: https://pysparkling.trivial.io
50 |
51 |
52 | Other links:
53 | `Github `_,
54 | |pypi-badge|, |test-badge|, |docs-badge|
55 |
56 | .. |pypi-badge| image:: https://badge.fury.io/py/pysparkling.svg
57 | :target: https://pypi.python.org/pypi/pysparkling/
58 | .. |test-badge| image:: https://github.com/svenkreiss/pysparkling/workflows/Tests/badge.svg
59 | :target: https://github.com/svenkreiss/pysparkling/actions?query=workflow%3ATests
60 | .. |docs-badge| image:: https://readthedocs.org/projects/pysparkling/badge/?version=latest
61 | :target: https://pysparkling.readthedocs.io/en/latest/?badge=latest
62 | :alt: Documentation Status
63 |
64 |
65 | Features
66 | ========
67 |
68 | * Supports URI schemes ``s3://``, ``hdfs://``, ``gs://``, ``http://`` and ``file://``
69 | for Amazon S3, HDFS, Google Storage, web and local file access.
70 | Specify multiple files separated by comma.
71 | Resolves ``*`` and ``?`` wildcards.
72 | * Handles ``.gz``, ``.zip``, ``.lzma``, ``.xz``, ``.bz2``, ``.tar``,
73 | ``.tar.gz`` and ``.tar.bz2`` compressed files.
74 | Supports reading of ``.7z`` files.
75 | * Parallelization via ``multiprocessing.Pool``,
76 | ``concurrent.futures.ThreadPoolExecutor`` or any other Pool-like
77 | objects that have a ``map(func, iterable)`` method.
78 | * Plain pysparkling does not have any dependencies (use ``pip install pysparkling``).
79 | Some file access methods have optional dependencies:
80 | ``boto`` for AWS S3, ``requests`` for http, ``hdfs`` for hdfs
81 |
82 |
83 | Examples
84 | ========
85 |
86 | Some demos are in the notebooks
87 | `docs/demo.ipynb `_
88 | and
89 | `docs/iris.ipynb `_
90 | .
91 |
92 | **Word Count**
93 |
94 | .. code-block:: python
95 |
96 | from pysparkling import Context
97 |
98 | counts = (
99 | Context()
100 | .textFile('README.rst')
101 | .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line))
102 | .flatMap(lambda line: line.split(' '))
103 | .map(lambda word: (word, 1))
104 | .reduceByKey(lambda a, b: a + b)
105 | )
106 | print(counts.collect())
107 |
108 | which prints a long list of pairs of words and their counts.
109 |
--------------------------------------------------------------------------------
/scripts/tcpperf_server.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import json
3 | import logging
4 | import math
5 | import os
6 | import struct
7 | import time
8 |
9 | import pysparkling
10 |
11 | N_CONNECTIONS = (100, 1000, 2000, 3000, 3500, 4000, 4500, 5000,
12 | 6000, 7000, 8000)
13 | N_CONNECTIONS_1K = (10, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100)
14 |
15 |
16 | class Server:
17 | def __init__(self, pause=60, values=1, start_port=8123, processes=2):
18 | self.pause = pause
19 | self.values = values
20 | self.port = start_port
21 | self.processes = processes
22 |
23 | def client(self, n=2000, format_='hello'):
24 | for _ in range(self.processes):
25 | os.system(
26 | f'python tests/tcpperf_client.py -n {int(n / self.processes)}'
27 | f' --port {self.port} --format {format_} --values {self.values}'
28 | f' &'
29 | )
30 |
31 | def _run_process(self, n, to_kv, format_):
32 | c = pysparkling.Context()
33 | stream_c = pysparkling.streaming.StreamingContext(c, 1.0)
34 |
35 | counts = []
36 | sensor_sums = defaultdict(float)
37 | sensor_squares = defaultdict(float)
38 | sensor_counts = defaultdict(int)
39 | if format_ not in ('bello', 'struct'):
40 | t = stream_c.socketTextStream('localhost', self.port)
41 | else:
42 | length = {'bello': 5, 'struct': 8}[format_]
43 | t = stream_c.socketBinaryStream('localhost', self.port, length)
44 | t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0]))
45 | if to_kv is not None:
46 | def update(rdd):
47 | for k, v in rdd.collect():
48 | sensor_sums[k] += sum(v)
49 | sensor_squares[k] += sum(vv ** 2 for vv in v)
50 | sensor_counts[k] += len(v)
51 |
52 | t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd))
53 |
54 | self.client(n, format_=format_)
55 |
56 | stream_c.start()
57 | stream_c.awaitTermination(timeout=5.0)
58 |
59 | return (
60 | counts,
61 | sensor_sums,
62 | sensor_squares,
63 | sensor_counts
64 | )
65 |
66 | def run(self, n=2000, to_kv=None, format_='hello'):
67 | counts, sensor_sums, sensor_squares, sensor_counts = self._run_process(n, to_kv, format_)
68 |
69 | result = max(counts) if counts else 0
70 | sensor_expections = {
71 | # expectation of X and X^2
72 | k: (sensor_sums[k] / v, sensor_squares[k] / v)
73 | for k, v in sensor_counts.items()
74 | }
75 | sensors = {
76 | k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0] ** 2))
77 | for k, ex_ex2 in sensor_expections.items()
78 | }
79 | print(f'run: n = {n}, counts = {counts}, result = {result}')
80 | print(f'sensors = {sensors}')
81 | time.sleep(self.pause)
82 | self.port += 1
83 | return result
84 |
85 |
86 | def main():
87 | logging.basicConfig(level=logging.WARNING)
88 |
89 | def kv_from_text(text):
90 | k, _, v = text.partition('|')
91 | return k, float(v)
92 |
93 | def kv_from_json(text):
94 | j = json.loads(text)
95 | return list(j.items())[0]
96 |
97 | def kv_from_struct(b):
98 | s, v = struct.unpack('If', b)
99 | return f'sensor{s}', v
100 |
101 | with open('tests/tcpperf_messages.csv', 'w', encoding='utf8') as f:
102 | f.write('# messages, hello, text, json, bello, struct\n')
103 | server_1k = Server(pause=2, values=1000, processes=5)
104 | for n in reversed(N_CONNECTIONS_1K):
105 | data = (
106 | n * 1000,
107 | server_1k.run(n),
108 | server_1k.run(n, None, 'bello'),
109 | server_1k.run(n, kv_from_text, 'text'),
110 | server_1k.run(n, kv_from_json, 'json'),
111 | server_1k.run(n, kv_from_struct, 'struct'),
112 | )
113 | f.write(', '.join(f'{d}' for d in data) + '\n')
114 |
115 | with open('tests/tcpperf_connections.csv', 'w', encoding='utf8') as f:
116 | f.write('# messages, hello, text, json, bello, struct\n')
117 | server = Server()
118 | for n in reversed(N_CONNECTIONS):
119 | data = (
120 | n,
121 | server.run(n),
122 | server.run(n, None, 'bello'),
123 | server.run(n, kv_from_text, 'text'),
124 | server.run(n, kv_from_json, 'json'),
125 | server.run(n, kv_from_struct, 'struct'),
126 | )
127 | f.write(', '.join(f'{d}' for d in data) + '\n')
128 |
129 |
130 | if __name__ == '__main__':
131 | main()
132 |
--------------------------------------------------------------------------------
/pysparkling/sql/expressions/aggregate/collectors.py:
--------------------------------------------------------------------------------
1 | from .aggregations import Aggregation
2 |
3 |
4 | class CollectList(Aggregation):
5 | pretty_name = "collect_list"
6 |
7 | def __init__(self, column):
8 | super().__init__(column)
9 | self.column = column
10 | self.items = []
11 |
12 | def merge(self, row, schema):
13 | self.items.append(self.column.eval(row, schema))
14 |
15 | def mergeStats(self, other, schema):
16 | self.items += other.items
17 |
18 | def eval(self, row, schema):
19 | return self.items
20 |
21 | def args(self):
22 | return (self.column,)
23 |
24 |
25 | class CollectSet(Aggregation):
26 | pretty_name = "collect_set"
27 |
28 | def __init__(self, column):
29 | super().__init__(column)
30 | self.column = column
31 | self.items = set()
32 |
33 | def merge(self, row, schema):
34 | self.items.add(self.column.eval(row, schema))
35 |
36 | def mergeStats(self, other, schema):
37 | self.items |= other.items
38 |
39 | def eval(self, row, schema):
40 | return list(self.items)
41 |
42 | def args(self):
43 | return (self.column,)
44 |
45 |
46 | class SumDistinct(Aggregation):
47 | pretty_name = "sum_distinct"
48 |
49 | def __init__(self, column):
50 | super().__init__(column)
51 | self.column = column
52 | self.items = set()
53 |
54 | def merge(self, row, schema):
55 | self.items.add(self.column.eval(row, schema))
56 |
57 | def mergeStats(self, other, schema):
58 | self.items |= other.items
59 |
60 | def eval(self, row, schema):
61 | return sum(self.items)
62 |
63 | def args(self):
64 | return (self.column,)
65 |
66 |
67 | class First(Aggregation):
68 | pretty_name = "first"
69 | _sentinel = object()
70 |
71 | def __init__(self, column, ignore_nulls):
72 | super().__init__(column)
73 | self.column = column
74 | self.value = self._sentinel
75 | self.ignore_nulls = ignore_nulls.get_literal_value()
76 |
77 | def merge(self, row, schema):
78 | if self.value is First._sentinel or (self.ignore_nulls and self.value is None):
79 | self.value = self.column.eval(row, schema)
80 |
81 | def mergeStats(self, other, schema):
82 | if self.value is First._sentinel or (self.ignore_nulls and self.value is None):
83 | self.value = other.value
84 |
85 | def eval(self, row, schema):
86 | return self.value if self.value is not First._sentinel else None
87 |
88 | def args(self):
89 | return (
90 | self.column,
91 | str(self.ignore_nulls).lower()
92 | )
93 |
94 |
95 | class Last(Aggregation):
96 | pretty_name = "last"
97 | _sentinel = object()
98 |
99 | def __init__(self, column, ignore_nulls):
100 | super().__init__(column)
101 | self.column = column
102 | self.value = None
103 | self.ignore_nulls = ignore_nulls.get_literal_value()
104 |
105 | def merge(self, row, schema):
106 | new_value = self.column.eval(row, schema)
107 | if not (self.ignore_nulls and new_value is None):
108 | self.value = new_value
109 |
110 | def mergeStats(self, other, schema):
111 | if not (self.ignore_nulls and other.value is None):
112 | self.value = other.value
113 |
114 | def eval(self, row, schema):
115 | return self.value
116 |
117 | def args(self):
118 | return (
119 | self.column,
120 | str(self.ignore_nulls).lower()
121 | )
122 |
123 |
124 | class CountDistinct(Aggregation):
125 | pretty_name = "count"
126 |
127 | def __init__(self, columns):
128 | super().__init__(columns)
129 | self.columns = columns
130 | self.items = set()
131 |
132 | def merge(self, row, schema):
133 | self.items.add(tuple(
134 | col.eval(row, schema) for col in self.columns
135 | ))
136 |
137 | def mergeStats(self, other, schema):
138 | self.items += other.items
139 |
140 | def eval(self, row, schema):
141 | return len(self.items)
142 |
143 | def args(self):
144 | return f"DISTINCT {','.join(self.columns)}"
145 |
146 |
147 | class ApproxCountDistinct(Aggregation):
148 | pretty_name = "approx_count_distinct"
149 |
150 | def __init__(self, column):
151 | super().__init__(column)
152 | self.column = column
153 | self.items = set()
154 |
155 | def merge(self, row, schema):
156 | self.items.add(self.column.eval(row, schema))
157 |
158 | def mergeStats(self, other, schema):
159 | self.items += other.items
160 |
161 | def eval(self, row, schema):
162 | return len(self.items)
163 |
164 | def args(self):
165 | return (self.column,)
166 |
167 |
168 | __all__ = [
169 | "SumDistinct", "ApproxCountDistinct", "CollectList", "CollectSet",
170 | "First", "CountDistinct", "Last"
171 | ]
172 |
--------------------------------------------------------------------------------
/pysparkling/sql/internal_utils/readers/jsonreader.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import itertools
3 | import json
4 |
5 | from ...casts import get_struct_caster
6 | from ...internal_utils.options import Options
7 | from ...internal_utils.readers.utils import get_records, resolve_partitions
8 | from ...schema_utils import infer_schema_from_rdd
9 | from ...types import create_row, row_from_keyed_values, StructType
10 |
11 |
12 | class JSONReader:
13 | default_options = dict(
14 | primitivesAsString=False,
15 | prefersDecimal=False,
16 | allowComments=False,
17 | allowUnquotedFieldNames=False,
18 | allowSingleQuotes=True,
19 | allowNumericLeadingZero=False,
20 | allowBackslashEscapingAnyCharacter=False,
21 | mode="PERMISSIVE",
22 | columnNameOfCorruptRecord="",
23 | dateFormat="yyyy-MM-dd",
24 | timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
25 | multiLine=False,
26 | allowUnquotedControlChars=False,
27 | encoding=None,
28 | lineSep=None,
29 | samplingRatio=1.0,
30 | dropFieldIfAllNull=False,
31 | locale="en-US",
32 | )
33 |
34 | def __init__(self, spark, paths, schema, options):
35 | self.spark = spark
36 | self.paths = paths
37 | self.schema = schema
38 | self.options = Options(self.default_options, options)
39 |
40 | def read(self):
41 | sc = self.spark._sc
42 | paths = self.paths
43 |
44 | partitions, partition_schema = resolve_partitions(paths)
45 |
46 | rdd_filenames = sc.parallelize(sorted(partitions.keys()), len(partitions))
47 | rdd = rdd_filenames.flatMap(partial(
48 | parse_json_file,
49 | partitions,
50 | partition_schema,
51 | self.schema,
52 | self.options
53 | ))
54 |
55 | inferred_schema = infer_schema_from_rdd(rdd)
56 |
57 | schema = self.schema if self.schema is not None else inferred_schema
58 | schema_fields = {
59 | field.name: field
60 | for field in schema.fields
61 | }
62 |
63 | # Field order is defined by fields in the record, not by the given schema
64 | # Field type is defined by the given schema or inferred
65 | full_schema = StructType(
66 | fields=[
67 | schema_fields.get(field.name, field)
68 | for field in inferred_schema.fields
69 | ]
70 | )
71 |
72 | cast_row = get_struct_caster(inferred_schema, full_schema, options=self.options)
73 | casted_rdd = rdd.map(cast_row)
74 | casted_rdd._name = paths
75 |
76 | # pylint: disable=import-outside-toplevel, cyclic-import
77 | from ...internals import DataFrameInternal
78 |
79 | return DataFrameInternal(
80 | sc,
81 | casted_rdd,
82 | schema=full_schema
83 | )
84 |
85 |
86 | def parse_json_file(partitions, partition_schema, schema, options, file_name):
87 | records = get_records(file_name, options.linesep, options.encoding)
88 | rows = []
89 | for record in records:
90 | partition = partitions[file_name]
91 | row = parse_record(record, schema, partition, partition_schema, options)
92 | row.set_input_file_name(file_name)
93 | rows.append(row)
94 | return rows
95 |
96 |
97 | def parse_record(record, schema, partition, partition_schema, options):
98 | raw_record_value = json.loads(record, encoding=options.encoding)
99 | if not isinstance(raw_record_value, dict):
100 | raise NotImplementedError(
101 | "Top level items should be JSON objects (dicts),"
102 | f" got {type(raw_record_value)} with {raw_record_value}"
103 | )
104 | record_value = decode_record(raw_record_value)
105 | if schema is not None:
106 | record_fields = record_value.__fields__
107 | available_names = tuple(partition_schema.names) + record_fields
108 | field_names = [name for name in record_fields if name in schema.names] + [
109 | f.name for f in schema.fields if f.name not in available_names
110 | ]
111 | else:
112 | field_names = list(record_value.__fields__)
113 | record_values = [
114 | record_value[field_name] if field_name in record_value.__fields__ else None
115 | for field_name in field_names
116 | ]
117 | partition_field_names = [f.name for f in partition_schema.fields] if partition_schema else []
118 | # pylint: disable=W0511
119 | # todo: handle nested rows
120 | row = create_row(
121 | itertools.chain(field_names, partition_field_names),
122 | itertools.chain(record_values, partition)
123 | )
124 | return row
125 |
126 |
127 | def decode_record(item):
128 | if isinstance(item, list):
129 | return [decode_record(e) for e in item]
130 | if isinstance(item, dict):
131 | return row_from_keyed_values(
132 | (key, decode_record(value))
133 | for key, value in item.items()
134 | )
135 | return item
136 |
--------------------------------------------------------------------------------
/pysparkling/sql/tests/test_session.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import pytest
4 |
5 | from pysparkling import Context, StorageLevel
6 | from pysparkling.sql.session import SparkSession
7 | from pysparkling.sql.types import (
8 | ArrayType, DoubleType, IntegerType, LongType, MapType, Row, row_from_keyed_values, StringType, StructField,
9 | StructType
10 | )
11 | from pysparkling.sql.utils import require_minimum_pandas_version
12 |
13 | try:
14 | require_minimum_pandas_version()
15 | has_pandas = True
16 | except ImportError:
17 | has_pandas = False
18 |
19 |
20 | class SessionTests(TestCase):
21 | spark = SparkSession(sparkContext=Context())
22 |
23 | def test_session_range(self):
24 | df = self.spark.range(3)
25 | self.assertEqual(df.count(), 3)
26 | self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
27 | self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)])
28 |
29 | def test_session_create_data_frame_from_rdd(self):
30 | df = self.spark.createDataFrame(self.spark.sparkContext.parallelize([
31 | (1, "one"),
32 | (2, "two"),
33 | (3, "three"),
34 | ]))
35 | self.assertEqual(df.count(), 3)
36 | self.assertListEqual(
37 | df.collect(),
38 | [Row(_1=1, _2='one'),
39 | Row(_1=2, _2='two'),
40 | Row(_1=3, _2='three')])
41 | self.assertEqual(
42 | df.schema,
43 | StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
44 | )
45 |
46 | def test_session_create_data_frame_from_list(self):
47 | df = self.spark.createDataFrame([
48 | (1, "one"),
49 | (2, "two"),
50 | (3, "three"),
51 | ])
52 | self.assertEqual(df.count(), 3)
53 | self.assertListEqual(
54 | df.collect(),
55 | [Row(_1=1, _2='one'),
56 | Row(_1=2, _2='two'),
57 | Row(_1=3, _2='three')])
58 | self.assertEqual(
59 | df.schema,
60 | StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
61 | )
62 |
63 | @pytest.mark.skipif(not has_pandas, reason='pandas is not installed')
64 | def test_session_create_data_frame_from_pandas_data_frame(self):
65 | try:
66 | # Pandas is an optional dependency
67 | # pylint: disable=import-outside-toplevel
68 | import pandas as pd
69 | except ImportError as e:
70 | raise ImportError("pandas is not importable") from e
71 |
72 | pdf = pd.DataFrame([
73 | (1, "one"),
74 | (2, "two"),
75 | (3, "three")
76 | ])
77 |
78 | df = self.spark.createDataFrame(pdf)
79 |
80 | self.assertEqual(df.count(), 3)
81 | self.assertListEqual(
82 | df.collect(),
83 | [Row(**{"0": 1, "1": 'one'}),
84 | Row(**{"0": 2, "1": 'two'}),
85 | Row(**{"0": 3, "2": 'three'})])
86 | self.assertEqual(
87 | df.schema,
88 | StructType([StructField("0", LongType(), True), StructField("1", StringType(), True)])
89 | )
90 |
91 | def test_session_create_data_frame_from_list_with_col_names(self):
92 | df = self.spark.createDataFrame([(0.0, [1.0, 0.8]),
93 | (1.0, [0.0, 0.0]),
94 | (2.0, [0.5, 0.5])],
95 | ["label", "features"])
96 | self.assertEqual(df.count(), 3)
97 | self.assertListEqual(
98 | df.collect(),
99 | [
100 | row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]),
101 | row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]),
102 | row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]),
103 | ]
104 | )
105 |
106 | self.assertEqual(
107 | df.schema,
108 | StructType([
109 | StructField("label", DoubleType(), True),
110 | StructField("features", ArrayType(DoubleType(), True), True)
111 | ])
112 | )
113 |
114 | def test_session_create_data_frame_from_list_with_schema(self):
115 | schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
116 | df = self.spark.createDataFrame([({'a': 1},)], schema=schema)
117 | self.assertEqual(df.count(), 1)
118 | self.assertListEqual(
119 | df.collect(),
120 | [Row(map={'a': 1})]
121 | )
122 | self.assertEqual(df.schema, schema)
123 |
124 | def test_session_storage_level(self):
125 | spark = SparkSession(Context())
126 | df = spark.range(4, numPartitions=2)
127 | self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
128 | persisted_df = df.persist()
129 | self.assertEqual(persisted_df.is_cached, True)
130 | self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
131 |
--------------------------------------------------------------------------------
/pysparkling/sql/tests/test_write.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import shutil
4 | from unittest import TestCase
5 |
6 | from dateutil.tz import tzlocal
7 |
8 | from pysparkling import Context, Row
9 | from pysparkling.sql.session import SparkSession
10 | from pysparkling.sql.utils import AnalysisException
11 |
12 | spark = SparkSession(Context())
13 |
14 |
15 | def get_folder_content(folder_path):
16 | folder_content = {}
17 | for root, _, files in os.walk(folder_path):
18 | relative_path = root[len(folder_path):]
19 | for file in files:
20 | file_path = os.path.join(root, file)
21 | with open(file_path, 'r', encoding='utf8') as file_content:
22 | folder_content[os.path.join(relative_path, file)] = file_content.readlines()
23 | return folder_content
24 |
25 |
26 | class DataFrameWriterTests(TestCase):
27 | maxDiff = None
28 |
29 | @staticmethod
30 | def clean():
31 | if os.path.exists(".tmp"):
32 | shutil.rmtree(".tmp")
33 |
34 | def setUp(self):
35 | self.clean()
36 |
37 | tz = datetime.datetime.now().astimezone().strftime('%z') # +0100
38 | self.tz = f'{tz[:3]}:{tz[3:]}' # --> +01:00
39 |
40 | def tearDown(self):
41 | self.clean()
42 |
43 | def test_write_to_csv(self):
44 | df = spark.createDataFrame(
45 | [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ),
46 | Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))]
47 | )
48 | df.write.csv(".tmp/wonderland/")
49 | self.assertDictEqual(
50 | get_folder_content(".tmp/wonderland"),
51 | {
52 | '_SUCCESS': [],
53 | 'part-00000-8447389540241120843.csv': [
54 | f'2,Alice,2017-01-01T00:00:00.000{self.tz}\n',
55 | f'5,Bob,2014-03-02T00:00:00.000{self.tz}\n'
56 | ]
57 | }
58 | )
59 |
60 | def test_write_to_csv_with_custom_options(self):
61 | df = spark.createDataFrame(
62 | [
63 | Row(age=2, name='Alice', occupation=None),
64 | Row(age=5, name='Bob', occupation=""),
65 | ]
66 | )
67 | df.write.csv(".tmp/wonderland/", sep="^", emptyValue="", nullValue="null", header=True)
68 | self.assertDictEqual(
69 | get_folder_content(".tmp/wonderland"),
70 | {
71 | '_SUCCESS': [],
72 | 'part-00000-4061950540148431296.csv': [
73 | 'age^name^occupation\n',
74 | '2^Alice^null\n',
75 | '5^Bob^\n',
76 | ],
77 | }
78 | )
79 |
80 | def test_write_to_csv_fail_when_overwrite(self):
81 | df = spark.createDataFrame(
82 | [Row(age=2, name='Alice'),
83 | Row(age=5, name='Bob')]
84 | )
85 | df.write.csv(".tmp/wonderland/")
86 | with self.assertRaises(AnalysisException) as ctx:
87 | df.write.csv(".tmp/wonderland/")
88 | self.assertEqual(ctx.exception.args[0], 'path .tmp/wonderland already exists.;')
89 | self.assertDictEqual(
90 | get_folder_content(".tmp/wonderland"),
91 | {
92 | '_SUCCESS': [],
93 | 'part-00000-3434325560268771971.csv': [
94 | '2,Alice\n',
95 | '5,Bob\n',
96 | ],
97 | }
98 | )
99 |
100 | def test_write_to_json(self):
101 | df = spark.createDataFrame(
102 | [Row(age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ),
103 | Row(age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()))]
104 | )
105 | df.write.json(".tmp/wonderland/")
106 | self.assertDictEqual(
107 | get_folder_content(".tmp/wonderland"),
108 | {
109 | '_SUCCESS': [],
110 | 'part-00000-8447389540241120843.json': [
111 | f'{{"age":2,"name":"Alice","time":"2017-01-01T00:00:00.000{self.tz}"}}\n',
112 | f'{{"age":5,"name":"Bob","time":"2014-03-02T00:00:00.000{self.tz}"}}\n',
113 | ],
114 | }
115 | )
116 |
117 | def test_write_nested_rows_to_json(self):
118 | df = spark.createDataFrame([
119 | Row(age=2, name='Alice', animals=[
120 | Row(name="Chessur", type="cat"),
121 | Row(name="The White Rabbit", type="Rabbit")]),
122 | Row(age=5, name='Bob', animals=[])
123 | ])
124 | df.write.json(".tmp/wonderland/")
125 | self.assertDictEqual(
126 | get_folder_content(".tmp/wonderland"),
127 | {
128 | '_SUCCESS': [],
129 | 'part-00000-2819354714706678872.json': [
130 | '{"age":2,"animals":['
131 | '{"name":"Chessur","type":"cat"},'
132 | '{"name":"The White Rabbit","type":"Rabbit"}'
133 | '],"name":"Alice"}\n',
134 | '{"age":5,"animals":[],"name":"Bob"}\n',
135 | ],
136 | }
137 | )
138 |
--------------------------------------------------------------------------------
/pysparkling/tests/test_resolve_filenames.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 |
5 | from pysparkling.fileio import File
6 |
7 | CURRENT_FILE_LOCATION = __file__
8 |
9 |
10 | class MockedHdfsClient:
11 | def list(self, path, status):
12 | if path == "/user/username/":
13 | return [
14 | ("input", {"type": "DIRECTORY"}),
15 | ("output", {"type": "DIRECTORY"})
16 | ]
17 | if path in ('/user/username/input', '/user/username/input/'):
18 | return [
19 | ("part-00001.gz", {"type": "FILE"}),
20 | ("part-00002.gz", {"type": "FILE"}),
21 | ("_SUCCESS", {"type": "FILE"})
22 | ]
23 | raise NotImplementedError(f"Return value not mocked for '{path}'")
24 |
25 |
26 | class MockedS3Bucket:
27 | def list(self, *args, **kwargs):
28 | return [
29 | MockedS3Key("user/username/input/part-00001.gz"),
30 | MockedS3Key("user/username/input/part-00002.gz"),
31 | MockedS3Key("user/username/input/_SUCCESS"),
32 | ]
33 |
34 |
35 | class MockedS3Connection:
36 | def get_bucket(self, *args, **kwargs):
37 | return MockedS3Bucket()
38 |
39 |
40 | class MockedS3Key:
41 | def __init__(self, name):
42 | self.name = name
43 |
44 |
45 | def test_local_1():
46 | filenames = File.resolve_filenames(
47 | f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*'
48 | )
49 | assert CURRENT_FILE_LOCATION in filenames
50 |
51 |
52 | def test_local_2():
53 | filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
54 | assert filenames == [CURRENT_FILE_LOCATION]
55 |
56 |
57 | @pytest.mark.skipif(not os.getenv('AWS_ACCESS_KEY_ID'), reason='no AWS env')
58 | def test_s3_1():
59 | filenames = File.resolve_filenames(
60 | 's3n://aws-publicdatasets/common-crawl/'
61 | 'crawl-data/CC-MAIN-2015-11/warc.paths.*'
62 | )
63 | print(filenames)
64 | assert ('s3n://aws-publicdatasets/common-crawl/'
65 | 'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
66 |
67 |
68 | def test_hdfs_resolve_filenames_with_wildcard():
69 | # hdfs is an optional dependency
70 | # pylint: disable=import-outside-toplevel
71 | from pysparkling.fileio.fs import Hdfs
72 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
73 |
74 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-*.gz")
75 | print(filenames)
76 | assert filenames == [
77 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
78 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
79 | ]
80 |
81 |
82 | def test_hdfs_resolve_filenames_with_folder_path():
83 | # hdfs is an optional dependency
84 | # pylint: disable=import-outside-toplevel
85 | from pysparkling.fileio.fs import Hdfs
86 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
87 |
88 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input")
89 | print(filenames)
90 | assert filenames == [
91 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
92 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
93 | ]
94 |
95 |
96 | def test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash():
97 | # hdfs is an optional dependency
98 | # pylint: disable=import-outside-toplevel
99 | from pysparkling.fileio.fs import Hdfs
100 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
101 |
102 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/")
103 | print(filenames)
104 | assert filenames == [
105 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz',
106 | 'hdfs://hdfs-cluster.com/user/username/input/part-00002.gz'
107 | ]
108 |
109 |
110 | def test_hdfs_resolve_filenames_with_file_path():
111 | # hdfs is an optional dependency
112 | # pylint: disable=import-outside-toplevel
113 | from pysparkling.fileio.fs import Hdfs
114 | Hdfs.client_and_path = staticmethod(lambda *args, **kwargs: (MockedHdfsClient(), "unused_path"))
115 |
116 | filenames = Hdfs.resolve_filenames("hdfs://hdfs-cluster.com/user/username/input/part-00001.gz")
117 | print(filenames)
118 | assert filenames == [
119 | 'hdfs://hdfs-cluster.com/user/username/input/part-00001.gz'
120 | ]
121 |
122 |
123 | def test_s3_resolve_filenames():
124 | # boto is an optional dependency
125 | # pylint: disable=import-outside-toplevel
126 | from pysparkling.fileio.fs import S3
127 | S3._get_conn = classmethod(lambda *args, **kwargs: MockedS3Connection())
128 |
129 | filenames = S3.resolve_filenames("s3://bucket-name/user/username/input/part-*.gz")
130 | print(filenames)
131 | assert filenames == [
132 | 's3://bucket-name/user/username/input/part-00001.gz',
133 | 's3://bucket-name/user/username/input/part-00002.gz'
134 | ]
135 |
136 |
137 | if __name__ == '__main__':
138 | test_local_1()
139 | test_local_2()
140 | test_s3_1()
141 | test_hdfs_resolve_filenames_with_folder_path()
142 | test_hdfs_resolve_filenames_with_folder_path_and_trailing_slash()
143 | test_hdfs_resolve_filenames_with_file_path()
144 | test_hdfs_resolve_filenames_with_wildcard()
145 | test_s3_resolve_filenames()
146 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/gs.py:
--------------------------------------------------------------------------------
1 | from fnmatch import fnmatch
2 | from io import BytesIO, StringIO
3 | import logging
4 |
5 | from ...exceptions import FileSystemNotSupported
6 | from ...utils import parse_file_uri, Tokenizer
7 | from .file_system import FileSystem
8 |
9 | log = logging.getLogger(__name__)
10 |
11 | try:
12 | from gcloud import storage
13 | except ImportError:
14 | storage = None
15 |
16 |
17 | class GS(FileSystem):
18 | """:class:`.FileSystem` implementation for Google Storage.
19 |
20 | Paths are of the form `gs://bucket_name/file_path` or
21 | `gs://project_name:bucket_name/file_path`.
22 | """
23 |
24 | #: Set a default project name.
25 | project_name = None
26 |
27 | #: Default mime type.
28 | mime_type = 'text/plain'
29 |
30 | _clients = {}
31 |
32 | def __init__(self, file_name):
33 | if storage is None:
34 | raise FileSystemNotSupported(
35 | 'Google Storage is not supported. Install "gcloud".'
36 | )
37 |
38 | super().__init__(file_name)
39 |
40 | # obtain key
41 | t = Tokenizer(self.file_name)
42 | t.get_next('://') # skip scheme
43 | bucket_name = t.get_next('/')
44 | if ':' in bucket_name:
45 | project_name, _, bucket_name = bucket_name.partition(':')
46 | else:
47 | project_name = GS.project_name
48 | blob_name = t.get_next()
49 |
50 | client = GS._get_client(project_name)
51 | bucket = client.get_bucket(bucket_name)
52 | self.blob = bucket.get_blob(blob_name)
53 | if not self.blob:
54 | self.blob = bucket.blob(blob_name)
55 |
56 | @staticmethod
57 | def _get_client(project_name):
58 | if project_name not in GS._clients:
59 | if storage is None:
60 | raise FileSystemNotSupported(
61 | 'Google Storage is not supported. Install "gcloud".'
62 | )
63 | GS._clients[project_name] = storage.Client(project_name)
64 | return GS._clients[project_name]
65 |
66 | @staticmethod
67 | def resolve_filenames(expr):
68 | files = []
69 |
70 | t = Tokenizer(expr)
71 | scheme = t.get_next('://')
72 | bucket_name = t.get_next('/')
73 | if ':' in bucket_name:
74 | project_name, _, bucket_name = bucket_name.partition(':')
75 | else:
76 | project_name = GS.project_name
77 | prefix = t.get_next(['*', '?'])
78 |
79 | bucket = GS._get_client(project_name).get_bucket(bucket_name)
80 | expr_s = len(scheme) + 3 + len(project_name) + 1 + len(bucket_name) + 1
81 | expr = expr[expr_s:]
82 | for k in bucket.list_blobs(prefix=prefix):
83 | if fnmatch(k.name, expr) or fnmatch(k.name, expr + '/part*'):
84 | files.append(f'{scheme}://{project_name}:{bucket_name}/{k.name}')
85 | return files
86 |
87 | @staticmethod
88 | def resolve_content(expr):
89 | scheme, raw_bucket_name, folder_path, pattern = parse_file_uri(expr)
90 |
91 | if ':' in raw_bucket_name:
92 | project_name, _, bucket_name = raw_bucket_name.partition(':')
93 | else:
94 | project_name = GS.project_name
95 | bucket_name = raw_bucket_name
96 |
97 | folder_path = folder_path[1:] # Remove leading slash
98 |
99 | expr = f"{folder_path}{pattern}"
100 | # Match all files inside folders that match expr
101 | pattern_expr = f"{expr}{'' if expr.endswith('/') else '/'}*"
102 |
103 | bucket = GS._get_client(project_name).get_bucket(bucket_name)
104 |
105 | files = []
106 | for k in bucket.list_blobs(prefix=folder_path):
107 | if not k.name.endswith("/") and (
108 | fnmatch(k.name, expr) or fnmatch(k.name, pattern_expr)
109 | ):
110 | files.append(
111 | f'{scheme}://{raw_bucket_name}/{k.name}'
112 | )
113 | return files
114 |
115 | def exists(self):
116 | t = Tokenizer(self.file_name)
117 | t.get_next('//') # skip scheme
118 | bucket_name = t.get_next('/')
119 | if ':' in bucket_name:
120 | project_name, _, bucket_name = bucket_name.partition(':')
121 | else:
122 | project_name = GS.project_name
123 | blob_name = t.get_next()
124 | bucket = GS._get_client(project_name).get_bucket(bucket_name)
125 | return (bucket.get_blob(blob_name)
126 | or list(bucket.list_blobs(prefix=f'{blob_name}/')))
127 |
128 | def load(self):
129 | log.debug('Loading %s with size %s.', self.blob.name, self.blob.size)
130 | return BytesIO(self.blob.download_as_string())
131 |
132 | def load_text(self, encoding='utf8', encoding_errors='ignore'):
133 | log.debug('Loading %s with size %s.', self.blob.name, self.blob.size)
134 | return StringIO(
135 | self.blob.download_as_string().decode(
136 | encoding, encoding_errors
137 | )
138 | )
139 |
140 | def dump(self, stream):
141 | log.debug('Dumping to %s.', self.blob.name)
142 | self.blob.upload_from_string(stream.read(),
143 | content_type=self.mime_type)
144 | return self
145 |
146 | def make_public(self, recursive=False):
147 | self.blob.make_public(recursive)
148 | return self
149 |
--------------------------------------------------------------------------------
/pysparkling/accumulators.py:
--------------------------------------------------------------------------------
1 | # A large part of this module is extracted from its PySpark counterpart at
2 | # https://spark.apache.org/docs/1.5.0/api/python/_modules/pyspark/accumulators.html
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | """
21 | >>> from pysparkling import Context
22 | >>> sc = Context()
23 | >>> a = sc.accumulator(1)
24 | >>> a.value
25 | 1
26 | >>> a.value = 2
27 | >>> a.value
28 | 2
29 | >>> a += 5
30 | >>> a.value
31 | 7
32 |
33 | >>> sc.accumulator(1.0).value
34 | 1.0
35 |
36 | >>> sc.accumulator(1j).value
37 | 1j
38 |
39 | >>> rdd = sc.parallelize([1,2,3])
40 | >>> def f(x):
41 | ... global a
42 | ... a += x
43 | >>> rdd.foreach(f)
44 | >>> a.value
45 | 13
46 |
47 | >>> b = sc.accumulator(0)
48 | >>> def g(x):
49 | ... b.add(x)
50 | >>> rdd.foreach(g)
51 | >>> b.value
52 | 6
53 |
54 | >>> from pysparkling import AccumulatorParam
55 | >>> class VectorAccumulatorParam(AccumulatorParam):
56 | ... def zero(self, value):
57 | ... return [0.0] * len(value)
58 | ... def addInPlace(self, val1, val2):
59 | ... for i in range(len(val1)):
60 | ... val1[i] += val2[i]
61 | ... return val1
62 | >>> va = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam())
63 | >>> va.value
64 | [1.0, 2.0, 3.0]
65 | >>> def g(x):
66 | ... global va
67 | ... va += [x] * 3
68 | >>> rdd.foreach(g)
69 | >>> va.value
70 | [7.0, 8.0, 9.0]
71 |
72 | >>> sc.accumulator([1.0, 2.0, 3.0]) # doctest: +IGNORE_EXCEPTION_DETAIL
73 | Traceback (most recent call last):
74 | ...
75 | TypeError: No default accumulator param for type
76 | """
77 |
78 |
79 | __all__ = ['Accumulator', 'AccumulatorParam']
80 |
81 |
82 | class Accumulator:
83 | """
84 | A shared variable that can be accumulated, i.e., has a commutative and associative "add"
85 | operation. Tasks can add values to an Accumulator with the ``+=`` operator
86 |
87 | The API supports accumulators for primitive data types like ``int`` and
88 | ``float``, users can also define accumulators for custom types by providing a custom
89 | ``AccumulatorParam`` object. Refer to the doctest of this module for an example.
90 | """
91 |
92 | def __init__(self, value, accum_param):
93 | """Create a new Accumulator with a given initial value and AccumulatorParam object"""
94 | self.accum_param = accum_param
95 | self._value = value
96 |
97 | @property
98 | def value(self):
99 | return self._value
100 |
101 | @value.setter
102 | def value(self, value):
103 | self._value = value
104 |
105 | def add(self, term):
106 | """Adds a term to this accumulator's value"""
107 | self._value = self.accum_param.addInPlace(self._value, term)
108 |
109 | def __iadd__(self, term):
110 | """The += operator; adds a term to this accumulator's value"""
111 | self.add(term)
112 | return self
113 |
114 | def __str__(self):
115 | return str(self._value)
116 |
117 | def __repr__(self):
118 | return f"Accumulator"
119 |
120 |
121 | class AccumulatorParam:
122 | """
123 | Helper object that defines how to accumulate values of a given type.
124 | """
125 | def zero(self, value):
126 | """
127 | Provide a "zero value" for the type, compatible in dimensions with the
128 | provided ``value`` (e.g., a zero vector)
129 | """
130 | raise NotImplementedError
131 |
132 | def addInPlace(self, value1, value2):
133 | """
134 | Add two values of the accumulator's data type, returning a new value;
135 | for efficiency, can also update ``value1`` in place and return it.
136 | """
137 | raise NotImplementedError
138 |
139 |
140 | class AddingAccumulatorParam(AccumulatorParam):
141 | """
142 | An AccumulatorParam that uses the + operators to add values. Designed for simple types
143 | such as integers, floats, and lists. Requires the zero value for the underlying type
144 | as a parameter.
145 | """
146 | def __init__(self, zero_value):
147 | self.zero_value = zero_value
148 |
149 | def zero(self, value):
150 | return self.zero_value
151 |
152 | def addInPlace(self, value1, value2):
153 | value1 += value2
154 | return value1
155 |
156 |
157 | # Singleton accumulator params for some standard types
158 | INT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0)
159 | FLOAT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0)
160 | COMPLEX_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0j)
161 |
162 |
163 | if __name__ == "__main__":
164 | #
165 | # Execute doctests with
166 | #
167 | # $ python -m pysparkling.accumulators -v
168 | #
169 | import doctest
170 | import sys
171 |
172 | failure_count, _ = doctest.testmod()
173 | if failure_count:
174 | sys.exit(-1)
175 |
--------------------------------------------------------------------------------
/docs/sphinx/parallel.rst:
--------------------------------------------------------------------------------
1 | .. _parallel:
2 |
3 |
4 | Parallelization
5 | ===============
6 |
7 | Pysparkling supports parallelizations on the local machine and across clusters
8 | of computers.
9 |
10 |
11 | Processes and Threads
12 | ---------------------
13 |
14 | Single machine parallelization with
15 | ``concurrent.futures.ThreadPoolExecutor``,
16 | ``concurrent.futures.ProcessPoolExecutor`` or
17 | ``multiprocessing.Pool`` is supported. Use ``cloudpickle`` instead of ``pickle`` for
18 | serialization to support lambda functions (and more) for data transformations.
19 |
20 |
21 | .. code-block:: python
22 |
23 | import cloudpickle
24 | import concurrent
25 | import pysparkling
26 |
27 | sc = pysparkling.Context(
28 | pool=concurrent.futures.ProcessPoolExecutor(4),
29 | serializer=cloudpickle.dumps,
30 | deserializer=pickle.loads,
31 | )
32 |
33 |
34 |
35 | Experimental
36 | ------------
37 |
38 | The following are experimental notes. Most of them don't even contain examples how to make
39 | use of these techniques with pysparkling.
40 |
41 | ipcluster and IPython.parallel
42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43 |
44 | Local test setup:
45 |
46 | .. code-block:: bash
47 |
48 | ipcluster start --n=2
49 |
50 | .. code-block:: python
51 |
52 | from IPython.parallel import Client
53 |
54 | c = Client()
55 | print(c[:].map(lambda _: 'hello world', range(2)).get())
56 |
57 | which should print ``['hello world', 'hello world']``.
58 |
59 | To run on a cluster, create a profile:
60 |
61 | .. code-block:: bash
62 |
63 | ipython profile create --parallel --profile=smallcluster
64 |
65 | # start controller:
66 | # Creates ~/.ipython/profile_smallcluster/security/ipcontroller-engine.json
67 | # which is used by the engines to identify the location of this controller.
68 | # This is the local-only IP address. Substitute with the machines IP
69 | # address so that the engines can find it.
70 | ipcontroller --ip=127.0.0.1 --port=7123 --profile=smallcluster
71 |
72 | # start engines (assuming they have access to the
73 | # ipcontroller-engine.json file)
74 | ipengine --profile=smallcluster
75 |
76 | Test it in Python:
77 |
78 | .. code-block:: python
79 |
80 | from IPython.parallel import Client
81 |
82 | c = Client(profile='smallcluster')
83 | print(c[:].map(lambda _: 'hello world', range(2)).get())
84 |
85 | If you don't want to start the engines manually, ``ipcluster`` comes with
86 | "Launchers" that can start them for you:
87 | https://ipython.org/ipython-doc/dev/parallel/parallel_process.html#using-ipcluster-in-ssh-mode
88 |
89 |
90 | StarCluster
91 | ~~~~~~~~~~~
92 |
93 | Setting up StarCluster was an experiment. However it does not integrate well
94 | with the rest of our EC2 infrastructure, so we switched to a Chef based setup
95 | where we use ``ipcluster`` directly. A blocker was that the number of engines
96 | per node is not configurable and we have many map jobs that wait on external
97 | responses.
98 |
99 | Setup
100 |
101 | .. code-block:: bash
102 |
103 | # install
104 | pip install starcluster
105 |
106 | # create configuration
107 | starcluster help # choose the option to create a sample config file
108 |
109 | # add your user id, aws_access_key_id and aws_secret_access_key to config
110 |
111 | # create an ssh key (this creates a new key just for starcluster)
112 | # and registers it with AWS
113 | starcluster createkey starclusterkey -o ~/.ssh/starclusterkey.rsa
114 |
115 | # add this key to config:
116 | [key starclusterkey]
117 | KEY_LOCATION=~/.ssh/starclusterkey.rsa
118 | # and use this key in the cluster setup:
119 | KEYNAME = starclusterkey
120 |
121 | # disable the queue, Sun Grid Engine
122 | # (unnecessary for pysparkling and takes time during setup)
123 | DISABLE_QUEUE=True
124 |
125 | # to enable IPython parallel support, uncomment these lines in config:
126 | [plugin ipcluster]
127 | SETUP_CLASS = starcluster.plugins.ipcluster.IPCluster
128 |
129 | # and make sure you have this line inside the cluster section
130 | [cluster smallcluster]
131 | PLUGINS = ipcluster
132 |
133 | # start the cluster
134 | starcluster start smallcluster
135 |
136 | # check it has started
137 | starcluster listclusters
138 |
139 | Currently use: ``ami-da180db2`` (Ubuntu 14.04 with 100GB EBS) on
140 | ``m3.medium`` instances.
141 |
142 | Workarounds:
143 |
144 | .. code-block:: bash
145 |
146 | # this seems to be a dependency that does not get installed
147 | pip install pexpect
148 |
149 | # to validate the ssh host, you need to log in once manually, to add it
150 | # to the list of known hosts
151 | starcluster sshmaster smallcluster
152 |
153 | In Python, you should now be able to run
154 |
155 | .. code-block:: python
156 |
157 | from IPython.parallel import Client
158 |
159 | # the exact command is printed after the cluster started
160 | rc = Client('/Users/sven/.starcluster/ipcluster/SecurityGroup:@sc-smallcluster-us-east-1.json',
161 | sshkey='/Users/sven/.ssh/starclusterkey.rsa', packer='pickle')
162 |
163 | view = rc[:]
164 | results = view.map(lambda x: x**30, range(8))
165 | print results.get()
166 |
167 | which is also in ``tests/starcluster_simple.py``.
168 |
169 |
170 | Install your own software that is not on pypi:
171 |
172 | .. code-block:: python
173 |
174 | pip install wheel
175 | python setup.py bdist_wheel # add --universal for Python2 and 3 packages
176 | starcluster put smallcluster dist/your_package_name.whl /home/sgeadmin/your_package_name.whl
177 |
178 | # ssh into remote machine
179 | starcluster sshmaster smallcluster
180 | > pip install --upgrade pip
181 | > pip install wheel
182 | > pip2.7 install /home/sgeadmin/your_package_name.whl
183 |
184 |
185 |
--------------------------------------------------------------------------------
/pysparkling/fileio/fs/hdfs.py:
--------------------------------------------------------------------------------
1 | from fnmatch import fnmatch
2 | from io import BytesIO, StringIO
3 | import logging
4 |
5 | from ...exceptions import FileSystemNotSupported
6 | from ...utils import format_file_uri, parse_file_uri
7 | from .file_system import FileSystem
8 |
9 | log = logging.getLogger(__name__)
10 |
11 | try:
12 | import hdfs
13 | except ImportError:
14 | hdfs = None
15 |
16 |
17 | class Hdfs(FileSystem):
18 | """:class:`.FileSystem` implementation for HDFS."""
19 |
20 | _conn = {}
21 |
22 | def __init__(self, file_name):
23 | if hdfs is None:
24 | raise FileSystemNotSupported(
25 | 'hdfs not supported. Install the python package "hdfs".'
26 | )
27 |
28 | super().__init__(file_name)
29 |
30 | @staticmethod
31 | def client_and_path(path):
32 | _, domain, folder_path, file_pattern = parse_file_uri(path)
33 |
34 | if ':' not in domain:
35 | port = 50070
36 | else:
37 | domain, port = domain.split(':')
38 | port = int(port)
39 | cache_id = domain + '__' + str(port)
40 |
41 | if cache_id not in Hdfs._conn:
42 | if hdfs is None:
43 | raise FileSystemNotSupported(
44 | 'hdfs not supported. Install the python package "hdfs".'
45 | )
46 | Hdfs._conn[cache_id] = hdfs.InsecureClient( # pylint: disable=no-member
47 | f'http://{domain}:{port}'
48 | )
49 | return Hdfs._conn[cache_id], folder_path + file_pattern
50 |
51 | def exists(self):
52 | c, p = Hdfs.client_and_path(self.file_name)
53 | try:
54 | c.status(p)
55 | except hdfs.util.HdfsError: # pylint: disable=no-member
56 | return False
57 | return True
58 |
59 | @staticmethod
60 | def resolve_filenames(expr):
61 | c, _ = Hdfs.client_and_path(expr)
62 |
63 | scheme, domain, folder_path, _ = parse_file_uri(expr)
64 |
65 | files = []
66 | for fn, file_status in c.list(folder_path, status=True):
67 | file_local_path = f'{folder_path}{fn}'
68 | file_path = format_file_uri(scheme, domain, file_local_path)
69 | part_file_expr = expr + ("" if expr.endswith("/") else "/") + 'part*'
70 |
71 | if fnmatch(file_path, expr):
72 | if file_status["type"] != "DIRECTORY":
73 | files.append(file_path)
74 | else:
75 | files += Hdfs._get_folder_part_files(
76 | c,
77 | scheme,
78 | domain,
79 | file_local_path,
80 | part_file_expr
81 | )
82 | elif fnmatch(file_path, part_file_expr):
83 | files.append(file_path)
84 | return files
85 |
86 | @staticmethod
87 | def _get_folder_part_files(c, scheme, domain, folder_local_path, expr_with_part):
88 | files = []
89 | for fn, file_status in c.list(folder_local_path, status=True):
90 | sub_file_path = format_file_uri(scheme, domain, folder_local_path, fn)
91 | if fnmatch(sub_file_path, expr_with_part) and file_status["type"] != "DIRECTORY":
92 | files.append(sub_file_path)
93 | return files
94 |
95 | @classmethod
96 | def _get_folder_files_by_expr(cls, c, scheme, domain, folder_path, expr=None):
97 | """
98 | Using client c, retrieves all files located in the folder `folder_path` that matches `expr`
99 |
100 | :param c: An HDFS client
101 | :param scheme: a scheme such as hdfs
102 | :param domain: a DFS web server
103 | :param folder_path: a folder path without patterns
104 | :param expr: a pattern
105 |
106 | :return: list of matching files absolute paths prefixed with the scheme and domain
107 | """
108 | file_paths = []
109 | for fn, file_status in c.list(folder_path, status=True):
110 | file_local_path = f'{folder_path}{fn}'
111 | if expr is None or fnmatch(file_local_path, expr):
112 | if file_status["type"] == "DIRECTORY":
113 | file_paths += cls._get_folder_files_by_expr(
114 | c,
115 | scheme,
116 | domain,
117 | file_local_path + "/",
118 | expr=None
119 | )
120 | else:
121 | file_path = format_file_uri(scheme, domain, file_local_path)
122 | file_paths.append(file_path)
123 | elif file_status["type"] == "DIRECTORY":
124 | file_paths += cls._get_folder_files_by_expr(
125 | c, scheme, domain, file_local_path + "/", expr
126 | )
127 | return file_paths
128 |
129 | @classmethod
130 | def resolve_content(cls, expr):
131 | c, _ = cls.client_and_path(expr)
132 |
133 | scheme, domain, folder_path, pattern = parse_file_uri(expr)
134 |
135 | expr = folder_path + pattern
136 |
137 | return cls._get_folder_files_by_expr(c, scheme, domain, folder_path, expr)
138 |
139 | def load(self):
140 | log.debug('Hdfs read for %s.', self.file_name)
141 | c, path = Hdfs.client_and_path(self.file_name)
142 |
143 | with c.read(path) as reader:
144 | r = BytesIO(reader.read())
145 |
146 | return r
147 |
148 | def load_text(self, encoding='utf8', encoding_errors='ignore'):
149 | log.debug('Hdfs text read for %s.', self.file_name)
150 | c, path = Hdfs.client_and_path(self.file_name)
151 |
152 | with c.read(path) as reader:
153 | r = StringIO(reader.read().decode(encoding, encoding_errors))
154 |
155 | return r
156 |
157 | def dump(self, stream):
158 | log.debug('Dump to %s with hdfs write.', self.file_name)
159 | c, path = Hdfs.client_and_path(self.file_name)
160 | c.write(path, stream)
161 | return self
162 |
--------------------------------------------------------------------------------