├── tests ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── conftest.cpython-310-pytest-7.2.1.pyc │ └── test_dataclass.cpython-310-pytest-7.2.1.pyc ├── conftest.py ├── test_dataclass.py └── test_pydantic.py ├── pyspark_types ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ └── dataclass.cpython-310.pyc ├── auxiliary.py ├── dataclass.py └── pydantic.py ├── .gitignore ├── pytest.ini ├── pyproject.toml ├── .github └── workflows │ └── python-publish.yml ├── README.md └── poetry.lock /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyspark_types/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .vscode/ 3 | dist/ 4 | *.pyc 5 | __pycache__/ 6 | **/__pycache__/ 7 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --verbose 3 | log_level = INFO 4 | spark_options = 5 | spark.sql.catalogImplementation: in-memory 6 | -------------------------------------------------------------------------------- /tests/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdanny/pyspark_types/HEAD/tests/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /pyspark_types/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdanny/pyspark_types/HEAD/pyspark_types/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /pyspark_types/__pycache__/dataclass.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdanny/pyspark_types/HEAD/pyspark_types/__pycache__/dataclass.cpython-310.pyc -------------------------------------------------------------------------------- /tests/__pycache__/conftest.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdanny/pyspark_types/HEAD/tests/__pycache__/conftest.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /tests/__pycache__/test_dataclass.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdanny/pyspark_types/HEAD/tests/__pycache__/test_dataclass.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pyspark-types" 3 | version = "0.0.3" 4 | description = "`pyspark_types` is a Python library that provides a simple way to map Python dataclasses to PySpark StructTypes" 5 | authors = ["Dan"] 6 | readme = "README.md" 7 | packages = [{include = "pyspark_types"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | pyspark = "^3.3.2" 12 | pytest = "^7.2.1" 13 | black = "^23.1.0" 14 | pydantic = "^2.6.1" 15 | 16 | 17 | [build-system] 18 | requires = ["poetry-core"] 19 | build-backend = "poetry.core.masonry.api" 20 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from pyspark.sql import SparkSession 4 | 5 | 6 | @pytest.fixture 7 | def spark_local(): 8 | os.environ['PYSPARK_GATEWAY_ENABLED'] = '0' 9 | spark = ( 10 | SparkSession.builder.master("local[1]") 11 | .appName("local-tests") 12 | .config("spark.executor.cores", "1") 13 | .config("spark.executor.instances", "1") 14 | .config("spark.sql.shuffle.partitions", "1") 15 | .config("spark.driver.bindAddress", "127.0.0.1") 16 | .getOrCreate() 17 | ) 18 | yield spark 19 | spark.stop() 20 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | permissions: 12 | contents: read 13 | 14 | on: 15 | push: 16 | branches: 17 | - main 18 | 19 | jobs: 20 | build-and-publish: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: "3.x" 29 | - name: Install dependencies 30 | run: | 31 | pip install poetry 32 | poetry install --no-dev 33 | - name: Build package 34 | run: poetry build 35 | - name: Publish to PyPI 36 | env: 37 | TWINE_USERNAME: __token__ 38 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 39 | run: | 40 | pip install twine 41 | twine upload dist/* 42 | 43 | -------------------------------------------------------------------------------- /pyspark_types/auxiliary.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | from decimal import Decimal 3 | 4 | 5 | class LongT(int): 6 | def __repr__(self): 7 | return f"LongT({super().__repr__()})" 8 | 9 | 10 | class ShortT(int): 11 | def __repr__(self): 12 | return f"ShortT({super().__repr__()})" 13 | 14 | 15 | class ByteT(int): 16 | def __repr__(self): 17 | return f"ByteT({super().__repr__()})" 18 | 19 | 20 | class BoundDecimal(Decimal): 21 | """ 22 | Custom data type that represents a decimal with a specific scale and precision. 23 | """ 24 | 25 | def __new__(cls, value: str, precision: int, scale: int): 26 | obj = super().__new__(cls, value) 27 | obj.precision = precision 28 | obj.scale = scale 29 | return obj 30 | 31 | def __repr__(self) -> str: 32 | return f"BoundDecimal('{str(self)}', precision={self.precision}, scale={self.scale})" 33 | 34 | 35 | def create_bound_decimal_type(precision: int, scale: int) -> Type[BoundDecimal]: 36 | """ 37 | Factory method that creates a new BoundDecimal type with the specified precision and scale. 38 | """ 39 | 40 | class _BoundDecimal(BoundDecimal): 41 | def __new__(cls, value): 42 | return super().__new__(cls, value, precision=precision, scale=scale) 43 | 44 | _BoundDecimal.__name__ = f"BoundDecimal_{precision}_{scale}" 45 | _BoundDecimal.precision = precision 46 | _BoundDecimal.scale = scale 47 | 48 | return _BoundDecimal 49 | -------------------------------------------------------------------------------- /tests/test_dataclass.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from pyspark.sql.types import ( 4 | StructType, 5 | StructField, 6 | StringType, 7 | IntegerType, 8 | LongType, 9 | DecimalType, 10 | ArrayType, 11 | DateType, 12 | ) 13 | from typing import List, Optional 14 | from dataclasses import dataclass 15 | 16 | from pyspark_types.dataclass import map_dataclass_to_struct, LongT 17 | from pyspark_types.auxiliary import create_bound_decimal_type 18 | 19 | decimal = create_bound_decimal_type(10, 2) 20 | 21 | 22 | @dataclass 23 | class InnerDataClass: 24 | id: Optional[LongT] 25 | name: Optional[str] 26 | price: decimal 27 | 28 | 29 | @dataclass 30 | class OuterDataClass: 31 | field1: int 32 | field2: Optional[str] 33 | field3: List[InnerDataClass] 34 | field4: datetime.date 35 | 36 | 37 | def test_map_simple_dataclass(): 38 | expected_struct = StructType( 39 | [ 40 | StructField("id", LongType(), True), 41 | StructField("name", StringType(), True), 42 | StructField("price", DecimalType(10, 2), False), 43 | ] 44 | ) 45 | result_struct = map_dataclass_to_struct(InnerDataClass) 46 | assert result_struct == expected_struct 47 | 48 | 49 | def test_map_dataclass_with_list_of_dataclasses(): 50 | expected_struct = StructType( 51 | [ 52 | StructField("field1", IntegerType(), False), 53 | StructField("field2", StringType(), True), 54 | StructField( 55 | "field3", 56 | ArrayType( 57 | StructType( 58 | [ 59 | StructField("id", LongType(), True), 60 | StructField("name", StringType(), True), 61 | StructField("price", DecimalType(10, 2), False), 62 | ] 63 | ) 64 | ), 65 | False, 66 | ), 67 | StructField("field4", DateType(), False), 68 | ] 69 | ) 70 | result_struct = map_dataclass_to_struct(OuterDataClass) 71 | assert result_struct == expected_struct 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySpark Types 2 | 3 | `pyspark_types` is a Python library that provides a simple way to map Python dataclasses to PySpark StructTypes. 4 | 5 | ## Usage 6 | 7 | ### Pydantic 8 | PySparkBaseModel is a base class for PySpark models that provides methods for converting between PySpark Rows and Pydantic models. 9 | 10 | Here's an example of a Pydantic model that will be used to create a PySpark DataFrame: 11 | 12 | ```python 13 | from pyspark_types.auxiliary import BoundDecimal 14 | from pyspark_types.pydantic import PySparkBaseModel 15 | 16 | 17 | class Person(PySparkBaseModel): 18 | name: str 19 | age: int 20 | addresses: dict[str, str] 21 | salary: BoundDecimal 22 | 23 | ``` 24 | 25 | To create a PySpark DataFrame from a list of Person Pydantic models, we can use PySparkBaseModel.create_spark_dataframe() method. 26 | 27 | ```python 28 | from pyspark.sql import SparkSession 29 | 30 | spark = SparkSession.builder.appName("MyApp").getOrCreate() 31 | 32 | # create a list of Pydantic models 33 | data = [ 34 | Person( 35 | name="Alice", 36 | age=25, 37 | addresses={"home": "123 Main St", "work": "456 Pine St"}, 38 | salary=BoundDecimal("5000.00", precision=10, scale=2), 39 | ), 40 | Person( 41 | name="Bob", 42 | age=30, 43 | addresses={"home": "789 Elm St", "work": "321 Oak St"}, 44 | salary=BoundDecimal("6000.50", precision=10, scale=2), 45 | ), 46 | ] 47 | 48 | # create a PySpark DataFrame from the list of Pydantic models 49 | df = Person.create_spark_dataframe(data, spark) 50 | 51 | # show the contents of the DataFrame 52 | df.show() 53 | 54 | ``` 55 | 56 | Output: 57 | ```bash 58 | +---+-----+--------------------+------+ 59 | |age| name| addresses|salary| 60 | +---+-----+--------------------+------+ 61 | | 25|Alice|[home -> 123 Main...|5000.00| 62 | | 30| Bob|[home -> 789 Elm ...|6000.50| 63 | +---+-----+--------------------+------+ 64 | 65 | ``` 66 | 67 | The PySparkBaseModel.create_spark_dataframe() method converts the list of Pydantic models to a list of dictionaries, and then creates a PySpark DataFrame from the list of dictionaries and schema generated from the Pydantic model. 68 | 69 | You can also generate a schema based on a Pydantic model by calling the PySparkBaseModel.schema() method: 70 | ```python 71 | schema = PySparkBaseModel.schema(Person) 72 | 73 | ``` 74 | 75 | This creates a PySpark schema for the Person Pydantic model. 76 | 77 | Note that if you have custom types, such as BoundDecimal, you will need to add support for them in PySparkBaseModel. For example, you can modify the PySparkBaseModel.dict() method to extract BoundDecimal values when mapping to DecimalType. 78 | ### Dataclasses 79 | 80 | To use pyspark_types, you first need to define a Python data class with the fields you want to map to PySpark. For example: 81 | 82 | ```python 83 | from dataclasses import dataclass 84 | 85 | @dataclass 86 | class Person: 87 | name: str 88 | age: int 89 | is_student: bool 90 | 91 | ``` 92 | To map this data class to a PySpark StructType, you can use the map_dataclass_to_struct() function: 93 | 94 | ```python 95 | from pyspark_types import map_dataclass_to_struct 96 | 97 | person_struct = map_dataclass_to_struct(Person) 98 | ``` 99 | 100 | This will return a PySpark StructType that corresponds to the Person data class. 101 | 102 | You can also use the apply_nullability() function to set the nullable flag for a given PySpark DataType: 103 | 104 | ```python 105 | from pyspark.sql.types import StringType 106 | from pyspark_types import apply_nullability 107 | 108 | nullable_string_type = apply_nullability(StringType(), True) 109 | ``` 110 | 111 | This will return a new PySpark StringType with the nullable flag set to True. 112 | -------------------------------------------------------------------------------- /tests/test_pydantic.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import ( 2 | DecimalType, 3 | Row, 4 | BooleanType, 5 | StructType, 6 | StructField, 7 | StringType, 8 | DoubleType, 9 | LongType, 10 | ArrayType, 11 | ) 12 | from typing import List, Optional 13 | from pyspark_types.auxiliary import create_bound_decimal_type 14 | 15 | from pyspark_types.dataclass import LongT 16 | from pyspark_types.pydantic import PySparkBaseModel 17 | import pytest 18 | import os 19 | 20 | 21 | decimal = create_bound_decimal_type(10, 2) 22 | 23 | os.environ['PYSPARK_GATEWAY_ENABLED'] = '0' 24 | 25 | 26 | class Address(PySparkBaseModel): 27 | street: str 28 | city: str 29 | state: str 30 | zip: str 31 | 32 | 33 | class Person(PySparkBaseModel): 34 | name: Optional[str] 35 | age: LongT 36 | height: float 37 | is_employed: bool 38 | salary: float 39 | decimal: decimal 40 | addresses: List[Address] 41 | 42 | 43 | def test_person_spark_schema(): 44 | schema = Person.schema() 45 | expected_schema = StructType( 46 | [ 47 | StructField("name", StringType(), True), 48 | StructField("age", LongType(), False), 49 | StructField("height", DoubleType(), False), 50 | StructField("is_employed", BooleanType(), False), 51 | StructField("salary", DoubleType(), False), 52 | StructField("decimal", DecimalType(10, 2), False), 53 | StructField( 54 | "addresses", 55 | ArrayType( 56 | StructType( 57 | [ 58 | StructField("street", StringType(), False), 59 | StructField("city", StringType(), False), 60 | StructField("state", StringType(), False), 61 | StructField("zip", StringType(), False), 62 | ] 63 | ), 64 | True, 65 | ), 66 | True, 67 | ), 68 | ] 69 | ) 70 | 71 | assert schema == expected_schema 72 | 73 | @pytest.mark.usefixtures("spark_local") 74 | def test_to_dataframe_from_pydantic(spark_local): 75 | data = [ 76 | Person( 77 | name="John", 78 | age=30, 79 | height=1.80, 80 | is_employed=True, 81 | salary=1000.00, 82 | decimal=decimal(100.00), 83 | addresses=[ 84 | Address(street="123 Main St", city="Anytown", state="NY", zip="12345"), 85 | Address(street="456 Main St", city="Anytown", state="NY", zip="12345"), 86 | ], 87 | ), 88 | Person( 89 | name="Jane", 90 | age=25, 91 | height=1.60, 92 | is_employed=False, 93 | salary=0.00, 94 | decimal=decimal(0.00), 95 | addresses=[], 96 | ), 97 | ] 98 | df = Person.create_spark_dataframe(data, spark_local) 99 | 100 | expected_schema = StructType( 101 | [ 102 | StructField("name", StringType(), True), 103 | StructField("age", LongType(), False), 104 | StructField("height", DoubleType(), False), 105 | StructField("is_employed", BooleanType(), False), 106 | StructField("salary", DoubleType(), False), 107 | StructField("decimal", DecimalType(10, 2), False), 108 | StructField( 109 | "addresses", 110 | ArrayType( 111 | StructType( 112 | [ 113 | StructField("street", StringType(), False), 114 | StructField("city", StringType(), False), 115 | StructField("state", StringType(), False), 116 | StructField("zip", StringType(), False), 117 | ] 118 | ), 119 | True, 120 | ), 121 | True, 122 | ), 123 | ] 124 | ) 125 | 126 | assert df.schema == expected_schema 127 | assert df.count() == 2 128 | -------------------------------------------------------------------------------- /pyspark_types/dataclass.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from pyspark.sql.types import ( 4 | StructType, 5 | StructField, 6 | StringType, 7 | IntegerType, 8 | ArrayType, 9 | DoubleType, 10 | DecimalType, 11 | BooleanType, 12 | LongType, 13 | DataType, 14 | ShortType, 15 | ByteType, 16 | MapType, 17 | DateType, 18 | TimestampType, 19 | ) 20 | from typing import Type, get_type_hints, Union 21 | from dataclasses import is_dataclass, fields 22 | from pyspark_types.auxiliary import LongT, ShortT, ByteT, BoundDecimal 23 | 24 | 25 | def map_dataclass_to_struct(dataclass_type: Type) -> StructType: 26 | """ 27 | Map a Python data class to a PySpark struct. 28 | 29 | :param dataclass_type: The Python data class to be mapped. 30 | :return: A PySpark struct that corresponds to the data class. 31 | """ 32 | fields_list = [] 33 | hints = get_type_hints(dataclass_type) 34 | 35 | for field in fields(dataclass_type): 36 | field_name = field.name 37 | field_type = field.type 38 | 39 | if is_dataclass(field_type): 40 | # Recursively map nested data classes to PySpark structs 41 | sub_struct = map_dataclass_to_struct(field_type) 42 | nullable = is_field_nullable(field_name, hints) 43 | fields_list.append(StructField(field_name, sub_struct, nullable)) 44 | elif hasattr(field_type, "__origin__") and field_type.__origin__ is list: 45 | # Handle lists of elements 46 | elem_type = field_type.__args__[0] 47 | if is_dataclass(elem_type): 48 | # Handle lists of data classes 49 | sub_struct = map_dataclass_to_struct(elem_type) 50 | nullable = is_field_nullable(field_name, hints) 51 | fields_list.append( 52 | StructField(field_name, ArrayType(sub_struct), nullable) 53 | ) 54 | else: 55 | # Handle lists of primitive types and dicts 56 | spark_type = get_spark_type(elem_type) 57 | nullable = is_field_nullable(field_name, hints) 58 | if spark_type == MapType(StringType(), StringType()): 59 | # Special case for dictionaries with any value type 60 | fields_list.append(StructField(field_name, spark_type, nullable)) 61 | else: 62 | fields_list.append( 63 | StructField(field_name, ArrayType(spark_type), nullable) 64 | ) 65 | elif hasattr(field_type, "__origin__") and field_type.__origin__ is dict: 66 | # Handle dictionaries 67 | key_type, value_type = field_type.__args__ 68 | if is_dataclass(value_type): 69 | sub_struct = map_dataclass_to_struct(value_type) 70 | nullable = is_field_nullable(field_name, hints) 71 | fields_list.append( 72 | StructField( 73 | field_name, 74 | MapType(get_spark_type(key_type), sub_struct), 75 | nullable, 76 | ) 77 | ) 78 | else: 79 | spark_type = get_spark_type(value_type) 80 | nullable = is_field_nullable(field_name, hints) 81 | fields_list.append( 82 | StructField( 83 | field_name, 84 | MapType(get_spark_type(key_type), spark_type), 85 | nullable, 86 | ) 87 | ) 88 | else: 89 | # Handle primitive types and BoundDecimal custom type 90 | spark_type = get_spark_type(field_type) 91 | nullable = is_field_nullable(field_name, hints) 92 | fields_list.append(StructField(field_name, spark_type, nullable)) 93 | 94 | return StructType(fields_list) 95 | 96 | 97 | def get_spark_type(py_type: Type) -> DataType: 98 | """ 99 | Creates a mapping from a python type to a pyspark data type 100 | :param py_type: 101 | :return: 102 | """ 103 | if py_type == str: 104 | return StringType() 105 | elif py_type == int: 106 | return IntegerType() 107 | elif py_type == LongT: 108 | return LongType() 109 | elif py_type == ShortT: 110 | return ShortType() 111 | elif py_type == ByteT: 112 | return ByteType() 113 | elif py_type == float: 114 | return DoubleType() 115 | elif py_type == datetime.datetime: 116 | return TimestampType() 117 | elif py_type == datetime.date: 118 | return DateType() 119 | elif py_type == bool: 120 | return BooleanType() 121 | elif isinstance(py_type, type) and issubclass(py_type, BoundDecimal): 122 | return DecimalType(precision=py_type.precision, scale=py_type.scale) 123 | elif is_optional_type(py_type): 124 | elem_type = py_type.__args__[0] 125 | spark_type = get_spark_type(elem_type) 126 | return spark_type 127 | else: 128 | raise Exception(f"Type {py_type} is not supported by PySpark") 129 | 130 | 131 | def is_field_nullable(field_name: str, hints: dict) -> bool: 132 | """ 133 | Returns True if the given field name is nullable, based on the type hint for the field in the given hints dictionary. 134 | """ 135 | if field_name not in hints: 136 | return True 137 | field_type = hints[field_name] 138 | if is_optional_type(field_type): 139 | return True 140 | return False 141 | 142 | 143 | def apply_nullability(dtype: DataType, is_nullable: bool) -> DataType: 144 | """ 145 | Returns a new PySpark DataType with the nullable flag set to the given value. 146 | """ 147 | if is_nullable: 148 | if isinstance(dtype, StructType): 149 | # Wrap the nullable field in a struct with a single field 150 | return StructType([StructField("value", dtype, True)]) 151 | elif hasattr(dtype, "add_nullable"): 152 | return dtype.add_nullable() 153 | else: 154 | raise TypeError(f"Type {dtype} does not support nullability") 155 | else: 156 | return dtype 157 | 158 | def is_optional_type(py_type: Type) -> bool: 159 | """ 160 | Returns True if the given type is an Optional type. 161 | """ 162 | if hasattr(py_type, "__origin__") and py_type.__origin__ is Union: 163 | args = py_type.__args__ 164 | if len(args) == 2 and args[1] is type(None): 165 | return True 166 | return False 167 | -------------------------------------------------------------------------------- /pyspark_types/pydantic.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import datetime 3 | from typing import Type, Union, get_type_hints, get_origin, get_args, List 4 | from pydantic import BaseModel 5 | from pyspark.sql.types import * 6 | from pyspark_types.dataclass import is_field_nullable, is_optional_type 7 | from pyspark_types.auxiliary import LongT, ShortT, ByteT, BoundDecimal 8 | from pyspark.sql import SparkSession, DataFrame 9 | from decimal import Decimal 10 | 11 | 12 | class PySparkBaseModel(BaseModel): 13 | """ 14 | Base class for PySpark models. Provides methods for converting between PySpark Rows and Pydantic models. 15 | """ 16 | 17 | class Config: 18 | arbitrary_types_allowed = True 19 | validate_assignment = True 20 | validate_all = True 21 | 22 | 23 | def dict(self, *args, **kwargs): 24 | """ 25 | Override Pydantic's dict() method to return a dictionary with PySparkBaseModel field values 26 | instead of Pydantic field values 27 | """ 28 | result = super().dict(*args, **kwargs) 29 | new_result = {} 30 | for k, v in result.items(): 31 | if isinstance(v, PySparkBaseModel): 32 | new_result[k] = v.dict() 33 | elif isinstance(v, BoundDecimal): 34 | new_result[k] = Decimal(v) 35 | else: 36 | new_result[k] = v 37 | return new_result 38 | 39 | @classmethod 40 | def is_pyspark_basemodel_type(cls, t: Type) -> bool: 41 | if isinstance(t, type): 42 | if t.__module__ == 'builtins': 43 | return False 44 | else: 45 | return issubclass(t, PySparkBaseModel) 46 | else: 47 | return False 48 | 49 | @classmethod 50 | def is_optional_pyspark_basemodel_type(cls, t: Type) -> bool: 51 | return ( 52 | get_origin(t) is Union 53 | and len(get_args(t)) == 2 54 | and get_args(t)[1] is type(None) 55 | and cls.is_pyspark_basemodel_type(get_args(t)[0]) 56 | ) 57 | 58 | @classmethod 59 | def _get_struct_field( 60 | cls, field_name: str, field_type: Type, hints: typing.Dict[str, Type] 61 | ) -> StructField: 62 | # Handle PySparkBaseModel and Optional[PySparkBaseModel] fields 63 | if cls.is_pyspark_basemodel_type( 64 | field_type 65 | ) or cls.is_optional_pyspark_basemodel_type(field_type): 66 | if cls.is_optional_pyspark_basemodel_type(field_type): 67 | field_type = field_type.__args__[0] 68 | sub_struct = cls._schema(field_type) 69 | nullable = ( 70 | field_type.__config__.allow_population_by_field_name 71 | or is_field_nullable(field_name, hints) 72 | ) 73 | return StructField(field_name, sub_struct, nullable) 74 | 75 | # Handle list fields 76 | elif get_origin(field_type) is list: 77 | elem_type = get_args(field_type)[0] 78 | struct_field = cls._get_list_struct_field(field_name, elem_type, hints) 79 | return struct_field 80 | 81 | # Handle dict fields 82 | elif get_origin(field_type) is dict: 83 | key_type, value_type = get_args(field_type) 84 | struct_field = cls._get_dict_struct_field( 85 | field_name, key_type, value_type, hints 86 | ) 87 | return struct_field 88 | 89 | # Handle all other types 90 | else: 91 | spark_type = cls.get_spark_type(field_type) 92 | nullable = is_field_nullable(field_name, hints) 93 | return StructField(field_name, spark_type, nullable) 94 | 95 | @classmethod 96 | def _get_list_struct_field( 97 | cls, field_name: str, elem_type: Type, hints: typing.Dict[str, Type] 98 | ) -> StructField: 99 | if cls.is_pyspark_basemodel_type( 100 | elem_type 101 | ) or cls.is_optional_pyspark_basemodel_type(elem_type): 102 | sub_struct = cls._schema(elem_type) 103 | nullable = True 104 | return StructField(field_name, ArrayType(sub_struct), nullable) 105 | 106 | else: 107 | spark_type = cls.get_spark_type(elem_type) 108 | nullable = is_field_nullable(field_name, hints) 109 | return StructField(field_name, ArrayType(spark_type), nullable) 110 | 111 | @classmethod 112 | def _get_dict_struct_field( 113 | cls, 114 | field_name: str, 115 | key_type: Type, 116 | value_type: Type, 117 | hints: typing.Dict[str, Type], 118 | ) -> StructField: 119 | if cls.is_pyspark_basemodel_type( 120 | value_type 121 | ) or cls.is_optional_pyspark_basemodel_type(value_type): 122 | sub_struct = cls._schema(value_type) 123 | spark_type = MapType(cls.get_spark_type(key_type), sub_struct) 124 | nullable = True 125 | return StructField(field_name, spark_type, nullable) 126 | 127 | else: 128 | spark_type = MapType( 129 | cls.get_spark_type(key_type), cls.get_spark_type(value_type) 130 | ) 131 | nullable = is_field_nullable(field_name, hints) 132 | return StructField(field_name, spark_type, nullable) 133 | 134 | @classmethod 135 | def schema(cls: Type[BaseModel]) -> StructType: 136 | """ 137 | Map a Pydantic model to a PySpark struct. 138 | 139 | :param model_type: The Pydantic model to be mapped. 140 | :return: A PySpark struct that corresponds to the Pydantic model. 141 | """ 142 | return cls._schema(cls) 143 | 144 | @classmethod 145 | def _schema(cls, t: Type[BaseModel]) -> StructType: 146 | """ 147 | Map a Pydantic model to a PySpark struct. 148 | 149 | :param model_type: The Pydantic model to be mapped. 150 | :return: A PySpark struct that corresponds to the Pydantic model. 151 | """ 152 | fields_list = [] 153 | hints = get_type_hints(t) 154 | 155 | for field_name, _ in t.__fields__.items(): 156 | field_type = hints.get(field_name) 157 | struct_field = cls._get_struct_field(field_name, field_type, hints) 158 | fields_list.append(struct_field) 159 | 160 | return StructType(fields_list) 161 | 162 | @classmethod 163 | def get_spark_type(cls, py_type: Type) -> DataType: 164 | """ 165 | Creates a mapping from a python type to a pyspark data type 166 | :param py_type: 167 | :return: 168 | """ 169 | if py_type == str: 170 | return StringType() 171 | elif py_type == int: 172 | return IntegerType() 173 | elif py_type == LongT: 174 | return LongType() 175 | elif py_type == ShortT: 176 | return ShortType() 177 | elif py_type == ByteT: 178 | return ByteType() 179 | elif py_type == float: 180 | return DoubleType() 181 | elif py_type == datetime.datetime: 182 | return TimestampType() 183 | elif py_type == datetime.date: 184 | return DateType() 185 | elif py_type == bool: 186 | return BooleanType() 187 | elif isinstance(py_type, type) and issubclass(py_type, BoundDecimal): 188 | return DecimalType(precision=py_type.precision, scale=py_type.scale) 189 | elif is_optional_type(py_type): 190 | elem_type = py_type.__args__[0] 191 | spark_type = cls.get_spark_type(elem_type) 192 | return spark_type 193 | else: 194 | raise Exception(f"Type {py_type} is not supported by PySpark") 195 | 196 | 197 | @classmethod 198 | def create_spark_dataframe(cls, data: List['PySparkBaseModel'], spark: SparkSession) -> DataFrame: 199 | """ 200 | Creates a PySpark DataFrame from a list of Pydantic models. 201 | 202 | :param data: A list of Pydantic models. 203 | :param spark: A PySpark SparkSession. 204 | :return: A PySpark DataFrame. 205 | """ 206 | # Generate schema based on base model 207 | schema = cls.schema() 208 | 209 | # Convert Pydantic models to dictionaries 210 | data_rows = [tuple(item.dict().values()) for item in data] 211 | 212 | # Create Spark DataFrame from list of dictionaries and schema 213 | df = spark.createDataFrame(data_rows, schema) 214 | 215 | return df -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "annotated-types" 5 | version = "0.6.0" 6 | description = "Reusable constraint types to use with typing.Annotated" 7 | optional = false 8 | python-versions = ">=3.8" 9 | files = [ 10 | {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, 11 | {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, 12 | ] 13 | 14 | [[package]] 15 | name = "attrs" 16 | version = "22.2.0" 17 | description = "Classes Without Boilerplate" 18 | optional = false 19 | python-versions = ">=3.6" 20 | files = [ 21 | {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, 22 | {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, 23 | ] 24 | 25 | [package.extras] 26 | cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] 27 | dev = ["attrs[docs,tests]"] 28 | docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] 29 | tests = ["attrs[tests-no-zope]", "zope.interface"] 30 | tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] 31 | 32 | [[package]] 33 | name = "black" 34 | version = "23.1.0" 35 | description = "The uncompromising code formatter." 36 | optional = false 37 | python-versions = ">=3.7" 38 | files = [ 39 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, 40 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, 41 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, 42 | {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, 43 | {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, 44 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, 45 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, 46 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, 47 | {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, 48 | {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, 49 | {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, 50 | {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, 51 | {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, 52 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, 53 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, 54 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, 55 | {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, 56 | {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, 57 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, 58 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, 59 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, 60 | {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, 61 | {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, 62 | {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, 63 | {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, 64 | ] 65 | 66 | [package.dependencies] 67 | click = ">=8.0.0" 68 | mypy-extensions = ">=0.4.3" 69 | packaging = ">=22.0" 70 | pathspec = ">=0.9.0" 71 | platformdirs = ">=2" 72 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 73 | 74 | [package.extras] 75 | colorama = ["colorama (>=0.4.3)"] 76 | d = ["aiohttp (>=3.7.4)"] 77 | jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] 78 | uvloop = ["uvloop (>=0.15.2)"] 79 | 80 | [[package]] 81 | name = "click" 82 | version = "8.1.3" 83 | description = "Composable command line interface toolkit" 84 | optional = false 85 | python-versions = ">=3.7" 86 | files = [ 87 | {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, 88 | {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, 89 | ] 90 | 91 | [package.dependencies] 92 | colorama = {version = "*", markers = "platform_system == \"Windows\""} 93 | 94 | [[package]] 95 | name = "colorama" 96 | version = "0.4.6" 97 | description = "Cross-platform colored terminal text." 98 | optional = false 99 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 100 | files = [ 101 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 102 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 103 | ] 104 | 105 | [[package]] 106 | name = "exceptiongroup" 107 | version = "1.1.0" 108 | description = "Backport of PEP 654 (exception groups)" 109 | optional = false 110 | python-versions = ">=3.7" 111 | files = [ 112 | {file = "exceptiongroup-1.1.0-py3-none-any.whl", hash = "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e"}, 113 | {file = "exceptiongroup-1.1.0.tar.gz", hash = "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"}, 114 | ] 115 | 116 | [package.extras] 117 | test = ["pytest (>=6)"] 118 | 119 | [[package]] 120 | name = "iniconfig" 121 | version = "2.0.0" 122 | description = "brain-dead simple config-ini parsing" 123 | optional = false 124 | python-versions = ">=3.7" 125 | files = [ 126 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 127 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 128 | ] 129 | 130 | [[package]] 131 | name = "mypy-extensions" 132 | version = "1.0.0" 133 | description = "Type system extensions for programs checked with the mypy type checker." 134 | optional = false 135 | python-versions = ">=3.5" 136 | files = [ 137 | {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, 138 | {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, 139 | ] 140 | 141 | [[package]] 142 | name = "packaging" 143 | version = "23.0" 144 | description = "Core utilities for Python packages" 145 | optional = false 146 | python-versions = ">=3.7" 147 | files = [ 148 | {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, 149 | {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, 150 | ] 151 | 152 | [[package]] 153 | name = "pathspec" 154 | version = "0.11.0" 155 | description = "Utility library for gitignore style pattern matching of file paths." 156 | optional = false 157 | python-versions = ">=3.7" 158 | files = [ 159 | {file = "pathspec-0.11.0-py3-none-any.whl", hash = "sha256:3a66eb970cbac598f9e5ccb5b2cf58930cd8e3ed86d393d541eaf2d8b1705229"}, 160 | {file = "pathspec-0.11.0.tar.gz", hash = "sha256:64d338d4e0914e91c1792321e6907b5a593f1ab1851de7fc269557a21b30ebbc"}, 161 | ] 162 | 163 | [[package]] 164 | name = "platformdirs" 165 | version = "3.0.0" 166 | description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." 167 | optional = false 168 | python-versions = ">=3.7" 169 | files = [ 170 | {file = "platformdirs-3.0.0-py3-none-any.whl", hash = "sha256:b1d5eb14f221506f50d6604a561f4c5786d9e80355219694a1b244bcd96f4567"}, 171 | {file = "platformdirs-3.0.0.tar.gz", hash = "sha256:8a1228abb1ef82d788f74139988b137e78692984ec7b08eaa6c65f1723af28f9"}, 172 | ] 173 | 174 | [package.extras] 175 | docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] 176 | test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] 177 | 178 | [[package]] 179 | name = "pluggy" 180 | version = "1.0.0" 181 | description = "plugin and hook calling mechanisms for python" 182 | optional = false 183 | python-versions = ">=3.6" 184 | files = [ 185 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 186 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 187 | ] 188 | 189 | [package.extras] 190 | dev = ["pre-commit", "tox"] 191 | testing = ["pytest", "pytest-benchmark"] 192 | 193 | [[package]] 194 | name = "py4j" 195 | version = "0.10.9.5" 196 | description = "Enables Python programs to dynamically access arbitrary Java objects" 197 | optional = false 198 | python-versions = "*" 199 | files = [ 200 | {file = "py4j-0.10.9.5-py2.py3-none-any.whl", hash = "sha256:52d171a6a2b031d8a5d1de6efe451cf4f5baff1a2819aabc3741c8406539ba04"}, 201 | {file = "py4j-0.10.9.5.tar.gz", hash = "sha256:276a4a3c5a2154df1860ef3303a927460e02e97b047dc0a47c1c3fb8cce34db6"}, 202 | ] 203 | 204 | [[package]] 205 | name = "pydantic" 206 | version = "2.6.1" 207 | description = "Data validation using Python type hints" 208 | optional = false 209 | python-versions = ">=3.8" 210 | files = [ 211 | {file = "pydantic-2.6.1-py3-none-any.whl", hash = "sha256:0b6a909df3192245cb736509a92ff69e4fef76116feffec68e93a567347bae6f"}, 212 | {file = "pydantic-2.6.1.tar.gz", hash = "sha256:4fd5c182a2488dc63e6d32737ff19937888001e2a6d86e94b3f233104a5d1fa9"}, 213 | ] 214 | 215 | [package.dependencies] 216 | annotated-types = ">=0.4.0" 217 | pydantic-core = "2.16.2" 218 | typing-extensions = ">=4.6.1" 219 | 220 | [package.extras] 221 | email = ["email-validator (>=2.0.0)"] 222 | 223 | [[package]] 224 | name = "pydantic-core" 225 | version = "2.16.2" 226 | description = "" 227 | optional = false 228 | python-versions = ">=3.8" 229 | files = [ 230 | {file = "pydantic_core-2.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3fab4e75b8c525a4776e7630b9ee48aea50107fea6ca9f593c98da3f4d11bf7c"}, 231 | {file = "pydantic_core-2.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8bde5b48c65b8e807409e6f20baee5d2cd880e0fad00b1a811ebc43e39a00ab2"}, 232 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2924b89b16420712e9bb8192396026a8fbd6d8726224f918353ac19c4c043d2a"}, 233 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16aa02e7a0f539098e215fc193c8926c897175d64c7926d00a36188917717a05"}, 234 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:936a787f83db1f2115ee829dd615c4f684ee48ac4de5779ab4300994d8af325b"}, 235 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:459d6be6134ce3b38e0ef76f8a672924460c455d45f1ad8fdade36796df1ddc8"}, 236 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9ee4febb249c591d07b2d4dd36ebcad0ccd128962aaa1801508320896575ef"}, 237 | {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:40a0bd0bed96dae5712dab2aba7d334a6c67cbcac2ddfca7dbcc4a8176445990"}, 238 | {file = "pydantic_core-2.16.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:870dbfa94de9b8866b37b867a2cb37a60c401d9deb4a9ea392abf11a1f98037b"}, 239 | {file = "pydantic_core-2.16.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:308974fdf98046db28440eb3377abba274808bf66262e042c412eb2adf852731"}, 240 | {file = "pydantic_core-2.16.2-cp310-none-win32.whl", hash = "sha256:a477932664d9611d7a0816cc3c0eb1f8856f8a42435488280dfbf4395e141485"}, 241 | {file = "pydantic_core-2.16.2-cp310-none-win_amd64.whl", hash = "sha256:8f9142a6ed83d90c94a3efd7af8873bf7cefed2d3d44387bf848888482e2d25f"}, 242 | {file = "pydantic_core-2.16.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:406fac1d09edc613020ce9cf3f2ccf1a1b2f57ab00552b4c18e3d5276c67eb11"}, 243 | {file = "pydantic_core-2.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce232a6170dd6532096cadbf6185271e4e8c70fc9217ebe105923ac105da9978"}, 244 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a90fec23b4b05a09ad988e7a4f4e081711a90eb2a55b9c984d8b74597599180f"}, 245 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8aafeedb6597a163a9c9727d8a8bd363a93277701b7bfd2749fbefee2396469e"}, 246 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9957433c3a1b67bdd4c63717eaf174ebb749510d5ea612cd4e83f2d9142f3fc8"}, 247 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0d7a9165167269758145756db43a133608a531b1e5bb6a626b9ee24bc38a8f7"}, 248 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dffaf740fe2e147fedcb6b561353a16243e654f7fe8e701b1b9db148242e1272"}, 249 | {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ed79883b4328b7f0bd142733d99c8e6b22703e908ec63d930b06be3a0e7113"}, 250 | {file = "pydantic_core-2.16.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:cf903310a34e14651c9de056fcc12ce090560864d5a2bb0174b971685684e1d8"}, 251 | {file = "pydantic_core-2.16.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46b0d5520dbcafea9a8645a8164658777686c5c524d381d983317d29687cce97"}, 252 | {file = "pydantic_core-2.16.2-cp311-none-win32.whl", hash = "sha256:70651ff6e663428cea902dac297066d5c6e5423fda345a4ca62430575364d62b"}, 253 | {file = "pydantic_core-2.16.2-cp311-none-win_amd64.whl", hash = "sha256:98dc6f4f2095fc7ad277782a7c2c88296badcad92316b5a6e530930b1d475ebc"}, 254 | {file = "pydantic_core-2.16.2-cp311-none-win_arm64.whl", hash = "sha256:ef6113cd31411eaf9b39fc5a8848e71c72656fd418882488598758b2c8c6dfa0"}, 255 | {file = "pydantic_core-2.16.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:88646cae28eb1dd5cd1e09605680c2b043b64d7481cdad7f5003ebef401a3039"}, 256 | {file = "pydantic_core-2.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7b883af50eaa6bb3299780651e5be921e88050ccf00e3e583b1e92020333304b"}, 257 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bf26c2e2ea59d32807081ad51968133af3025c4ba5753e6a794683d2c91bf6e"}, 258 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:99af961d72ac731aae2a1b55ccbdae0733d816f8bfb97b41909e143de735f522"}, 259 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02906e7306cb8c5901a1feb61f9ab5e5c690dbbeaa04d84c1b9ae2a01ebe9379"}, 260 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5362d099c244a2d2f9659fb3c9db7c735f0004765bbe06b99be69fbd87c3f15"}, 261 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ac426704840877a285d03a445e162eb258924f014e2f074e209d9b4ff7bf380"}, 262 | {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b94cbda27267423411c928208e89adddf2ea5dd5f74b9528513f0358bba019cb"}, 263 | {file = "pydantic_core-2.16.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6db58c22ac6c81aeac33912fb1af0e930bc9774166cdd56eade913d5f2fff35e"}, 264 | {file = "pydantic_core-2.16.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:396fdf88b1b503c9c59c84a08b6833ec0c3b5ad1a83230252a9e17b7dfb4cffc"}, 265 | {file = "pydantic_core-2.16.2-cp312-none-win32.whl", hash = "sha256:7c31669e0c8cc68400ef0c730c3a1e11317ba76b892deeefaf52dcb41d56ed5d"}, 266 | {file = "pydantic_core-2.16.2-cp312-none-win_amd64.whl", hash = "sha256:a3b7352b48fbc8b446b75f3069124e87f599d25afb8baa96a550256c031bb890"}, 267 | {file = "pydantic_core-2.16.2-cp312-none-win_arm64.whl", hash = "sha256:a9e523474998fb33f7c1a4d55f5504c908d57add624599e095c20fa575b8d943"}, 268 | {file = "pydantic_core-2.16.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:ae34418b6b389d601b31153b84dce480351a352e0bb763684a1b993d6be30f17"}, 269 | {file = "pydantic_core-2.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:732bd062c9e5d9582a30e8751461c1917dd1ccbdd6cafb032f02c86b20d2e7ec"}, 270 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b52776a2e3230f4854907a1e0946eec04d41b1fc64069ee774876bbe0eab55"}, 271 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef551c053692b1e39e3f7950ce2296536728871110e7d75c4e7753fb30ca87f4"}, 272 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ebb892ed8599b23fa8f1799e13a12c87a97a6c9d0f497525ce9858564c4575a4"}, 273 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa6c8c582036275997a733427b88031a32ffa5dfc3124dc25a730658c47a572f"}, 274 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ba0884a91f1aecce75202473ab138724aa4fb26d7707f2e1fa6c3e68c84fbf"}, 275 | {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7924e54f7ce5d253d6160090ddc6df25ed2feea25bfb3339b424a9dd591688bc"}, 276 | {file = "pydantic_core-2.16.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:69a7b96b59322a81c2203be537957313b07dd333105b73db0b69212c7d867b4b"}, 277 | {file = "pydantic_core-2.16.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7e6231aa5bdacda78e96ad7b07d0c312f34ba35d717115f4b4bff6cb87224f0f"}, 278 | {file = "pydantic_core-2.16.2-cp38-none-win32.whl", hash = "sha256:41dac3b9fce187a25c6253ec79a3f9e2a7e761eb08690e90415069ea4a68ff7a"}, 279 | {file = "pydantic_core-2.16.2-cp38-none-win_amd64.whl", hash = "sha256:f685dbc1fdadb1dcd5b5e51e0a378d4685a891b2ddaf8e2bba89bd3a7144e44a"}, 280 | {file = "pydantic_core-2.16.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:55749f745ebf154c0d63d46c8c58594d8894b161928aa41adbb0709c1fe78b77"}, 281 | {file = "pydantic_core-2.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b30b0dd58a4509c3bd7eefddf6338565c4905406aee0c6e4a5293841411a1286"}, 282 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18de31781cdc7e7b28678df7c2d7882f9692ad060bc6ee3c94eb15a5d733f8f7"}, 283 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5864b0242f74b9dd0b78fd39db1768bc3f00d1ffc14e596fd3e3f2ce43436a33"}, 284 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8f9186ca45aee030dc8234118b9c0784ad91a0bb27fc4e7d9d6608a5e3d386c"}, 285 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc6f6c9be0ab6da37bc77c2dda5f14b1d532d5dbef00311ee6e13357a418e646"}, 286 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa057095f621dad24a1e906747179a69780ef45cc8f69e97463692adbcdae878"}, 287 | {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ad84731a26bcfb299f9eab56c7932d46f9cad51c52768cace09e92a19e4cf55"}, 288 | {file = "pydantic_core-2.16.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3b052c753c4babf2d1edc034c97851f867c87d6f3ea63a12e2700f159f5c41c3"}, 289 | {file = "pydantic_core-2.16.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e0f686549e32ccdb02ae6f25eee40cc33900910085de6aa3790effd391ae10c2"}, 290 | {file = "pydantic_core-2.16.2-cp39-none-win32.whl", hash = "sha256:7afb844041e707ac9ad9acad2188a90bffce2c770e6dc2318be0c9916aef1469"}, 291 | {file = "pydantic_core-2.16.2-cp39-none-win_amd64.whl", hash = "sha256:9da90d393a8227d717c19f5397688a38635afec89f2e2d7af0df037f3249c39a"}, 292 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f60f920691a620b03082692c378661947d09415743e437a7478c309eb0e4f82"}, 293 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:47924039e785a04d4a4fa49455e51b4eb3422d6eaacfde9fc9abf8fdef164e8a"}, 294 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6294e76b0380bb7a61eb8a39273c40b20beb35e8c87ee101062834ced19c545"}, 295 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe56851c3f1d6f5384b3051c536cc81b3a93a73faf931f404fef95217cf1e10d"}, 296 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9d776d30cde7e541b8180103c3f294ef7c1862fd45d81738d156d00551005784"}, 297 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:72f7919af5de5ecfaf1eba47bf9a5d8aa089a3340277276e5636d16ee97614d7"}, 298 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:4bfcbde6e06c56b30668a0c872d75a7ef3025dc3c1823a13cf29a0e9b33f67e8"}, 299 | {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ff7c97eb7a29aba230389a2661edf2e9e06ce616c7e35aa764879b6894a44b25"}, 300 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9b5f13857da99325dcabe1cc4e9e6a3d7b2e2c726248ba5dd4be3e8e4a0b6d0e"}, 301 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a7e41e3ada4cca5f22b478c08e973c930e5e6c7ba3588fb8e35f2398cdcc1545"}, 302 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60eb8ceaa40a41540b9acae6ae7c1f0a67d233c40dc4359c256ad2ad85bdf5e5"}, 303 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7beec26729d496a12fd23cf8da9944ee338c8b8a17035a560b585c36fe81af20"}, 304 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:22c5f022799f3cd6741e24f0443ead92ef42be93ffda0d29b2597208c94c3753"}, 305 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:eca58e319f4fd6df004762419612122b2c7e7d95ffafc37e890252f869f3fb2a"}, 306 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ed957db4c33bc99895f3a1672eca7e80e8cda8bd1e29a80536b4ec2153fa9804"}, 307 | {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:459c0d338cc55d099798618f714b21b7ece17eb1a87879f2da20a3ff4c7628e2"}, 308 | {file = "pydantic_core-2.16.2.tar.gz", hash = "sha256:0ba503850d8b8dcc18391f10de896ae51d37fe5fe43dbfb6a35c5c5cad271a06"}, 309 | ] 310 | 311 | [package.dependencies] 312 | typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" 313 | 314 | [[package]] 315 | name = "pyspark" 316 | version = "3.3.2" 317 | description = "Apache Spark Python API" 318 | optional = false 319 | python-versions = ">=3.7" 320 | files = [ 321 | {file = "pyspark-3.3.2.tar.gz", hash = "sha256:0dfd5db4300c1f6cc9c16d8dbdfb82d881b4b172984da71344ede1a9d4893da8"}, 322 | ] 323 | 324 | [package.dependencies] 325 | py4j = "0.10.9.5" 326 | 327 | [package.extras] 328 | ml = ["numpy (>=1.15)"] 329 | mllib = ["numpy (>=1.15)"] 330 | pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] 331 | sql = ["pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] 332 | 333 | [[package]] 334 | name = "pytest" 335 | version = "7.2.1" 336 | description = "pytest: simple powerful testing with Python" 337 | optional = false 338 | python-versions = ">=3.7" 339 | files = [ 340 | {file = "pytest-7.2.1-py3-none-any.whl", hash = "sha256:c7c6ca206e93355074ae32f7403e8ea12163b1163c976fee7d4d84027c162be5"}, 341 | {file = "pytest-7.2.1.tar.gz", hash = "sha256:d45e0952f3727241918b8fd0f376f5ff6b301cc0777c6f9a556935c92d8a7d42"}, 342 | ] 343 | 344 | [package.dependencies] 345 | attrs = ">=19.2.0" 346 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 347 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 348 | iniconfig = "*" 349 | packaging = "*" 350 | pluggy = ">=0.12,<2.0" 351 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 352 | 353 | [package.extras] 354 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 355 | 356 | [[package]] 357 | name = "tomli" 358 | version = "2.0.1" 359 | description = "A lil' TOML parser" 360 | optional = false 361 | python-versions = ">=3.7" 362 | files = [ 363 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 364 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 365 | ] 366 | 367 | [[package]] 368 | name = "typing-extensions" 369 | version = "4.9.0" 370 | description = "Backported and Experimental Type Hints for Python 3.8+" 371 | optional = false 372 | python-versions = ">=3.8" 373 | files = [ 374 | {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, 375 | {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, 376 | ] 377 | 378 | [metadata] 379 | lock-version = "2.0" 380 | python-versions = "^3.10" 381 | content-hash = "2d10f2056d3bcf2684a0e39533ee192baee7614dc9faa4153e29114b76048abc" 382 | --------------------------------------------------------------------------------