├── .gitignore ├── LICENSE ├── README.md ├── pyspark_utilities ├── __init__.py ├── dimension_utilities │ ├── __init__.py │ └── dim_datetime_utilities.py ├── pandas_udfs │ ├── __init__.py │ ├── datetime_udfs.py │ ├── datetime_udfs_base_functions.py │ ├── fuzzy_match_udfs.py │ ├── general_udfs.py │ ├── general_udfs_base_functions.py │ └── general_udfs_base_functions_test.py ├── spark_udfs │ ├── __init__.py │ └── spark_udfs.py └── spark_utilities │ ├── __init__.py │ └── spark_utilities.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyspark-utilities 2 | ETL-focused utilities library for PySpark 3 | 4 | ## Package Contents 5 | * `spark_utilities` - general PySpark utilities to develop and run Spark applications 6 | * `pandas_udfs` - Spark UDFs written using [__Pandas UDF__](https://docs.databricks.com/spark/latest/spark-sql/udf-python-pandas.html) feature [added in Spark 2.3](https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html) 7 | * `spark_udfs` - Python class containing Spark UDFs written in Scala, accessed via JAR passed to SparkContext at initialization 8 | * `dimension_utilities` - functions to generate dimension tables as Spark DataFrames 9 | 10 | ## Setup 11 | 1. Install library 12 | ```bash 13 | pip install git+https://github.com/zaksamalik/pyspark-utilities 14 | ``` 15 | 2. Follow instructions in [__spark-etl-utilities__](https://github.com/zaksamalik/spark-etl-utilities) 16 | repo to build `spark-etl-utilities` JAR 17 | * __Note__: steps 2 & 3 are optional and __only__ required in order to use the Spark UDFs written in Scala 18 | 3. Load resulting JAR file in Spark session (example in Spark Utilities --> Methods section) 19 | 20 | ## Spark Utilities 21 | _General PySpark utilities to develop and run Spark applications._ 22 | ### Methods 23 | __General__ 24 | * `start_spark` - instantiate SparkSession 25 | * arguments: 26 | * `config` (_SparkConf_) - SparkConf() with set parameters (defaulted). 27 | * `app_name` (_str_) - name of Spark application (default = None). 28 | * `env` (_str_) - where Spark application is running (default = 'cluster'). Known values: `local`, `cluster`. 29 | * `enable_hive` (_bool_) - if `True`: adds Hive support via `enableHiveSupport()` (default = False). 30 | * `source_aws_credentials_file` (_bool_) - whether to source AWS credentials file (default = False). 31 | * `aws_profile` (_str_) - name of profile to use for interacting with AWS services (default = None). 32 | Only used if `env` is `local`. 33 | * example usage 34 | ```py 35 | from os.path import expanduser 36 | from pyspark import SparkConf 37 | from pyspark_utilities.spark_utilities import start_spark 38 | 39 | config = (SparkConf().setAll([ 40 | ('spark.driver.extraClassPath', expanduser('/path/to/jars/*')), 41 | ('spark.executor.extraClassPath', expanduser('/path/to/jars/*')) 42 | ])) 43 | spark = start_spark(config=config, app_name='example_app', env='local') 44 | ``` 45 | 46 | ## Pandas UDFs 47 | _User-defined functions written using Pandas UDF feature added in Spark 2.3._ 48 | 49 | A good introduction of Pandas UDFs can be found [here](https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html), but in short: Pandas UDFs are vectorized and use Apache Arrow to transfer data from Spark to Pandas and back, delivering much faster performance than one-row-at-a-time Python UDFs, which are notorious 50 | bottlenecks in PySpark application development. 51 | 52 | __Note__: all Pandas UDFs in this library start with a `pd_` prefix. 53 | 54 | ### Methods 55 | * __General UDFs__ 56 | * `pd_clean_string` - remove ISO control characters from, and trim input string column. 57 | * returns: _StringType_ 58 | * `pd_empty_string_to_null` - check if input values in strings column are empty, and return null if so. 59 | * returns: _StringType_ 60 | * `pd_generate_uuid` - generate UUID v4. 61 | * returns: _StringType_ 62 | * Pandas UDFs require at least one input column, so a column must be passed to this function even though no operations will actually be applied to that column. 63 | * __Important note__: because Spark is lazy evaluated, UUID will change prior to being written / collected / converted to Pandas DataFrame. Do not rely on UUID as a key throughout Spark application. 64 | * `pd_map_booleans_ynu` - map boolean values to `Y`, `N`, `Unknown`. 65 | * returns: _StringType_ 66 | * `pd_string_to_double_pfd` - convert string column to double where PERIOD represents DECIMAL place. 67 | * returns: _DoubleType_ 68 | * `pd_string_to_double_cfd` - convert string column to double where COMMA represents DECIMAL place. 69 | * returns: _DoubleType_ 70 | * `pd_string_is_number` - check whether values in string column can be converted to numbers 71 | * returns: _BooleanType_ 72 | * __Datetime UDFs__ 73 | * `pd_is_holiday_usa` - check whether values in date column are US holidays (from [__holidays__](https://pypi.org/project/holidays/) package). 74 | * returns: _StringType_ (`Y`, `N`, `Unknown`) 75 | * `pd_normalize_date_md` - Convert column with dates as strings to dates 76 | (MONTH before DAY). 77 | * returns: _DateType_ 78 | * `pd_normalize_date_dm` - Convert column with dates as strings to dates 79 | (DAY before MONTH). 80 | * returns: _DateType_ 81 | * `pd_normalize_timestamp_md` - Convert column with timestamps as strings to 82 | timestamps (MONTH before DAY). 83 | * returns _TimestampType_ 84 | * `pd_normalize_timestamp_dm` - Convert column with timestamps as strings to timestamps (DAY before MONTH). 85 | * returns _TimestampType_ 86 | * example usage 87 | ```py 88 | from pyspark_utilities.pandas_udfs import pd_clean_string 89 | from pyspark.sql.functions import col, lit 90 | df_clean = (df 91 | .withColumn('clean_text', pd_clean_string(col('messy_text'))) 92 | .withColumn('uuid', pd_generate_uuid(lit('')))) 93 | ``` 94 | 95 | * __Fuzzy String Matching UDFs__ (methods from [__fuzzywuzzy__](https://github.com/seatgeek/fuzzywuzzy) and [__jellyfish__](https://github.com/jamesturk/jellyfish) packages) 96 | * `pd_fuzz_ratio` - simple ratio (`fuzz.ratio`) 97 | * returns: _IntegerType_ 98 | * `pd_fuzz_partial_ratio` - partial ratio (`fuzz.partial_ratio`) 99 | * returns: _IntegerType_ 100 | * `pd_fuzz_token_set_ratio` - token set ratio (`fuzz.token_set_ratio`) 101 | * returns: _IntegerType_ 102 | * `pd_fuzz_partial_token_set_ratio` - partial token set ratio (`fuzz.partial_token_set_ratio`) 103 | * returns: _IntegerType_ 104 | * `pd_fuzz_token_sort_ratio` - token sort ratio (`fuzz.token_sort_ratio`) 105 | * returns: _IntegerType_ 106 | * `pd_fuzz_partial_token_sort_ratio` - partial token sort ratio (`fuzz.partial_token_sort_ratio`) 107 | * returns: _IntegerType_ 108 | * `pd_damerau_levenshtein_distance` - damerau levenshtein distance (`jellyfish.damerau_levenshtein_distance`) 109 | * returns: _IntegerType_ 110 | * `pd_hamming_distance` - hamming distance (`jellyfish.hamming_distance`) 111 | * returns: _IntegerType_ 112 | * `pd_jaro_distance` - jaro distance (`jellyfish.jaro_distance`) 113 | * returns: _DoubleType_ 114 | * `pd_jaro_winkler` - jaro winkler (`jellyfish.jaro_winkler`) 115 | * returns: _DoubleType_ 116 | * `pd_match_rating_codex` - match rating codex (`jellyfish.match_rating_codex`) 117 | * returns: _StringType_ 118 | * `pd_match_rating_comparison` match rating comparison - (`jellyfish.match_rating_comparison`) 119 | * returns: _BooleanType_ 120 | * `pd_metaphone` - metaphone (`jellyfish.metaphone`) 121 | * returns: _StringType_ 122 | * `pd_nysiis` - nysiis (`jellyfish.nysiis`) 123 | * returns: _StringType_ 124 | * `pd_porter_stem` - porter_stem (`jellyfish.porter_stem`) 125 | * returns: _StringType_ 126 | 127 | 128 | * example usage 129 | ```py 130 | from pyspark.sql.functions import col 131 | from pyspark_utilities.pandas_udfs import pd_clean_string, pd_fuzz_partial_ratio 132 | 133 | df_fuzzy_ratio = (df 134 | .withColumn('clean_text1', pd_clean_string(col('messy_text'))) 135 | .withColumn('clean_text2', pd_clean_string(col('messy_text'))) 136 | .withColumn('partial_fuzzy_ratio', pd_fuzz_partial_ratio(col('clean_text1'), col('clean_text2')))) 137 | ``` 138 | ## Spark UDFs (Scala) 139 | _Spark UDFs written in Scala exposed to Python._ 140 | 141 | __Important Note__: all Scala Spark UDF functionality also exist as Pandas UDFs. 142 | Because they are Scala native, Spark UDFs should be more performant than Pandas UDFs (in most cases), but require an external JAR in order to use. For pure Python ETL implementations, use Pandas UDFs instead. All other functionality in this package should work fine without the Spark UDFs JAR. 143 | 144 | ### Example 145 | ```py 146 | from pyspark.sql.functions import col 147 | from pyspark_utilities.spark_udfs import SparkUDFs 148 | # `spark` = instantiated SparkSession 149 | udfs = SparkUDFs(spark) 150 | # apply UDF 151 | df_with_uuid = (df 152 | .withColumn('uuid', udfs.generate_uuid()) 153 | .withColumn('clean_string', udfs.clean_string(col('messy_text')))) 154 | ``` 155 | ### Methods 156 | * __General UDFs__ 157 | * `clean_string` - remove Java ISO control characters from, and trim, string 158 | * returns: _string_ (nullable) 159 | * `empty_string_to_null` - convert empty strings to null values 160 | * returns: _string_ (nullable) 161 | * `generate_uuid` - generate V4 UUID 162 | * returns: _string_ 163 | * `map_booleans_ynu` - map boolean values to `Y`, `N`, `Unknown` 164 | * returns: _string_ 165 | * `string_to_double_pfd` - convert string to double (where `.` represents decimal place) 166 | * returns: _double_ (nullable) 167 | * `string_to_double_cfd` - convert string to decimal (where `,` represents decimal place) 168 | * returns: _double_ (nullable) 169 | * `string_is_number` - validate whether passed string could be converted to a number. 170 | * returns: _boolean_ 171 | * __Datetime UDFs__ 172 | * `normalize_date_md` - normalize string to date with MONTH before DAY 173 | * returns: _date_ (nullable) 174 | * `normalize_date_dm` - normalize string to date with DAY before MONTH 175 | * returns: _date_ (nullable) 176 | * `normalize_timestamp_md` - normalize string to timestamp with MONTH before DAY 177 | * returns: _timestamp_ (nullable) 178 | * `normalize_timestamp_dm` - normalize string to timestamp with DAY before MONTH 179 | * returns: _timestamp_ (nullable) 180 | 181 | ## Dimension Utilities 182 | _Functions to generate dimension ("dim") tables as Spark DataFrames._ 183 | ### Methods 184 | __Datetime__ 185 | * `generate_dim_date` - generate Spark DataFrame with various date dimensions 186 | * arguments: 187 | * `spark` - instantiated SparkSession 188 | * `start_date` - starting (minimum) year for dim_date table 189 | * `number_years_out_from_start` - number of years out from starting date to increment 190 | * example usage 191 | ```py 192 | from pyspark_utilities.spark_utilities import start_spark 193 | from pyspark_utilities.dimension_utilities import generate_dim_date 194 | 195 | spark=start_spark(env='local') 196 | dim_date_df = generate_dim_date(spark=spark, start_year=1901, number_years_out_from_start=300) 197 | ``` 198 | -------------------------------------------------------------------------------- /pyspark_utilities/__init__.py: -------------------------------------------------------------------------------- 1 | name = "pyspark-utilities" 2 | -------------------------------------------------------------------------------- /pyspark_utilities/dimension_utilities/__init__.py: -------------------------------------------------------------------------------- 1 | from .dim_datetime_utilities import generate_dim_date 2 | -------------------------------------------------------------------------------- /pyspark_utilities/dimension_utilities/dim_datetime_utilities.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pyspark.sql.functions import (col, concat, date_format, datediff, dayofmonth, dayofweek, dayofyear, expr, 3 | last_day, lit, lpad, quarter, regexp_replace, to_date, weekofyear, when) 4 | from ..pandas_udfs.datetime_udfs import pd_is_holiday_usa 5 | 6 | 7 | def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300): 8 | """Create `dim_date` table containing various date feature columns. 9 | 10 | Args: 11 | spark (SparkSession): Instantiated SparkSession 12 | start_year (int): starting year for dim_date table. 13 | number_years_out_from_start (int): number out from `start_year` to increment. 14 | 15 | Returns: 16 | Spark DataFrame. 17 | """ 18 | years = [start_year + i for i in range(number_years_out_from_start + 1)] 19 | months = [i for i in range(1, 13)] 20 | days = [i for i in range(1, 32)] 21 | 22 | years_df = spark.createDataFrame(pd.DataFrame({'year': years, 'temp_join_key': '1'})) 23 | months_df = spark.createDataFrame(pd.DataFrame({'month': months, 'temp_join_key': '1'})) 24 | days_df = spark.createDataFrame(pd.DataFrame({'day_of_month': days, 'temp_join_key': '1'})) 25 | 26 | years_months_df = (years_df 27 | .join(months_df, 28 | ['temp_join_key'], 29 | how='inner')) 30 | 31 | years_month_days_df = (years_months_df 32 | .join(days_df, 33 | ['temp_join_key'], 34 | how='inner')) 35 | 36 | date_keys = (years_month_days_df 37 | .withColumn('date', to_date(concat(col('year'), 38 | lpad(col('month'), 2, '0'), 39 | lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd')) 40 | # remove invalid dates 41 | .filter("date IS NOT NULL") 42 | .withColumn('date_key', regexp_replace(col('date').cast('string'), '-', '').cast('integer'))) 43 | 44 | date_features = (date_keys 45 | # get `week` and `quarter` 46 | .withColumn('week', weekofyear(col('date'))) 47 | .withColumn('quarter', quarter(col('date'))) 48 | # get `day_name` and `month_name` 49 | .withColumn('day_name', date_format(col('date'), 'EEEE')) 50 | .withColumn('month_name', date_format(col('date'), 'MMMM')) 51 | # get `date_year`, `date_quarter`, `date_month`, `date_week` 52 | .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)")) 53 | .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01')) 54 | .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)")) 55 | .withColumn('date_year', date_format(col('date'), 'yyyy-01-01')) 56 | # get `day_of_week`, `day_of_quarter`, `day_of_year` 57 | .withColumn('day_of_week', dayofweek(col('date'))) 58 | .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1)) 59 | .withColumn('day_of_year', dayofyear(col('date'))) 60 | # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`, 61 | # `month_start_flag`, `month_end_flag` 62 | .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N')) 63 | .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp'))) 64 | .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) | 65 | (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N')) 66 | .withColumn('leap_year_flag', 67 | when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y') 68 | .otherwise('N')) 69 | .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N')) 70 | .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N')) 71 | # get `pct_into_month`, `pct_into_quarter`, `pct_into_year` 72 | .withColumn('pct_into_month', 73 | (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)')) 74 | .withColumn('date_quarter_end', 75 | when(col('quarter') == lit(1), concat(col('year'), lit('-03-31'))) 76 | .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30'))) 77 | .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30'))) 78 | .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31'))) 79 | .otherwise(None) 80 | .cast('date')) 81 | .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1)) 82 | .withColumn('pct_into_quarter', 83 | (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)')) 84 | .withColumn('pct_into_year', 85 | (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0)) 86 | .cast('decimal(7, 6)')) 87 | # get seasons 88 | .withColumn('season_northern', 89 | when(col('month').isin(12, 1, 2), 'Winter') 90 | .when(col('month').isin(3, 4, 5), 'Spring') 91 | .when(col('month').isin(6, 7, 8), 'Summer') 92 | .when(col('month').isin(9, 10, 11), 'Fall') 93 | .otherwise('UNKNOWN')) 94 | .withColumn('season_southern', 95 | when(col('month').isin(6, 7, 8), 'Winter') 96 | .when(col('month').isin(9, 10, 11), 'Spring') 97 | .when(col('month').isin(12, 1, 2), 'Summer') 98 | .when(col('month').isin(3, 4, 5), 'Fall') 99 | .otherwise('UNKNOWN'))) 100 | 101 | dim_date = (date_features 102 | .sort('date') 103 | .select(['date_key', 104 | 'date', 105 | 'date_week', 106 | 'date_month', 107 | 'date_quarter', 108 | 'date_year', 109 | 'day_of_week', 110 | 'day_of_month', 111 | 'day_of_quarter', 112 | 'day_of_year', 113 | 'week', 114 | 'month', 115 | 'quarter', 116 | 'year', 117 | 'days_in_quarter', 118 | 'day_name', 119 | 'month_name', 120 | 'season_northern', 121 | 'season_southern', 122 | 'weekend_flag', 123 | 'us_holiday_flag', 124 | 'us_biz_day_flag', 125 | 'month_start_flag', 126 | 'month_end_flag', 127 | 'leap_year_flag', 128 | 'pct_into_month', 129 | 'pct_into_quarter', 130 | 'pct_into_year'])) 131 | return dim_date 132 | 133 | 134 | # TODO: `generate_dim_time` (seconds) 135 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/__init__.py: -------------------------------------------------------------------------------- 1 | from .datetime_udfs import (pd_is_holiday_usa, pd_normalize_date_md, pd_normalize_date_dm, pd_normalize_timestamp_md, 2 | pd_normalize_timestamp_dm) 3 | from .fuzzy_match_udfs import (pd_fuzz_ratio, pd_fuzz_partial_ratio, pd_fuzz_token_set_ratio, 4 | pd_fuzz_partial_token_set_ratio, pd_fuzz_token_sort_ratio, 5 | pd_fuzz_partial_token_sort_ratio, pd_damerau_levenshtein_distance, pd_hamming_distance, 6 | pd_jaro_distance, pd_jaro_winkler, pd_match_rating_codex, pd_match_rating_comparison, 7 | pd_metaphone, pd_nysiis, pd_porter_stem) 8 | from .general_udfs import (pd_clean_string, pd_empty_string_to_null, pd_map_booleans_ynu, pd_generate_uuid, 9 | pd_string_to_double_pfd, pd_string_to_double_cfd) 10 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/datetime_udfs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pyspark.sql.functions import pandas_udf, PandasUDFType 3 | from pyspark.sql.types import DateType, StringType, TimestampType 4 | from .datetime_udfs_base_functions import is_holiday_usa, to_datetime_md, to_datetime_dm 5 | 6 | 7 | # noinspection PyArgumentList 8 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 9 | def pd_is_holiday_usa(target_col): 10 | """ Apply `is_holiday_usa` to Spark column. 11 | 12 | Args: 13 | target_col (Spark Column): containing dates or timestamps to check holiday status. 14 | 15 | Returns: 16 | Spark Column (StringType): `Y`, `N`, or `Unknown`. 17 | """ 18 | return pd.Series(target_col.apply(lambda x: is_holiday_usa(x))) 19 | 20 | 21 | # noinspection PyArgumentList 22 | @pandas_udf(DateType(), PandasUDFType.SCALAR) 23 | def pd_normalize_date_md(target_col): 24 | """ Convert column with dates as strings to dates (MONTH BEFORE DAY). 25 | 26 | Args: 27 | target_col (Spark Column): containing dates as strings. 28 | 29 | Returns: 30 | Spark Column (DateType): containing dates extracted from strings. 31 | """ 32 | return pd.Series(target_col.apply(lambda x: to_datetime_md(x))) 33 | 34 | 35 | # noinspection PyArgumentList 36 | @pandas_udf(DateType(), PandasUDFType.SCALAR) 37 | def pd_normalize_date_dm(target_col): 38 | """ Convert column with dates as strings to dates (DAY BEFORE MONTH). 39 | 40 | Args: 41 | target_col (Spark Column): containing dates as strings. 42 | 43 | Returns: 44 | Spark Column (DateType): containing dates extracted from strings. 45 | """ 46 | return pd.Series(target_col.apply(lambda x: to_datetime_dm(x))) 47 | 48 | 49 | # noinspection PyArgumentList 50 | @pandas_udf(TimestampType(), PandasUDFType.SCALAR) 51 | def pd_normalize_timestamp_md(target_col): 52 | """ Convert column with timestamps as strings to timestamps (MONTH BEFORE DAY). 53 | 54 | Args: 55 | target_col (Spark Column): containing dates as strings. 56 | 57 | Returns: 58 | Spark Column (TimestampType): containing dates extracted from strings. 59 | """ 60 | return pd.Series(target_col.apply(lambda x: to_datetime_md(x))) 61 | 62 | 63 | # noinspection PyArgumentList 64 | @pandas_udf(TimestampType(), PandasUDFType.SCALAR) 65 | def pd_normalize_timestamp_dm(target_col): 66 | """ Convert column with timestamps as strings to timestamps (DAY BEFORE MONTH). 67 | 68 | Args: 69 | target_col (Spark Column): containing dates as strings. 70 | 71 | Returns: 72 | Spark Column (TimestampType): containing dates extracted from strings. 73 | """ 74 | return pd.Series(target_col.apply(lambda x: to_datetime_dm(x))) 75 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/datetime_udfs_base_functions.py: -------------------------------------------------------------------------------- 1 | import holidays 2 | import pandas as pd 3 | from .general_udfs_base_functions import clean_string 4 | 5 | 6 | def is_holiday_usa(dt): 7 | """ Check whether a given date is a US holiday. 8 | 9 | Args: 10 | dt (Date, Timestamp): date to check for holiday status. 11 | 12 | Returns: 13 | str: `Y`, `N`, or `Unknown` 14 | """ 15 | if dt is None: 16 | return 'Unknown' 17 | elif pd.to_datetime(dt) in holidays.US(): 18 | return 'Y' 19 | else: 20 | return 'N' 21 | 22 | 23 | def to_datetime_md(dt_str): 24 | """ Apply `pd.to_datetime` with inferring datetime and null handling (MONTH comes BEFORE DAY). 25 | 26 | Args: 27 | dt_str (str): target str to parse to datetime. 28 | 29 | Returns: 30 | Timestamp: parsed from string. 31 | """ 32 | if clean_string(dt_str) is None: 33 | return None 34 | else: 35 | return pd.to_datetime(dt_str, infer_datetime_format=True, dayfirst=False) 36 | 37 | 38 | def to_datetime_dm(dt_str): 39 | """ Apply `pd.to_datetime` with inferring datetime and null handling (DAY comes BEFORE MONTH). 40 | 41 | Args: 42 | dt_str (str): target str to parse to datetime. 43 | 44 | Returns: 45 | Timestamp: parsed from string. 46 | """ 47 | if clean_string(dt_str) is None: 48 | return None 49 | else: 50 | return pd.to_datetime(dt_str, infer_datetime_format=True, dayfirst=True) 51 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/fuzzy_match_udfs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from fuzzywuzzy import fuzz 3 | import jellyfish 4 | from pyspark.sql.functions import pandas_udf, PandasUDFType 5 | from pyspark.sql.types import BooleanType, DoubleType, IntegerType, StringType 6 | 7 | 8 | # noinspection PyArgumentList 9 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 10 | def pd_fuzz_ratio(col1, col2): 11 | """ Calculate "simple" ratio (`fuzz.ratio`) between two text columns. 12 | 13 | Args: 14 | col1 (Spark Column): 1st text column 15 | col2 (Spark Column): 2nd text column 16 | 17 | Returns: 18 | Spark Column (IntegerType): result of `fuzz.ratio` calculation. 19 | """ 20 | return pd.Series(map(fuzz.ratio, col1.astype(str), col2.astype(str))) 21 | 22 | 23 | # noinspection PyArgumentList 24 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 25 | def pd_fuzz_partial_ratio(col1, col2): 26 | """ Calculate "partial" ratio (`fuzz.partial_ratio`) between two text columns. 27 | 28 | Args: 29 | col1 (Spark Column): 1st text column 30 | col2 (Spark Column): 2nd text column 31 | 32 | Returns: 33 | Spark Column (IntegerType): result of `fuzz.partial_ratio` calculation. 34 | """ 35 | return pd.Series(map(fuzz.partial_ratio, col1.astype(str), col2.astype(str))) 36 | 37 | 38 | # noinspection PyArgumentList 39 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 40 | def pd_fuzz_token_set_ratio(col1, col2): 41 | """ Calculate "token set" ratio (`fuzz.token_set_ratio`) between two text columns. 42 | 43 | Args: 44 | col1 (Spark Column): 1st text column 45 | col2 (Spark Column): 2nd text column 46 | 47 | Returns: 48 | Spark Column (IntegerType): result of `fuzz.token_set_ratio` calculation. 49 | """ 50 | return pd.Series(map(fuzz.token_set_ratio, col1.astype(str), col2.astype(str))) 51 | 52 | 53 | # noinspection PyArgumentList 54 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 55 | def pd_fuzz_partial_token_set_ratio(col1, col2): 56 | """ Calculate "partial token set" ratio (`fuzz.partial_token_set_ratio`) between two text columns. 57 | 58 | Args: 59 | col1 (Spark Column): 1st text column 60 | col2 (Spark Column): 2nd text column 61 | 62 | Returns: 63 | Spark Column (IntegerType): result of `fuzz.partial_token_set_ratio` calculation. 64 | """ 65 | return pd.Series(map(fuzz.partial_token_set_ratio, col1.astype(str), col2.astype(str))) 66 | 67 | 68 | # noinspection PyArgumentList 69 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 70 | def pd_fuzz_token_sort_ratio(col1, col2): 71 | """ Calculate "token sort" ratio (`fuzz.token_sort_ratio`) between two text columns. 72 | 73 | Args: 74 | col1 (Spark Column): 1st text column 75 | col2 (Spark Column): 2nd text column 76 | 77 | Returns: 78 | Spark Column (IntegerType): result of `fuzz.token_sort_ratio` calculation. 79 | """ 80 | return pd.Series(map(fuzz.token_sort_ratio, col1.astype(str), col2.astype(str))) 81 | 82 | 83 | # noinspection PyArgumentList 84 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 85 | def pd_fuzz_partial_token_sort_ratio(col1, col2): 86 | """ Calculate "partial token sort" ratio (`fuzz.partial_token_sort_ratio`) between two text columns. 87 | 88 | Args: 89 | col1 (Spark Column): 1st text column 90 | col2 (Spark Column): 2nd text column 91 | 92 | Returns: 93 | Spark Column (IntegerType): result of `fuzz.partial_token_sort_ratio` calculation. 94 | """ 95 | return pd.Series(map(fuzz.partial_token_sort_ratio, col1.astype(str), col2.astype(str))) 96 | 97 | # TODO: `process` function 98 | 99 | 100 | # noinspection PyArgumentList 101 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 102 | def pd_damerau_levenshtein_distance(col1, col2): 103 | """ Calculate Damerau Levenshtein distance between two text columns. 104 | 105 | Args: 106 | col1 (Spark Column): 1st text column 107 | col2 (Spark Column): 2nd text column 108 | 109 | Returns: 110 | Spark Column (IntegerType): with Damerau Levenshtein distances. 111 | """ 112 | return pd.Series(map(jellyfish.damerau_levenshtein_distance, col1.astype(str), col2.astype(str))) 113 | 114 | 115 | # jellyfish.hamming_distance 116 | # noinspection PyArgumentList 117 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR) 118 | def pd_hamming_distance(col1, col2): 119 | """ Calculate hamming distance between two text columns. 120 | 121 | Args: 122 | col1 (Spark Column): 1st text column 123 | col2 (Spark Column): 2nd text column 124 | 125 | Returns: 126 | Spark Column (IntegerType): with hamming distances. 127 | """ 128 | return pd.Series(map(jellyfish.hamming_distance, col1.astype(str), col2.astype(str))) 129 | 130 | 131 | # noinspection PyArgumentList 132 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR) 133 | def pd_jaro_distance(col1, col2): 134 | """ Calculate `jaro_distance` between two text columns. 135 | 136 | Args: 137 | col1 (Spark Column): 1st text column 138 | col2 (Spark Column): 2nd text column 139 | 140 | Returns: 141 | Spark Column (DoubleType): with Jaro distances. 142 | """ 143 | return pd.Series(map(jellyfish.jaro_distance, col1.astype(str), col2.astype(str))) 144 | 145 | 146 | # noinspection PyArgumentList 147 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR) 148 | def pd_jaro_winkler(col1, col2): 149 | """ Calculate `jellyfish.jaro_winkler` between two text columns. 150 | 151 | Args: 152 | col1 (Spark Column): 1st text column 153 | col2 (Spark Column): 2nd text column 154 | 155 | Returns: 156 | Spark Column (DoubleType): with Jaro Winkler scores. 157 | """ 158 | return pd.Series(map(jellyfish.jaro_winkler, col1.astype(str), col2.astype(str))) 159 | 160 | 161 | # noinspection PyArgumentList 162 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 163 | def pd_match_rating_codex(target_col): 164 | """ Apply `jellyfish.match_rating_codex` to text column. 165 | 166 | Args: 167 | target_col (Spark Column): text column. 168 | 169 | Returns: 170 | Spark Column (StringType): with match rating codex. 171 | """ 172 | return pd.Series(target_col.apply(lambda x: jellyfish.match_rating_codex(str(x)))) 173 | 174 | 175 | # noinspection PyArgumentList 176 | @pandas_udf(BooleanType(), PandasUDFType.SCALAR) 177 | def pd_match_rating_comparison(col1, col2): 178 | """ Calculate `jellyfish.match_rating_comparison` between two text columns. 179 | 180 | Args: 181 | col1 (Spark Column): 1st text column 182 | col2 (Spark Column): 2nd text column 183 | 184 | Returns: 185 | Spark Column (BooleanType, nullable): True / False / None matching results. 186 | """ 187 | return pd.Series(map(jellyfish.match_rating_comparison, col1.astype(str), col2.astype(str))) 188 | 189 | 190 | # noinspection PyArgumentList 191 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 192 | def pd_metaphone(target_col): 193 | """ Apply `jellyfish.metaphone` to text column. 194 | 195 | Args: 196 | target_col (Spark Column): text column. 197 | 198 | Returns: 199 | Spark Column (StringType): metaphone encodings. 200 | """ 201 | return pd.Series(target_col.apply(lambda x: jellyfish.metaphone(str(x)))) 202 | 203 | 204 | # noinspection PyArgumentList 205 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 206 | def pd_nysiis(target_col): 207 | """ Apply `jellyfish.nysiis` to text column. 208 | 209 | Args: 210 | target_col (Spark Column): text column. 211 | 212 | Returns: 213 | Spark Column (StringType): NYSIIS encodings. 214 | """ 215 | return pd.Series(target_col.apply(lambda x: jellyfish.nysiis(str(x)))) 216 | 217 | 218 | # noinspection PyArgumentList 219 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 220 | def pd_porter_stem(target_col): 221 | """ Apply `jellyfish.porter_stem` to text column. 222 | 223 | Args: 224 | target_col (Spark Column): text column. 225 | 226 | Returns: 227 | Spark Column (StringType): porter stems. 228 | """ 229 | return pd.Series(target_col.apply(lambda x: jellyfish.porter_stem(str(x)))) 230 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/general_udfs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from uuid import uuid4 3 | from pyspark.sql.functions import pandas_udf, PandasUDFType 4 | from pyspark.sql.types import DoubleType, StringType 5 | from .general_udfs_base_functions import (clean_string, empty_string_to_null, map_booleans_ynu, 6 | string_to_double_pfd, string_to_double_cfd) 7 | 8 | 9 | # noinspection PyArgumentList 10 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 11 | def pd_clean_string(target_col): 12 | """ Apply `clean_string` over Spark Column. 13 | 14 | Args: 15 | target_col (Spark Column): containing strings to clean. 16 | 17 | Returns: 18 | Spark Column (StringType): cleaned version of input strings. 19 | """ 20 | return pd.Series(target_col.apply(lambda x: clean_string(x))) 21 | 22 | 23 | # noinspection PyArgumentList 24 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 25 | def pd_empty_string_to_null(target_col): 26 | """ Apply `empty_string_to_null` to Spark Column. 27 | 28 | Args: 29 | target_col (Spark Column): containing strings to convert empties --> nulls 30 | 31 | Returns: 32 | Spark Column: where empty strings replaced with nulls. 33 | """ 34 | return pd.Series(target_col.apply(lambda x: empty_string_to_null(x))) 35 | 36 | 37 | # noinspection PyArgumentList 38 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 39 | def pd_generate_uuid(target_col): 40 | """ Generate UUID v4. 41 | 42 | Args: 43 | target_col (Spark Column): any column, not actually used... Pandas UDFs require input column. 44 | 45 | Returns: 46 | Spark Column (StringType): UUID v4. 47 | """ 48 | return pd.Series(target_col.apply(lambda x: uuid4().__str__())) 49 | 50 | 51 | # noinspection PyArgumentList 52 | @pandas_udf(StringType(), PandasUDFType.SCALAR) 53 | def pd_map_booleans_ynu(target_col): 54 | """ Apply `map_booleans_ynu` over Spark Column. 55 | 56 | Args: 57 | target_col (Spark Column): containing values to check if they represent booleans / indicators. 58 | 59 | Returns: 60 | Spark Column (StringType): `Y`, `N`, `Unknown`. 61 | """ 62 | return pd.Series(target_col.apply(lambda x: map_booleans_ynu(x))) 63 | 64 | 65 | # noinspection PyArgumentList 66 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR) 67 | def pd_string_to_double_pfd(target_col): 68 | """ Apply `string_to_double` to Spark Column (PERIOD for DECIMAL place). 69 | 70 | Args: 71 | target_col (Spark Column): containing double values as strings. 72 | 73 | Returns: 74 | Spark Column (DoubleType): doubles converted from strings. 75 | """ 76 | return pd.Series(target_col.apply(lambda x: string_to_double_pfd(x, comma_for_decimal=False))) 77 | 78 | 79 | # noinspection PyArgumentList 80 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR) 81 | def pd_string_to_double_cfd(target_col): 82 | """ Apply `string_to_double` Spark Column (COMMAS for DECIMAL place). 83 | 84 | Args: 85 | target_col (Spark Column): containing double values as strings. 86 | 87 | Returns: 88 | Spark Column (DoubleType): doubles converted from strings. 89 | """ 90 | return pd.Series(target_col.apply(lambda x: string_to_double_cfd(x, comma_for_decimal=True))) 91 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/general_udfs_base_functions.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def clean_string(target_str): 5 | """ Remove ISO control characters and trim input string. Returns None if cleaned string is empty. 6 | 7 | Args: 8 | target_str (st): string to be cleaned. 9 | 10 | Returns: 11 | str: cleaned input string. 12 | """ 13 | if target_str is None: 14 | return None 15 | else: 16 | string_clean = re.sub(r'[\x00-\x1F]+', '', target_str).strip() 17 | if string_clean == '': 18 | return None 19 | else: 20 | return string_clean 21 | 22 | 23 | def empty_string_to_null(target_str): 24 | """ Check if input string is empty, and return null if so (otherwise return input string). 25 | 26 | Args: 27 | target_str (str): string to check for emptiness. 28 | 29 | Returns: 30 | str: null if input string is empty else input string. 31 | """ 32 | if target_str is None: 33 | return None 34 | elif re.sub(r'[\x00-\x1F]+', '', target_str).strip() == '': 35 | return None 36 | else: 37 | return target_str 38 | 39 | 40 | def map_booleans_ynu(target_val): 41 | """ Map boolean values to `Y`, `N`, `Unknown`. 42 | 43 | Args: 44 | target_val (any): value to check if it represents a boolean / indicator. 45 | 46 | Returns: 47 | str: `Y`, `N`, `Unknown` 48 | """ 49 | if target_val in [False, 0, '0', 'f', 'F', 'false', 'False', 'FALSE', 'n', 'N', 'no', 'No', 'NO']: 50 | return 'N' 51 | elif target_val in [True, 1, '1', 't', 'T', 'true', 'True', 'TRUE', 'y', 'Y', 'yes', 'Yes', 'YES']: 52 | return 'Y' 53 | else: 54 | return 'Unknown' 55 | 56 | 57 | def string_to_double_pfd(target_str): 58 | return string_to_float(target_str, comma_for_decimal=False) 59 | 60 | 61 | def string_to_double_cfd(target_str): 62 | return string_to_float(target_str, comma_for_decimal=True) 63 | 64 | 65 | def string_to_float(target_str, comma_for_decimal=False): 66 | """ Convert string to float. 67 | 68 | Args: 69 | target_str (str): target str to convert to double. 70 | comma_for_decimal (bool): whether commas represent decimal in passed string. 71 | 72 | Returns: 73 | float: converted from input string. 74 | """ 75 | if not string_is_number(target_str): 76 | return None 77 | else: 78 | if comma_for_decimal: 79 | string_clean = re.sub(',', '.', re.sub('[^0-9,-]', '', target_str.strip())) 80 | else: 81 | string_clean = re.sub('[^0-9.-]', '', target_str.strip()) 82 | number_match = extract_number_from_string(string_clean) 83 | if re.match('\\(.*\\)', target_str): 84 | return number_match * -1.0 85 | else: 86 | return number_match 87 | 88 | 89 | def extract_number_from_string(target_str): 90 | """Extract number from string. 91 | 92 | Args: 93 | target_str (str): containing number in string format. 94 | 95 | Returns: 96 | float: parsed from string. 97 | """ 98 | number_pattern = '(\\-?[0-9]+(\\.[0-9]+)?)' 99 | matches = re.search(number_pattern, target_str) 100 | if matches: 101 | return float(matches.group(0)) 102 | else: 103 | raise ValueError(f"ERROR: Bad number passing. Could not parse {target_str}.") 104 | 105 | 106 | def string_is_number(target_str): 107 | """ Check whether passed string can accurately be converted to a number. 108 | 109 | Args: 110 | target_str (str): string to validate if parsable to number. 111 | 112 | Returns: 113 | bool 114 | """ 115 | if target_str is None: 116 | return False 117 | else: 118 | return bool(re.fullmatch('^\\d+$', re.sub('[^0-9]', '', target_str))) 119 | -------------------------------------------------------------------------------- /pyspark_utilities/pandas_udfs/general_udfs_base_functions_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from .general_udfs_base_functions import (clean_string, empty_string_to_null, map_booleans_ynu, 3 | string_to_double_pfd, string_to_double_cfd) 4 | 5 | 6 | class TestGeneralUDFBaseFunctions(unittest.TestCase): 7 | 8 | def test_clean_string(self): 9 | self.assertIsNone(clean_string('\u0000')) 10 | self.assertIsNone(clean_string(None)) 11 | self.assertIsNone(clean_string('')) 12 | self.assertEqual(clean_string('abc '), 'abc') 13 | self.assertEqual(clean_string('abc 123\u0000'), 'abc 123') 14 | 15 | def test_empty_string_to_null(self): 16 | self.assertIsNone(empty_string_to_null('\u0000')) 17 | # noinspection PyTypeChecker 18 | self.assertIsNone(empty_string_to_null(None)) 19 | self.assertIsNone(empty_string_to_null('')) 20 | self.assertEqual(empty_string_to_null('abc '), 'abc ') 21 | self.assertEqual(empty_string_to_null('abc 123\u0000'), 'abc 123\u0000') 22 | 23 | def test_map_booleans_ynu(self): 24 | # `N` 25 | self.assertEqual(map_booleans_ynu(False), 'N') 26 | self.assertEqual(map_booleans_ynu(0), 'N') 27 | self.assertEqual(map_booleans_ynu('0'), 'N') 28 | self.assertEqual(map_booleans_ynu('f'), 'N') 29 | self.assertEqual(map_booleans_ynu('F'), 'N') 30 | self.assertEqual(map_booleans_ynu('false'), 'N') 31 | self.assertEqual(map_booleans_ynu('False'), 'N') 32 | self.assertEqual(map_booleans_ynu('FALSE'), 'N') 33 | self.assertEqual(map_booleans_ynu('n'), 'N') 34 | self.assertEqual(map_booleans_ynu('N'), 'N') 35 | self.assertEqual(map_booleans_ynu('no'), 'N') 36 | self.assertEqual(map_booleans_ynu('No'), 'N') 37 | self.assertEqual(map_booleans_ynu('NO'), 'N') 38 | # `Y` 39 | self.assertEqual(map_booleans_ynu(True), 'Y') 40 | self.assertEqual(map_booleans_ynu(1), 'Y') 41 | self.assertEqual(map_booleans_ynu('1'), 'Y') 42 | self.assertEqual(map_booleans_ynu('t'), 'Y') 43 | self.assertEqual(map_booleans_ynu('T'), 'Y') 44 | self.assertEqual(map_booleans_ynu('true'), 'Y') 45 | self.assertEqual(map_booleans_ynu('True'), 'Y') 46 | self.assertEqual(map_booleans_ynu('TRUE'), 'Y') 47 | self.assertEqual(map_booleans_ynu('y'), 'Y') 48 | self.assertEqual(map_booleans_ynu('Y'), 'Y') 49 | self.assertEqual(map_booleans_ynu('yes'), 'Y') 50 | self.assertEqual(map_booleans_ynu('Yes'), 'Y') 51 | self.assertEqual(map_booleans_ynu('YES'), 'Y') 52 | # `Unknown` 53 | self.assertEqual(map_booleans_ynu(''), 'Unknown') 54 | self.assertEqual(map_booleans_ynu(' '), 'Unknown') 55 | self.assertEqual(map_booleans_ynu(3), 'Unknown') 56 | self.assertEqual(map_booleans_ynu(3.0), 'Unknown') 57 | self.assertEqual(map_booleans_ynu(None), 'Unknown') 58 | self.assertEqual(map_booleans_ynu('foo'), 'Unknown') 59 | self.assertEqual(map_booleans_ynu('BAR'), 'Unknown') 60 | 61 | def test_string_to_double(self): 62 | self.assertEqual(string_to_double_pfd("100"), 100.00) 63 | self.assertEqual(string_to_double_pfd("100"), 100.00) 64 | self.assertEqual(string_to_double_pfd("-100"), -100.00) 65 | self.assertEqual(string_to_double_pfd("(100)"), -100.00) 66 | self.assertEqual(string_to_double_pfd("$100"), 100.00) 67 | self.assertEqual(string_to_double_pfd("-$100"), -100.00) 68 | self.assertEqual(string_to_double_pfd("($100)"), -100.00) 69 | self.assertEqual(string_to_double_pfd("100%"), 100.00) 70 | self.assertEqual(string_to_double_pfd("-100%"), -100.00) 71 | self.assertEqual(string_to_double_pfd("(100%)"), -100.00) 72 | self.assertEqual(string_to_double_pfd("100.00"), 100.00) 73 | self.assertEqual(string_to_double_pfd("-100.00"), -100.00) 74 | self.assertEqual(string_to_double_pfd("(100.00)"), -100.00) 75 | self.assertEqual(string_to_double_pfd("$100.00"), 100.00) 76 | self.assertEqual(string_to_double_pfd("-$100.00"), -100.00) 77 | self.assertEqual(string_to_double_pfd("($100.00)"), -100.00) 78 | self.assertEqual(string_to_double_pfd("100.00%"), 100.00) 79 | self.assertEqual(string_to_double_pfd("-100.00%"), -100.00) 80 | self.assertEqual(string_to_double_pfd("(100.00%)"), -100.00) 81 | # 82 | self.assertEqual(string_to_double_pfd("100 Apples"), 100.00) 83 | self.assertEqual(string_to_double_pfd("$3.14/lbs."), 3.14) 84 | # 85 | self.assertEqual(string_to_double_cfd("4 294 967 295,000"), 4294967295.00) 86 | self.assertEqual(string_to_double_cfd("4 294 967.295,000"), 4294967295.00) 87 | self.assertEqual(string_to_double_cfd("4.294.967.295,000"), 4294967295.00) 88 | -------------------------------------------------------------------------------- /pyspark_utilities/spark_udfs/__init__.py: -------------------------------------------------------------------------------- 1 | from .spark_udfs import SparkUDFs 2 | -------------------------------------------------------------------------------- /pyspark_utilities/spark_udfs/spark_udfs.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import Column, SparkSession 2 | # noinspection PyUnresolvedReferences, PyProtectedMember 3 | from pyspark.sql.column import _to_seq, _to_java_column 4 | 5 | 6 | class SparkUDFs: 7 | def __init__(self, spark): 8 | """ 9 | 10 | Args: 11 | spark (SparkSession): instantiated SparkSession. 12 | """ 13 | self.spark = spark 14 | 15 | def clean_string(self, target_col): 16 | """ Remove Java ISO control characters from, and trim, string. 17 | 18 | Args: 19 | target_col (Spark Column): target column to be cleaned. 20 | 21 | Returns: 22 | Spark Column (StringType): cleaned version of input column. 23 | """ 24 | sc = self.spark.sparkContext 25 | # noinspection PyUnresolvedReferences, PyProtectedMember 26 | _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF() 27 | return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column))) 28 | 29 | def empty_string_to_null(self, target_col): 30 | """ Convert empty strings to nulls. 31 | 32 | Args: 33 | target_col (Spark Column): target column to convert. 34 | 35 | Returns: 36 | Spark Column (StringType): target column with empty values converted to nulls. 37 | """ 38 | sc = self.spark.sparkContext 39 | # noinspection PyUnresolvedReferences, PyProtectedMember 40 | _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF() 41 | return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column))) 42 | 43 | def generate_uuid(self): 44 | """ Generate V4 UUID. 45 | 46 | Returns: 47 | Spark Column (StringType): containing v4 UUIDs. 48 | """ 49 | sc = self.spark.sparkContext 50 | # noinspection PyUnresolvedReferences, PyProtectedMember 51 | _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF() 52 | return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column))) 53 | 54 | def map_booleans_ynu(self, target_col): 55 | """ Map boolean values to `Y`, `N`, `Unknown` 56 | 57 | Args: 58 | target_col (Spark Column): target column containing boolean values to map. 59 | 60 | Returns: 61 | Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`) 62 | """ 63 | sc = self.spark.sparkContext 64 | # noinspection PyUnresolvedReferences, PyProtectedMember 65 | _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF() 66 | return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column))) 67 | 68 | def string_to_double_pfd(self, target_col): 69 | """ Convert string to doubles where period represents decimal places (`pfd`). 70 | 71 | Args: 72 | target_col (Spark Column): containing double values in string format. 73 | 74 | Returns: 75 | Spark Column (DoubleType): containing double values converted from strings. 76 | """ 77 | sc = self.spark.sparkContext 78 | # noinspection PyUnresolvedReferences, PyProtectedMember 79 | _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoublePeriodForDecimal_UDF() 80 | return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column))) 81 | 82 | def string_to_double_cfd(self, target_col): 83 | """ Convert string to doubles where commas represents decimal places (`cfd`). 84 | 85 | Args: 86 | target_col (Spark Column): containing double values in string format. 87 | 88 | Returns: 89 | Spark Column (DoubleType): containing double values converted from strings. 90 | """ 91 | sc = self.spark.sparkContext 92 | # noinspection PyUnresolvedReferences, PyProtectedMember 93 | _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF() 94 | return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column))) 95 | 96 | def string_is_number(self, target_col): 97 | """ Return boolean if string can be converted to a number. 98 | 99 | Args: 100 | target_col (Spark Column): containing string to check for convertability to number. 101 | 102 | Returns: 103 | Spark Column (BooleanType): whether string can converted to a number. 104 | """ 105 | sc = self.spark.sparkContext 106 | # noinspection PyUnresolvedReferences, PyProtectedMember 107 | _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF() 108 | return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column))) 109 | 110 | def normalize_date_md(self, target_col): 111 | """ Convert string to date where MONTH is BEFORE DAY. 112 | 113 | Args: 114 | target_col (Spark Column): containing strings representing dates. 115 | 116 | Returns: 117 | Spark Column (DateType): containing dates converted from strings. 118 | """ 119 | sc = self.spark.sparkContext 120 | # noinspection PyUnresolvedReferences, PyProtectedMember 121 | _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF() 122 | return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column))) 123 | 124 | def normalize_date_dm(self, target_col): 125 | """ Convert string to date where DAY is BEFORE MONTH. 126 | 127 | Args: 128 | target_col (Spark Column): containing strings representing dates. 129 | 130 | Returns: 131 | Spark Column (DateType): containing dates converted from strings. 132 | """ 133 | sc = self.spark.sparkContext 134 | # noinspection PyUnresolvedReferences, PyProtectedMember 135 | _normalize_date_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateDM_UDF() 136 | return Column(_normalize_date_dm.apply(_to_seq(sc, [target_col], _to_java_column))) 137 | 138 | def normalize_timestamp_md(self, target_col): 139 | """ Convert string to timestamp where MONTH is BEFORE DAY. 140 | 141 | Args: 142 | target_col (Spark Column): containing strings representing timestamps. 143 | 144 | Returns: 145 | Spark Column (TimestampType): containing timestamps converted from strings. 146 | """ 147 | sc = self.spark.sparkContext 148 | # noinspection PyUnresolvedReferences, PyProtectedMember 149 | _normalize_timestamp_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampMD_UDF() 150 | return Column(_normalize_timestamp_md.apply(_to_seq(sc, [target_col], _to_java_column))) 151 | 152 | def normalize_timestamp_dm(self, target_col): 153 | """ Convert string to timestamp where DAY is BEFORE MONTH. 154 | 155 | Args: 156 | target_col (Spark Column): containing strings representing timestamps. 157 | 158 | Returns: 159 | Spark Column (TimestampType): containing timestamps converted from strings. 160 | """ 161 | sc = self.spark.sparkContext 162 | # noinspection PyUnresolvedReferences, PyProtectedMember 163 | _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF() 164 | return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column))) 165 | -------------------------------------------------------------------------------- /pyspark_utilities/spark_utilities/__init__.py: -------------------------------------------------------------------------------- 1 | from .spark_utilities import start_spark 2 | -------------------------------------------------------------------------------- /pyspark_utilities/spark_utilities/spark_utilities.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from os.path import expanduser 3 | from warnings import warn 4 | 5 | from pyspark import SparkConf 6 | from pyspark.sql import SparkSession 7 | 8 | 9 | def start_spark(config=SparkConf(), 10 | app_name=None, 11 | env='cluster', 12 | enable_hive=False, 13 | source_aws_credentials_file=False, 14 | aws_profile=None): 15 | """Instantiate SparkSession. 16 | 17 | Args: 18 | config (SparkConf): SparkConf() with set parameters (optional). 19 | app_name (str): Name of Spark application (optional). 20 | env (str): Where Spark application is running (required). Known values: `local`, `cluster`. 21 | enable_hive (bool): If `True`: adds Hive support via `enableHiveSupport()` 22 | source_aws_credentials_file (bool): Whether to source AWS credentials file. 23 | aws_profile (str): Name of profile to use for interacting with AWS services. Only used if `env` is `local`. 24 | 25 | Returns: 26 | Instantiated SparkSession. 27 | """ 28 | # validate inputs 29 | ok_envs = ['local', 'cluster'] 30 | ok_envs_str = ', '.join(['`' + e + '`' for e in ok_envs]) 31 | assert env in ok_envs, f'Invalid value passed to `env` argument: `{env}`. Acceptable values: {ok_envs_str}.' 32 | 33 | # start SparkSession builder 34 | if env == 'local': 35 | if app_name is None: 36 | app_name = 'some_app' 37 | session_builder = (SparkSession 38 | .builder 39 | .master('local') 40 | .config(conf=config) 41 | .appName(app_name)) 42 | else: 43 | session_builder = SparkSession.builder.config(conf=config) 44 | 45 | # enable Hive support 46 | if enable_hive: 47 | session_builder = session_builder.enableHiveSupport() 48 | 49 | # instantiate SparkSession 50 | spark = session_builder.getOrCreate() 51 | 52 | # get credentials for AWS profile when running Spark locally 53 | if source_aws_credentials_file: 54 | if aws_profile is None: 55 | warn("`aws_profile` is None with `source_aws_credentials_file` set to true. Using `default` AWS profile.") 56 | aws_profile = 'default' 57 | cfp = configparser.ConfigParser() 58 | cfp.read(expanduser("~/.aws/credentials")) 59 | access_id = cfp.get(aws_profile, "aws_access_key_id") 60 | access_key = cfp.get(aws_profile, "aws_secret_access_key") 61 | # noinspection PyProtectedMember, PyUnresolvedReferences 62 | hadoop_conf = spark._jsc.hadoopConfiguration() 63 | hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 64 | hadoop_conf.set("fs.s3a.access.key", access_id) 65 | hadoop_conf.set("fs.s3a.secret.key", access_key) 66 | 67 | return spark 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | holidays>=0.9.10 2 | fuzzywuzzy>=0.17.0 3 | jellyfish>=0.7.2 4 | numpy==1.14.5 5 | pandas>=0.24.0 6 | pyarrow==0.13.0 7 | pyspark>=2.4.0 8 | python-Levenshtein>=0.12.0 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="pyspark-utilities", 8 | version="0.0.1", 9 | author="Sam Zakalik", 10 | description="ETL focused utilities library for PySpark", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | url="https://github.com/zaksamalik/pyspark-utilities", 14 | packages=setuptools.find_packages(), 15 | install_requires=['holidays>=0.9.10', 16 | 'fuzzywuzzy>=0.17.0', 17 | 'jellyfish>=0.7.2', 18 | 'numpy==1.14.5', 19 | 'pandas>=0.24.0', 20 | 'pyarrow==0.13.0', 21 | 'pyspark>=2.4.0', 22 | 'python-Levenshtein>=0.12.0'], 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ], 28 | ) 29 | --------------------------------------------------------------------------------