├── .gitignore
├── LICENSE
├── README.md
├── pyspark_utilities
    ├── __init__.py
    ├── dimension_utilities
    │   ├── __init__.py
    │   └── dim_datetime_utilities.py
    ├── pandas_udfs
    │   ├── __init__.py
    │   ├── datetime_udfs.py
    │   ├── datetime_udfs_base_functions.py
    │   ├── fuzzy_match_udfs.py
    │   ├── general_udfs.py
    │   ├── general_udfs_base_functions.py
    │   └── general_udfs_base_functions_test.py
    ├── spark_udfs
    │   ├── __init__.py
    │   └── spark_udfs.py
    └── spark_utilities
    │   ├── __init__.py
    │   └── spark_utilities.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyspark-utilities
  2 | ETL-focused utilities library for PySpark
  3 | 
  4 | ## Package Contents
  5 | * `spark_utilities` - general PySpark utilities to develop and run Spark applications
  6 | * `pandas_udfs` - Spark UDFs written using [__Pandas UDF__](https://docs.databricks.com/spark/latest/spark-sql/udf-python-pandas.html) feature [added in Spark 2.3](https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html)
  7 | * `spark_udfs` - Python class containing Spark UDFs written in Scala, accessed via JAR passed to SparkContext at initialization
  8 | * `dimension_utilities` - functions to generate dimension tables as Spark DataFrames
  9 | 
 10 | ## Setup
 11 | 1. Install library  
 12 |     ```bash
 13 |     pip install git+https://github.com/zaksamalik/pyspark-utilities
 14 |     ```
 15 | 2. Follow instructions in [__spark-etl-utilities__](https://github.com/zaksamalik/spark-etl-utilities)
 16 |    repo to build `spark-etl-utilities` JAR
 17 |    * __Note__: steps 2 & 3 are optional and __only__ required in order to use the Spark UDFs written in Scala
 18 | 3. Load resulting JAR file in Spark session (example in Spark Utilities --> Methods section)
 19 | 
 20 | ## Spark Utilities
 21 | _General PySpark utilities to develop and run Spark applications._
 22 | ### Methods
 23 | __General__
 24 | * `start_spark` - instantiate SparkSession
 25 |     * arguments:
 26 |         * `config` (_SparkConf_) - SparkConf() with set parameters (defaulted).
 27 |         * `app_name` (_str_) - name of Spark application (default = None).
 28 |         * `env` (_str_) - where Spark application is running (default = 'cluster'). Known values: `local`, `cluster`.
 29 |         * `enable_hive` (_bool_) - if `True`: adds Hive support via `enableHiveSupport()` (default = False).
 30 |         * `source_aws_credentials_file` (_bool_) - whether to source AWS credentials file (default = False).
 31 |         * `aws_profile` (_str_) - name of profile to use for interacting with AWS services (default = None).
 32 |            Only used if `env` is `local`.
 33 |     * example usage
 34 |         ```py
 35 |         from os.path import expanduser
 36 |         from pyspark import SparkConf
 37 |         from pyspark_utilities.spark_utilities import start_spark
 38 |     
 39 |         config = (SparkConf().setAll([
 40 |             ('spark.driver.extraClassPath', expanduser('/path/to/jars/*')),
 41 |             ('spark.executor.extraClassPath', expanduser('/path/to/jars/*'))
 42 |         ]))
 43 |         spark = start_spark(config=config, app_name='example_app', env='local')
 44 |         ```
 45 | 
 46 | ## Pandas UDFs
 47 | _User-defined functions written using Pandas UDF feature added in Spark 2.3._
 48 | 
 49 | A good introduction of Pandas UDFs can be found [here](https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html), but in short: Pandas UDFs are vectorized and use Apache Arrow to transfer data from Spark to Pandas and back, delivering much faster performance than one-row-at-a-time Python UDFs, which are notorious
 50 | bottlenecks in PySpark application development.
 51 | 
 52 | __Note__: all Pandas UDFs in this library start with a `pd_` prefix.
 53 | 
 54 | ### Methods
 55 | * __General UDFs__
 56 |     * `pd_clean_string` - remove ISO control characters from, and trim input string column.
 57 |         * returns: _StringType_
 58 |     * `pd_empty_string_to_null` - check if input values in strings column are empty, and return null if so.
 59 |         * returns: _StringType_
 60 |     * `pd_generate_uuid` - generate UUID v4.
 61 |         * returns: _StringType_
 62 |         * Pandas UDFs require at least one input column, so a column must be passed to this function even though no operations will actually be applied to that column.
 63 |         * __Important note__: because Spark is lazy evaluated, UUID will change prior to being written / collected / converted to Pandas DataFrame. Do not rely on UUID as a key throughout Spark application.
 64 |     * `pd_map_booleans_ynu` - map boolean values to `Y`, `N`, `Unknown`.
 65 |         * returns: _StringType_
 66 |     * `pd_string_to_double_pfd` - convert string column to double where PERIOD represents DECIMAL place.
 67 |         * returns: _DoubleType_
 68 |     * `pd_string_to_double_cfd` - convert string column to double where COMMA represents DECIMAL place.
 69 |         * returns: _DoubleType_
 70 |     * `pd_string_is_number` - check whether values in string column can be converted to numbers
 71 |         * returns: _BooleanType_
 72 | * __Datetime UDFs__
 73 |     * `pd_is_holiday_usa` - check whether values in date column are US holidays (from [__holidays__](https://pypi.org/project/holidays/) package).
 74 |         * returns: _StringType_ (`Y`, `N`, `Unknown`)
 75 |     * `pd_normalize_date_md` - Convert column with dates as strings to dates
 76 |     (MONTH before DAY).
 77 |         * returns: _DateType_
 78 |     * `pd_normalize_date_dm` - Convert column with dates as strings to dates
 79 |     (DAY before MONTH).
 80 |         * returns: _DateType_
 81 |     * `pd_normalize_timestamp_md` - Convert column with timestamps as strings to
 82 |     timestamps (MONTH before DAY).
 83 |         * returns _TimestampType_
 84 |     * `pd_normalize_timestamp_dm` - Convert column with timestamps as strings to timestamps (DAY before MONTH).
 85 |         * returns _TimestampType_
 86 | * example usage
 87 |   ```py
 88 |   from pyspark_utilities.pandas_udfs import pd_clean_string
 89 |   from pyspark.sql.functions import col, lit
 90 |   df_clean = (df
 91 |               .withColumn('clean_text', pd_clean_string(col('messy_text')))
 92 |               .withColumn('uuid', pd_generate_uuid(lit(''))))
 93 |   ```
 94 | 
 95 | * __Fuzzy String Matching UDFs__ (methods from [__fuzzywuzzy__](https://github.com/seatgeek/fuzzywuzzy) and [__jellyfish__](https://github.com/jamesturk/jellyfish) packages)
 96 |     * `pd_fuzz_ratio` - simple ratio (`fuzz.ratio`)
 97 |         * returns: _IntegerType_
 98 |     * `pd_fuzz_partial_ratio` - partial ratio (`fuzz.partial_ratio`)
 99 |         * returns: _IntegerType_
100 |     * `pd_fuzz_token_set_ratio` - token set ratio (`fuzz.token_set_ratio`)
101 |         * returns: _IntegerType_
102 |     * `pd_fuzz_partial_token_set_ratio` - partial token set ratio (`fuzz.partial_token_set_ratio`)
103 |         * returns: _IntegerType_
104 |     * `pd_fuzz_token_sort_ratio` - token sort ratio (`fuzz.token_sort_ratio`)
105 |         * returns: _IntegerType_
106 |     * `pd_fuzz_partial_token_sort_ratio` - partial token sort ratio (`fuzz.partial_token_sort_ratio`)
107 |         * returns: _IntegerType_  
108 |     * `pd_damerau_levenshtein_distance` - damerau levenshtein distance (`jellyfish.damerau_levenshtein_distance`)
109 |         * returns: _IntegerType_
110 |     * `pd_hamming_distance` - hamming distance (`jellyfish.hamming_distance`)
111 |         * returns: _IntegerType_
112 |     * `pd_jaro_distance` - jaro distance (`jellyfish.jaro_distance`)
113 |         * returns: _DoubleType_
114 |     * `pd_jaro_winkler` - jaro winkler (`jellyfish.jaro_winkler`)
115 |         * returns: _DoubleType_
116 |     * `pd_match_rating_codex` - match rating codex (`jellyfish.match_rating_codex`)
117 |         * returns: _StringType_
118 |     * `pd_match_rating_comparison` match rating comparison - (`jellyfish.match_rating_comparison`)
119 |         * returns: _BooleanType_
120 |     * `pd_metaphone` - metaphone (`jellyfish.metaphone`)
121 |         * returns: _StringType_
122 |     * `pd_nysiis` - nysiis (`jellyfish.nysiis`)
123 |         * returns: _StringType_
124 |     * `pd_porter_stem` - porter_stem (`jellyfish.porter_stem`)
125 |         * returns: _StringType_
126 | 
127 | 
128 | * example usage
129 |   ```py
130 |   from pyspark.sql.functions import col
131 |   from pyspark_utilities.pandas_udfs import pd_clean_string, pd_fuzz_partial_ratio
132 |   
133 |   df_fuzzy_ratio = (df
134 |                       .withColumn('clean_text1', pd_clean_string(col('messy_text')))
135 |                       .withColumn('clean_text2', pd_clean_string(col('messy_text')))
136 |                       .withColumn('partial_fuzzy_ratio', pd_fuzz_partial_ratio(col('clean_text1'), col('clean_text2'))))                      
137 |   ```
138 | ## Spark UDFs (Scala)
139 | _Spark UDFs written in Scala exposed to Python._
140 | 
141 | __Important Note__: all Scala Spark UDF functionality also exist as Pandas UDFs.  
142 | Because they are Scala native, Spark UDFs should be more performant than Pandas UDFs (in most cases), but require an external JAR in order to use. For pure Python ETL implementations, use Pandas UDFs instead. All other functionality in this package should work fine without the Spark UDFs JAR.
143 | 
144 | ### Example
145 | ```py
146 | from pyspark.sql.functions import col
147 | from pyspark_utilities.spark_udfs import SparkUDFs
148 | # `spark` = instantiated SparkSession
149 | udfs = SparkUDFs(spark)     
150 | # apply UDF
151 | df_with_uuid = (df
152 |                 .withColumn('uuid', udfs.generate_uuid())
153 |                 .withColumn('clean_string', udfs.clean_string(col('messy_text'))))
154 | ```
155 | ### Methods
156 | * __General UDFs__
157 |     * `clean_string` - remove Java ISO control characters from, and trim, string
158 |         * returns: _string_ (nullable)
159 |     *  `empty_string_to_null` - convert empty strings to null values
160 |         * returns: _string_ (nullable)
161 |     *  `generate_uuid` - generate V4 UUID
162 |         * returns: _string_
163 |     * `map_booleans_ynu` - map boolean values to `Y`, `N`, `Unknown`
164 |         * returns: _string_
165 |     * `string_to_double_pfd` - convert string to double (where `.` represents decimal place)
166 |         * returns: _double_ (nullable)
167 |     * `string_to_double_cfd` - convert string to decimal (where `,` represents decimal place)
168 |         * returns: _double_ (nullable)
169 |     * `string_is_number` - validate whether passed string could be converted to a number.
170 |         * returns: _boolean_
171 | * __Datetime UDFs__
172 |     * `normalize_date_md` - normalize string to date with MONTH before DAY
173 |         * returns: _date_ (nullable)
174 |     * `normalize_date_dm` - normalize string to date with DAY before MONTH
175 |         * returns: _date_ (nullable)
176 |     * `normalize_timestamp_md` - normalize string to timestamp with MONTH before DAY
177 |         * returns: _timestamp_ (nullable)
178 |     * `normalize_timestamp_dm` - normalize string to timestamp with DAY before MONTH
179 |         * returns: _timestamp_ (nullable)
180 | 
181 | ## Dimension Utilities
182 | _Functions to generate dimension ("dim") tables as Spark DataFrames._
183 | ### Methods
184 | __Datetime__
185 | * `generate_dim_date` - generate Spark DataFrame with various date dimensions
186 |     * arguments:
187 |         * `spark` - instantiated SparkSession
188 |         * `start_date` - starting (minimum) year for dim_date table
189 |         * `number_years_out_from_start` - number of years out from starting date to increment
190 |     * example usage
191 |         ```py
192 |         from pyspark_utilities.spark_utilities import start_spark
193 |         from pyspark_utilities.dimension_utilities import generate_dim_date
194 | 
195 |         spark=start_spark(env='local')
196 |         dim_date_df = generate_dim_date(spark=spark, start_year=1901, number_years_out_from_start=300)
197 |         ```
198 | 


--------------------------------------------------------------------------------
/pyspark_utilities/__init__.py:
--------------------------------------------------------------------------------
1 | name = "pyspark-utilities"
2 | 


--------------------------------------------------------------------------------
/pyspark_utilities/dimension_utilities/__init__.py:
--------------------------------------------------------------------------------
1 | from .dim_datetime_utilities import generate_dim_date
2 | 


--------------------------------------------------------------------------------
/pyspark_utilities/dimension_utilities/dim_datetime_utilities.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pyspark.sql.functions import (col, concat, date_format, datediff, dayofmonth, dayofweek, dayofyear, expr,
  3 |                                    last_day, lit, lpad, quarter, regexp_replace, to_date, weekofyear, when)
  4 | from ..pandas_udfs.datetime_udfs import pd_is_holiday_usa
  5 | 
  6 | 
  7 | def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300):
  8 |     """Create `dim_date` table containing various date feature columns.
  9 | 
 10 |     Args:
 11 |         spark (SparkSession): Instantiated SparkSession
 12 |         start_year (int): starting year for dim_date table.
 13 |         number_years_out_from_start (int): number out from `start_year` to increment.
 14 | 
 15 |     Returns:
 16 |         Spark DataFrame.
 17 |     """
 18 |     years = [start_year + i for i in range(number_years_out_from_start + 1)]
 19 |     months = [i for i in range(1, 13)]
 20 |     days = [i for i in range(1, 32)]
 21 | 
 22 |     years_df = spark.createDataFrame(pd.DataFrame({'year': years, 'temp_join_key': '1'}))
 23 |     months_df = spark.createDataFrame(pd.DataFrame({'month': months, 'temp_join_key': '1'}))
 24 |     days_df = spark.createDataFrame(pd.DataFrame({'day_of_month': days, 'temp_join_key': '1'}))
 25 | 
 26 |     years_months_df = (years_df
 27 |                        .join(months_df,
 28 |                              ['temp_join_key'],
 29 |                              how='inner'))
 30 | 
 31 |     years_month_days_df = (years_months_df
 32 |                            .join(days_df,
 33 |                                  ['temp_join_key'],
 34 |                                  how='inner'))
 35 | 
 36 |     date_keys = (years_month_days_df
 37 |                  .withColumn('date', to_date(concat(col('year'),
 38 |                                                     lpad(col('month'), 2, '0'),
 39 |                                                     lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd'))
 40 |                  # remove invalid dates
 41 |                  .filter("date IS NOT NULL")
 42 |                  .withColumn('date_key', regexp_replace(col('date').cast('string'), '-', '').cast('integer')))
 43 | 
 44 |     date_features = (date_keys
 45 |                      # get `week` and `quarter`
 46 |                      .withColumn('week', weekofyear(col('date')))
 47 |                      .withColumn('quarter', quarter(col('date')))
 48 |                      # get `day_name` and `month_name`
 49 |                      .withColumn('day_name', date_format(col('date'), 'EEEE'))
 50 |                      .withColumn('month_name', date_format(col('date'), 'MMMM'))
 51 |                      # get `date_year`, `date_quarter`, `date_month`, `date_week`
 52 |                      .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)"))
 53 |                      .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01'))
 54 |                      .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)"))
 55 |                      .withColumn('date_year', date_format(col('date'), 'yyyy-01-01'))
 56 |                      # get `day_of_week`, `day_of_quarter`, `day_of_year`
 57 |                      .withColumn('day_of_week', dayofweek(col('date')))
 58 |                      .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1))
 59 |                      .withColumn('day_of_year', dayofyear(col('date')))
 60 |                      # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`,
 61 |                      # `month_start_flag`, `month_end_flag`
 62 |                      .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N'))
 63 |                      .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp')))
 64 |                      .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) |
 65 |                                                          (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N'))
 66 |                      .withColumn('leap_year_flag',
 67 |                                  when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y')
 68 |                                  .otherwise('N'))
 69 |                      .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N'))
 70 |                      .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N'))
 71 |                      # get `pct_into_month`, `pct_into_quarter`, `pct_into_year`
 72 |                      .withColumn('pct_into_month',
 73 |                                  (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)'))
 74 |                      .withColumn('date_quarter_end',
 75 |                                  when(col('quarter') == lit(1), concat(col('year'), lit('-03-31')))
 76 |                                  .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30')))
 77 |                                  .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30')))
 78 |                                  .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31')))
 79 |                                  .otherwise(None)
 80 |                                  .cast('date'))
 81 |                      .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1))
 82 |                      .withColumn('pct_into_quarter',
 83 |                                  (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)'))
 84 |                      .withColumn('pct_into_year',
 85 |                                  (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0))
 86 |                                  .cast('decimal(7, 6)'))
 87 |                      # get seasons
 88 |                      .withColumn('season_northern',
 89 |                                  when(col('month').isin(12, 1, 2), 'Winter')
 90 |                                  .when(col('month').isin(3, 4, 5), 'Spring')
 91 |                                  .when(col('month').isin(6, 7, 8), 'Summer')
 92 |                                  .when(col('month').isin(9, 10, 11), 'Fall')
 93 |                                  .otherwise('UNKNOWN'))
 94 |                      .withColumn('season_southern',
 95 |                                  when(col('month').isin(6, 7, 8), 'Winter')
 96 |                                  .when(col('month').isin(9, 10, 11), 'Spring')
 97 |                                  .when(col('month').isin(12, 1, 2), 'Summer')
 98 |                                  .when(col('month').isin(3, 4, 5), 'Fall')
 99 |                                  .otherwise('UNKNOWN')))
100 | 
101 |     dim_date = (date_features
102 |                 .sort('date')
103 |                 .select(['date_key',
104 |                          'date',
105 |                          'date_week',
106 |                          'date_month',
107 |                          'date_quarter',
108 |                          'date_year',
109 |                          'day_of_week',
110 |                          'day_of_month',
111 |                          'day_of_quarter',
112 |                          'day_of_year',
113 |                          'week',
114 |                          'month',
115 |                          'quarter',
116 |                          'year',
117 |                          'days_in_quarter',
118 |                          'day_name',
119 |                          'month_name',
120 |                          'season_northern',
121 |                          'season_southern',
122 |                          'weekend_flag',
123 |                          'us_holiday_flag',
124 |                          'us_biz_day_flag',
125 |                          'month_start_flag',
126 |                          'month_end_flag',
127 |                          'leap_year_flag',
128 |                          'pct_into_month',
129 |                          'pct_into_quarter',
130 |                          'pct_into_year']))
131 |     return dim_date
132 | 
133 | 
134 | # TODO: `generate_dim_time` (seconds)
135 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .datetime_udfs import (pd_is_holiday_usa, pd_normalize_date_md, pd_normalize_date_dm, pd_normalize_timestamp_md,
 2 |                             pd_normalize_timestamp_dm)
 3 | from .fuzzy_match_udfs import (pd_fuzz_ratio, pd_fuzz_partial_ratio, pd_fuzz_token_set_ratio,
 4 |                                pd_fuzz_partial_token_set_ratio, pd_fuzz_token_sort_ratio,
 5 |                                pd_fuzz_partial_token_sort_ratio, pd_damerau_levenshtein_distance, pd_hamming_distance,
 6 |                                pd_jaro_distance, pd_jaro_winkler, pd_match_rating_codex, pd_match_rating_comparison,
 7 |                                pd_metaphone, pd_nysiis, pd_porter_stem)
 8 | from .general_udfs import (pd_clean_string, pd_empty_string_to_null, pd_map_booleans_ynu, pd_generate_uuid,
 9 |                            pd_string_to_double_pfd, pd_string_to_double_cfd)
10 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/datetime_udfs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pyspark.sql.functions import pandas_udf, PandasUDFType
 3 | from pyspark.sql.types import DateType, StringType, TimestampType
 4 | from .datetime_udfs_base_functions import is_holiday_usa, to_datetime_md, to_datetime_dm
 5 | 
 6 | 
 7 | # noinspection PyArgumentList
 8 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
 9 | def pd_is_holiday_usa(target_col):
10 |     """ Apply `is_holiday_usa` to Spark column.
11 | 
12 |     Args:
13 |         target_col (Spark Column): containing dates or timestamps to check holiday status.
14 | 
15 |     Returns:
16 |         Spark Column (StringType): `Y`, `N`, or `Unknown`.
17 |     """
18 |     return pd.Series(target_col.apply(lambda x: is_holiday_usa(x)))
19 | 
20 | 
21 | # noinspection PyArgumentList
22 | @pandas_udf(DateType(), PandasUDFType.SCALAR)
23 | def pd_normalize_date_md(target_col):
24 |     """ Convert column with dates as strings to dates (MONTH BEFORE DAY).
25 | 
26 |     Args:
27 |         target_col (Spark Column): containing dates as strings.
28 | 
29 |     Returns:
30 |         Spark Column (DateType): containing dates extracted from strings.
31 |     """
32 |     return pd.Series(target_col.apply(lambda x: to_datetime_md(x)))
33 | 
34 | 
35 | # noinspection PyArgumentList
36 | @pandas_udf(DateType(), PandasUDFType.SCALAR)
37 | def pd_normalize_date_dm(target_col):
38 |     """ Convert column with dates as strings to dates (DAY BEFORE MONTH).
39 | 
40 |     Args:
41 |         target_col (Spark Column): containing dates as strings.
42 | 
43 |     Returns:
44 |         Spark Column (DateType): containing dates extracted from strings.
45 |     """
46 |     return pd.Series(target_col.apply(lambda x: to_datetime_dm(x)))
47 | 
48 | 
49 | # noinspection PyArgumentList
50 | @pandas_udf(TimestampType(), PandasUDFType.SCALAR)
51 | def pd_normalize_timestamp_md(target_col):
52 |     """ Convert column with timestamps as strings to timestamps (MONTH BEFORE DAY).
53 | 
54 |     Args:
55 |         target_col (Spark Column): containing dates as strings.
56 | 
57 |     Returns:
58 |         Spark Column (TimestampType): containing dates extracted from strings.
59 |     """
60 |     return pd.Series(target_col.apply(lambda x: to_datetime_md(x)))
61 | 
62 | 
63 | # noinspection PyArgumentList
64 | @pandas_udf(TimestampType(), PandasUDFType.SCALAR)
65 | def pd_normalize_timestamp_dm(target_col):
66 |     """ Convert column with timestamps as strings to timestamps (DAY BEFORE MONTH).
67 | 
68 |     Args:
69 |         target_col (Spark Column): containing dates as strings.
70 | 
71 |     Returns:
72 |         Spark Column (TimestampType): containing dates extracted from strings.
73 |     """
74 |     return pd.Series(target_col.apply(lambda x: to_datetime_dm(x)))
75 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/datetime_udfs_base_functions.py:
--------------------------------------------------------------------------------
 1 | import holidays
 2 | import pandas as pd
 3 | from .general_udfs_base_functions import clean_string
 4 | 
 5 | 
 6 | def is_holiday_usa(dt):
 7 |     """ Check whether a given date is a US holiday.
 8 | 
 9 |     Args:
10 |         dt (Date, Timestamp): date to check for holiday status.
11 | 
12 |     Returns:
13 |         str: `Y`, `N`, or `Unknown`
14 |     """
15 |     if dt is None:
16 |         return 'Unknown'
17 |     elif pd.to_datetime(dt) in holidays.US():
18 |         return 'Y'
19 |     else:
20 |         return 'N'
21 | 
22 | 
23 | def to_datetime_md(dt_str):
24 |     """ Apply `pd.to_datetime` with inferring datetime and null handling (MONTH comes BEFORE DAY).
25 | 
26 |     Args:
27 |         dt_str (str): target str to parse to datetime.
28 | 
29 |     Returns:
30 |         Timestamp: parsed from string.
31 |     """
32 |     if clean_string(dt_str) is None:
33 |         return None
34 |     else:
35 |         return pd.to_datetime(dt_str, infer_datetime_format=True, dayfirst=False)
36 | 
37 | 
38 | def to_datetime_dm(dt_str):
39 |     """ Apply `pd.to_datetime` with inferring datetime and null handling (DAY comes BEFORE MONTH).
40 | 
41 |     Args:
42 |         dt_str (str): target str to parse to datetime.
43 | 
44 |     Returns:
45 |         Timestamp: parsed from string.
46 |     """
47 |     if clean_string(dt_str) is None:
48 |         return None
49 |     else:
50 |         return pd.to_datetime(dt_str, infer_datetime_format=True, dayfirst=True)
51 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/fuzzy_match_udfs.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from fuzzywuzzy import fuzz
  3 | import jellyfish
  4 | from pyspark.sql.functions import pandas_udf, PandasUDFType
  5 | from pyspark.sql.types import BooleanType, DoubleType, IntegerType, StringType
  6 | 
  7 | 
  8 | # noinspection PyArgumentList
  9 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 10 | def pd_fuzz_ratio(col1, col2):
 11 |     """ Calculate "simple" ratio (`fuzz.ratio`) between two text columns.
 12 | 
 13 |     Args:
 14 |         col1 (Spark Column): 1st text column
 15 |         col2 (Spark Column): 2nd text column
 16 | 
 17 |     Returns:
 18 |         Spark Column (IntegerType): result of `fuzz.ratio` calculation.
 19 |     """
 20 |     return pd.Series(map(fuzz.ratio, col1.astype(str), col2.astype(str)))
 21 | 
 22 | 
 23 | # noinspection PyArgumentList
 24 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 25 | def pd_fuzz_partial_ratio(col1, col2):
 26 |     """ Calculate "partial" ratio (`fuzz.partial_ratio`) between two text columns.
 27 | 
 28 |     Args:
 29 |         col1 (Spark Column): 1st text column
 30 |         col2 (Spark Column): 2nd text column
 31 | 
 32 |     Returns:
 33 |         Spark Column (IntegerType): result of `fuzz.partial_ratio` calculation.
 34 |     """
 35 |     return pd.Series(map(fuzz.partial_ratio, col1.astype(str), col2.astype(str)))
 36 | 
 37 | 
 38 | # noinspection PyArgumentList
 39 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 40 | def pd_fuzz_token_set_ratio(col1, col2):
 41 |     """ Calculate "token set" ratio (`fuzz.token_set_ratio`) between two text columns.
 42 | 
 43 |     Args:
 44 |         col1 (Spark Column): 1st text column
 45 |         col2 (Spark Column): 2nd text column
 46 | 
 47 |     Returns:
 48 |         Spark Column (IntegerType): result of `fuzz.token_set_ratio` calculation.
 49 |     """
 50 |     return pd.Series(map(fuzz.token_set_ratio, col1.astype(str), col2.astype(str)))
 51 | 
 52 | 
 53 | # noinspection PyArgumentList
 54 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 55 | def pd_fuzz_partial_token_set_ratio(col1, col2):
 56 |     """ Calculate "partial token set" ratio (`fuzz.partial_token_set_ratio`) between two text columns.
 57 | 
 58 |     Args:
 59 |         col1 (Spark Column): 1st text column
 60 |         col2 (Spark Column): 2nd text column
 61 | 
 62 |     Returns:
 63 |         Spark Column (IntegerType): result of `fuzz.partial_token_set_ratio` calculation.
 64 |     """
 65 |     return pd.Series(map(fuzz.partial_token_set_ratio, col1.astype(str), col2.astype(str)))
 66 | 
 67 | 
 68 | # noinspection PyArgumentList
 69 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 70 | def pd_fuzz_token_sort_ratio(col1, col2):
 71 |     """ Calculate "token sort" ratio (`fuzz.token_sort_ratio`) between two text columns.
 72 | 
 73 |     Args:
 74 |         col1 (Spark Column): 1st text column
 75 |         col2 (Spark Column): 2nd text column
 76 | 
 77 |     Returns:
 78 |         Spark Column (IntegerType): result of `fuzz.token_sort_ratio` calculation.
 79 |     """
 80 |     return pd.Series(map(fuzz.token_sort_ratio, col1.astype(str), col2.astype(str)))
 81 | 
 82 | 
 83 | # noinspection PyArgumentList
 84 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
 85 | def pd_fuzz_partial_token_sort_ratio(col1, col2):
 86 |     """ Calculate "partial token sort" ratio (`fuzz.partial_token_sort_ratio`) between two text columns.
 87 | 
 88 |     Args:
 89 |         col1 (Spark Column): 1st text column
 90 |         col2 (Spark Column): 2nd text column
 91 | 
 92 |     Returns:
 93 |         Spark Column (IntegerType): result of `fuzz.partial_token_sort_ratio` calculation.
 94 |     """
 95 |     return pd.Series(map(fuzz.partial_token_sort_ratio, col1.astype(str), col2.astype(str)))
 96 | 
 97 | # TODO: `process` function
 98 | 
 99 | 
100 | # noinspection PyArgumentList
101 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
102 | def pd_damerau_levenshtein_distance(col1, col2):
103 |     """ Calculate Damerau Levenshtein distance between two text columns.
104 | 
105 |     Args:
106 |         col1 (Spark Column): 1st text column
107 |         col2 (Spark Column): 2nd text column
108 | 
109 |     Returns:
110 |         Spark Column (IntegerType): with Damerau Levenshtein distances.
111 |     """
112 |     return pd.Series(map(jellyfish.damerau_levenshtein_distance, col1.astype(str), col2.astype(str)))
113 | 
114 | 
115 | # jellyfish.hamming_distance
116 | # noinspection PyArgumentList
117 | @pandas_udf(IntegerType(), PandasUDFType.SCALAR)
118 | def pd_hamming_distance(col1, col2):
119 |     """ Calculate hamming distance between two text columns.
120 | 
121 |     Args:
122 |         col1 (Spark Column): 1st text column
123 |         col2 (Spark Column): 2nd text column
124 | 
125 |     Returns:
126 |         Spark Column (IntegerType): with hamming distances.
127 |     """
128 |     return pd.Series(map(jellyfish.hamming_distance, col1.astype(str), col2.astype(str)))
129 | 
130 | 
131 | # noinspection PyArgumentList
132 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR)
133 | def pd_jaro_distance(col1, col2):
134 |     """ Calculate `jaro_distance` between two text columns.
135 | 
136 |     Args:
137 |         col1 (Spark Column): 1st text column
138 |         col2 (Spark Column): 2nd text column
139 | 
140 |     Returns:
141 |         Spark Column (DoubleType): with Jaro distances.
142 |     """
143 |     return pd.Series(map(jellyfish.jaro_distance, col1.astype(str), col2.astype(str)))
144 | 
145 | 
146 | # noinspection PyArgumentList
147 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR)
148 | def pd_jaro_winkler(col1, col2):
149 |     """ Calculate `jellyfish.jaro_winkler` between two text columns.
150 | 
151 |     Args:
152 |         col1 (Spark Column): 1st text column
153 |         col2 (Spark Column): 2nd text column
154 | 
155 |     Returns:
156 |         Spark Column (DoubleType): with Jaro Winkler scores.
157 |     """
158 |     return pd.Series(map(jellyfish.jaro_winkler, col1.astype(str), col2.astype(str)))
159 | 
160 | 
161 | # noinspection PyArgumentList
162 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
163 | def pd_match_rating_codex(target_col):
164 |     """ Apply `jellyfish.match_rating_codex` to text column.
165 | 
166 |     Args:
167 |         target_col (Spark Column): text column.
168 | 
169 |     Returns:
170 |         Spark Column (StringType): with match rating codex.
171 |     """
172 |     return pd.Series(target_col.apply(lambda x: jellyfish.match_rating_codex(str(x))))
173 | 
174 | 
175 | # noinspection PyArgumentList
176 | @pandas_udf(BooleanType(), PandasUDFType.SCALAR)
177 | def pd_match_rating_comparison(col1, col2):
178 |     """ Calculate `jellyfish.match_rating_comparison` between two text columns.
179 | 
180 |     Args:
181 |         col1 (Spark Column): 1st text column
182 |         col2 (Spark Column): 2nd text column
183 | 
184 |     Returns:
185 |         Spark Column (BooleanType, nullable): True / False / None matching results.
186 |     """
187 |     return pd.Series(map(jellyfish.match_rating_comparison, col1.astype(str), col2.astype(str)))
188 | 
189 | 
190 | # noinspection PyArgumentList
191 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
192 | def pd_metaphone(target_col):
193 |     """ Apply `jellyfish.metaphone` to text column.
194 | 
195 |     Args:
196 |         target_col (Spark Column): text column.
197 | 
198 |     Returns:
199 |         Spark Column (StringType): metaphone encodings.
200 |     """
201 |     return pd.Series(target_col.apply(lambda x: jellyfish.metaphone(str(x))))
202 | 
203 | 
204 | # noinspection PyArgumentList
205 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
206 | def pd_nysiis(target_col):
207 |     """ Apply `jellyfish.nysiis` to text column.
208 | 
209 |     Args:
210 |         target_col (Spark Column): text column.
211 | 
212 |     Returns:
213 |         Spark Column (StringType): NYSIIS encodings.
214 |     """
215 |     return pd.Series(target_col.apply(lambda x: jellyfish.nysiis(str(x))))
216 | 
217 | 
218 | # noinspection PyArgumentList
219 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
220 | def pd_porter_stem(target_col):
221 |     """ Apply `jellyfish.porter_stem` to text column.
222 | 
223 |     Args:
224 |         target_col (Spark Column): text column.
225 | 
226 |     Returns:
227 |         Spark Column (StringType): porter stems.
228 |     """
229 |     return pd.Series(target_col.apply(lambda x: jellyfish.porter_stem(str(x))))
230 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/general_udfs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from uuid import uuid4
 3 | from pyspark.sql.functions import pandas_udf, PandasUDFType
 4 | from pyspark.sql.types import DoubleType, StringType
 5 | from .general_udfs_base_functions import (clean_string, empty_string_to_null, map_booleans_ynu,
 6 |                                           string_to_double_pfd, string_to_double_cfd)
 7 | 
 8 | 
 9 | # noinspection PyArgumentList
10 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
11 | def pd_clean_string(target_col):
12 |     """ Apply `clean_string` over Spark Column.
13 | 
14 |     Args:
15 |         target_col (Spark Column): containing strings to clean.
16 | 
17 |     Returns:
18 |         Spark Column (StringType): cleaned version of input strings.
19 |     """
20 |     return pd.Series(target_col.apply(lambda x: clean_string(x)))
21 | 
22 | 
23 | # noinspection PyArgumentList
24 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
25 | def pd_empty_string_to_null(target_col):
26 |     """ Apply `empty_string_to_null` to Spark Column.
27 | 
28 |     Args:
29 |         target_col (Spark Column): containing strings to convert empties --> nulls
30 | 
31 |     Returns:
32 |         Spark Column: where empty strings replaced with nulls.
33 |     """
34 |     return pd.Series(target_col.apply(lambda x: empty_string_to_null(x)))
35 | 
36 | 
37 | # noinspection PyArgumentList
38 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
39 | def pd_generate_uuid(target_col):
40 |     """ Generate UUID v4.
41 | 
42 |     Args:
43 |         target_col (Spark Column): any column, not actually used... Pandas UDFs require input column.
44 | 
45 |     Returns:
46 |         Spark Column (StringType): UUID v4.
47 |     """
48 |     return pd.Series(target_col.apply(lambda x: uuid4().__str__()))
49 | 
50 | 
51 | # noinspection PyArgumentList
52 | @pandas_udf(StringType(), PandasUDFType.SCALAR)
53 | def pd_map_booleans_ynu(target_col):
54 |     """ Apply `map_booleans_ynu` over Spark Column.
55 | 
56 |     Args:
57 |         target_col (Spark Column): containing values to check if they represent booleans / indicators.
58 | 
59 |     Returns:
60 |         Spark Column (StringType): `Y`, `N`, `Unknown`.
61 |     """
62 |     return pd.Series(target_col.apply(lambda x: map_booleans_ynu(x)))
63 | 
64 | 
65 | # noinspection PyArgumentList
66 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR)
67 | def pd_string_to_double_pfd(target_col):
68 |     """ Apply `string_to_double` to Spark Column (PERIOD for DECIMAL place).
69 | 
70 |     Args:
71 |         target_col (Spark Column): containing double values as strings.
72 | 
73 |     Returns:
74 |         Spark Column (DoubleType): doubles converted from strings.
75 |     """
76 |     return pd.Series(target_col.apply(lambda x: string_to_double_pfd(x, comma_for_decimal=False)))
77 | 
78 | 
79 | # noinspection PyArgumentList
80 | @pandas_udf(DoubleType(), PandasUDFType.SCALAR)
81 | def pd_string_to_double_cfd(target_col):
82 |     """ Apply `string_to_double` Spark Column (COMMAS for DECIMAL place).
83 | 
84 |     Args:
85 |         target_col (Spark Column): containing double values as strings.
86 | 
87 |     Returns:
88 |         Spark Column (DoubleType): doubles converted from strings.
89 |     """
90 |     return pd.Series(target_col.apply(lambda x: string_to_double_cfd(x, comma_for_decimal=True)))
91 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/general_udfs_base_functions.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | def clean_string(target_str):
  5 |     """ Remove ISO control characters and trim input string. Returns None if cleaned string is empty.
  6 | 
  7 |     Args:
  8 |         target_str (st): string to be cleaned.
  9 | 
 10 |     Returns:
 11 |         str: cleaned input string.
 12 |     """
 13 |     if target_str is None:
 14 |         return None
 15 |     else:
 16 |         string_clean = re.sub(r'[\x00-\x1F]+', '', target_str).strip()
 17 |         if string_clean == '':
 18 |             return None
 19 |         else:
 20 |             return string_clean
 21 | 
 22 | 
 23 | def empty_string_to_null(target_str):
 24 |     """ Check if input string is empty, and return null if so (otherwise return input string).
 25 | 
 26 |     Args:
 27 |         target_str (str): string to check for emptiness.
 28 | 
 29 |     Returns:
 30 |         str: null if input string is empty else input string.
 31 |     """
 32 |     if target_str is None:
 33 |         return None
 34 |     elif re.sub(r'[\x00-\x1F]+', '', target_str).strip() == '':
 35 |         return None
 36 |     else:
 37 |         return target_str
 38 | 
 39 | 
 40 | def map_booleans_ynu(target_val):
 41 |     """ Map boolean values to `Y`, `N`, `Unknown`.
 42 | 
 43 |     Args:
 44 |         target_val (any): value to check if it represents a boolean / indicator.
 45 | 
 46 |     Returns:
 47 |         str: `Y`, `N`, `Unknown`
 48 |     """
 49 |     if target_val in [False, 0, '0', 'f', 'F', 'false', 'False', 'FALSE', 'n', 'N', 'no', 'No', 'NO']:
 50 |         return 'N'
 51 |     elif target_val in [True, 1, '1', 't', 'T', 'true', 'True', 'TRUE', 'y', 'Y', 'yes', 'Yes', 'YES']:
 52 |         return 'Y'
 53 |     else:
 54 |         return 'Unknown'
 55 | 
 56 | 
 57 | def string_to_double_pfd(target_str):
 58 |     return string_to_float(target_str, comma_for_decimal=False)
 59 | 
 60 | 
 61 | def string_to_double_cfd(target_str):
 62 |     return string_to_float(target_str, comma_for_decimal=True)
 63 | 
 64 | 
 65 | def string_to_float(target_str, comma_for_decimal=False):
 66 |     """ Convert string to float.
 67 | 
 68 |     Args:
 69 |         target_str (str): target str to convert to double.
 70 |         comma_for_decimal (bool): whether commas represent decimal in passed string.
 71 | 
 72 |     Returns:
 73 |         float: converted from input string.
 74 |     """
 75 |     if not string_is_number(target_str):
 76 |         return None
 77 |     else:
 78 |         if comma_for_decimal:
 79 |             string_clean = re.sub(',', '.', re.sub('[^0-9,-]', '', target_str.strip()))
 80 |         else:
 81 |             string_clean = re.sub('[^0-9.-]', '', target_str.strip())
 82 |         number_match = extract_number_from_string(string_clean)
 83 |         if re.match('\\(.*\\)', target_str):
 84 |             return number_match * -1.0
 85 |         else:
 86 |             return number_match
 87 | 
 88 | 
 89 | def extract_number_from_string(target_str):
 90 |     """Extract number from string.
 91 | 
 92 |     Args:
 93 |         target_str (str): containing number in string format.
 94 | 
 95 |     Returns:
 96 |         float: parsed from string.
 97 |     """
 98 |     number_pattern = '(\\-?[0-9]+(\\.[0-9]+)?)'
 99 |     matches = re.search(number_pattern, target_str)
100 |     if matches:
101 |         return float(matches.group(0))
102 |     else:
103 |         raise ValueError(f"ERROR: Bad number passing. Could not parse {target_str}.")
104 | 
105 | 
106 | def string_is_number(target_str):
107 |     """ Check whether passed string can accurately be converted to a number.
108 | 
109 |     Args:
110 |         target_str (str): string to validate if parsable to number.
111 | 
112 |     Returns:
113 |         bool
114 |     """
115 |     if target_str is None:
116 |         return False
117 |     else:
118 |         return bool(re.fullmatch('^\\d+$', re.sub('[^0-9]', '', target_str)))
119 | 


--------------------------------------------------------------------------------
/pyspark_utilities/pandas_udfs/general_udfs_base_functions_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from .general_udfs_base_functions import (clean_string, empty_string_to_null, map_booleans_ynu,
 3 |                                           string_to_double_pfd, string_to_double_cfd)
 4 | 
 5 | 
 6 | class TestGeneralUDFBaseFunctions(unittest.TestCase):
 7 | 
 8 |     def test_clean_string(self):
 9 |         self.assertIsNone(clean_string('\u0000'))
10 |         self.assertIsNone(clean_string(None))
11 |         self.assertIsNone(clean_string(''))
12 |         self.assertEqual(clean_string('abc '), 'abc')
13 |         self.assertEqual(clean_string('abc 123\u0000'), 'abc 123')
14 | 
15 |     def test_empty_string_to_null(self):
16 |         self.assertIsNone(empty_string_to_null('\u0000'))
17 |         # noinspection PyTypeChecker
18 |         self.assertIsNone(empty_string_to_null(None))
19 |         self.assertIsNone(empty_string_to_null(''))
20 |         self.assertEqual(empty_string_to_null('abc '), 'abc ')
21 |         self.assertEqual(empty_string_to_null('abc 123\u0000'), 'abc 123\u0000')
22 | 
23 |     def test_map_booleans_ynu(self):
24 |         # `N`
25 |         self.assertEqual(map_booleans_ynu(False), 'N')
26 |         self.assertEqual(map_booleans_ynu(0), 'N')
27 |         self.assertEqual(map_booleans_ynu('0'), 'N')
28 |         self.assertEqual(map_booleans_ynu('f'), 'N')
29 |         self.assertEqual(map_booleans_ynu('F'), 'N')
30 |         self.assertEqual(map_booleans_ynu('false'), 'N')
31 |         self.assertEqual(map_booleans_ynu('False'), 'N')
32 |         self.assertEqual(map_booleans_ynu('FALSE'), 'N')
33 |         self.assertEqual(map_booleans_ynu('n'), 'N')
34 |         self.assertEqual(map_booleans_ynu('N'), 'N')
35 |         self.assertEqual(map_booleans_ynu('no'), 'N')
36 |         self.assertEqual(map_booleans_ynu('No'), 'N')
37 |         self.assertEqual(map_booleans_ynu('NO'), 'N')
38 |         # `Y`
39 |         self.assertEqual(map_booleans_ynu(True), 'Y')
40 |         self.assertEqual(map_booleans_ynu(1), 'Y')
41 |         self.assertEqual(map_booleans_ynu('1'), 'Y')
42 |         self.assertEqual(map_booleans_ynu('t'), 'Y')
43 |         self.assertEqual(map_booleans_ynu('T'), 'Y')
44 |         self.assertEqual(map_booleans_ynu('true'), 'Y')
45 |         self.assertEqual(map_booleans_ynu('True'), 'Y')
46 |         self.assertEqual(map_booleans_ynu('TRUE'), 'Y')
47 |         self.assertEqual(map_booleans_ynu('y'), 'Y')
48 |         self.assertEqual(map_booleans_ynu('Y'), 'Y')
49 |         self.assertEqual(map_booleans_ynu('yes'), 'Y')
50 |         self.assertEqual(map_booleans_ynu('Yes'), 'Y')
51 |         self.assertEqual(map_booleans_ynu('YES'), 'Y')
52 |         # `Unknown`
53 |         self.assertEqual(map_booleans_ynu(''), 'Unknown')
54 |         self.assertEqual(map_booleans_ynu(' '), 'Unknown')
55 |         self.assertEqual(map_booleans_ynu(3), 'Unknown')
56 |         self.assertEqual(map_booleans_ynu(3.0), 'Unknown')
57 |         self.assertEqual(map_booleans_ynu(None), 'Unknown')
58 |         self.assertEqual(map_booleans_ynu('foo'), 'Unknown')
59 |         self.assertEqual(map_booleans_ynu('BAR'), 'Unknown')
60 | 
61 |     def test_string_to_double(self):
62 |         self.assertEqual(string_to_double_pfd("100"), 100.00)
63 |         self.assertEqual(string_to_double_pfd("100"), 100.00)
64 |         self.assertEqual(string_to_double_pfd("-100"), -100.00)
65 |         self.assertEqual(string_to_double_pfd("(100)"), -100.00)
66 |         self.assertEqual(string_to_double_pfd("$100"), 100.00)
67 |         self.assertEqual(string_to_double_pfd("-$100"), -100.00)
68 |         self.assertEqual(string_to_double_pfd("($100)"), -100.00)
69 |         self.assertEqual(string_to_double_pfd("100%"), 100.00)
70 |         self.assertEqual(string_to_double_pfd("-100%"), -100.00)
71 |         self.assertEqual(string_to_double_pfd("(100%)"), -100.00)
72 |         self.assertEqual(string_to_double_pfd("100.00"), 100.00)
73 |         self.assertEqual(string_to_double_pfd("-100.00"), -100.00)
74 |         self.assertEqual(string_to_double_pfd("(100.00)"), -100.00)
75 |         self.assertEqual(string_to_double_pfd("$100.00"), 100.00)
76 |         self.assertEqual(string_to_double_pfd("-$100.00"), -100.00)
77 |         self.assertEqual(string_to_double_pfd("($100.00)"), -100.00)
78 |         self.assertEqual(string_to_double_pfd("100.00%"), 100.00)
79 |         self.assertEqual(string_to_double_pfd("-100.00%"), -100.00)
80 |         self.assertEqual(string_to_double_pfd("(100.00%)"), -100.00)
81 |         #
82 |         self.assertEqual(string_to_double_pfd("100 Apples"), 100.00)
83 |         self.assertEqual(string_to_double_pfd("$3.14/lbs."), 3.14)
84 |         #
85 |         self.assertEqual(string_to_double_cfd("4 294 967 295,000"), 4294967295.00)
86 |         self.assertEqual(string_to_double_cfd("4 294 967.295,000"), 4294967295.00)
87 |         self.assertEqual(string_to_double_cfd("4.294.967.295,000"), 4294967295.00)
88 | 


--------------------------------------------------------------------------------
/pyspark_utilities/spark_udfs/__init__.py:
--------------------------------------------------------------------------------
1 | from .spark_udfs import SparkUDFs
2 | 


--------------------------------------------------------------------------------
/pyspark_utilities/spark_udfs/spark_udfs.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import Column, SparkSession
  2 | # noinspection PyUnresolvedReferences, PyProtectedMember
  3 | from pyspark.sql.column import _to_seq, _to_java_column
  4 | 
  5 | 
  6 | class SparkUDFs:
  7 |     def __init__(self, spark):
  8 |         """
  9 | 
 10 |         Args:
 11 |             spark (SparkSession): instantiated SparkSession.
 12 |         """
 13 |         self.spark = spark
 14 | 
 15 |     def clean_string(self, target_col):
 16 |         """ Remove Java ISO control characters from, and trim, string.
 17 | 
 18 |         Args:
 19 |             target_col (Spark Column): target column to be cleaned.
 20 | 
 21 |         Returns:
 22 |             Spark Column (StringType): cleaned version of input column.
 23 |         """
 24 |         sc = self.spark.sparkContext
 25 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 26 |         _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF()
 27 |         return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column)))
 28 | 
 29 |     def empty_string_to_null(self, target_col):
 30 |         """ Convert empty strings to nulls.
 31 | 
 32 |         Args:
 33 |             target_col (Spark Column): target column to convert.
 34 | 
 35 |         Returns:
 36 |             Spark Column (StringType): target column with empty values converted to nulls.
 37 |         """
 38 |         sc = self.spark.sparkContext
 39 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 40 |         _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF()
 41 |         return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column)))
 42 | 
 43 |     def generate_uuid(self):
 44 |         """ Generate V4 UUID.
 45 | 
 46 |         Returns:
 47 |             Spark Column (StringType): containing v4 UUIDs.
 48 |         """
 49 |         sc = self.spark.sparkContext
 50 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 51 |         _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF()
 52 |         return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column)))
 53 | 
 54 |     def map_booleans_ynu(self, target_col):
 55 |         """ Map boolean values to `Y`, `N`, `Unknown`
 56 | 
 57 |         Args:
 58 |             target_col (Spark Column): target column containing boolean values to map.
 59 | 
 60 |         Returns:
 61 |             Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`)
 62 |         """
 63 |         sc = self.spark.sparkContext
 64 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 65 |         _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF()
 66 |         return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column)))
 67 | 
 68 |     def string_to_double_pfd(self, target_col):
 69 |         """ Convert string to doubles where period represents decimal places (`pfd`).
 70 | 
 71 |         Args:
 72 |             target_col (Spark Column): containing double values in string format.
 73 | 
 74 |         Returns:
 75 |             Spark Column (DoubleType): containing double values converted from strings.
 76 |         """
 77 |         sc = self.spark.sparkContext
 78 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 79 |         _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoublePeriodForDecimal_UDF()
 80 |         return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
 81 | 
 82 |     def string_to_double_cfd(self, target_col):
 83 |         """ Convert string to doubles where commas represents decimal places (`cfd`).
 84 | 
 85 |         Args:
 86 |             target_col (Spark Column): containing double values in string format.
 87 | 
 88 |         Returns:
 89 |             Spark Column (DoubleType): containing double values converted from strings.
 90 |         """
 91 |         sc = self.spark.sparkContext
 92 |         # noinspection PyUnresolvedReferences, PyProtectedMember
 93 |         _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF()
 94 |         return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
 95 | 
 96 |     def string_is_number(self, target_col):
 97 |         """ Return boolean if string can be converted to a number.
 98 | 
 99 |         Args:
100 |             target_col (Spark Column): containing string to check for convertability to number.
101 | 
102 |         Returns:
103 |             Spark Column (BooleanType): whether string can converted to a number.
104 |         """
105 |         sc = self.spark.sparkContext
106 |         # noinspection PyUnresolvedReferences, PyProtectedMember
107 |         _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF()
108 |         return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column)))
109 | 
110 |     def normalize_date_md(self, target_col):
111 |         """ Convert string to date where MONTH is BEFORE DAY.
112 | 
113 |         Args:
114 |             target_col (Spark Column): containing strings representing dates.
115 | 
116 |         Returns:
117 |             Spark Column (DateType): containing dates converted from strings.
118 |         """
119 |         sc = self.spark.sparkContext
120 |         # noinspection PyUnresolvedReferences, PyProtectedMember
121 |         _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF()
122 |         return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column)))
123 | 
124 |     def normalize_date_dm(self, target_col):
125 |         """ Convert string to date where DAY is BEFORE MONTH.
126 | 
127 |         Args:
128 |             target_col (Spark Column): containing strings representing dates.
129 | 
130 |         Returns:
131 |             Spark Column (DateType): containing dates converted from strings.
132 |         """
133 |         sc = self.spark.sparkContext
134 |         # noinspection PyUnresolvedReferences, PyProtectedMember
135 |         _normalize_date_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateDM_UDF()
136 |         return Column(_normalize_date_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
137 | 
138 |     def normalize_timestamp_md(self, target_col):
139 |         """ Convert string to timestamp where MONTH is BEFORE DAY.
140 | 
141 |         Args:
142 |             target_col (Spark Column): containing strings representing timestamps.
143 | 
144 |         Returns:
145 |             Spark Column (TimestampType): containing timestamps converted from strings.
146 |         """
147 |         sc = self.spark.sparkContext
148 |         # noinspection PyUnresolvedReferences, PyProtectedMember
149 |         _normalize_timestamp_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampMD_UDF()
150 |         return Column(_normalize_timestamp_md.apply(_to_seq(sc, [target_col], _to_java_column)))
151 | 
152 |     def normalize_timestamp_dm(self, target_col):
153 |         """ Convert string to timestamp where DAY is BEFORE MONTH.
154 | 
155 |         Args:
156 |             target_col (Spark Column): containing strings representing timestamps.
157 | 
158 |         Returns:
159 |             Spark Column (TimestampType): containing timestamps converted from strings.
160 |         """
161 |         sc = self.spark.sparkContext
162 |         # noinspection PyUnresolvedReferences, PyProtectedMember
163 |         _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF()
164 |         return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
165 | 


--------------------------------------------------------------------------------
/pyspark_utilities/spark_utilities/__init__.py:
--------------------------------------------------------------------------------
1 | from .spark_utilities import start_spark
2 | 


--------------------------------------------------------------------------------
/pyspark_utilities/spark_utilities/spark_utilities.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | from os.path import expanduser
 3 | from warnings import warn
 4 | 
 5 | from pyspark import SparkConf
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | 
 9 | def start_spark(config=SparkConf(),
10 |                 app_name=None,
11 |                 env='cluster',
12 |                 enable_hive=False,
13 |                 source_aws_credentials_file=False,
14 |                 aws_profile=None):
15 |     """Instantiate SparkSession.
16 | 
17 |     Args:
18 |         config (SparkConf): SparkConf() with set parameters (optional).
19 |         app_name (str): Name of Spark application (optional).
20 |         env (str): Where Spark application is running (required). Known values: `local`, `cluster`.
21 |         enable_hive (bool): If `True`: adds Hive support via `enableHiveSupport()`
22 |         source_aws_credentials_file (bool): Whether to source AWS credentials file.
23 |         aws_profile (str): Name of profile to use for interacting with AWS services. Only used if `env` is `local`.
24 | 
25 |     Returns:
26 |         Instantiated SparkSession.
27 |     """
28 |     # validate inputs
29 |     ok_envs = ['local', 'cluster']
30 |     ok_envs_str = ', '.join(['`' + e + '`' for e in ok_envs])
31 |     assert env in ok_envs, f'Invalid value passed to `env` argument: `{env}`. Acceptable values: {ok_envs_str}.'
32 | 
33 |     # start SparkSession builder
34 |     if env == 'local':
35 |         if app_name is None:
36 |             app_name = 'some_app'
37 |         session_builder = (SparkSession
38 |                            .builder
39 |                            .master('local')
40 |                            .config(conf=config)
41 |                            .appName(app_name))
42 |     else:
43 |         session_builder = SparkSession.builder.config(conf=config)
44 | 
45 |     # enable Hive support
46 |     if enable_hive:
47 |         session_builder = session_builder.enableHiveSupport()
48 | 
49 |     # instantiate SparkSession
50 |     spark = session_builder.getOrCreate()
51 | 
52 |     # get credentials for AWS profile when running Spark locally
53 |     if source_aws_credentials_file:
54 |         if aws_profile is None:
55 |             warn("`aws_profile` is None with `source_aws_credentials_file` set to true. Using `default` AWS profile.")
56 |             aws_profile = 'default'
57 |         cfp = configparser.ConfigParser()
58 |         cfp.read(expanduser("~/.aws/credentials"))
59 |         access_id = cfp.get(aws_profile, "aws_access_key_id")
60 |         access_key = cfp.get(aws_profile, "aws_secret_access_key")
61 |         # noinspection PyProtectedMember, PyUnresolvedReferences
62 |         hadoop_conf = spark._jsc.hadoopConfiguration()
63 |         hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
64 |         hadoop_conf.set("fs.s3a.access.key", access_id)
65 |         hadoop_conf.set("fs.s3a.secret.key", access_key)
66 | 
67 |     return spark
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | holidays>=0.9.10
2 | fuzzywuzzy>=0.17.0
3 | jellyfish>=0.7.2
4 | numpy==1.14.5
5 | pandas>=0.24.0
6 | pyarrow==0.13.0
7 | pyspark>=2.4.0
8 | python-Levenshtein>=0.12.0
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="pyspark-utilities",
 8 |     version="0.0.1",
 9 |     author="Sam Zakalik",
10 |     description="ETL focused utilities library for PySpark",
11 |     long_description=long_description,
12 |     long_description_content_type="text/markdown",
13 |     url="https://github.com/zaksamalik/pyspark-utilities",
14 |     packages=setuptools.find_packages(),
15 |     install_requires=['holidays>=0.9.10',
16 |                       'fuzzywuzzy>=0.17.0',
17 |                       'jellyfish>=0.7.2',
18 |                       'numpy==1.14.5',
19 |                       'pandas>=0.24.0',
20 |                       'pyarrow==0.13.0',
21 |                       'pyspark>=2.4.0',
22 |                       'python-Levenshtein>=0.12.0'],
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3",
25 |         "License :: OSI Approved :: MIT License",
26 |         "Operating System :: OS Independent",
27 |     ],
28 | )
29 | 


--------------------------------------------------------------------------------