├── .gitignore
├── Makefile
├── README.md
├── config
    ├── example_modeler_app_config.yaml
    └── example_scorer_app_config.yaml
├── environment.yml
├── src
    ├── jobs
    │   ├── __init__.py
    │   ├── prophet_modeler.py
    │   └── prophet_scorer.py
    ├── modeler_spark_driver.py
    └── scorer_spark_driver.py
└── tests
    ├── fixtures
        └── model-input
        │   └── series_id=751
        │       └── sample-model-input.csv
    └── unit
        ├── prophet_modeler_test.py
        └── prophet_scorer_test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | .pytest_cache/
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | # DotEnv configuration
61 | .env
62 | 
63 | # Database
64 | *.db
65 | *.rdb
66 | 
67 | # Pycharm/IntelliJ
68 | .iml
69 | .idea
70 | 
71 | # VS Code
72 | .vscode/
73 | 
74 | # Spyder
75 | .spyproject/
76 | 
77 | # Jupyter NB Checkpoints
78 | .ipynb_checkpoints/
79 | 
80 | # Mac OS-specific storage files
81 | .DS_Store
82 | 
83 | # Gradle
84 | .gradle
85 | 
86 | # Spark
87 | spark-warehouse
88 | 
89 | # exclude dirs from source control by default
90 | /data/
91 | /output/
92 | /models/
93 | /out/
94 | /target/
95 | /build
96 | 
97 | # exclude Excel temp files
98 | ~$*.xlsx
99 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | help:
 2 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 3 | 
 4 | clean: ## Clean build dir
 5 | 	rm -rf ./build
 6 | 	rm -rf ./.pytest_cache
 7 | 	find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
 8 | 
 9 | build: clean ## Build project
10 | 	mkdir -p ./build/dist
11 | 	cp ./src/*spark_driver.py ./build/dist
12 | 	cd ./src && zip -x *spark_driver.py -x \*__pycache__\* -r ../build/dist/app.zip .
13 | 
14 | test: build ## Run tests
15 | 	pytest tests/unit
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Time Series Forecasting with FB Prophet and Apache Spark
  2 | ========================================================
  3 | 
  4 | # Use Case
  5 | If you have a time series you would like to forecast, [Facebook's Prophet](https://facebook.github.io/prophet/docs/quick_start.html#python-api)
  6 | library is fantastic. It robustly handled seasonality, missing data, trends and trains
  7 | and scores quickly.
  8 | 
  9 | But what if you have a large number of different time series you need to forecast?
 10 | With the help of [Apache Spark](https://spark.apache.org/) for large scale analytics processing,
 11 | you can train and predict multiple time series and scale up processing
 12 | horizontally by modifying the Spark cluster.
 13 | 
 14 | [PySpark](https://spark.apache.org/docs/latest/api/python/index.html) is needed in order to use [Pandas user defined functions (UDFs)]
 15 | (https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs)
 16 | which allow conversions between Spark dataframes to Pandas dataframes
 17 | with the help of [Apache Arrow](https://arrow.apache.org/).
 18 | 
 19 | If the historical time series are segmented by different dimensions, by grouping
 20 | for the dimensions, a time series model can be trained per time series.
 21 | 
 22 | # Example data
 23 | in `tests/fixtures/model-input`, the example data set has two non-temporal
 24 | dimensions. The series_id happens to be a partition column, and within the CSV
 25 | dim_id is another dimension. The timestamp and quantity correspond to the
 26 | time and y values for the time series. The schema for the CSV is defined in
 27 | `src/jobs/prophet_modeler.py`
 28 | 
 29 | # Modeling
 30 | When the Spark dataframe reads in the input data and a grouping clause is
 31 | applied on the non-temporal dimensions, that particular time series is
 32 | converted into a Pandas dataframe with the UDF defined. Using FB Prophet,
 33 | the model is trained on the historical data and the model itself is pickled
 34 | and returned in a new dataframe. The Spark dataframe collects all of the models per sets
 35 | of dimensions and persists them for scoring.
 36 | 
 37 | # Scoring
 38 | The models are read as a Spark dataframe. Similar to modeling, a Pandas UDF
 39 | is used to take the trained model and create forecast predictions based
 40 | on the given configuration. These are returned and collected in a Spark
 41 | dataframe with forecast predictions per set of dimensions.
 42 | 
 43 | # Developer Notes
 44 | This uses conda and PySpark so extra configuration is needed.
 45 | 
 46 | ## Conda environment
 47 | The `environment.yml` file defines what is used for the Python code. To use
 48 | this in a Spark cluster, you should bootstrap the conda installation onto each
 49 | worker node or use a preconfigured machine image with the conda environment
 50 | preconfigured.
 51 | 
 52 | ## Spark
 53 | This requires binary serialization between Spark and Python, so Spark 2.4+ is required along with pyarrow > 0.10
 54 | 
 55 | ### PYSPARK_PYTHON
 56 | This environment variable should use the python executable in the conda env.
 57 | `export PYSPARK_PYTHON=/path/to/python executable for conda env`
 58 | 
 59 | ### PYTHONPATH
 60 | Be sure to define env var PYTHONPATH to include py4j and pyspark:
 61 | `export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-xxx-src.zip:$PYTHONPATH`
 62 | 
 63 | Also include the `src` directory of the project to PYTHONPATH in order to run the unit tests.
 64 | 
 65 | ## Unit testing
 66 | To run the tests, be sure to set PYTHONPATH and activate the conda environment and run
 67 | `make test`.
 68 | 
 69 | ## Build
 70 | To build the archive, `make build` will create app.zip with the Python modules with the jobs
 71 | subdirectory. The Spark driver in PySpark has to be in a separate file outside the zip archive
 72 | (see `src/modeler_spark_driver.py and scorer_spark_driver.py`)
 73 | 
 74 | The app.zip and drivers are in `build/dist/*`
 75 | 
 76 | ## App Config
 77 | The app functionality has been split into prophet_modeler which trains and persists the Prophet models and
 78 | prophet_scorer which makes forecasts from the models.
 79 | 
 80 | There needs to be a minimum number of observations in a time series in order for the modeler to work properly.
 81 | 
 82 | The modeler application configuration are YAML files (`config/*_modeler_app_config.yaml`) that needs to specify the following:
 83 | 
 84 | <pre>
 85 | io:
 86 |   input: [input data location]
 87 |   models: [model output location]
 88 | model:
 89 |   floor: [min value for the forecast values]
 90 |   cap_multiplier: [multiplier over the max prior values for logistic model for Prophet]
 91 | </pre>
 92 | 
 93 | The scorer application configuration are YAML files (`config/*_scorer_app_config.yaml`) that needs to specify the following:
 94 | 
 95 | <pre>
 96 | io:
 97 |   models: [model location]
 98 |   forecasts: [output for forecasts in CSV]
 99 | forecast:
100 |   periods: [number of periods to forecast]
101 |   frequency: [frequency of forecasts (use 15min for quarter hour for example)]
102 | </pre>
103 | 
104 | ## Spark Cluster Config
105 | The spark cluster should not be configured for max resource allocation (described below). Only the
106 | YARN virtual memory check should be disabled with the following config:
107 | <pre>
108 |  {
109 |    "Classification": "yarn-site",
110 |    "Properties":
111 |      {
112 |        'yarn.nodemanager.vmem-check-enabled': 'false'
113 |      }
114 |  }
115 | </pre>
116 | This needs to be disabled in case YARN thinks the container exceeds virtual memory limits and kills
117 | the container.
118 | 
119 | ## Spark Submit Config
120 | Since most of the processing is done for the Prophet modeling, max resource
121 | allocation and dynamic allocation that would typically be done for a
122 | Spark application is not applicable here.  The spark app configuration needs to be manually tuned:
123 | 
124 | * spark.dynamicAllocation.enabled should be set to false since we are not using dynamic allocation
125 | * spark.executor.instances should be set to the number of CORE nodes in cluster
126 | * spark.executor.cores should be set to the number of vCPUs in a node - 1 for the driver
127 | * spark.sql.shuffle.partitions should be set to number of executor cores * number of executor instances
128 | * spark.speculation should be true to kill any task that takes too long compared to other tasks
129 | * spark.speculation.multiplier set to 2, if task is twice as slow as median consider speculation
130 | * spark.speculation.quantile set to .90, the fraction of tasks must be completed to consider speculation
131 | 
132 | About 70 to 80% of the nodes memory can be used for Spark + Python.
133 | 
134 | * spark.executor.memory is the memory for the spark app in each executor
135 | * spark.executor.memoryOverhead is the off heap memory not used by Spark (Python in this case)
136 | 
137 | Example spark submit:
138 | <pre>
139 | spark-submit
140 |       --deploy-mode client
141 |       --master yarn
142 |       --conf spark.dynamicAllocation.enabled=false
143 |       --conf spark.sql.shuffle.partitions=2000
144 |       --conf spark.executor.instances=30
145 |       --conf spark.executor.cores=15
146 |       --conf spark.executor.memory=8g
147 |       --conf spark.executor.memoryOverhead=14g
148 |       --conf spark.speculation=true
149 |       --conf spark.speculation.multiplier=2
150 |       --conf spark.speculation.quantile=0.90
151 |       --py-files /path/to/app.zip
152 |       /path/to/spark_driver.py
153 |       /path/to/app_config.yaml
154 | </pre>
155 | 


--------------------------------------------------------------------------------
/config/example_modeler_app_config.yaml:
--------------------------------------------------------------------------------
1 | io:
2 |   input: "/path/to/model-input-data"
3 |   models: "/path/to/write-models"
4 | model:
5 |   floor: 0
6 |   cap_multiplier: 1.1
7 | 


--------------------------------------------------------------------------------
/config/example_scorer_app_config.yaml:
--------------------------------------------------------------------------------
1 | io:
2 |   models: "/path/to/read-models"
3 |   forecasts: "/path/to/write-forecasts"
4 | forecast:
5 |   periods: 960       # 10 days
6 |   frequency: 15min
7 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: time-series-spark
 2 | 
 3 | dependencies:
 4 |   - python=3.7.5
 5 |   - pytest=5.3.1
 6 |   - pandas=0.25.3
 7 |   - pyyaml=5.1.2
 8 |   - pyarrow=0.13.0
 9 |   - pyspark=2.4.4
10 |   - pip=19.3.1
11 |   - pip:
12 |       - pystan==2.19.1.1
13 |       - fbprophet==0.5
14 | 


--------------------------------------------------------------------------------
/src/jobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mageky/time-series-spark/42be42e4576b072d9629f35619739045a514b4dc/src/jobs/__init__.py


--------------------------------------------------------------------------------
/src/jobs/prophet_modeler.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pickle
  3 | import time
  4 | 
  5 | import pandas as pd
  6 | from fbprophet import Prophet
  7 | from pyspark.sql import SparkSession, DataFrame
  8 | from pyspark.sql.functions import pandas_udf, PandasUDFType
  9 | from pyspark.sql.types import BinaryType, FloatType, StructType, StructField, IntegerType, TimestampType
 10 | 
 11 | # Modify input schema as needed if using CSV
 12 | MODEL_INPUT_SCHEMA = StructType([
 13 |     StructField("series_id", IntegerType(), True),
 14 |     StructField("dim_id", IntegerType(), True),
 15 |     StructField("start_time", TimestampType(), True),
 16 |     StructField("quantity", IntegerType(), True)
 17 | ])
 18 | 
 19 | 
 20 | # Use function currying to set the parameters
 21 | # Print statements are needed to log since Python can't interface to Spark logger easily.
 22 | def model_time_series(config):
 23 |     """
 24 |     Model time series per dimensions (series_id, dim_id)
 25 |     Be sure there is sufficient data to model each time series, otherwise model may not converge.
 26 | 
 27 |     :param config:
 28 |     :return:
 29 |     """
 30 | 
 31 |     # Pandas UDF requires the output pandas dataframe schema to be defined
 32 |     output_schema = StructType([
 33 |         StructField('series_id', IntegerType(), True),
 34 |         StructField('dim_id', IntegerType(), True),
 35 |         StructField('floor', FloatType(), True),
 36 |         StructField('cap', FloatType(), True),
 37 |         StructField('model', BinaryType(), True)
 38 |     ])
 39 | 
 40 |     @pandas_udf(output_schema, PandasUDFType.GROUPED_MAP)
 41 |     def model_time_series_udf(pdf):
 42 |         """
 43 |         User defined function for grouped sub spark dataframes converted into pandas dataframes.
 44 |         Input/output are both a pandas.DataFrame.  This cannot be an instance method.
 45 |         Be sure the order of columns and types match the defined output schema!
 46 | 
 47 |         Print statements are needed to log since Python can't interface to Spark logger easily.
 48 |         :param pdf: Input pandas dataframe
 49 |         :return: Output pandas dataframe
 50 |         """
 51 |         try:
 52 |             execution_time = time.time()
 53 |             series_id = int(pdf.iloc[0]['series_id'])
 54 |             dim_id = int(pdf.iloc[0]['dim_id'])
 55 | 
 56 |             floor = config['model']['floor']
 57 |             pdf['floor'] = floor
 58 | 
 59 |             cap = pdf['y'].max() * config['model']['cap_multiplier']
 60 |             pdf['cap'] = cap
 61 | 
 62 |             print(f"Modeling series_id: {series_id}, dim_id: {dim_id}"
 63 |                   f" with {len(pdf.index)} modeling rows")
 64 | 
 65 |             model = Prophet(growth='logistic', seasonality_mode='multiplicative')
 66 |             model.fit(pdf)
 67 | 
 68 |             data = {'series_id': [series_id],
 69 |                     'dim_id': [dim_id],
 70 |                     'floor': [floor],
 71 |                     'cap': [cap],
 72 |                     'model': [
 73 |                         pickle.dumps(model)]}  # Save the trained model by pickling the model and add to the dataframe
 74 | 
 75 |             output_df = pd.DataFrame(data)
 76 |             print(f"Output df series_id: {series_id}, dim_id: {dim_id}"
 77 |                   f" trained in {time.time() - execution_time}")
 78 | 
 79 |             return output_df
 80 | 
 81 |         except RuntimeError as err:
 82 |             print(f"Runtime error {err} for series_id: {series_id}, "
 83 |                   f"dim_id: {dim_id}")
 84 |             return pd.DataFrame(
 85 |                 columns=['series_id', 'dim_id', 'floor', 'cap', 'model'])
 86 | 
 87 |     return model_time_series_udf
 88 | 
 89 | 
 90 | class ProphetModeler:
 91 |     """
 92 |     Create models to forecast quantities using Facebook Prophet model.
 93 | 
 94 |     Each time series has its own time series model. The collection of trained models are stored
 95 |     in a Spark dataframe for easy export/import using Spark.
 96 |     """
 97 | 
 98 |     def __init__(self, config, logger=None):
 99 |         self.logger = logger or logging.getLogger(self.__class__.__name__)
100 |         self.config = config
101 | 
102 |     def read_input_dataframe(self, spark: SparkSession):
103 |         """
104 |         Reads the modeling input data
105 |         :param spark: spark session
106 |         :return: dataframe with input data
107 |         """
108 | 
109 |         input_df = spark \
110 |             .read \
111 |             .csv(self.config['io']['input'], schema=MODEL_INPUT_SCHEMA) \
112 |             .select('series_id', 'dim_id', 'start_time', 'quantity') \
113 |             .withColumnRenamed("start_time", "ds") \
114 |             .withColumnRenamed("quantity", "y")
115 | 
116 |         return input_df
117 | 
118 |     def persist_models(self, model_df: DataFrame):
119 |         """
120 |         Persist the models in Spark dataframe for easy export into Parquet.
121 |         :param model_df: Spark dataframe with trained time series models
122 |         """
123 |         model_df \
124 |             .write \
125 |             .parquet(self.config['io']['models'], mode='overwrite')
126 | 
127 |     @staticmethod
128 |     def model(spark_session, config):
129 |         """
130 |         Create the trained time series models
131 |         :param spark_session:
132 |         :param config: Dict of config
133 |         """
134 |         spark_session.conf.set("spark.sql.execution.arrow.enabled",
135 |                                "true")  # needed to convert to/from Pandas dataframe
136 |         scorer = ProphetModeler(config)
137 |         input_df = scorer.read_input_dataframe(spark_session)
138 | 
139 |         model_df = input_df \
140 |             .groupby('series_id', 'dim_id') \
141 |             .apply(model_time_series(scorer.config))
142 | 
143 |         scorer.persist_models(model_df)
144 | 


--------------------------------------------------------------------------------
/src/jobs/prophet_scorer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pickle
  3 | from datetime import datetime, timezone
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pyspark.sql import SparkSession, DataFrame
  8 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType, udf
  9 | from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType
 10 | 
 11 | 
 12 | # UDFs need to be standalone functions
 13 | 
 14 | # Use function currying to set the parameters
 15 | # Logging within Python doesn't work on Spark executors, so have to use print statements which
 16 | # will show up in stderr on individual workers
 17 | 
 18 | def forecast_time_series(config):
 19 |     """
 20 |     Forecast using trained time series model (series_id, dim_id)
 21 | 
 22 |     :param config:
 23 |     :return:
 24 |     """
 25 | 
 26 |     # Pandas UDF requires the output pandas dataframe schema to be defined
 27 |     output_schema = StructType([
 28 |         StructField('series_id', IntegerType(), True),
 29 |         StructField('dim_id', IntegerType(), True),
 30 |         StructField('ds', TimestampType(), True),
 31 |         StructField('yhat', IntegerType(), True)
 32 |     ])
 33 | 
 34 |     @pandas_udf(output_schema, PandasUDFType.GROUPED_MAP)
 35 |     def forecast_time_series_udf(pdf):
 36 |         """
 37 |         User defined function for grouped sub spark dataframes converted into pandas dataframes.
 38 |         Input/output are both a pandas.DataFrame.  This cannot be an instance method.
 39 |         Be sure the order of columns and types match the defined output schema!
 40 |         :param pdf: Input pandas dataframe
 41 |         :return: Output pandas dataframe
 42 |         """
 43 |         try:
 44 |             series_id = int(pdf.iloc[0]['series_id'])
 45 |             dim_id = int(pdf.iloc[0]['dim_id'])
 46 |             floor = float(pdf.iloc[0]['floor'])
 47 |             cap = float(pdf.iloc[0]['cap'])
 48 |             model = pickle.loads(pdf.iloc[0]['model'])
 49 | 
 50 |             # If model is missing return empty dataframe for the forecast
 51 |             if model is None:
 52 |                 print(f"For series_id: {series_id}, "
 53 |                       f"dim_id: {dim_id},"
 54 |                       f" no model found")
 55 |                 return pd.DataFrame(columns=['series_id', 'dim_id', 'ds', 'yhat'])
 56 | 
 57 |             frequency = config['forecast']['frequency']
 58 | 
 59 |             # We want the weeks to be offset on the appropriate weekday,
 60 |             # using 'W' for pandas date_range will set it to Sundays
 61 |             if frequency == 'W':
 62 |                 frequency = pd.offsets.Week()
 63 | 
 64 |             future_df = model.make_future_dataframe(periods=config['forecast']['periods'],
 65 |                                                     freq=frequency,
 66 |                                                     include_history=False)
 67 |             future_df['floor'] = floor
 68 |             future_df['cap'] = cap
 69 | 
 70 |             forecast_df = model.predict(future_df)
 71 | 
 72 |             # Be sure to cast yhat to integer since yhat is a float
 73 |             forecast_df = forecast_df.astype({"yhat": int})
 74 | 
 75 |             # Log negative values if any
 76 |             negatives = np.where(forecast_df["yhat"] < floor)
 77 |             if len(negatives[0]) > 0:
 78 |                 print(f"Negative forecast values found for series_id: {series_id}, "
 79 |                       f"dim_id: {dim_id}")
 80 | 
 81 |                 # Remove forecast values less than floor
 82 |                 forecast_df["yhat"] = np.where(forecast_df["yhat"] < floor,
 83 |                                                floor,
 84 |                                                forecast_df["yhat"])
 85 | 
 86 |             output_df = forecast_df[['ds', 'yhat']]
 87 |             output_df['series_id'] = series_id
 88 |             output_df['dim_id'] = dim_id
 89 | 
 90 |             output_df = output_df[['series_id', 'dim_id', 'ds', 'yhat']]
 91 |             print(f"series_id: {series_id}; demo_id: {dim_id}; "
 92 |                   f"floor: {floor}; cap: {cap}; "
 93 |                   f"future min: {future_df['ds'].min()}; future max: {future_df['ds'].max()}; "
 94 |                   f"forecast min: {forecast_df['ds'].min()}; forecast max: {forecast_df['ds'].max()}; "
 95 |                   f"output min: {output_df['ds'].min()}; output max: {output_df['ds'].max()};")
 96 | 
 97 |             return output_df
 98 | 
 99 |         except RuntimeError as err:
100 |             print(f"Runtime error {err} for series_id: {series_id}, "
101 |                   f"dim_id: {dim_id}")
102 |             return pd.DataFrame(columns=['series_id', 'dim_id', 'ds', 'yhat'])
103 | 
104 |     return forecast_time_series_udf
105 | 
106 | 
107 | def extract_date(datetimestamp: datetime):
108 |     return datetimestamp.date().strftime("%Y-%m-%d")
109 | 
110 | 
111 | extract_date_udf = udf(extract_date)
112 | 
113 | 
114 | class ProphetScorer:
115 |     """
116 |     Forecast quantities using trained Facebook Prophet model.
117 |     """
118 | 
119 |     def __init__(self, config, logger=None):
120 |         self.logger = logger or logging.getLogger(self.__class__.__name__)
121 |         self.config = config
122 | 
123 |     def read_model_dataframe(self, spark: SparkSession):
124 |         model_df = spark \
125 |             .read \
126 |             .parquet(self.config['io']['models'])
127 | 
128 |         return model_df
129 | 
130 |     @staticmethod
131 |     def convert_forecasts(forecast_df: DataFrame):
132 |         created_timestamp = datetime.now(timezone.utc) \
133 |             .replace(microsecond=0) \
134 |             .isoformat()
135 | 
136 |         return forecast_df \
137 |             .withColumn("created_timestamp", lit(created_timestamp)) \
138 |             .select("created_timestamp",
139 |                     "series_id",
140 |                     "dim_id",
141 |                     extract_date_udf("ds").alias("forecast_date"),  # include a date without time
142 |                     "ds",
143 |                     "yhat") \
144 |             .withColumnRenamed("ds", "forecast_timestamp") \
145 |             .withColumnRenamed("yhat", "forecast_quantity")
146 | 
147 |     def write_forecasts(self, output_df: DataFrame):
148 |         output_df \
149 |             .write \
150 |             .csv(self.config['io']['forecasts'], mode='overwrite', header=True)
151 | 
152 |     @staticmethod
153 |     def score(spark_session, config):
154 |         spark_session.conf.set("spark.sql.execution.arrow.enabled",
155 |                                "true")  # needed to convert to/from Pandas dataframe
156 |         scorer = ProphetScorer(config)
157 |         model_df = scorer.read_model_dataframe(spark_session)
158 | 
159 |         forecast_df = model_df \
160 |             .groupby('series_id', 'dim_id') \
161 |             .apply(forecast_time_series(scorer.config))
162 | 
163 |         converted_df = scorer.convert_forecasts(forecast_df)
164 | 
165 |         scorer.write_forecasts(converted_df)
166 | 


--------------------------------------------------------------------------------
/src/modeler_spark_driver.py:
--------------------------------------------------------------------------------
 1 | from jobs.prophet_modeler import ProphetModeler
 2 | from pyspark.sql import SparkSession
 3 | import sys
 4 | import yaml
 5 | 
 6 | if __name__ == '__main__':
 7 |     spark_session = SparkSession.builder.appName('TimeSeriesForecastModeler').getOrCreate()
 8 | 
 9 |     if len(sys.argv) != 2:
10 |         print("arg1 must be the config YAML")
11 |         exit(1)
12 | 
13 |     with open(sys.argv[1]) as file:
14 |         config = yaml.safe_load(file)
15 | 
16 |     print(f"config: {config}")
17 | 
18 |     ProphetModeler.model(spark_session, config)
19 | 
20 |     print("closing spark session")
21 |     spark_session.stop()
22 | 
23 | 


--------------------------------------------------------------------------------
/src/scorer_spark_driver.py:
--------------------------------------------------------------------------------
 1 | from jobs.prophet_scorer import ProphetScorer
 2 | from pyspark.sql import SparkSession
 3 | import sys
 4 | import yaml
 5 | 
 6 | if __name__ == '__main__':
 7 |     spark_session = SparkSession.builder.appName('TimeSeriesForecastScorer').getOrCreate()
 8 | 
 9 |     if len(sys.argv) != 2:
10 |         print("arg1 must be the config YAML")
11 |         exit(1)
12 | 
13 |     with open(sys.argv[1]) as file:
14 |         config = yaml.safe_load(file)
15 | 
16 |     print(f"config: {config}")
17 | 
18 |     ProphetScorer.score(spark_session, config)
19 | 
20 |     print("closing spark session")
21 |     spark_session.stop()
22 | 


--------------------------------------------------------------------------------
/tests/fixtures/model-input/series_id=751/sample-model-input.csv:
--------------------------------------------------------------------------------
  1 | 91,2001-01-05 11:15:00,36445
  2 | 91,2001-01-05 21:45:00,45724
  3 | 91,2001-01-07 11:15:00,40891
  4 | 91,2001-01-07 21:45:00,38598
  5 | 91,2001-01-12 11:15:00,21823
  6 | 91,2001-01-12 21:45:00,66162
  7 | 91,2001-01-14 11:15:00,22251
  8 | 91,2001-01-14 21:45:00,27663
  9 | 91,2001-01-19 11:15:00,41971
 10 | 91,2001-01-19 21:45:00,65254
 11 | 91,2001-01-21 11:15:00,43467
 12 | 91,2001-01-21 21:45:00,78034
 13 | 91,2001-01-26 11:15:00,48042
 14 | 91,2001-01-26 21:45:00,51967
 15 | 91,2001-02-02 11:15:00,20871
 16 | 91,2001-02-02 21:45:00,57492
 17 | 91,2001-02-04 11:15:00,23932
 18 | 91,2001-02-04 21:45:00,69640
 19 | 91,2001-02-09 11:15:00,40401
 20 | 91,2001-02-09 21:45:00,74112
 21 | 91,2001-02-11 11:15:00,14379
 22 | 91,2001-02-11 21:45:00,24779
 23 | 91,2001-02-16 11:15:00,27079
 24 | 91,2001-02-16 21:45:00,89082
 25 | 91,2001-02-18 11:15:00,19567
 26 | 91,2001-02-18 21:45:00,53830
 27 | 91,2001-02-23 11:15:00,73004
 28 | 91,2001-02-23 21:45:00,94174
 29 | 91,2001-02-25 11:15:00,4597
 30 | 91,2001-02-25 21:45:00,37117
 31 | 91,2001-03-01 11:15:00,38493
 32 | 91,2001-03-01 21:45:00,88510
 33 | 91,2001-03-03 11:15:00,21451
 34 | 91,2001-03-03 21:45:00,23728
 35 | 91,2001-03-08 11:15:00,45463
 36 | 91,2001-03-08 21:45:00,46318
 37 | 91,2001-03-10 11:15:00,28366
 38 | 91,2001-03-10 21:45:00,52082
 39 | 91,2001-03-15 11:15:00,41617
 40 | 91,2001-03-15 21:45:00,41544
 41 | 91,2001-03-17 11:15:00,35316
 42 | 91,2001-03-17 21:45:00,38839
 43 | 91,2001-03-22 11:15:00,36006
 44 | 91,2001-03-22 21:45:00,53785
 45 | 91,2001-03-24 11:15:00,35939
 46 | 91,2001-03-24 21:45:00,46446
 47 | 91,2001-03-29 11:15:00,24116
 48 | 91,2001-03-29 21:45:00,60230
 49 | 91,2001-03-31 11:15:00,22322
 50 | 91,2001-03-31 21:45:00,67151
 51 | 91,2001-04-05 11:15:00,50135
 52 | 91,2001-04-05 21:45:00,70147
 53 | 91,2001-04-07 11:15:00,25075
 54 | 91,2001-04-07 21:45:00,57819
 55 | 91,2001-04-12 11:15:00,37735
 56 | 91,2001-04-12 21:45:00,41320
 57 | 91,2001-04-14 11:15:00,24265
 58 | 91,2001-04-14 21:45:00,69948
 59 | 91,2001-04-19 11:15:00,24272
 60 | 91,2001-04-19 21:45:00,46014
 61 | 91,2001-04-21 11:15:00,8831
 62 | 91,2001-04-21 21:45:00,50994
 63 | 91,2001-04-26 11:15:00,24417
 64 | 91,2001-04-26 21:45:00,43985
 65 | 91,2001-04-28 11:15:00,21357
 66 | 91,2001-04-28 21:45:00,40104
 67 | 91,2001-05-03 11:15:00,21921
 68 | 91,2001-05-03 21:45:00,48765
 69 | 91,2001-05-05 11:15:00,39914
 70 | 91,2001-05-05 21:45:00,47584
 71 | 91,2001-05-10 11:15:00,21628
 72 | 91,2001-05-10 21:45:00,40713
 73 | 91,2001-05-12 11:15:00,36413
 74 | 91,2001-05-12 21:45:00,48834
 75 | 91,2001-05-17 11:15:00,22638
 76 | 91,2001-05-17 21:45:00,35891
 77 | 91,2001-05-19 11:15:00,25321
 78 | 91,2001-05-19 21:45:00,37890
 79 | 91,2001-05-24 11:15:00,45577
 80 | 91,2001-05-24 21:45:00,36964
 81 | 91,2001-05-26 11:15:00,25931
 82 | 91,2001-05-26 21:45:00,16763
 83 | 91,2001-05-31 11:15:00,26824
 84 | 91,2001-05-31 21:45:00,33766
 85 | 91,2001-06-02 11:15:00,44606
 86 | 91,2001-06-02 21:45:00,32453
 87 | 91,2001-06-07 11:15:00,20121
 88 | 91,2001-06-07 21:45:00,57270
 89 | 91,2001-06-09 11:15:00,23307
 90 | 91,2001-06-09 21:45:00,26673
 91 | 91,2001-06-14 11:15:00,28635
 92 | 91,2001-06-14 21:45:00,39151
 93 | 91,2001-06-16 11:15:00,19480
 94 | 91,2001-06-16 21:45:00,31806
 95 | 91,2001-06-21 11:15:00,25035
 96 | 91,2001-06-21 21:45:00,46247
 97 | 91,2001-06-23 11:15:00,23578
 98 | 91,2001-06-23 21:45:00,26755
 99 | 91,2001-06-28 11:15:00,21714
100 | 91,2001-06-28 21:45:00,47806
101 | 91,2001-06-30 11:15:00,34080
102 | 91,2001-06-30 21:45:00,39258
103 | 91,2001-07-05 11:15:00,18045
104 | 91,2001-07-05 21:45:00,62693
105 | 91,2001-07-07 11:15:00,22567
106 | 91,2001-07-07 21:45:00,34251
107 | 91,2001-07-12 11:15:00,21305
108 | 91,2001-07-12 21:45:00,35910
109 | 91,2001-07-14 11:15:00,25308
110 | 91,2001-07-14 21:45:00,18350
111 | 91,2001-07-19 11:15:00,36281
112 | 91,2001-07-19 21:45:00,31472
113 | 91,2001-07-21 11:15:00,43686
114 | 91,2001-07-21 21:45:00,28472
115 | 91,2001-07-26 11:15:00,36646
116 | 91,2001-07-26 21:45:00,40556
117 | 91,2001-07-28 11:15:00,28966
118 | 91,2001-07-28 21:45:00,34096
119 | 91,2001-08-02 11:15:00,40002
120 | 91,2001-08-02 21:45:00,26874
121 | 91,2001-08-04 11:15:00,24951
122 | 91,2001-08-04 21:45:00,20656
123 | 91,2001-08-09 11:15:00,10173
124 | 91,2001-08-09 21:45:00,35239
125 | 91,2001-08-11 11:15:00,34420
126 | 91,2001-08-11 21:45:00,8168
127 | 91,2001-08-16 11:15:00,27422
128 | 91,2001-08-16 21:45:00,30065
129 | 91,2001-08-18 11:15:00,11033
130 | 91,2001-08-18 21:45:00,27083
131 | 91,2001-08-23 11:15:00,21043
132 | 91,2001-08-23 21:45:00,21620
133 | 91,2001-08-25 11:15:00,15618
134 | 91,2001-08-25 21:45:00,20564
135 | 91,2001-08-30 11:15:00,20128
136 | 91,2001-08-30 21:45:00,41981
137 | 91,2001-09-01 11:15:00,31125
138 | 91,2001-09-01 21:45:00,27130
139 | 91,2001-09-06 11:15:00,18949
140 | 91,2001-09-06 21:45:00,43783
141 | 91,2001-09-08 11:15:00,24134
142 | 91,2001-09-08 21:45:00,36398
143 | 91,2001-09-13 11:15:00,30022
144 | 91,2001-09-13 21:45:00,43283
145 | 91,2001-09-15 11:15:00,30295
146 | 91,2001-09-15 21:45:00,31854
147 | 91,2001-09-20 11:15:00,20853
148 | 91,2001-09-20 21:45:00,19783
149 | 91,2001-09-22 11:15:00,24796
150 | 91,2001-09-22 21:45:00,21741
151 | 91,2001-09-27 11:15:00,24632
152 | 91,2001-09-27 21:45:00,24239
153 | 91,2001-09-29 11:15:00,30279
154 | 91,2001-09-29 21:45:00,27009
155 | 91,2001-10-04 11:15:00,36132
156 | 91,2001-10-04 21:45:00,39291
157 | 91,2001-10-06 11:15:00,15676
158 | 91,2001-10-06 21:45:00,24646
159 | 91,2001-10-11 11:15:00,17277
160 | 91,2001-10-11 21:45:00,25673
161 | 91,2001-10-13 11:15:00,9854
162 | 91,2001-10-13 21:45:00,30027
163 | 91,2001-10-18 11:15:00,33085
164 | 91,2001-10-18 21:45:00,39714
165 | 91,2001-10-20 11:15:00,5896
166 | 91,2001-10-20 21:45:00,19681
167 | 91,2001-10-25 11:15:00,11271
168 | 91,2001-10-25 21:45:00,23135
169 | 91,2001-10-27 11:15:00,12205
170 | 91,2001-10-27 21:45:00,14825
171 | 91,2001-11-01 11:15:00,37000
172 | 91,2001-11-01 21:45:00,44628
173 | 91,2001-11-03 11:15:00,8906
174 | 91,2001-11-03 21:45:00,25971
175 | 91,2001-11-08 11:15:00,23585
176 | 91,2001-11-08 21:45:00,12089
177 | 91,2001-11-10 11:15:00,34457
178 | 91,2001-11-10 21:45:00,7422
179 | 91,2001-11-15 11:15:00,41470
180 | 91,2001-11-15 21:45:00,30664
181 | 91,2001-11-17 11:15:00,20418
182 | 91,2001-11-17 21:45:00,26153
183 | 91,2001-11-22 11:15:00,24652
184 | 91,2001-11-22 21:45:00,43923
185 | 91,2001-11-24 11:15:00,1594
186 | 91,2001-11-24 21:45:00,24324
187 | 91,2001-11-29 11:15:00,24340
188 | 91,2001-11-29 21:45:00,35080
189 | 91,2001-12-01 11:15:00,12223
190 | 91,2001-12-01 21:45:00,25693
191 | 91,2001-12-06 11:15:00,37306
192 | 91,2001-12-06 21:45:00,43944
193 | 91,2001-12-08 11:15:00,9804
194 | 91,2001-12-08 21:45:00,18679
195 | 91,2001-12-13 11:15:00,14830
196 | 91,2001-12-13 21:45:00,28011
197 | 91,2001-12-15 11:15:00,4741
198 | 91,2001-12-15 21:45:00,7650
199 | 91,2001-12-20 11:15:00,20580
200 | 91,2001-12-20 21:45:00,26661
201 | 91,2001-12-22 11:15:00,27715
202 | 91,2001-12-22 21:45:00,25029
203 | 91,2001-12-27 11:15:00,11975
204 | 91,2001-12-27 21:45:00,37147
205 | 91,2001-12-29 11:15:00,23825
206 | 91,2001-12-29 21:45:00,19599
207 | 91,2002-01-03 11:15:00,28622
208 | 91,2002-01-03 21:45:00,32035
209 | 91,2002-01-05 11:15:00,14997
210 | 91,2002-01-05 21:45:00,42943
211 | 91,2002-01-10 11:15:00,40949
212 | 91,2002-01-10 21:45:00,36729
213 | 91,2002-01-12 11:15:00,5436
214 | 91,2002-01-12 21:45:00,22979
215 | 91,2002-01-17 11:15:00,15643
216 | 91,2002-01-17 21:45:00,14861
217 | 91,2002-01-19 11:15:00,12344
218 | 91,2002-01-19 21:45:00,15079
219 | 91,2002-01-24 11:15:00,18217
220 | 91,2002-01-24 21:45:00,23966
221 | 91,2002-01-26 11:15:00,19930
222 | 91,2002-01-26 21:45:00,31981
223 | 91,2002-01-31 11:15:00,34765
224 | 91,2002-01-31 21:45:00,53567
225 | 91,2002-02-02 11:15:00,9234
226 | 91,2002-02-02 21:45:00,27676
227 | 91,2002-02-07 11:15:00,38054
228 | 91,2002-02-07 21:45:00,36133
229 | 91,2002-02-14 11:15:00,25139
230 | 91,2002-02-14 21:45:00,27524
231 | 91,2002-02-16 11:15:00,13604
232 | 91,2002-02-16 21:45:00,40562
233 | 91,2002-02-21 11:15:00,5964
234 | 91,2002-02-21 21:45:00,37265
235 | 91,2002-02-23 11:15:00,13688
236 | 91,2002-02-23 21:45:00,45976
237 | 91,2002-02-28 11:15:00,20804
238 | 91,2002-02-28 21:45:00,60194
239 | 91,2002-03-02 11:15:00,27063
240 | 91,2002-03-02 21:45:00,24445
241 | 91,2002-03-07 11:15:00,27321
242 | 91,2002-03-07 21:45:00,35197
243 | 91,2002-03-09 11:15:00,25220
244 | 91,2002-03-09 21:45:00,25808
245 | 91,2002-03-14 11:15:00,26083
246 | 91,2002-03-14 21:45:00,68252
247 | 91,2002-03-16 11:15:00,7489
248 | 91,2002-03-16 21:45:00,4041
249 | 91,2002-03-21 11:15:00,8659
250 | 91,2002-03-21 21:45:00,39718
251 | 91,2002-03-23 11:15:00,12432
252 | 91,2002-03-23 21:45:00,35504
253 | 91,2002-03-28 11:15:00,28580
254 | 91,2002-03-28 21:45:00,30540
255 | 91,2002-03-30 11:15:00,20112
256 | 91,2002-03-30 21:45:00,14447
257 | 91,2002-04-04 11:15:00,18262
258 | 91,2002-04-04 21:45:00,38505
259 | 91,2002-04-06 11:15:00,9802
260 | 91,2002-04-06 21:45:00,17291
261 | 91,2002-04-11 11:15:00,40675
262 | 91,2002-04-11 21:45:00,43640
263 | 91,2002-04-13 11:15:00,24442
264 | 91,2002-04-13 21:45:00,28946
265 | 91,2002-04-18 11:15:00,35476
266 | 91,2002-04-18 21:45:00,22319
267 | 91,2002-04-20 11:15:00,13188
268 | 91,2002-04-20 21:45:00,24329
269 | 91,2002-04-25 11:15:00,31794
270 | 91,2002-04-25 21:45:00,25451
271 | 91,2002-04-27 11:15:00,23765
272 | 91,2002-04-27 21:45:00,11286
273 | 91,2002-05-02 11:15:00,6341
274 | 91,2002-05-02 21:45:00,35603
275 | 91,2002-05-04 11:15:00,25045
276 | 91,2002-05-04 21:45:00,17024
277 | 91,2002-05-09 11:15:00,26785
278 | 91,2002-05-09 21:45:00,24333
279 | 91,2002-05-11 11:15:00,3563
280 | 91,2002-05-11 21:45:00,17584
281 | 91,2002-05-16 11:15:00,13716
282 | 91,2002-05-16 21:45:00,22165
283 | 91,2002-05-18 11:15:00,19687
284 | 91,2002-05-18 21:45:00,6856
285 | 91,2002-05-23 11:15:00,27117
286 | 91,2002-05-23 21:45:00,35752
287 | 91,2002-05-25 11:15:00,26160
288 | 91,2002-05-25 21:45:00,22879
289 | 91,2002-05-30 11:15:00,20514
290 | 91,2002-05-30 21:45:00,16189
291 | 91,2002-06-01 11:15:00,19041
292 | 91,2002-06-01 21:45:00,16696
293 | 91,2002-06-06 11:15:00,25059
294 | 91,2002-06-06 21:45:00,20922
295 | 91,2002-06-08 11:15:00,20665
296 | 91,2002-06-08 21:45:00,17423
297 | 91,2002-06-13 11:15:00,3908
298 | 91,2002-06-13 21:45:00,33827
299 | 91,2002-06-15 11:15:00,9938
300 | 91,2002-06-15 21:45:00,40020
301 | 91,2002-06-20 11:15:00,17388
302 | 91,2002-06-20 21:45:00,14358
303 | 91,2002-06-22 11:15:00,13968
304 | 91,2002-06-22 21:45:00,35079
305 | 91,2002-06-27 11:15:00,38418
306 | 91,2002-06-27 21:45:00,34435
307 | 91,2002-06-29 11:15:00,8002
308 | 91,2002-06-29 21:45:00,15928
309 | 91,2002-07-04 11:15:00,14754
310 | 91,2002-07-04 21:45:00,18912
311 | 91,2002-07-06 11:15:00,13159
312 | 91,2002-07-06 21:45:00,24498
313 | 91,2002-07-11 11:15:00,23921
314 | 91,2002-07-11 21:45:00,31337
315 | 91,2002-07-13 11:15:00,28059
316 | 91,2002-07-13 21:45:00,18429
317 | 91,2002-07-18 11:15:00,22195
318 | 91,2002-07-18 21:45:00,21239
319 | 91,2002-07-20 11:15:00,15259
320 | 91,2002-07-20 21:45:00,19636
321 | 91,2002-07-25 11:15:00,29279
322 | 91,2002-07-25 21:45:00,36344
323 | 91,2002-07-27 11:15:00,8153
324 | 91,2002-07-27 21:45:00,18183
325 | 91,2002-08-01 11:15:00,22066
326 | 91,2002-08-01 21:45:00,22358
327 | 91,2002-08-03 11:15:00,9754
328 | 91,2002-08-03 21:45:00,20418
329 | 91,2002-08-08 11:15:00,7865
330 | 91,2002-08-08 21:45:00,9133
331 | 91,2002-08-10 11:15:00,23567
332 | 91,2002-08-10 21:45:00,5894
333 | 91,2002-08-15 11:15:00,16018
334 | 91,2002-08-15 21:45:00,7985
335 | 91,2002-08-17 11:15:00,18098
336 | 91,2002-08-17 21:45:00,28770
337 | 91,2002-08-22 11:15:00,11046
338 | 91,2002-08-22 21:45:00,22970
339 | 91,2002-08-24 11:15:00,12181
340 | 91,2002-08-24 21:45:00,20678
341 | 91,2002-08-29 11:15:00,14893
342 | 91,2002-08-29 21:45:00,12246
343 | 91,2002-08-31 11:15:00,5884
344 | 91,2002-08-31 21:45:00,20540
345 | 91,2002-09-05 11:15:00,1846
346 | 91,2002-09-05 21:45:00,21171
347 | 91,2002-09-07 11:15:00,12188
348 | 91,2002-09-07 21:45:00,33484
349 | 91,2002-09-12 11:15:00,2075
350 | 91,2002-09-12 21:45:00,33688
351 | 91,2002-09-14 11:15:00,8313
352 | 91,2002-09-14 21:45:00,11863
353 | 91,2002-09-19 11:15:00,28486
354 | 91,2002-09-19 21:45:00,47206
355 | 91,2002-09-21 11:15:00,5032
356 | 91,2002-09-21 21:45:00,9533
357 | 91,2002-09-26 11:15:00,20036
358 | 91,2002-09-26 21:45:00,54346
359 | 91,2002-10-03 11:15:00,9811
360 | 91,2002-10-03 21:45:00,16758
361 | 91,2002-10-05 11:15:00,9084
362 | 91,2002-10-05 21:45:00,44899
363 | 91,2002-10-10 11:15:00,25392
364 | 91,2002-10-10 21:45:00,27966
365 | 91,2002-10-12 11:15:00,7011
366 | 91,2002-10-12 21:45:00,43894
367 | 91,2002-10-17 11:15:00,11833
368 | 91,2002-10-17 21:45:00,16352
369 | 91,2002-10-19 11:15:00,22622
370 | 91,2002-10-19 21:45:00,13215
371 | 91,2002-10-24 11:15:00,25672
372 | 91,2002-10-24 21:45:00,33190
373 | 91,2002-10-26 11:15:00,4290
374 | 91,2002-10-26 21:45:00,27019
375 | 91,2002-10-31 11:15:00,11876
376 | 91,2002-10-31 21:45:00,18089
377 | 91,2002-11-02 11:15:00,15072
378 | 91,2002-11-02 21:45:00,27975
379 | 91,2002-11-07 11:15:00,11524
380 | 91,2002-11-07 21:45:00,35711
381 | 91,2002-11-09 11:15:00,16570
382 | 91,2002-11-09 21:45:00,16307
383 | 91,2002-11-14 11:15:00,30377
384 | 91,2002-11-14 21:45:00,18593
385 | 91,2002-11-16 11:15:00,6081
386 | 91,2002-11-16 21:45:00,17141
387 | 91,2002-11-21 11:15:00,18077
388 | 91,2002-11-21 21:45:00,34452
389 | 91,2002-11-23 11:15:00,25290
390 | 91,2002-11-23 21:45:00,7860
391 | 91,2002-11-28 11:15:00,21380
392 | 91,2002-11-28 21:45:00,32118
393 | 91,2002-11-30 11:15:00,4522
394 | 91,2002-11-30 21:45:00,25127
395 | 91,2002-12-12 11:15:00,22800
396 | 91,2002-12-12 11:15:00,22800
397 | 91,2002-12-12 21:45:00,20536
398 | 91,2002-12-12 21:45:00,20536
399 | 91,2002-12-14 11:15:00,7854
400 | 91,2002-12-14 11:15:00,7854
401 | 91,2002-12-14 21:45:00,15454
402 | 91,2002-12-14 21:45:00,15454
403 | 91,2002-12-19 11:15:00,18251
404 | 91,2002-12-19 21:45:00,24406
405 | 91,2002-12-21 11:15:00,11551
406 | 91,2002-12-21 21:45:00,9024
407 | 91,2002-12-26 11:15:00,45343
408 | 91,2002-12-26 21:45:00,38407
409 | 91,2002-12-28 11:15:00,11481
410 | 91,2002-12-28 21:45:00,9223
411 | 155,2001-01-05 11:15:00,47160
412 | 155,2001-01-05 21:45:00,79145
413 | 155,2001-01-07 11:15:00,28556
414 | 155,2001-01-07 21:45:00,44767
415 | 155,2001-01-12 11:15:00,22247
416 | 155,2001-01-12 21:45:00,57113
417 | 155,2001-01-14 11:15:00,25489
418 | 155,2001-01-14 21:45:00,29332
419 | 155,2001-01-19 11:15:00,53915
420 | 155,2001-01-19 21:45:00,74347
421 | 155,2001-01-21 11:15:00,26968
422 | 155,2001-01-21 21:45:00,74567
423 | 155,2001-01-26 11:15:00,21522
424 | 155,2001-01-26 21:45:00,45837
425 | 155,2001-02-02 11:15:00,27172
426 | 155,2001-02-02 21:45:00,52341
427 | 155,2001-02-04 11:15:00,25381
428 | 155,2001-02-04 21:45:00,37220
429 | 155,2001-02-09 11:15:00,42818
430 | 155,2001-02-09 21:45:00,75960
431 | 155,2001-02-11 11:15:00,10505
432 | 155,2001-02-11 21:45:00,27725
433 | 155,2001-02-16 11:15:00,23650
434 | 155,2001-02-16 21:45:00,61938
435 | 155,2001-02-18 11:15:00,13298
436 | 155,2001-02-18 21:45:00,25214
437 | 155,2001-02-23 11:15:00,41355
438 | 155,2001-02-23 21:45:00,99310
439 | 155,2001-02-25 11:15:00,52462
440 | 155,2001-02-25 21:45:00,47029
441 | 155,2001-03-01 11:15:00,27606
442 | 155,2001-03-01 21:45:00,60153
443 | 155,2001-03-03 11:15:00,17431
444 | 155,2001-03-03 21:45:00,34277
445 | 155,2001-03-08 11:15:00,32507
446 | 155,2001-03-08 21:45:00,98225
447 | 155,2001-03-10 11:15:00,13529
448 | 155,2001-03-10 21:45:00,59719
449 | 155,2001-03-15 11:15:00,45699
450 | 155,2001-03-15 21:45:00,81882
451 | 155,2001-03-17 11:15:00,22303
452 | 155,2001-03-17 21:45:00,54697
453 | 155,2001-03-22 11:15:00,63869
454 | 155,2001-03-22 21:45:00,59625
455 | 155,2001-03-24 11:15:00,23878
456 | 155,2001-03-24 21:45:00,57520
457 | 155,2001-03-29 11:15:00,25564
458 | 155,2001-03-29 21:45:00,60518
459 | 155,2001-03-31 11:15:00,46884
460 | 155,2001-03-31 21:45:00,50539
461 | 155,2001-04-05 11:15:00,29264
462 | 155,2001-04-05 21:45:00,66833
463 | 155,2001-04-07 11:15:00,46394
464 | 155,2001-04-07 21:45:00,83511
465 | 155,2001-04-12 11:15:00,36101
466 | 155,2001-04-12 21:45:00,63117
467 | 155,2001-04-14 11:15:00,35112
468 | 155,2001-04-14 21:45:00,79289
469 | 155,2001-04-19 11:15:00,45884
470 | 155,2001-04-19 21:45:00,54673
471 | 155,2001-04-21 11:15:00,14909
472 | 155,2001-04-21 21:45:00,58232
473 | 155,2001-04-26 11:15:00,25369
474 | 155,2001-04-26 21:45:00,58654
475 | 155,2001-04-28 11:15:00,31886
476 | 155,2001-04-28 21:45:00,36885
477 | 155,2001-05-03 11:15:00,9376
478 | 155,2001-05-03 21:45:00,30407
479 | 155,2001-05-05 11:15:00,14027
480 | 155,2001-05-05 21:45:00,52893
481 | 155,2001-05-10 11:15:00,16558
482 | 155,2001-05-10 21:45:00,76662
483 | 155,2001-05-12 11:15:00,20462
484 | 155,2001-05-12 21:45:00,55767
485 | 155,2001-05-17 11:15:00,31514
486 | 155,2001-05-17 21:45:00,62208
487 | 155,2001-05-19 11:15:00,19076
488 | 155,2001-05-19 21:45:00,39989
489 | 155,2001-05-24 11:15:00,36241
490 | 155,2001-05-24 21:45:00,39279
491 | 155,2001-05-26 11:15:00,28568
492 | 155,2001-05-26 21:45:00,39174
493 | 155,2001-05-31 11:15:00,25074
494 | 155,2001-05-31 21:45:00,46620
495 | 155,2001-06-02 11:15:00,41491
496 | 155,2001-06-02 21:45:00,41291
497 | 155,2001-06-07 11:15:00,22774
498 | 155,2001-06-07 21:45:00,50928
499 | 155,2001-06-09 11:15:00,31219
500 | 155,2001-06-09 21:45:00,22728
501 | 155,2001-06-14 11:15:00,43415
502 | 155,2001-06-14 21:45:00,63540
503 | 155,2001-06-16 11:15:00,21249
504 | 155,2001-06-16 21:45:00,40849
505 | 155,2001-06-21 11:15:00,48760
506 | 155,2001-06-21 21:45:00,81860
507 | 155,2001-06-23 11:15:00,44529
508 | 155,2001-06-23 21:45:00,66362
509 | 155,2001-06-28 11:15:00,32718
510 | 155,2001-06-28 21:45:00,47446
511 | 155,2001-06-30 11:15:00,29634
512 | 155,2001-06-30 21:45:00,38638
513 | 155,2001-07-05 11:15:00,39456
514 | 155,2001-07-05 21:45:00,77100
515 | 155,2001-07-07 11:15:00,13870
516 | 155,2001-07-07 21:45:00,56203
517 | 155,2001-07-12 11:15:00,46392
518 | 155,2001-07-12 21:45:00,37438
519 | 155,2001-07-14 11:15:00,25586
520 | 155,2001-07-14 21:45:00,13469
521 | 155,2001-07-19 11:15:00,30838
522 | 155,2001-07-19 21:45:00,28566
523 | 155,2001-07-21 11:15:00,30584
524 | 155,2001-07-21 21:45:00,43256
525 | 155,2001-07-26 11:15:00,31799
526 | 155,2001-07-26 21:45:00,87748
527 | 155,2001-07-28 11:15:00,15149
528 | 155,2001-07-28 21:45:00,52908
529 | 155,2001-08-02 11:15:00,26134
530 | 155,2001-08-02 21:45:00,32303
531 | 155,2001-08-04 11:15:00,27381
532 | 155,2001-08-04 21:45:00,42345
533 | 155,2001-08-09 11:15:00,25117
534 | 155,2001-08-09 21:45:00,46062
535 | 155,2001-08-11 11:15:00,32478
536 | 155,2001-08-11 21:45:00,43279
537 | 155,2001-08-16 11:15:00,27900
538 | 155,2001-08-16 21:45:00,71976
539 | 155,2001-08-18 11:15:00,20572
540 | 155,2001-08-18 21:45:00,40490
541 | 155,2001-08-23 11:15:00,35076
542 | 155,2001-08-23 21:45:00,52676
543 | 155,2001-08-25 11:15:00,17380
544 | 155,2001-08-25 21:45:00,46913
545 | 155,2001-08-30 11:15:00,30934
546 | 155,2001-08-30 21:45:00,119007
547 | 155,2001-09-01 11:15:00,25038
548 | 155,2001-09-01 21:45:00,70484
549 | 155,2001-09-06 11:15:00,20296
550 | 155,2001-09-06 21:45:00,68376
551 | 155,2001-09-08 11:15:00,39674
552 | 155,2001-09-08 21:45:00,51128
553 | 155,2001-09-13 11:15:00,33335
554 | 155,2001-09-13 21:45:00,57007
555 | 155,2001-09-15 11:15:00,41197
556 | 155,2001-09-15 21:45:00,69643
557 | 155,2001-09-20 11:15:00,35542
558 | 155,2001-09-20 21:45:00,51868
559 | 155,2001-09-22 11:15:00,14775
560 | 155,2001-09-22 21:45:00,36246
561 | 155,2001-09-27 11:15:00,39445
562 | 155,2001-09-27 21:45:00,59610
563 | 155,2001-09-29 11:15:00,13120
564 | 155,2001-09-29 21:45:00,50651
565 | 155,2001-10-04 11:15:00,38468
566 | 155,2001-10-04 21:45:00,67526
567 | 155,2001-10-06 11:15:00,26560
568 | 155,2001-10-06 21:45:00,64617
569 | 155,2001-10-11 11:15:00,10619
570 | 155,2001-10-11 21:45:00,24220
571 | 155,2001-10-13 11:15:00,12291
572 | 155,2001-10-13 21:45:00,46453
573 | 155,2001-10-18 11:15:00,25500
574 | 155,2001-10-18 21:45:00,88530
575 | 155,2001-10-20 11:15:00,22302
576 | 155,2001-10-20 21:45:00,34156
577 | 155,2001-10-25 11:15:00,12799
578 | 155,2001-10-25 21:45:00,36165
579 | 155,2001-10-27 11:15:00,33056
580 | 155,2001-10-27 21:45:00,42997
581 | 155,2001-11-01 11:15:00,59764
582 | 155,2001-11-01 21:45:00,80346
583 | 155,2001-11-03 11:15:00,26249
584 | 155,2001-11-03 21:45:00,19786
585 | 155,2001-11-08 11:15:00,46280
586 | 155,2001-11-08 21:45:00,46632
587 | 155,2001-11-10 11:15:00,27986
588 | 155,2001-11-10 21:45:00,35418
589 | 155,2001-11-15 11:15:00,33701
590 | 155,2001-11-15 21:45:00,100492
591 | 155,2001-11-17 11:15:00,11008
592 | 155,2001-11-17 21:45:00,41364
593 | 155,2001-11-22 11:15:00,19919
594 | 155,2001-11-22 21:45:00,108506
595 | 155,2001-11-24 11:15:00,11466
596 | 155,2001-11-24 21:45:00,26303
597 | 155,2001-11-29 11:15:00,34608
598 | 155,2001-11-29 21:45:00,66799
599 | 155,2001-12-01 11:15:00,36585
600 | 155,2001-12-01 21:45:00,39367
601 | 155,2001-12-06 11:15:00,58185
602 | 155,2001-12-06 21:45:00,56685
603 | 155,2001-12-08 11:15:00,15953
604 | 155,2001-12-08 21:45:00,41716
605 | 155,2001-12-13 11:15:00,23780
606 | 155,2001-12-13 21:45:00,54777
607 | 155,2001-12-15 11:15:00,28346
608 | 155,2001-12-15 21:45:00,30267
609 | 155,2001-12-20 11:15:00,17987
610 | 155,2001-12-20 21:45:00,53592
611 | 155,2001-12-22 11:15:00,14796
612 | 155,2001-12-22 21:45:00,51911
613 | 155,2001-12-27 11:15:00,23721
614 | 155,2001-12-27 21:45:00,80647
615 | 155,2001-12-29 11:15:00,23631
616 | 155,2001-12-29 21:45:00,57509
617 | 155,2002-01-03 11:15:00,46025
618 | 155,2002-01-03 21:45:00,76333
619 | 155,2002-01-05 11:15:00,14878
620 | 155,2002-01-05 21:45:00,44175
621 | 155,2002-01-10 11:15:00,50535
622 | 155,2002-01-10 21:45:00,73147
623 | 155,2002-01-12 11:15:00,18213
624 | 155,2002-01-12 21:45:00,49091
625 | 155,2002-01-17 11:15:00,28559
626 | 155,2002-01-17 21:45:00,69585
627 | 155,2002-01-19 11:15:00,14629
628 | 155,2002-01-19 21:45:00,74604
629 | 155,2002-01-24 11:15:00,39554
630 | 155,2002-01-24 21:45:00,56264
631 | 155,2002-01-26 11:15:00,26463
632 | 155,2002-01-26 21:45:00,45483
633 | 155,2002-01-31 11:15:00,29566
634 | 155,2002-01-31 21:45:00,127322
635 | 155,2002-02-02 11:15:00,30568
636 | 155,2002-02-02 21:45:00,63657
637 | 155,2002-02-07 11:15:00,51733
638 | 155,2002-02-07 21:45:00,53442
639 | 155,2002-02-14 11:15:00,35365
640 | 155,2002-02-14 21:45:00,52901
641 | 155,2002-02-16 11:15:00,28502
642 | 155,2002-02-16 21:45:00,17340
643 | 155,2002-02-21 11:15:00,35378
644 | 155,2002-02-21 21:45:00,81922
645 | 155,2002-02-23 11:15:00,26591
646 | 155,2002-02-23 21:45:00,38141
647 | 155,2002-02-28 11:15:00,39223
648 | 155,2002-02-28 21:45:00,61799
649 | 155,2002-03-02 11:15:00,21661
650 | 155,2002-03-02 21:45:00,26651
651 | 155,2002-03-07 11:15:00,24898
652 | 155,2002-03-07 21:45:00,66425
653 | 155,2002-03-09 11:15:00,11629
654 | 155,2002-03-09 21:45:00,45727
655 | 155,2002-03-14 11:15:00,18354
656 | 155,2002-03-14 21:45:00,74475
657 | 155,2002-03-16 11:15:00,15828
658 | 155,2002-03-16 21:45:00,56993
659 | 155,2002-03-21 11:15:00,57501
660 | 155,2002-03-21 21:45:00,44223
661 | 155,2002-03-23 11:15:00,9902
662 | 155,2002-03-23 21:45:00,40780
663 | 155,2002-03-28 11:15:00,49527
664 | 155,2002-03-28 21:45:00,51841
665 | 155,2002-03-30 11:15:00,28272
666 | 155,2002-03-30 21:45:00,25346
667 | 155,2002-04-04 11:15:00,28262
668 | 155,2002-04-04 21:45:00,65081
669 | 155,2002-04-06 11:15:00,7648
670 | 155,2002-04-06 21:45:00,17067
671 | 155,2002-04-11 11:15:00,61328
672 | 155,2002-04-11 21:45:00,46403
673 | 155,2002-04-13 11:15:00,21971
674 | 155,2002-04-13 21:45:00,52832
675 | 155,2002-04-18 11:15:00,56265
676 | 155,2002-04-18 21:45:00,63298
677 | 155,2002-04-20 11:15:00,25280
678 | 155,2002-04-20 21:45:00,9841
679 | 155,2002-04-25 11:15:00,55619
680 | 155,2002-04-25 21:45:00,28003
681 | 155,2002-04-27 11:15:00,30943
682 | 155,2002-04-27 21:45:00,32108
683 | 155,2002-05-02 11:15:00,23441
684 | 155,2002-05-02 21:45:00,63216
685 | 155,2002-05-04 11:15:00,44246
686 | 155,2002-05-04 21:45:00,37654
687 | 155,2002-05-09 11:15:00,23782
688 | 155,2002-05-09 21:45:00,35986
689 | 155,2002-05-11 11:15:00,26588
690 | 155,2002-05-11 21:45:00,50344
691 | 155,2002-05-16 11:15:00,18303
692 | 155,2002-05-16 21:45:00,26610
693 | 155,2002-05-18 11:15:00,21812
694 | 155,2002-05-18 21:45:00,12481
695 | 155,2002-05-23 11:15:00,47154
696 | 155,2002-05-23 21:45:00,51113
697 | 155,2002-05-25 11:15:00,29322
698 | 155,2002-05-25 21:45:00,16905
699 | 155,2002-05-30 11:15:00,37798
700 | 155,2002-05-30 21:45:00,63267
701 | 155,2002-06-01 11:15:00,24901
702 | 155,2002-06-01 21:45:00,32972
703 | 155,2002-06-06 11:15:00,56873
704 | 155,2002-06-06 21:45:00,51542
705 | 155,2002-06-08 11:15:00,30199
706 | 155,2002-06-08 21:45:00,42197
707 | 155,2002-06-13 11:15:00,36657
708 | 155,2002-06-13 21:45:00,56763
709 | 155,2002-06-15 11:15:00,15616
710 | 155,2002-06-15 21:45:00,61732
711 | 155,2002-06-20 11:15:00,37168
712 | 155,2002-06-20 21:45:00,49352
713 | 155,2002-06-22 11:15:00,15282
714 | 155,2002-06-22 21:45:00,34328
715 | 155,2002-06-27 11:15:00,43994
716 | 155,2002-06-27 21:45:00,36183
717 | 155,2002-06-29 11:15:00,14257
718 | 155,2002-06-29 21:45:00,47385
719 | 155,2002-07-04 11:15:00,27885
720 | 155,2002-07-04 21:45:00,46133
721 | 155,2002-07-06 11:15:00,16643
722 | 155,2002-07-06 21:45:00,23292
723 | 155,2002-07-11 11:15:00,18303
724 | 155,2002-07-11 21:45:00,65505
725 | 155,2002-07-13 11:15:00,48652
726 | 155,2002-07-13 21:45:00,31875
727 | 155,2002-07-18 11:15:00,19188
728 | 155,2002-07-18 21:45:00,39646
729 | 155,2002-07-20 11:15:00,43831
730 | 155,2002-07-20 21:45:00,30483
731 | 155,2002-07-25 11:15:00,45441
732 | 155,2002-07-25 21:45:00,63240
733 | 155,2002-07-27 11:15:00,28574
734 | 155,2002-07-27 21:45:00,23596
735 | 155,2002-08-01 11:15:00,21197
736 | 155,2002-08-01 21:45:00,52841
737 | 155,2002-08-03 11:15:00,14047
738 | 155,2002-08-03 21:45:00,60289
739 | 155,2002-08-08 11:15:00,34957
740 | 155,2002-08-08 21:45:00,30751
741 | 155,2002-08-10 11:15:00,20037
742 | 155,2002-08-10 21:45:00,66834
743 | 155,2002-08-15 11:15:00,37907
744 | 155,2002-08-15 21:45:00,37676
745 | 155,2002-08-17 11:15:00,21647
746 | 155,2002-08-17 21:45:00,47694
747 | 155,2002-08-22 11:15:00,15658
748 | 155,2002-08-22 21:45:00,37566
749 | 155,2002-08-24 11:15:00,6382
750 | 155,2002-08-24 21:45:00,25552
751 | 155,2002-08-29 11:15:00,13945
752 | 155,2002-08-29 21:45:00,25045
753 | 155,2002-08-31 11:15:00,46877
754 | 155,2002-08-31 21:45:00,47260
755 | 155,2002-09-05 11:15:00,9946
756 | 155,2002-09-05 21:45:00,62600
757 | 155,2002-09-07 11:15:00,11786
758 | 155,2002-09-07 21:45:00,29597
759 | 155,2002-09-12 11:15:00,23993
760 | 155,2002-09-12 21:45:00,62210
761 | 155,2002-09-14 11:15:00,17723
762 | 155,2002-09-14 21:45:00,32499
763 | 155,2002-09-19 11:15:00,4580
764 | 155,2002-09-19 21:45:00,19836
765 | 155,2002-09-21 11:15:00,17779
766 | 155,2002-09-21 21:45:00,47976
767 | 155,2002-09-26 11:15:00,51844
768 | 155,2002-09-26 21:45:00,45191
769 | 155,2002-10-03 11:15:00,19679
770 | 155,2002-10-03 21:45:00,39990
771 | 155,2002-10-05 11:15:00,10319
772 | 155,2002-10-05 21:45:00,39537
773 | 155,2002-10-10 11:15:00,19477
774 | 155,2002-10-10 21:45:00,46900
775 | 155,2002-10-12 11:15:00,12031
776 | 155,2002-10-12 21:45:00,21471
777 | 155,2002-10-17 11:15:00,22960
778 | 155,2002-10-17 21:45:00,40154
779 | 155,2002-10-19 11:15:00,5348
780 | 155,2002-10-19 21:45:00,30237
781 | 155,2002-10-24 11:15:00,22789
782 | 155,2002-10-24 21:45:00,55010
783 | 155,2002-10-26 11:15:00,15351
784 | 155,2002-10-26 21:45:00,39595
785 | 155,2002-10-31 11:15:00,14519
786 | 155,2002-10-31 21:45:00,33043
787 | 155,2002-11-02 11:15:00,25853
788 | 155,2002-11-02 21:45:00,27839
789 | 155,2002-11-07 11:15:00,20881
790 | 155,2002-11-07 21:45:00,44755
791 | 155,2002-11-09 11:15:00,5250
792 | 155,2002-11-09 21:45:00,34903
793 | 155,2002-11-14 11:15:00,29225
794 | 155,2002-11-14 21:45:00,34545
795 | 155,2002-11-16 11:15:00,25539
796 | 155,2002-11-16 21:45:00,61767
797 | 155,2002-11-21 11:15:00,34971
798 | 155,2002-11-21 21:45:00,87959
799 | 155,2002-11-23 11:15:00,43600
800 | 155,2002-11-23 21:45:00,83211
801 | 155,2002-11-28 11:15:00,21938
802 | 155,2002-11-28 21:45:00,36940
803 | 155,2002-11-30 11:15:00,10336
804 | 155,2002-11-30 21:45:00,31425
805 | 155,2002-12-12 11:15:00,22621
806 | 155,2002-12-12 21:45:00,70786
807 | 155,2002-12-14 11:15:00,24884
808 | 155,2002-12-14 21:45:00,23938
809 | 155,2002-12-19 11:15:00,25227
810 | 155,2002-12-19 21:45:00,62650
811 | 155,2002-12-21 11:15:00,38959
812 | 155,2002-12-21 21:45:00,26640
813 | 155,2002-12-26 11:15:00,83510
814 | 155,2002-12-26 21:45:00,69198
815 | 155,2002-12-28 11:15:00,34616
816 | 155,2002-12-28 21:45:00,20118
817 | 


--------------------------------------------------------------------------------
/tests/unit/prophet_modeler_test.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pytest
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from jobs.prophet_modeler import ProphetModeler, model_time_series
 6 | 
 7 | 
 8 | def suppress_py4j_logging():
 9 |     logger = logging.getLogger('py4j')
10 |     logger.setLevel(logging.WARN)
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def spark_session(request):
15 |     """ fixture for creating a spark session
16 |     Args:
17 |         request: pytest.FixtureRequest object
18 |     """
19 |     global spark
20 |     spark = SparkSession.builder.master("local[*]").appName("TestModeler").getOrCreate()
21 |     request.addfinalizer(lambda: spark.stop())
22 | 
23 |     suppress_py4j_logging()
24 |     return spark
25 | 
26 | 
27 | @pytest.fixture()
28 | def config():
29 |     config = {
30 |         'io': {
31 |             'input': 'tests/fixtures/model-input',
32 |             'models': 'build/models',
33 |         },
34 |         'model': {
35 |             'floor': 0,
36 |             'cap_multiplier': 1.1
37 |         }}
38 |     return config
39 | 
40 | 
41 | @pytest.fixture()
42 | def setup(config):
43 |     global modeler
44 |     modeler = ProphetModeler(config)
45 | 
46 | 
47 | @pytest.fixture()
48 | def spark_input_df(spark_session):
49 |     return modeler.read_input_dataframe(spark_session)
50 | 
51 | 
52 | def test_read_dataframe(setup, spark_input_df):
53 |     assert (spark_input_df.columns == ['series_id', 'dim_id', 'ds', 'y'])
54 |     assert (spark_input_df.select("series_id").distinct().count() == 1)
55 |     assert (spark_input_df.select("dim_id").distinct().count() == 2)
56 |     assert (spark_input_df.count() == 816)
57 | 
58 | 
59 | @pytest.mark.dependency()
60 | def test_model_time_series(setup, spark_input_df):
61 |     output_df = spark_input_df \
62 |         .groupby('series_id', 'dim_id') \
63 |         .apply(model_time_series(modeler.config))
64 | 
65 |     assert (output_df.count() == 2)
66 |     assert (output_df.columns == ['series_id', 'dim_id', 'floor', 'cap', 'model'])
67 |     assert (output_df.filter('series_id = 751 and dim_id = 91').count() == 1)
68 |     assert (output_df.filter('series_id = 751 and dim_id = 155').count() == 1)
69 | 
70 |     modeler.persist_models(output_df)
71 | 
72 |     model_df = spark.read.parquet('./build/models')
73 | 
74 |     assert (model_df.count() == 2)
75 |     assert (model_df.columns == ['series_id', 'dim_id', 'floor', 'cap', 'model'])
76 | 


--------------------------------------------------------------------------------
/tests/unit/prophet_scorer_test.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from datetime import datetime
  4 | import pytest
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.types import *
  7 | 
  8 | from jobs.prophet_scorer import ProphetScorer, forecast_time_series
  9 | 
 10 | 
 11 | def suppress_py4j_logging():
 12 |     logger = logging.getLogger('py4j')
 13 |     logger.setLevel(logging.WARN)
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def spark_session(request):
 18 |     """ fixture for creating a spark session
 19 |     Args:
 20 |         request: pytest.FixtureRequest object
 21 |     """
 22 |     global spark
 23 |     spark = SparkSession.builder.master("local[*]").appName("TestScorer").getOrCreate()
 24 |     request.addfinalizer(lambda: spark.stop())
 25 | 
 26 |     suppress_py4j_logging()
 27 |     return spark
 28 | 
 29 | 
 30 | @pytest.fixture()
 31 | def config():
 32 |     config = {
 33 |         'io': {
 34 |             'models': 'build/models',
 35 |             'forecasts': 'build/forecasts',
 36 |         },
 37 |         'forecast': {
 38 |             'periods': 40,
 39 |             'frequency': '15min'
 40 |         }}
 41 |     return config
 42 | 
 43 | 
 44 | @pytest.fixture()
 45 | def setup(config):
 46 |     global scorer
 47 |     scorer = ProphetScorer(config)
 48 | 
 49 | 
 50 | @pytest.fixture()
 51 | def spark_model_df(spark_session):
 52 |     return scorer.read_model_dataframe(spark_session)
 53 | 
 54 | 
 55 | @pytest.fixture()
 56 | def spark_forecast_df(spark_session):
 57 |     schema = StructType([StructField("series_id", IntegerType()),
 58 |                          StructField("dim_id", IntegerType()),
 59 |                          StructField("ds", TimestampType()),
 60 |                          StructField("yhat", IntegerType())
 61 |                          ])
 62 | 
 63 |     test_list = [
 64 |         (101, 66, datetime.strptime('2015-07-05 10:15:00', '%Y-%m-%d %H:%M:%S'), 873242)
 65 |     ]
 66 | 
 67 |     return spark_session.createDataFrame(data=test_list, schema=schema)
 68 | 
 69 | 
 70 | def test_convert_forecasts(setup, spark_forecast_df):
 71 |     output_df = scorer.convert_forecasts(spark_forecast_df)
 72 | 
 73 |     timestamp_regex = re.compile(r'^([0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T'
 74 |                                  r'(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\+00:00)$')
 75 |     assert (timestamp_regex.match(output_df.collect()[0][0]))
 76 |     assert (output_df.collect()[0][1] == 101)
 77 |     assert (output_df.collect()[0][2] == 66)
 78 |     assert (output_df.collect()[0][3] == '2015-07-05')
 79 |     assert (output_df.collect()[0][4] == datetime(2015, 7, 5, 10, 15))
 80 |     assert (output_df.collect()[0][5] == 873242)
 81 | 
 82 | 
 83 | def test_read_model_dataframe(setup, spark_model_df):
 84 |     assert (spark_model_df.columns == ['series_id', 'dim_id', 'floor', 'cap',
 85 |                                        'model'])
 86 |     assert (spark_model_df.select("series_id").distinct().count() == 1)
 87 |     assert (spark_model_df.select("dim_id").distinct().count() == 2)
 88 |     assert (spark_model_df.count() == 2)
 89 | 
 90 | 
 91 | @pytest.mark.dependency(depends=["test_model_time_series"])
 92 | def test_forecast_time_series(setup, spark_model_df):
 93 |     output_df = spark_model_df \
 94 |         .groupby('series_id', 'dim_id') \
 95 |         .apply(forecast_time_series(scorer.config))
 96 | 
 97 |     assert (output_df.count() == 80)
 98 |     assert (output_df.columns == ['series_id', 'dim_id', 'ds', 'yhat'])
 99 |     assert (output_df.filter('series_id = 751 and dim_id = 91').count() == 40)
100 |     assert (output_df.filter('series_id = 751 and dim_id = 155').count() == 40)
101 | 
102 |     converted_df = scorer.convert_forecasts(output_df)
103 | 
104 |     scorer.write_forecasts(converted_df)
105 | 
106 |     read_output_df = spark.read.csv('./build/forecasts', header=True)
107 | 
108 |     assert (read_output_df.columns == ['created_timestamp',
109 |                                        'series_id',
110 |                                        'dim_id',
111 |                                        'forecast_date',
112 |                                        'forecast_timestamp',
113 |                                        'forecast_quantity'])
114 |     assert (read_output_df.count() == 80)
115 | 


--------------------------------------------------------------------------------