├── .gitignore ├── Makefile ├── README.md ├── config ├── example_modeler_app_config.yaml └── example_scorer_app_config.yaml ├── environment.yml ├── src ├── jobs │ ├── __init__.py │ ├── prophet_modeler.py │ └── prophet_scorer.py ├── modeler_spark_driver.py └── scorer_spark_driver.py └── tests ├── fixtures └── model-input │ └── series_id=751 │ └── sample-model-input.csv └── unit ├── prophet_modeler_test.py └── prophet_scorer_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | .pytest_cache/ 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | # DotEnv configuration 61 | .env 62 | 63 | # Database 64 | *.db 65 | *.rdb 66 | 67 | # Pycharm/IntelliJ 68 | .iml 69 | .idea 70 | 71 | # VS Code 72 | .vscode/ 73 | 74 | # Spyder 75 | .spyproject/ 76 | 77 | # Jupyter NB Checkpoints 78 | .ipynb_checkpoints/ 79 | 80 | # Mac OS-specific storage files 81 | .DS_Store 82 | 83 | # Gradle 84 | .gradle 85 | 86 | # Spark 87 | spark-warehouse 88 | 89 | # exclude dirs from source control by default 90 | /data/ 91 | /output/ 92 | /models/ 93 | /out/ 94 | /target/ 95 | /build 96 | 97 | # exclude Excel temp files 98 | ~$*.xlsx 99 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | help: 2 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 3 | 4 | clean: ## Clean build dir 5 | rm -rf ./build 6 | rm -rf ./.pytest_cache 7 | find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete 8 | 9 | build: clean ## Build project 10 | mkdir -p ./build/dist 11 | cp ./src/*spark_driver.py ./build/dist 12 | cd ./src && zip -x *spark_driver.py -x \*__pycache__\* -r ../build/dist/app.zip . 13 | 14 | test: build ## Run tests 15 | pytest tests/unit 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Time Series Forecasting with FB Prophet and Apache Spark 2 | ======================================================== 3 | 4 | # Use Case 5 | If you have a time series you would like to forecast, [Facebook's Prophet](https://facebook.github.io/prophet/docs/quick_start.html#python-api) 6 | library is fantastic. It robustly handled seasonality, missing data, trends and trains 7 | and scores quickly. 8 | 9 | But what if you have a large number of different time series you need to forecast? 10 | With the help of [Apache Spark](https://spark.apache.org/) for large scale analytics processing, 11 | you can train and predict multiple time series and scale up processing 12 | horizontally by modifying the Spark cluster. 13 | 14 | [PySpark](https://spark.apache.org/docs/latest/api/python/index.html) is needed in order to use [Pandas user defined functions (UDFs)] 15 | (https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs) 16 | which allow conversions between Spark dataframes to Pandas dataframes 17 | with the help of [Apache Arrow](https://arrow.apache.org/). 18 | 19 | If the historical time series are segmented by different dimensions, by grouping 20 | for the dimensions, a time series model can be trained per time series. 21 | 22 | # Example data 23 | in `tests/fixtures/model-input`, the example data set has two non-temporal 24 | dimensions. The series_id happens to be a partition column, and within the CSV 25 | dim_id is another dimension. The timestamp and quantity correspond to the 26 | time and y values for the time series. The schema for the CSV is defined in 27 | `src/jobs/prophet_modeler.py` 28 | 29 | # Modeling 30 | When the Spark dataframe reads in the input data and a grouping clause is 31 | applied on the non-temporal dimensions, that particular time series is 32 | converted into a Pandas dataframe with the UDF defined. Using FB Prophet, 33 | the model is trained on the historical data and the model itself is pickled 34 | and returned in a new dataframe. The Spark dataframe collects all of the models per sets 35 | of dimensions and persists them for scoring. 36 | 37 | # Scoring 38 | The models are read as a Spark dataframe. Similar to modeling, a Pandas UDF 39 | is used to take the trained model and create forecast predictions based 40 | on the given configuration. These are returned and collected in a Spark 41 | dataframe with forecast predictions per set of dimensions. 42 | 43 | # Developer Notes 44 | This uses conda and PySpark so extra configuration is needed. 45 | 46 | ## Conda environment 47 | The `environment.yml` file defines what is used for the Python code. To use 48 | this in a Spark cluster, you should bootstrap the conda installation onto each 49 | worker node or use a preconfigured machine image with the conda environment 50 | preconfigured. 51 | 52 | ## Spark 53 | This requires binary serialization between Spark and Python, so Spark 2.4+ is required along with pyarrow > 0.10 54 | 55 | ### PYSPARK_PYTHON 56 | This environment variable should use the python executable in the conda env. 57 | `export PYSPARK_PYTHON=/path/to/python executable for conda env` 58 | 59 | ### PYTHONPATH 60 | Be sure to define env var PYTHONPATH to include py4j and pyspark: 61 | `export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-xxx-src.zip:$PYTHONPATH` 62 | 63 | Also include the `src` directory of the project to PYTHONPATH in order to run the unit tests. 64 | 65 | ## Unit testing 66 | To run the tests, be sure to set PYTHONPATH and activate the conda environment and run 67 | `make test`. 68 | 69 | ## Build 70 | To build the archive, `make build` will create app.zip with the Python modules with the jobs 71 | subdirectory. The Spark driver in PySpark has to be in a separate file outside the zip archive 72 | (see `src/modeler_spark_driver.py and scorer_spark_driver.py`) 73 | 74 | The app.zip and drivers are in `build/dist/*` 75 | 76 | ## App Config 77 | The app functionality has been split into prophet_modeler which trains and persists the Prophet models and 78 | prophet_scorer which makes forecasts from the models. 79 | 80 | There needs to be a minimum number of observations in a time series in order for the modeler to work properly. 81 | 82 | The modeler application configuration are YAML files (`config/*_modeler_app_config.yaml`) that needs to specify the following: 83 | 84 |
85 | io: 86 | input: [input data location] 87 | models: [model output location] 88 | model: 89 | floor: [min value for the forecast values] 90 | cap_multiplier: [multiplier over the max prior values for logistic model for Prophet] 91 |92 | 93 | The scorer application configuration are YAML files (`config/*_scorer_app_config.yaml`) that needs to specify the following: 94 | 95 |
96 | io: 97 | models: [model location] 98 | forecasts: [output for forecasts in CSV] 99 | forecast: 100 | periods: [number of periods to forecast] 101 | frequency: [frequency of forecasts (use 15min for quarter hour for example)] 102 |103 | 104 | ## Spark Cluster Config 105 | The spark cluster should not be configured for max resource allocation (described below). Only the 106 | YARN virtual memory check should be disabled with the following config: 107 |
108 | { 109 | "Classification": "yarn-site", 110 | "Properties": 111 | { 112 | 'yarn.nodemanager.vmem-check-enabled': 'false' 113 | } 114 | } 115 |116 | This needs to be disabled in case YARN thinks the container exceeds virtual memory limits and kills 117 | the container. 118 | 119 | ## Spark Submit Config 120 | Since most of the processing is done for the Prophet modeling, max resource 121 | allocation and dynamic allocation that would typically be done for a 122 | Spark application is not applicable here. The spark app configuration needs to be manually tuned: 123 | 124 | * spark.dynamicAllocation.enabled should be set to false since we are not using dynamic allocation 125 | * spark.executor.instances should be set to the number of CORE nodes in cluster 126 | * spark.executor.cores should be set to the number of vCPUs in a node - 1 for the driver 127 | * spark.sql.shuffle.partitions should be set to number of executor cores * number of executor instances 128 | * spark.speculation should be true to kill any task that takes too long compared to other tasks 129 | * spark.speculation.multiplier set to 2, if task is twice as slow as median consider speculation 130 | * spark.speculation.quantile set to .90, the fraction of tasks must be completed to consider speculation 131 | 132 | About 70 to 80% of the nodes memory can be used for Spark + Python. 133 | 134 | * spark.executor.memory is the memory for the spark app in each executor 135 | * spark.executor.memoryOverhead is the off heap memory not used by Spark (Python in this case) 136 | 137 | Example spark submit: 138 |
139 | spark-submit 140 | --deploy-mode client 141 | --master yarn 142 | --conf spark.dynamicAllocation.enabled=false 143 | --conf spark.sql.shuffle.partitions=2000 144 | --conf spark.executor.instances=30 145 | --conf spark.executor.cores=15 146 | --conf spark.executor.memory=8g 147 | --conf spark.executor.memoryOverhead=14g 148 | --conf spark.speculation=true 149 | --conf spark.speculation.multiplier=2 150 | --conf spark.speculation.quantile=0.90 151 | --py-files /path/to/app.zip 152 | /path/to/spark_driver.py 153 | /path/to/app_config.yaml 154 |155 | -------------------------------------------------------------------------------- /config/example_modeler_app_config.yaml: -------------------------------------------------------------------------------- 1 | io: 2 | input: "/path/to/model-input-data" 3 | models: "/path/to/write-models" 4 | model: 5 | floor: 0 6 | cap_multiplier: 1.1 7 | -------------------------------------------------------------------------------- /config/example_scorer_app_config.yaml: -------------------------------------------------------------------------------- 1 | io: 2 | models: "/path/to/read-models" 3 | forecasts: "/path/to/write-forecasts" 4 | forecast: 5 | periods: 960 # 10 days 6 | frequency: 15min 7 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: time-series-spark 2 | 3 | dependencies: 4 | - python=3.7.5 5 | - pytest=5.3.1 6 | - pandas=0.25.3 7 | - pyyaml=5.1.2 8 | - pyarrow=0.13.0 9 | - pyspark=2.4.4 10 | - pip=19.3.1 11 | - pip: 12 | - pystan==2.19.1.1 13 | - fbprophet==0.5 14 | -------------------------------------------------------------------------------- /src/jobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mageky/time-series-spark/42be42e4576b072d9629f35619739045a514b4dc/src/jobs/__init__.py -------------------------------------------------------------------------------- /src/jobs/prophet_modeler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | import time 4 | 5 | import pandas as pd 6 | from fbprophet import Prophet 7 | from pyspark.sql import SparkSession, DataFrame 8 | from pyspark.sql.functions import pandas_udf, PandasUDFType 9 | from pyspark.sql.types import BinaryType, FloatType, StructType, StructField, IntegerType, TimestampType 10 | 11 | # Modify input schema as needed if using CSV 12 | MODEL_INPUT_SCHEMA = StructType([ 13 | StructField("series_id", IntegerType(), True), 14 | StructField("dim_id", IntegerType(), True), 15 | StructField("start_time", TimestampType(), True), 16 | StructField("quantity", IntegerType(), True) 17 | ]) 18 | 19 | 20 | # Use function currying to set the parameters 21 | # Print statements are needed to log since Python can't interface to Spark logger easily. 22 | def model_time_series(config): 23 | """ 24 | Model time series per dimensions (series_id, dim_id) 25 | Be sure there is sufficient data to model each time series, otherwise model may not converge. 26 | 27 | :param config: 28 | :return: 29 | """ 30 | 31 | # Pandas UDF requires the output pandas dataframe schema to be defined 32 | output_schema = StructType([ 33 | StructField('series_id', IntegerType(), True), 34 | StructField('dim_id', IntegerType(), True), 35 | StructField('floor', FloatType(), True), 36 | StructField('cap', FloatType(), True), 37 | StructField('model', BinaryType(), True) 38 | ]) 39 | 40 | @pandas_udf(output_schema, PandasUDFType.GROUPED_MAP) 41 | def model_time_series_udf(pdf): 42 | """ 43 | User defined function for grouped sub spark dataframes converted into pandas dataframes. 44 | Input/output are both a pandas.DataFrame. This cannot be an instance method. 45 | Be sure the order of columns and types match the defined output schema! 46 | 47 | Print statements are needed to log since Python can't interface to Spark logger easily. 48 | :param pdf: Input pandas dataframe 49 | :return: Output pandas dataframe 50 | """ 51 | try: 52 | execution_time = time.time() 53 | series_id = int(pdf.iloc[0]['series_id']) 54 | dim_id = int(pdf.iloc[0]['dim_id']) 55 | 56 | floor = config['model']['floor'] 57 | pdf['floor'] = floor 58 | 59 | cap = pdf['y'].max() * config['model']['cap_multiplier'] 60 | pdf['cap'] = cap 61 | 62 | print(f"Modeling series_id: {series_id}, dim_id: {dim_id}" 63 | f" with {len(pdf.index)} modeling rows") 64 | 65 | model = Prophet(growth='logistic', seasonality_mode='multiplicative') 66 | model.fit(pdf) 67 | 68 | data = {'series_id': [series_id], 69 | 'dim_id': [dim_id], 70 | 'floor': [floor], 71 | 'cap': [cap], 72 | 'model': [ 73 | pickle.dumps(model)]} # Save the trained model by pickling the model and add to the dataframe 74 | 75 | output_df = pd.DataFrame(data) 76 | print(f"Output df series_id: {series_id}, dim_id: {dim_id}" 77 | f" trained in {time.time() - execution_time}") 78 | 79 | return output_df 80 | 81 | except RuntimeError as err: 82 | print(f"Runtime error {err} for series_id: {series_id}, " 83 | f"dim_id: {dim_id}") 84 | return pd.DataFrame( 85 | columns=['series_id', 'dim_id', 'floor', 'cap', 'model']) 86 | 87 | return model_time_series_udf 88 | 89 | 90 | class ProphetModeler: 91 | """ 92 | Create models to forecast quantities using Facebook Prophet model. 93 | 94 | Each time series has its own time series model. The collection of trained models are stored 95 | in a Spark dataframe for easy export/import using Spark. 96 | """ 97 | 98 | def __init__(self, config, logger=None): 99 | self.logger = logger or logging.getLogger(self.__class__.__name__) 100 | self.config = config 101 | 102 | def read_input_dataframe(self, spark: SparkSession): 103 | """ 104 | Reads the modeling input data 105 | :param spark: spark session 106 | :return: dataframe with input data 107 | """ 108 | 109 | input_df = spark \ 110 | .read \ 111 | .csv(self.config['io']['input'], schema=MODEL_INPUT_SCHEMA) \ 112 | .select('series_id', 'dim_id', 'start_time', 'quantity') \ 113 | .withColumnRenamed("start_time", "ds") \ 114 | .withColumnRenamed("quantity", "y") 115 | 116 | return input_df 117 | 118 | def persist_models(self, model_df: DataFrame): 119 | """ 120 | Persist the models in Spark dataframe for easy export into Parquet. 121 | :param model_df: Spark dataframe with trained time series models 122 | """ 123 | model_df \ 124 | .write \ 125 | .parquet(self.config['io']['models'], mode='overwrite') 126 | 127 | @staticmethod 128 | def model(spark_session, config): 129 | """ 130 | Create the trained time series models 131 | :param spark_session: 132 | :param config: Dict of config 133 | """ 134 | spark_session.conf.set("spark.sql.execution.arrow.enabled", 135 | "true") # needed to convert to/from Pandas dataframe 136 | scorer = ProphetModeler(config) 137 | input_df = scorer.read_input_dataframe(spark_session) 138 | 139 | model_df = input_df \ 140 | .groupby('series_id', 'dim_id') \ 141 | .apply(model_time_series(scorer.config)) 142 | 143 | scorer.persist_models(model_df) 144 | -------------------------------------------------------------------------------- /src/jobs/prophet_scorer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | from datetime import datetime, timezone 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pyspark.sql import SparkSession, DataFrame 8 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType, udf 9 | from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType 10 | 11 | 12 | # UDFs need to be standalone functions 13 | 14 | # Use function currying to set the parameters 15 | # Logging within Python doesn't work on Spark executors, so have to use print statements which 16 | # will show up in stderr on individual workers 17 | 18 | def forecast_time_series(config): 19 | """ 20 | Forecast using trained time series model (series_id, dim_id) 21 | 22 | :param config: 23 | :return: 24 | """ 25 | 26 | # Pandas UDF requires the output pandas dataframe schema to be defined 27 | output_schema = StructType([ 28 | StructField('series_id', IntegerType(), True), 29 | StructField('dim_id', IntegerType(), True), 30 | StructField('ds', TimestampType(), True), 31 | StructField('yhat', IntegerType(), True) 32 | ]) 33 | 34 | @pandas_udf(output_schema, PandasUDFType.GROUPED_MAP) 35 | def forecast_time_series_udf(pdf): 36 | """ 37 | User defined function for grouped sub spark dataframes converted into pandas dataframes. 38 | Input/output are both a pandas.DataFrame. This cannot be an instance method. 39 | Be sure the order of columns and types match the defined output schema! 40 | :param pdf: Input pandas dataframe 41 | :return: Output pandas dataframe 42 | """ 43 | try: 44 | series_id = int(pdf.iloc[0]['series_id']) 45 | dim_id = int(pdf.iloc[0]['dim_id']) 46 | floor = float(pdf.iloc[0]['floor']) 47 | cap = float(pdf.iloc[0]['cap']) 48 | model = pickle.loads(pdf.iloc[0]['model']) 49 | 50 | # If model is missing return empty dataframe for the forecast 51 | if model is None: 52 | print(f"For series_id: {series_id}, " 53 | f"dim_id: {dim_id}," 54 | f" no model found") 55 | return pd.DataFrame(columns=['series_id', 'dim_id', 'ds', 'yhat']) 56 | 57 | frequency = config['forecast']['frequency'] 58 | 59 | # We want the weeks to be offset on the appropriate weekday, 60 | # using 'W' for pandas date_range will set it to Sundays 61 | if frequency == 'W': 62 | frequency = pd.offsets.Week() 63 | 64 | future_df = model.make_future_dataframe(periods=config['forecast']['periods'], 65 | freq=frequency, 66 | include_history=False) 67 | future_df['floor'] = floor 68 | future_df['cap'] = cap 69 | 70 | forecast_df = model.predict(future_df) 71 | 72 | # Be sure to cast yhat to integer since yhat is a float 73 | forecast_df = forecast_df.astype({"yhat": int}) 74 | 75 | # Log negative values if any 76 | negatives = np.where(forecast_df["yhat"] < floor) 77 | if len(negatives[0]) > 0: 78 | print(f"Negative forecast values found for series_id: {series_id}, " 79 | f"dim_id: {dim_id}") 80 | 81 | # Remove forecast values less than floor 82 | forecast_df["yhat"] = np.where(forecast_df["yhat"] < floor, 83 | floor, 84 | forecast_df["yhat"]) 85 | 86 | output_df = forecast_df[['ds', 'yhat']] 87 | output_df['series_id'] = series_id 88 | output_df['dim_id'] = dim_id 89 | 90 | output_df = output_df[['series_id', 'dim_id', 'ds', 'yhat']] 91 | print(f"series_id: {series_id}; demo_id: {dim_id}; " 92 | f"floor: {floor}; cap: {cap}; " 93 | f"future min: {future_df['ds'].min()}; future max: {future_df['ds'].max()}; " 94 | f"forecast min: {forecast_df['ds'].min()}; forecast max: {forecast_df['ds'].max()}; " 95 | f"output min: {output_df['ds'].min()}; output max: {output_df['ds'].max()};") 96 | 97 | return output_df 98 | 99 | except RuntimeError as err: 100 | print(f"Runtime error {err} for series_id: {series_id}, " 101 | f"dim_id: {dim_id}") 102 | return pd.DataFrame(columns=['series_id', 'dim_id', 'ds', 'yhat']) 103 | 104 | return forecast_time_series_udf 105 | 106 | 107 | def extract_date(datetimestamp: datetime): 108 | return datetimestamp.date().strftime("%Y-%m-%d") 109 | 110 | 111 | extract_date_udf = udf(extract_date) 112 | 113 | 114 | class ProphetScorer: 115 | """ 116 | Forecast quantities using trained Facebook Prophet model. 117 | """ 118 | 119 | def __init__(self, config, logger=None): 120 | self.logger = logger or logging.getLogger(self.__class__.__name__) 121 | self.config = config 122 | 123 | def read_model_dataframe(self, spark: SparkSession): 124 | model_df = spark \ 125 | .read \ 126 | .parquet(self.config['io']['models']) 127 | 128 | return model_df 129 | 130 | @staticmethod 131 | def convert_forecasts(forecast_df: DataFrame): 132 | created_timestamp = datetime.now(timezone.utc) \ 133 | .replace(microsecond=0) \ 134 | .isoformat() 135 | 136 | return forecast_df \ 137 | .withColumn("created_timestamp", lit(created_timestamp)) \ 138 | .select("created_timestamp", 139 | "series_id", 140 | "dim_id", 141 | extract_date_udf("ds").alias("forecast_date"), # include a date without time 142 | "ds", 143 | "yhat") \ 144 | .withColumnRenamed("ds", "forecast_timestamp") \ 145 | .withColumnRenamed("yhat", "forecast_quantity") 146 | 147 | def write_forecasts(self, output_df: DataFrame): 148 | output_df \ 149 | .write \ 150 | .csv(self.config['io']['forecasts'], mode='overwrite', header=True) 151 | 152 | @staticmethod 153 | def score(spark_session, config): 154 | spark_session.conf.set("spark.sql.execution.arrow.enabled", 155 | "true") # needed to convert to/from Pandas dataframe 156 | scorer = ProphetScorer(config) 157 | model_df = scorer.read_model_dataframe(spark_session) 158 | 159 | forecast_df = model_df \ 160 | .groupby('series_id', 'dim_id') \ 161 | .apply(forecast_time_series(scorer.config)) 162 | 163 | converted_df = scorer.convert_forecasts(forecast_df) 164 | 165 | scorer.write_forecasts(converted_df) 166 | -------------------------------------------------------------------------------- /src/modeler_spark_driver.py: -------------------------------------------------------------------------------- 1 | from jobs.prophet_modeler import ProphetModeler 2 | from pyspark.sql import SparkSession 3 | import sys 4 | import yaml 5 | 6 | if __name__ == '__main__': 7 | spark_session = SparkSession.builder.appName('TimeSeriesForecastModeler').getOrCreate() 8 | 9 | if len(sys.argv) != 2: 10 | print("arg1 must be the config YAML") 11 | exit(1) 12 | 13 | with open(sys.argv[1]) as file: 14 | config = yaml.safe_load(file) 15 | 16 | print(f"config: {config}") 17 | 18 | ProphetModeler.model(spark_session, config) 19 | 20 | print("closing spark session") 21 | spark_session.stop() 22 | 23 | -------------------------------------------------------------------------------- /src/scorer_spark_driver.py: -------------------------------------------------------------------------------- 1 | from jobs.prophet_scorer import ProphetScorer 2 | from pyspark.sql import SparkSession 3 | import sys 4 | import yaml 5 | 6 | if __name__ == '__main__': 7 | spark_session = SparkSession.builder.appName('TimeSeriesForecastScorer').getOrCreate() 8 | 9 | if len(sys.argv) != 2: 10 | print("arg1 must be the config YAML") 11 | exit(1) 12 | 13 | with open(sys.argv[1]) as file: 14 | config = yaml.safe_load(file) 15 | 16 | print(f"config: {config}") 17 | 18 | ProphetScorer.score(spark_session, config) 19 | 20 | print("closing spark session") 21 | spark_session.stop() 22 | -------------------------------------------------------------------------------- /tests/fixtures/model-input/series_id=751/sample-model-input.csv: -------------------------------------------------------------------------------- 1 | 91,2001-01-05 11:15:00,36445 2 | 91,2001-01-05 21:45:00,45724 3 | 91,2001-01-07 11:15:00,40891 4 | 91,2001-01-07 21:45:00,38598 5 | 91,2001-01-12 11:15:00,21823 6 | 91,2001-01-12 21:45:00,66162 7 | 91,2001-01-14 11:15:00,22251 8 | 91,2001-01-14 21:45:00,27663 9 | 91,2001-01-19 11:15:00,41971 10 | 91,2001-01-19 21:45:00,65254 11 | 91,2001-01-21 11:15:00,43467 12 | 91,2001-01-21 21:45:00,78034 13 | 91,2001-01-26 11:15:00,48042 14 | 91,2001-01-26 21:45:00,51967 15 | 91,2001-02-02 11:15:00,20871 16 | 91,2001-02-02 21:45:00,57492 17 | 91,2001-02-04 11:15:00,23932 18 | 91,2001-02-04 21:45:00,69640 19 | 91,2001-02-09 11:15:00,40401 20 | 91,2001-02-09 21:45:00,74112 21 | 91,2001-02-11 11:15:00,14379 22 | 91,2001-02-11 21:45:00,24779 23 | 91,2001-02-16 11:15:00,27079 24 | 91,2001-02-16 21:45:00,89082 25 | 91,2001-02-18 11:15:00,19567 26 | 91,2001-02-18 21:45:00,53830 27 | 91,2001-02-23 11:15:00,73004 28 | 91,2001-02-23 21:45:00,94174 29 | 91,2001-02-25 11:15:00,4597 30 | 91,2001-02-25 21:45:00,37117 31 | 91,2001-03-01 11:15:00,38493 32 | 91,2001-03-01 21:45:00,88510 33 | 91,2001-03-03 11:15:00,21451 34 | 91,2001-03-03 21:45:00,23728 35 | 91,2001-03-08 11:15:00,45463 36 | 91,2001-03-08 21:45:00,46318 37 | 91,2001-03-10 11:15:00,28366 38 | 91,2001-03-10 21:45:00,52082 39 | 91,2001-03-15 11:15:00,41617 40 | 91,2001-03-15 21:45:00,41544 41 | 91,2001-03-17 11:15:00,35316 42 | 91,2001-03-17 21:45:00,38839 43 | 91,2001-03-22 11:15:00,36006 44 | 91,2001-03-22 21:45:00,53785 45 | 91,2001-03-24 11:15:00,35939 46 | 91,2001-03-24 21:45:00,46446 47 | 91,2001-03-29 11:15:00,24116 48 | 91,2001-03-29 21:45:00,60230 49 | 91,2001-03-31 11:15:00,22322 50 | 91,2001-03-31 21:45:00,67151 51 | 91,2001-04-05 11:15:00,50135 52 | 91,2001-04-05 21:45:00,70147 53 | 91,2001-04-07 11:15:00,25075 54 | 91,2001-04-07 21:45:00,57819 55 | 91,2001-04-12 11:15:00,37735 56 | 91,2001-04-12 21:45:00,41320 57 | 91,2001-04-14 11:15:00,24265 58 | 91,2001-04-14 21:45:00,69948 59 | 91,2001-04-19 11:15:00,24272 60 | 91,2001-04-19 21:45:00,46014 61 | 91,2001-04-21 11:15:00,8831 62 | 91,2001-04-21 21:45:00,50994 63 | 91,2001-04-26 11:15:00,24417 64 | 91,2001-04-26 21:45:00,43985 65 | 91,2001-04-28 11:15:00,21357 66 | 91,2001-04-28 21:45:00,40104 67 | 91,2001-05-03 11:15:00,21921 68 | 91,2001-05-03 21:45:00,48765 69 | 91,2001-05-05 11:15:00,39914 70 | 91,2001-05-05 21:45:00,47584 71 | 91,2001-05-10 11:15:00,21628 72 | 91,2001-05-10 21:45:00,40713 73 | 91,2001-05-12 11:15:00,36413 74 | 91,2001-05-12 21:45:00,48834 75 | 91,2001-05-17 11:15:00,22638 76 | 91,2001-05-17 21:45:00,35891 77 | 91,2001-05-19 11:15:00,25321 78 | 91,2001-05-19 21:45:00,37890 79 | 91,2001-05-24 11:15:00,45577 80 | 91,2001-05-24 21:45:00,36964 81 | 91,2001-05-26 11:15:00,25931 82 | 91,2001-05-26 21:45:00,16763 83 | 91,2001-05-31 11:15:00,26824 84 | 91,2001-05-31 21:45:00,33766 85 | 91,2001-06-02 11:15:00,44606 86 | 91,2001-06-02 21:45:00,32453 87 | 91,2001-06-07 11:15:00,20121 88 | 91,2001-06-07 21:45:00,57270 89 | 91,2001-06-09 11:15:00,23307 90 | 91,2001-06-09 21:45:00,26673 91 | 91,2001-06-14 11:15:00,28635 92 | 91,2001-06-14 21:45:00,39151 93 | 91,2001-06-16 11:15:00,19480 94 | 91,2001-06-16 21:45:00,31806 95 | 91,2001-06-21 11:15:00,25035 96 | 91,2001-06-21 21:45:00,46247 97 | 91,2001-06-23 11:15:00,23578 98 | 91,2001-06-23 21:45:00,26755 99 | 91,2001-06-28 11:15:00,21714 100 | 91,2001-06-28 21:45:00,47806 101 | 91,2001-06-30 11:15:00,34080 102 | 91,2001-06-30 21:45:00,39258 103 | 91,2001-07-05 11:15:00,18045 104 | 91,2001-07-05 21:45:00,62693 105 | 91,2001-07-07 11:15:00,22567 106 | 91,2001-07-07 21:45:00,34251 107 | 91,2001-07-12 11:15:00,21305 108 | 91,2001-07-12 21:45:00,35910 109 | 91,2001-07-14 11:15:00,25308 110 | 91,2001-07-14 21:45:00,18350 111 | 91,2001-07-19 11:15:00,36281 112 | 91,2001-07-19 21:45:00,31472 113 | 91,2001-07-21 11:15:00,43686 114 | 91,2001-07-21 21:45:00,28472 115 | 91,2001-07-26 11:15:00,36646 116 | 91,2001-07-26 21:45:00,40556 117 | 91,2001-07-28 11:15:00,28966 118 | 91,2001-07-28 21:45:00,34096 119 | 91,2001-08-02 11:15:00,40002 120 | 91,2001-08-02 21:45:00,26874 121 | 91,2001-08-04 11:15:00,24951 122 | 91,2001-08-04 21:45:00,20656 123 | 91,2001-08-09 11:15:00,10173 124 | 91,2001-08-09 21:45:00,35239 125 | 91,2001-08-11 11:15:00,34420 126 | 91,2001-08-11 21:45:00,8168 127 | 91,2001-08-16 11:15:00,27422 128 | 91,2001-08-16 21:45:00,30065 129 | 91,2001-08-18 11:15:00,11033 130 | 91,2001-08-18 21:45:00,27083 131 | 91,2001-08-23 11:15:00,21043 132 | 91,2001-08-23 21:45:00,21620 133 | 91,2001-08-25 11:15:00,15618 134 | 91,2001-08-25 21:45:00,20564 135 | 91,2001-08-30 11:15:00,20128 136 | 91,2001-08-30 21:45:00,41981 137 | 91,2001-09-01 11:15:00,31125 138 | 91,2001-09-01 21:45:00,27130 139 | 91,2001-09-06 11:15:00,18949 140 | 91,2001-09-06 21:45:00,43783 141 | 91,2001-09-08 11:15:00,24134 142 | 91,2001-09-08 21:45:00,36398 143 | 91,2001-09-13 11:15:00,30022 144 | 91,2001-09-13 21:45:00,43283 145 | 91,2001-09-15 11:15:00,30295 146 | 91,2001-09-15 21:45:00,31854 147 | 91,2001-09-20 11:15:00,20853 148 | 91,2001-09-20 21:45:00,19783 149 | 91,2001-09-22 11:15:00,24796 150 | 91,2001-09-22 21:45:00,21741 151 | 91,2001-09-27 11:15:00,24632 152 | 91,2001-09-27 21:45:00,24239 153 | 91,2001-09-29 11:15:00,30279 154 | 91,2001-09-29 21:45:00,27009 155 | 91,2001-10-04 11:15:00,36132 156 | 91,2001-10-04 21:45:00,39291 157 | 91,2001-10-06 11:15:00,15676 158 | 91,2001-10-06 21:45:00,24646 159 | 91,2001-10-11 11:15:00,17277 160 | 91,2001-10-11 21:45:00,25673 161 | 91,2001-10-13 11:15:00,9854 162 | 91,2001-10-13 21:45:00,30027 163 | 91,2001-10-18 11:15:00,33085 164 | 91,2001-10-18 21:45:00,39714 165 | 91,2001-10-20 11:15:00,5896 166 | 91,2001-10-20 21:45:00,19681 167 | 91,2001-10-25 11:15:00,11271 168 | 91,2001-10-25 21:45:00,23135 169 | 91,2001-10-27 11:15:00,12205 170 | 91,2001-10-27 21:45:00,14825 171 | 91,2001-11-01 11:15:00,37000 172 | 91,2001-11-01 21:45:00,44628 173 | 91,2001-11-03 11:15:00,8906 174 | 91,2001-11-03 21:45:00,25971 175 | 91,2001-11-08 11:15:00,23585 176 | 91,2001-11-08 21:45:00,12089 177 | 91,2001-11-10 11:15:00,34457 178 | 91,2001-11-10 21:45:00,7422 179 | 91,2001-11-15 11:15:00,41470 180 | 91,2001-11-15 21:45:00,30664 181 | 91,2001-11-17 11:15:00,20418 182 | 91,2001-11-17 21:45:00,26153 183 | 91,2001-11-22 11:15:00,24652 184 | 91,2001-11-22 21:45:00,43923 185 | 91,2001-11-24 11:15:00,1594 186 | 91,2001-11-24 21:45:00,24324 187 | 91,2001-11-29 11:15:00,24340 188 | 91,2001-11-29 21:45:00,35080 189 | 91,2001-12-01 11:15:00,12223 190 | 91,2001-12-01 21:45:00,25693 191 | 91,2001-12-06 11:15:00,37306 192 | 91,2001-12-06 21:45:00,43944 193 | 91,2001-12-08 11:15:00,9804 194 | 91,2001-12-08 21:45:00,18679 195 | 91,2001-12-13 11:15:00,14830 196 | 91,2001-12-13 21:45:00,28011 197 | 91,2001-12-15 11:15:00,4741 198 | 91,2001-12-15 21:45:00,7650 199 | 91,2001-12-20 11:15:00,20580 200 | 91,2001-12-20 21:45:00,26661 201 | 91,2001-12-22 11:15:00,27715 202 | 91,2001-12-22 21:45:00,25029 203 | 91,2001-12-27 11:15:00,11975 204 | 91,2001-12-27 21:45:00,37147 205 | 91,2001-12-29 11:15:00,23825 206 | 91,2001-12-29 21:45:00,19599 207 | 91,2002-01-03 11:15:00,28622 208 | 91,2002-01-03 21:45:00,32035 209 | 91,2002-01-05 11:15:00,14997 210 | 91,2002-01-05 21:45:00,42943 211 | 91,2002-01-10 11:15:00,40949 212 | 91,2002-01-10 21:45:00,36729 213 | 91,2002-01-12 11:15:00,5436 214 | 91,2002-01-12 21:45:00,22979 215 | 91,2002-01-17 11:15:00,15643 216 | 91,2002-01-17 21:45:00,14861 217 | 91,2002-01-19 11:15:00,12344 218 | 91,2002-01-19 21:45:00,15079 219 | 91,2002-01-24 11:15:00,18217 220 | 91,2002-01-24 21:45:00,23966 221 | 91,2002-01-26 11:15:00,19930 222 | 91,2002-01-26 21:45:00,31981 223 | 91,2002-01-31 11:15:00,34765 224 | 91,2002-01-31 21:45:00,53567 225 | 91,2002-02-02 11:15:00,9234 226 | 91,2002-02-02 21:45:00,27676 227 | 91,2002-02-07 11:15:00,38054 228 | 91,2002-02-07 21:45:00,36133 229 | 91,2002-02-14 11:15:00,25139 230 | 91,2002-02-14 21:45:00,27524 231 | 91,2002-02-16 11:15:00,13604 232 | 91,2002-02-16 21:45:00,40562 233 | 91,2002-02-21 11:15:00,5964 234 | 91,2002-02-21 21:45:00,37265 235 | 91,2002-02-23 11:15:00,13688 236 | 91,2002-02-23 21:45:00,45976 237 | 91,2002-02-28 11:15:00,20804 238 | 91,2002-02-28 21:45:00,60194 239 | 91,2002-03-02 11:15:00,27063 240 | 91,2002-03-02 21:45:00,24445 241 | 91,2002-03-07 11:15:00,27321 242 | 91,2002-03-07 21:45:00,35197 243 | 91,2002-03-09 11:15:00,25220 244 | 91,2002-03-09 21:45:00,25808 245 | 91,2002-03-14 11:15:00,26083 246 | 91,2002-03-14 21:45:00,68252 247 | 91,2002-03-16 11:15:00,7489 248 | 91,2002-03-16 21:45:00,4041 249 | 91,2002-03-21 11:15:00,8659 250 | 91,2002-03-21 21:45:00,39718 251 | 91,2002-03-23 11:15:00,12432 252 | 91,2002-03-23 21:45:00,35504 253 | 91,2002-03-28 11:15:00,28580 254 | 91,2002-03-28 21:45:00,30540 255 | 91,2002-03-30 11:15:00,20112 256 | 91,2002-03-30 21:45:00,14447 257 | 91,2002-04-04 11:15:00,18262 258 | 91,2002-04-04 21:45:00,38505 259 | 91,2002-04-06 11:15:00,9802 260 | 91,2002-04-06 21:45:00,17291 261 | 91,2002-04-11 11:15:00,40675 262 | 91,2002-04-11 21:45:00,43640 263 | 91,2002-04-13 11:15:00,24442 264 | 91,2002-04-13 21:45:00,28946 265 | 91,2002-04-18 11:15:00,35476 266 | 91,2002-04-18 21:45:00,22319 267 | 91,2002-04-20 11:15:00,13188 268 | 91,2002-04-20 21:45:00,24329 269 | 91,2002-04-25 11:15:00,31794 270 | 91,2002-04-25 21:45:00,25451 271 | 91,2002-04-27 11:15:00,23765 272 | 91,2002-04-27 21:45:00,11286 273 | 91,2002-05-02 11:15:00,6341 274 | 91,2002-05-02 21:45:00,35603 275 | 91,2002-05-04 11:15:00,25045 276 | 91,2002-05-04 21:45:00,17024 277 | 91,2002-05-09 11:15:00,26785 278 | 91,2002-05-09 21:45:00,24333 279 | 91,2002-05-11 11:15:00,3563 280 | 91,2002-05-11 21:45:00,17584 281 | 91,2002-05-16 11:15:00,13716 282 | 91,2002-05-16 21:45:00,22165 283 | 91,2002-05-18 11:15:00,19687 284 | 91,2002-05-18 21:45:00,6856 285 | 91,2002-05-23 11:15:00,27117 286 | 91,2002-05-23 21:45:00,35752 287 | 91,2002-05-25 11:15:00,26160 288 | 91,2002-05-25 21:45:00,22879 289 | 91,2002-05-30 11:15:00,20514 290 | 91,2002-05-30 21:45:00,16189 291 | 91,2002-06-01 11:15:00,19041 292 | 91,2002-06-01 21:45:00,16696 293 | 91,2002-06-06 11:15:00,25059 294 | 91,2002-06-06 21:45:00,20922 295 | 91,2002-06-08 11:15:00,20665 296 | 91,2002-06-08 21:45:00,17423 297 | 91,2002-06-13 11:15:00,3908 298 | 91,2002-06-13 21:45:00,33827 299 | 91,2002-06-15 11:15:00,9938 300 | 91,2002-06-15 21:45:00,40020 301 | 91,2002-06-20 11:15:00,17388 302 | 91,2002-06-20 21:45:00,14358 303 | 91,2002-06-22 11:15:00,13968 304 | 91,2002-06-22 21:45:00,35079 305 | 91,2002-06-27 11:15:00,38418 306 | 91,2002-06-27 21:45:00,34435 307 | 91,2002-06-29 11:15:00,8002 308 | 91,2002-06-29 21:45:00,15928 309 | 91,2002-07-04 11:15:00,14754 310 | 91,2002-07-04 21:45:00,18912 311 | 91,2002-07-06 11:15:00,13159 312 | 91,2002-07-06 21:45:00,24498 313 | 91,2002-07-11 11:15:00,23921 314 | 91,2002-07-11 21:45:00,31337 315 | 91,2002-07-13 11:15:00,28059 316 | 91,2002-07-13 21:45:00,18429 317 | 91,2002-07-18 11:15:00,22195 318 | 91,2002-07-18 21:45:00,21239 319 | 91,2002-07-20 11:15:00,15259 320 | 91,2002-07-20 21:45:00,19636 321 | 91,2002-07-25 11:15:00,29279 322 | 91,2002-07-25 21:45:00,36344 323 | 91,2002-07-27 11:15:00,8153 324 | 91,2002-07-27 21:45:00,18183 325 | 91,2002-08-01 11:15:00,22066 326 | 91,2002-08-01 21:45:00,22358 327 | 91,2002-08-03 11:15:00,9754 328 | 91,2002-08-03 21:45:00,20418 329 | 91,2002-08-08 11:15:00,7865 330 | 91,2002-08-08 21:45:00,9133 331 | 91,2002-08-10 11:15:00,23567 332 | 91,2002-08-10 21:45:00,5894 333 | 91,2002-08-15 11:15:00,16018 334 | 91,2002-08-15 21:45:00,7985 335 | 91,2002-08-17 11:15:00,18098 336 | 91,2002-08-17 21:45:00,28770 337 | 91,2002-08-22 11:15:00,11046 338 | 91,2002-08-22 21:45:00,22970 339 | 91,2002-08-24 11:15:00,12181 340 | 91,2002-08-24 21:45:00,20678 341 | 91,2002-08-29 11:15:00,14893 342 | 91,2002-08-29 21:45:00,12246 343 | 91,2002-08-31 11:15:00,5884 344 | 91,2002-08-31 21:45:00,20540 345 | 91,2002-09-05 11:15:00,1846 346 | 91,2002-09-05 21:45:00,21171 347 | 91,2002-09-07 11:15:00,12188 348 | 91,2002-09-07 21:45:00,33484 349 | 91,2002-09-12 11:15:00,2075 350 | 91,2002-09-12 21:45:00,33688 351 | 91,2002-09-14 11:15:00,8313 352 | 91,2002-09-14 21:45:00,11863 353 | 91,2002-09-19 11:15:00,28486 354 | 91,2002-09-19 21:45:00,47206 355 | 91,2002-09-21 11:15:00,5032 356 | 91,2002-09-21 21:45:00,9533 357 | 91,2002-09-26 11:15:00,20036 358 | 91,2002-09-26 21:45:00,54346 359 | 91,2002-10-03 11:15:00,9811 360 | 91,2002-10-03 21:45:00,16758 361 | 91,2002-10-05 11:15:00,9084 362 | 91,2002-10-05 21:45:00,44899 363 | 91,2002-10-10 11:15:00,25392 364 | 91,2002-10-10 21:45:00,27966 365 | 91,2002-10-12 11:15:00,7011 366 | 91,2002-10-12 21:45:00,43894 367 | 91,2002-10-17 11:15:00,11833 368 | 91,2002-10-17 21:45:00,16352 369 | 91,2002-10-19 11:15:00,22622 370 | 91,2002-10-19 21:45:00,13215 371 | 91,2002-10-24 11:15:00,25672 372 | 91,2002-10-24 21:45:00,33190 373 | 91,2002-10-26 11:15:00,4290 374 | 91,2002-10-26 21:45:00,27019 375 | 91,2002-10-31 11:15:00,11876 376 | 91,2002-10-31 21:45:00,18089 377 | 91,2002-11-02 11:15:00,15072 378 | 91,2002-11-02 21:45:00,27975 379 | 91,2002-11-07 11:15:00,11524 380 | 91,2002-11-07 21:45:00,35711 381 | 91,2002-11-09 11:15:00,16570 382 | 91,2002-11-09 21:45:00,16307 383 | 91,2002-11-14 11:15:00,30377 384 | 91,2002-11-14 21:45:00,18593 385 | 91,2002-11-16 11:15:00,6081 386 | 91,2002-11-16 21:45:00,17141 387 | 91,2002-11-21 11:15:00,18077 388 | 91,2002-11-21 21:45:00,34452 389 | 91,2002-11-23 11:15:00,25290 390 | 91,2002-11-23 21:45:00,7860 391 | 91,2002-11-28 11:15:00,21380 392 | 91,2002-11-28 21:45:00,32118 393 | 91,2002-11-30 11:15:00,4522 394 | 91,2002-11-30 21:45:00,25127 395 | 91,2002-12-12 11:15:00,22800 396 | 91,2002-12-12 11:15:00,22800 397 | 91,2002-12-12 21:45:00,20536 398 | 91,2002-12-12 21:45:00,20536 399 | 91,2002-12-14 11:15:00,7854 400 | 91,2002-12-14 11:15:00,7854 401 | 91,2002-12-14 21:45:00,15454 402 | 91,2002-12-14 21:45:00,15454 403 | 91,2002-12-19 11:15:00,18251 404 | 91,2002-12-19 21:45:00,24406 405 | 91,2002-12-21 11:15:00,11551 406 | 91,2002-12-21 21:45:00,9024 407 | 91,2002-12-26 11:15:00,45343 408 | 91,2002-12-26 21:45:00,38407 409 | 91,2002-12-28 11:15:00,11481 410 | 91,2002-12-28 21:45:00,9223 411 | 155,2001-01-05 11:15:00,47160 412 | 155,2001-01-05 21:45:00,79145 413 | 155,2001-01-07 11:15:00,28556 414 | 155,2001-01-07 21:45:00,44767 415 | 155,2001-01-12 11:15:00,22247 416 | 155,2001-01-12 21:45:00,57113 417 | 155,2001-01-14 11:15:00,25489 418 | 155,2001-01-14 21:45:00,29332 419 | 155,2001-01-19 11:15:00,53915 420 | 155,2001-01-19 21:45:00,74347 421 | 155,2001-01-21 11:15:00,26968 422 | 155,2001-01-21 21:45:00,74567 423 | 155,2001-01-26 11:15:00,21522 424 | 155,2001-01-26 21:45:00,45837 425 | 155,2001-02-02 11:15:00,27172 426 | 155,2001-02-02 21:45:00,52341 427 | 155,2001-02-04 11:15:00,25381 428 | 155,2001-02-04 21:45:00,37220 429 | 155,2001-02-09 11:15:00,42818 430 | 155,2001-02-09 21:45:00,75960 431 | 155,2001-02-11 11:15:00,10505 432 | 155,2001-02-11 21:45:00,27725 433 | 155,2001-02-16 11:15:00,23650 434 | 155,2001-02-16 21:45:00,61938 435 | 155,2001-02-18 11:15:00,13298 436 | 155,2001-02-18 21:45:00,25214 437 | 155,2001-02-23 11:15:00,41355 438 | 155,2001-02-23 21:45:00,99310 439 | 155,2001-02-25 11:15:00,52462 440 | 155,2001-02-25 21:45:00,47029 441 | 155,2001-03-01 11:15:00,27606 442 | 155,2001-03-01 21:45:00,60153 443 | 155,2001-03-03 11:15:00,17431 444 | 155,2001-03-03 21:45:00,34277 445 | 155,2001-03-08 11:15:00,32507 446 | 155,2001-03-08 21:45:00,98225 447 | 155,2001-03-10 11:15:00,13529 448 | 155,2001-03-10 21:45:00,59719 449 | 155,2001-03-15 11:15:00,45699 450 | 155,2001-03-15 21:45:00,81882 451 | 155,2001-03-17 11:15:00,22303 452 | 155,2001-03-17 21:45:00,54697 453 | 155,2001-03-22 11:15:00,63869 454 | 155,2001-03-22 21:45:00,59625 455 | 155,2001-03-24 11:15:00,23878 456 | 155,2001-03-24 21:45:00,57520 457 | 155,2001-03-29 11:15:00,25564 458 | 155,2001-03-29 21:45:00,60518 459 | 155,2001-03-31 11:15:00,46884 460 | 155,2001-03-31 21:45:00,50539 461 | 155,2001-04-05 11:15:00,29264 462 | 155,2001-04-05 21:45:00,66833 463 | 155,2001-04-07 11:15:00,46394 464 | 155,2001-04-07 21:45:00,83511 465 | 155,2001-04-12 11:15:00,36101 466 | 155,2001-04-12 21:45:00,63117 467 | 155,2001-04-14 11:15:00,35112 468 | 155,2001-04-14 21:45:00,79289 469 | 155,2001-04-19 11:15:00,45884 470 | 155,2001-04-19 21:45:00,54673 471 | 155,2001-04-21 11:15:00,14909 472 | 155,2001-04-21 21:45:00,58232 473 | 155,2001-04-26 11:15:00,25369 474 | 155,2001-04-26 21:45:00,58654 475 | 155,2001-04-28 11:15:00,31886 476 | 155,2001-04-28 21:45:00,36885 477 | 155,2001-05-03 11:15:00,9376 478 | 155,2001-05-03 21:45:00,30407 479 | 155,2001-05-05 11:15:00,14027 480 | 155,2001-05-05 21:45:00,52893 481 | 155,2001-05-10 11:15:00,16558 482 | 155,2001-05-10 21:45:00,76662 483 | 155,2001-05-12 11:15:00,20462 484 | 155,2001-05-12 21:45:00,55767 485 | 155,2001-05-17 11:15:00,31514 486 | 155,2001-05-17 21:45:00,62208 487 | 155,2001-05-19 11:15:00,19076 488 | 155,2001-05-19 21:45:00,39989 489 | 155,2001-05-24 11:15:00,36241 490 | 155,2001-05-24 21:45:00,39279 491 | 155,2001-05-26 11:15:00,28568 492 | 155,2001-05-26 21:45:00,39174 493 | 155,2001-05-31 11:15:00,25074 494 | 155,2001-05-31 21:45:00,46620 495 | 155,2001-06-02 11:15:00,41491 496 | 155,2001-06-02 21:45:00,41291 497 | 155,2001-06-07 11:15:00,22774 498 | 155,2001-06-07 21:45:00,50928 499 | 155,2001-06-09 11:15:00,31219 500 | 155,2001-06-09 21:45:00,22728 501 | 155,2001-06-14 11:15:00,43415 502 | 155,2001-06-14 21:45:00,63540 503 | 155,2001-06-16 11:15:00,21249 504 | 155,2001-06-16 21:45:00,40849 505 | 155,2001-06-21 11:15:00,48760 506 | 155,2001-06-21 21:45:00,81860 507 | 155,2001-06-23 11:15:00,44529 508 | 155,2001-06-23 21:45:00,66362 509 | 155,2001-06-28 11:15:00,32718 510 | 155,2001-06-28 21:45:00,47446 511 | 155,2001-06-30 11:15:00,29634 512 | 155,2001-06-30 21:45:00,38638 513 | 155,2001-07-05 11:15:00,39456 514 | 155,2001-07-05 21:45:00,77100 515 | 155,2001-07-07 11:15:00,13870 516 | 155,2001-07-07 21:45:00,56203 517 | 155,2001-07-12 11:15:00,46392 518 | 155,2001-07-12 21:45:00,37438 519 | 155,2001-07-14 11:15:00,25586 520 | 155,2001-07-14 21:45:00,13469 521 | 155,2001-07-19 11:15:00,30838 522 | 155,2001-07-19 21:45:00,28566 523 | 155,2001-07-21 11:15:00,30584 524 | 155,2001-07-21 21:45:00,43256 525 | 155,2001-07-26 11:15:00,31799 526 | 155,2001-07-26 21:45:00,87748 527 | 155,2001-07-28 11:15:00,15149 528 | 155,2001-07-28 21:45:00,52908 529 | 155,2001-08-02 11:15:00,26134 530 | 155,2001-08-02 21:45:00,32303 531 | 155,2001-08-04 11:15:00,27381 532 | 155,2001-08-04 21:45:00,42345 533 | 155,2001-08-09 11:15:00,25117 534 | 155,2001-08-09 21:45:00,46062 535 | 155,2001-08-11 11:15:00,32478 536 | 155,2001-08-11 21:45:00,43279 537 | 155,2001-08-16 11:15:00,27900 538 | 155,2001-08-16 21:45:00,71976 539 | 155,2001-08-18 11:15:00,20572 540 | 155,2001-08-18 21:45:00,40490 541 | 155,2001-08-23 11:15:00,35076 542 | 155,2001-08-23 21:45:00,52676 543 | 155,2001-08-25 11:15:00,17380 544 | 155,2001-08-25 21:45:00,46913 545 | 155,2001-08-30 11:15:00,30934 546 | 155,2001-08-30 21:45:00,119007 547 | 155,2001-09-01 11:15:00,25038 548 | 155,2001-09-01 21:45:00,70484 549 | 155,2001-09-06 11:15:00,20296 550 | 155,2001-09-06 21:45:00,68376 551 | 155,2001-09-08 11:15:00,39674 552 | 155,2001-09-08 21:45:00,51128 553 | 155,2001-09-13 11:15:00,33335 554 | 155,2001-09-13 21:45:00,57007 555 | 155,2001-09-15 11:15:00,41197 556 | 155,2001-09-15 21:45:00,69643 557 | 155,2001-09-20 11:15:00,35542 558 | 155,2001-09-20 21:45:00,51868 559 | 155,2001-09-22 11:15:00,14775 560 | 155,2001-09-22 21:45:00,36246 561 | 155,2001-09-27 11:15:00,39445 562 | 155,2001-09-27 21:45:00,59610 563 | 155,2001-09-29 11:15:00,13120 564 | 155,2001-09-29 21:45:00,50651 565 | 155,2001-10-04 11:15:00,38468 566 | 155,2001-10-04 21:45:00,67526 567 | 155,2001-10-06 11:15:00,26560 568 | 155,2001-10-06 21:45:00,64617 569 | 155,2001-10-11 11:15:00,10619 570 | 155,2001-10-11 21:45:00,24220 571 | 155,2001-10-13 11:15:00,12291 572 | 155,2001-10-13 21:45:00,46453 573 | 155,2001-10-18 11:15:00,25500 574 | 155,2001-10-18 21:45:00,88530 575 | 155,2001-10-20 11:15:00,22302 576 | 155,2001-10-20 21:45:00,34156 577 | 155,2001-10-25 11:15:00,12799 578 | 155,2001-10-25 21:45:00,36165 579 | 155,2001-10-27 11:15:00,33056 580 | 155,2001-10-27 21:45:00,42997 581 | 155,2001-11-01 11:15:00,59764 582 | 155,2001-11-01 21:45:00,80346 583 | 155,2001-11-03 11:15:00,26249 584 | 155,2001-11-03 21:45:00,19786 585 | 155,2001-11-08 11:15:00,46280 586 | 155,2001-11-08 21:45:00,46632 587 | 155,2001-11-10 11:15:00,27986 588 | 155,2001-11-10 21:45:00,35418 589 | 155,2001-11-15 11:15:00,33701 590 | 155,2001-11-15 21:45:00,100492 591 | 155,2001-11-17 11:15:00,11008 592 | 155,2001-11-17 21:45:00,41364 593 | 155,2001-11-22 11:15:00,19919 594 | 155,2001-11-22 21:45:00,108506 595 | 155,2001-11-24 11:15:00,11466 596 | 155,2001-11-24 21:45:00,26303 597 | 155,2001-11-29 11:15:00,34608 598 | 155,2001-11-29 21:45:00,66799 599 | 155,2001-12-01 11:15:00,36585 600 | 155,2001-12-01 21:45:00,39367 601 | 155,2001-12-06 11:15:00,58185 602 | 155,2001-12-06 21:45:00,56685 603 | 155,2001-12-08 11:15:00,15953 604 | 155,2001-12-08 21:45:00,41716 605 | 155,2001-12-13 11:15:00,23780 606 | 155,2001-12-13 21:45:00,54777 607 | 155,2001-12-15 11:15:00,28346 608 | 155,2001-12-15 21:45:00,30267 609 | 155,2001-12-20 11:15:00,17987 610 | 155,2001-12-20 21:45:00,53592 611 | 155,2001-12-22 11:15:00,14796 612 | 155,2001-12-22 21:45:00,51911 613 | 155,2001-12-27 11:15:00,23721 614 | 155,2001-12-27 21:45:00,80647 615 | 155,2001-12-29 11:15:00,23631 616 | 155,2001-12-29 21:45:00,57509 617 | 155,2002-01-03 11:15:00,46025 618 | 155,2002-01-03 21:45:00,76333 619 | 155,2002-01-05 11:15:00,14878 620 | 155,2002-01-05 21:45:00,44175 621 | 155,2002-01-10 11:15:00,50535 622 | 155,2002-01-10 21:45:00,73147 623 | 155,2002-01-12 11:15:00,18213 624 | 155,2002-01-12 21:45:00,49091 625 | 155,2002-01-17 11:15:00,28559 626 | 155,2002-01-17 21:45:00,69585 627 | 155,2002-01-19 11:15:00,14629 628 | 155,2002-01-19 21:45:00,74604 629 | 155,2002-01-24 11:15:00,39554 630 | 155,2002-01-24 21:45:00,56264 631 | 155,2002-01-26 11:15:00,26463 632 | 155,2002-01-26 21:45:00,45483 633 | 155,2002-01-31 11:15:00,29566 634 | 155,2002-01-31 21:45:00,127322 635 | 155,2002-02-02 11:15:00,30568 636 | 155,2002-02-02 21:45:00,63657 637 | 155,2002-02-07 11:15:00,51733 638 | 155,2002-02-07 21:45:00,53442 639 | 155,2002-02-14 11:15:00,35365 640 | 155,2002-02-14 21:45:00,52901 641 | 155,2002-02-16 11:15:00,28502 642 | 155,2002-02-16 21:45:00,17340 643 | 155,2002-02-21 11:15:00,35378 644 | 155,2002-02-21 21:45:00,81922 645 | 155,2002-02-23 11:15:00,26591 646 | 155,2002-02-23 21:45:00,38141 647 | 155,2002-02-28 11:15:00,39223 648 | 155,2002-02-28 21:45:00,61799 649 | 155,2002-03-02 11:15:00,21661 650 | 155,2002-03-02 21:45:00,26651 651 | 155,2002-03-07 11:15:00,24898 652 | 155,2002-03-07 21:45:00,66425 653 | 155,2002-03-09 11:15:00,11629 654 | 155,2002-03-09 21:45:00,45727 655 | 155,2002-03-14 11:15:00,18354 656 | 155,2002-03-14 21:45:00,74475 657 | 155,2002-03-16 11:15:00,15828 658 | 155,2002-03-16 21:45:00,56993 659 | 155,2002-03-21 11:15:00,57501 660 | 155,2002-03-21 21:45:00,44223 661 | 155,2002-03-23 11:15:00,9902 662 | 155,2002-03-23 21:45:00,40780 663 | 155,2002-03-28 11:15:00,49527 664 | 155,2002-03-28 21:45:00,51841 665 | 155,2002-03-30 11:15:00,28272 666 | 155,2002-03-30 21:45:00,25346 667 | 155,2002-04-04 11:15:00,28262 668 | 155,2002-04-04 21:45:00,65081 669 | 155,2002-04-06 11:15:00,7648 670 | 155,2002-04-06 21:45:00,17067 671 | 155,2002-04-11 11:15:00,61328 672 | 155,2002-04-11 21:45:00,46403 673 | 155,2002-04-13 11:15:00,21971 674 | 155,2002-04-13 21:45:00,52832 675 | 155,2002-04-18 11:15:00,56265 676 | 155,2002-04-18 21:45:00,63298 677 | 155,2002-04-20 11:15:00,25280 678 | 155,2002-04-20 21:45:00,9841 679 | 155,2002-04-25 11:15:00,55619 680 | 155,2002-04-25 21:45:00,28003 681 | 155,2002-04-27 11:15:00,30943 682 | 155,2002-04-27 21:45:00,32108 683 | 155,2002-05-02 11:15:00,23441 684 | 155,2002-05-02 21:45:00,63216 685 | 155,2002-05-04 11:15:00,44246 686 | 155,2002-05-04 21:45:00,37654 687 | 155,2002-05-09 11:15:00,23782 688 | 155,2002-05-09 21:45:00,35986 689 | 155,2002-05-11 11:15:00,26588 690 | 155,2002-05-11 21:45:00,50344 691 | 155,2002-05-16 11:15:00,18303 692 | 155,2002-05-16 21:45:00,26610 693 | 155,2002-05-18 11:15:00,21812 694 | 155,2002-05-18 21:45:00,12481 695 | 155,2002-05-23 11:15:00,47154 696 | 155,2002-05-23 21:45:00,51113 697 | 155,2002-05-25 11:15:00,29322 698 | 155,2002-05-25 21:45:00,16905 699 | 155,2002-05-30 11:15:00,37798 700 | 155,2002-05-30 21:45:00,63267 701 | 155,2002-06-01 11:15:00,24901 702 | 155,2002-06-01 21:45:00,32972 703 | 155,2002-06-06 11:15:00,56873 704 | 155,2002-06-06 21:45:00,51542 705 | 155,2002-06-08 11:15:00,30199 706 | 155,2002-06-08 21:45:00,42197 707 | 155,2002-06-13 11:15:00,36657 708 | 155,2002-06-13 21:45:00,56763 709 | 155,2002-06-15 11:15:00,15616 710 | 155,2002-06-15 21:45:00,61732 711 | 155,2002-06-20 11:15:00,37168 712 | 155,2002-06-20 21:45:00,49352 713 | 155,2002-06-22 11:15:00,15282 714 | 155,2002-06-22 21:45:00,34328 715 | 155,2002-06-27 11:15:00,43994 716 | 155,2002-06-27 21:45:00,36183 717 | 155,2002-06-29 11:15:00,14257 718 | 155,2002-06-29 21:45:00,47385 719 | 155,2002-07-04 11:15:00,27885 720 | 155,2002-07-04 21:45:00,46133 721 | 155,2002-07-06 11:15:00,16643 722 | 155,2002-07-06 21:45:00,23292 723 | 155,2002-07-11 11:15:00,18303 724 | 155,2002-07-11 21:45:00,65505 725 | 155,2002-07-13 11:15:00,48652 726 | 155,2002-07-13 21:45:00,31875 727 | 155,2002-07-18 11:15:00,19188 728 | 155,2002-07-18 21:45:00,39646 729 | 155,2002-07-20 11:15:00,43831 730 | 155,2002-07-20 21:45:00,30483 731 | 155,2002-07-25 11:15:00,45441 732 | 155,2002-07-25 21:45:00,63240 733 | 155,2002-07-27 11:15:00,28574 734 | 155,2002-07-27 21:45:00,23596 735 | 155,2002-08-01 11:15:00,21197 736 | 155,2002-08-01 21:45:00,52841 737 | 155,2002-08-03 11:15:00,14047 738 | 155,2002-08-03 21:45:00,60289 739 | 155,2002-08-08 11:15:00,34957 740 | 155,2002-08-08 21:45:00,30751 741 | 155,2002-08-10 11:15:00,20037 742 | 155,2002-08-10 21:45:00,66834 743 | 155,2002-08-15 11:15:00,37907 744 | 155,2002-08-15 21:45:00,37676 745 | 155,2002-08-17 11:15:00,21647 746 | 155,2002-08-17 21:45:00,47694 747 | 155,2002-08-22 11:15:00,15658 748 | 155,2002-08-22 21:45:00,37566 749 | 155,2002-08-24 11:15:00,6382 750 | 155,2002-08-24 21:45:00,25552 751 | 155,2002-08-29 11:15:00,13945 752 | 155,2002-08-29 21:45:00,25045 753 | 155,2002-08-31 11:15:00,46877 754 | 155,2002-08-31 21:45:00,47260 755 | 155,2002-09-05 11:15:00,9946 756 | 155,2002-09-05 21:45:00,62600 757 | 155,2002-09-07 11:15:00,11786 758 | 155,2002-09-07 21:45:00,29597 759 | 155,2002-09-12 11:15:00,23993 760 | 155,2002-09-12 21:45:00,62210 761 | 155,2002-09-14 11:15:00,17723 762 | 155,2002-09-14 21:45:00,32499 763 | 155,2002-09-19 11:15:00,4580 764 | 155,2002-09-19 21:45:00,19836 765 | 155,2002-09-21 11:15:00,17779 766 | 155,2002-09-21 21:45:00,47976 767 | 155,2002-09-26 11:15:00,51844 768 | 155,2002-09-26 21:45:00,45191 769 | 155,2002-10-03 11:15:00,19679 770 | 155,2002-10-03 21:45:00,39990 771 | 155,2002-10-05 11:15:00,10319 772 | 155,2002-10-05 21:45:00,39537 773 | 155,2002-10-10 11:15:00,19477 774 | 155,2002-10-10 21:45:00,46900 775 | 155,2002-10-12 11:15:00,12031 776 | 155,2002-10-12 21:45:00,21471 777 | 155,2002-10-17 11:15:00,22960 778 | 155,2002-10-17 21:45:00,40154 779 | 155,2002-10-19 11:15:00,5348 780 | 155,2002-10-19 21:45:00,30237 781 | 155,2002-10-24 11:15:00,22789 782 | 155,2002-10-24 21:45:00,55010 783 | 155,2002-10-26 11:15:00,15351 784 | 155,2002-10-26 21:45:00,39595 785 | 155,2002-10-31 11:15:00,14519 786 | 155,2002-10-31 21:45:00,33043 787 | 155,2002-11-02 11:15:00,25853 788 | 155,2002-11-02 21:45:00,27839 789 | 155,2002-11-07 11:15:00,20881 790 | 155,2002-11-07 21:45:00,44755 791 | 155,2002-11-09 11:15:00,5250 792 | 155,2002-11-09 21:45:00,34903 793 | 155,2002-11-14 11:15:00,29225 794 | 155,2002-11-14 21:45:00,34545 795 | 155,2002-11-16 11:15:00,25539 796 | 155,2002-11-16 21:45:00,61767 797 | 155,2002-11-21 11:15:00,34971 798 | 155,2002-11-21 21:45:00,87959 799 | 155,2002-11-23 11:15:00,43600 800 | 155,2002-11-23 21:45:00,83211 801 | 155,2002-11-28 11:15:00,21938 802 | 155,2002-11-28 21:45:00,36940 803 | 155,2002-11-30 11:15:00,10336 804 | 155,2002-11-30 21:45:00,31425 805 | 155,2002-12-12 11:15:00,22621 806 | 155,2002-12-12 21:45:00,70786 807 | 155,2002-12-14 11:15:00,24884 808 | 155,2002-12-14 21:45:00,23938 809 | 155,2002-12-19 11:15:00,25227 810 | 155,2002-12-19 21:45:00,62650 811 | 155,2002-12-21 11:15:00,38959 812 | 155,2002-12-21 21:45:00,26640 813 | 155,2002-12-26 11:15:00,83510 814 | 155,2002-12-26 21:45:00,69198 815 | 155,2002-12-28 11:15:00,34616 816 | 155,2002-12-28 21:45:00,20118 817 | -------------------------------------------------------------------------------- /tests/unit/prophet_modeler_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | from pyspark.sql import SparkSession 4 | 5 | from jobs.prophet_modeler import ProphetModeler, model_time_series 6 | 7 | 8 | def suppress_py4j_logging(): 9 | logger = logging.getLogger('py4j') 10 | logger.setLevel(logging.WARN) 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def spark_session(request): 15 | """ fixture for creating a spark session 16 | Args: 17 | request: pytest.FixtureRequest object 18 | """ 19 | global spark 20 | spark = SparkSession.builder.master("local[*]").appName("TestModeler").getOrCreate() 21 | request.addfinalizer(lambda: spark.stop()) 22 | 23 | suppress_py4j_logging() 24 | return spark 25 | 26 | 27 | @pytest.fixture() 28 | def config(): 29 | config = { 30 | 'io': { 31 | 'input': 'tests/fixtures/model-input', 32 | 'models': 'build/models', 33 | }, 34 | 'model': { 35 | 'floor': 0, 36 | 'cap_multiplier': 1.1 37 | }} 38 | return config 39 | 40 | 41 | @pytest.fixture() 42 | def setup(config): 43 | global modeler 44 | modeler = ProphetModeler(config) 45 | 46 | 47 | @pytest.fixture() 48 | def spark_input_df(spark_session): 49 | return modeler.read_input_dataframe(spark_session) 50 | 51 | 52 | def test_read_dataframe(setup, spark_input_df): 53 | assert (spark_input_df.columns == ['series_id', 'dim_id', 'ds', 'y']) 54 | assert (spark_input_df.select("series_id").distinct().count() == 1) 55 | assert (spark_input_df.select("dim_id").distinct().count() == 2) 56 | assert (spark_input_df.count() == 816) 57 | 58 | 59 | @pytest.mark.dependency() 60 | def test_model_time_series(setup, spark_input_df): 61 | output_df = spark_input_df \ 62 | .groupby('series_id', 'dim_id') \ 63 | .apply(model_time_series(modeler.config)) 64 | 65 | assert (output_df.count() == 2) 66 | assert (output_df.columns == ['series_id', 'dim_id', 'floor', 'cap', 'model']) 67 | assert (output_df.filter('series_id = 751 and dim_id = 91').count() == 1) 68 | assert (output_df.filter('series_id = 751 and dim_id = 155').count() == 1) 69 | 70 | modeler.persist_models(output_df) 71 | 72 | model_df = spark.read.parquet('./build/models') 73 | 74 | assert (model_df.count() == 2) 75 | assert (model_df.columns == ['series_id', 'dim_id', 'floor', 'cap', 'model']) 76 | -------------------------------------------------------------------------------- /tests/unit/prophet_scorer_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from datetime import datetime 4 | import pytest 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import * 7 | 8 | from jobs.prophet_scorer import ProphetScorer, forecast_time_series 9 | 10 | 11 | def suppress_py4j_logging(): 12 | logger = logging.getLogger('py4j') 13 | logger.setLevel(logging.WARN) 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def spark_session(request): 18 | """ fixture for creating a spark session 19 | Args: 20 | request: pytest.FixtureRequest object 21 | """ 22 | global spark 23 | spark = SparkSession.builder.master("local[*]").appName("TestScorer").getOrCreate() 24 | request.addfinalizer(lambda: spark.stop()) 25 | 26 | suppress_py4j_logging() 27 | return spark 28 | 29 | 30 | @pytest.fixture() 31 | def config(): 32 | config = { 33 | 'io': { 34 | 'models': 'build/models', 35 | 'forecasts': 'build/forecasts', 36 | }, 37 | 'forecast': { 38 | 'periods': 40, 39 | 'frequency': '15min' 40 | }} 41 | return config 42 | 43 | 44 | @pytest.fixture() 45 | def setup(config): 46 | global scorer 47 | scorer = ProphetScorer(config) 48 | 49 | 50 | @pytest.fixture() 51 | def spark_model_df(spark_session): 52 | return scorer.read_model_dataframe(spark_session) 53 | 54 | 55 | @pytest.fixture() 56 | def spark_forecast_df(spark_session): 57 | schema = StructType([StructField("series_id", IntegerType()), 58 | StructField("dim_id", IntegerType()), 59 | StructField("ds", TimestampType()), 60 | StructField("yhat", IntegerType()) 61 | ]) 62 | 63 | test_list = [ 64 | (101, 66, datetime.strptime('2015-07-05 10:15:00', '%Y-%m-%d %H:%M:%S'), 873242) 65 | ] 66 | 67 | return spark_session.createDataFrame(data=test_list, schema=schema) 68 | 69 | 70 | def test_convert_forecasts(setup, spark_forecast_df): 71 | output_df = scorer.convert_forecasts(spark_forecast_df) 72 | 73 | timestamp_regex = re.compile(r'^([0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T' 74 | r'(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\+00:00)$') 75 | assert (timestamp_regex.match(output_df.collect()[0][0])) 76 | assert (output_df.collect()[0][1] == 101) 77 | assert (output_df.collect()[0][2] == 66) 78 | assert (output_df.collect()[0][3] == '2015-07-05') 79 | assert (output_df.collect()[0][4] == datetime(2015, 7, 5, 10, 15)) 80 | assert (output_df.collect()[0][5] == 873242) 81 | 82 | 83 | def test_read_model_dataframe(setup, spark_model_df): 84 | assert (spark_model_df.columns == ['series_id', 'dim_id', 'floor', 'cap', 85 | 'model']) 86 | assert (spark_model_df.select("series_id").distinct().count() == 1) 87 | assert (spark_model_df.select("dim_id").distinct().count() == 2) 88 | assert (spark_model_df.count() == 2) 89 | 90 | 91 | @pytest.mark.dependency(depends=["test_model_time_series"]) 92 | def test_forecast_time_series(setup, spark_model_df): 93 | output_df = spark_model_df \ 94 | .groupby('series_id', 'dim_id') \ 95 | .apply(forecast_time_series(scorer.config)) 96 | 97 | assert (output_df.count() == 80) 98 | assert (output_df.columns == ['series_id', 'dim_id', 'ds', 'yhat']) 99 | assert (output_df.filter('series_id = 751 and dim_id = 91').count() == 40) 100 | assert (output_df.filter('series_id = 751 and dim_id = 155').count() == 40) 101 | 102 | converted_df = scorer.convert_forecasts(output_df) 103 | 104 | scorer.write_forecasts(converted_df) 105 | 106 | read_output_df = spark.read.csv('./build/forecasts', header=True) 107 | 108 | assert (read_output_df.columns == ['created_timestamp', 109 | 'series_id', 110 | 'dim_id', 111 | 'forecast_date', 112 | 'forecast_timestamp', 113 | 'forecast_quantity']) 114 | assert (read_output_df.count() == 80) 115 | --------------------------------------------------------------------------------