├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── STL-usage-example.ipynb ├── setup.py └── stldecompose ├── __init__.py ├── __version__.py ├── forecast_funcs.py └── stl.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Created by https://www.gitignore.io/api/python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # End of https://www.gitignore.io/api/python 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Josh Montague 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst LICENSE 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | STL Decompose 2 | ============= 3 | 4 | This is a relatively naive Python implementation of a seasonal and trend decomposition using Loess smoothing. Commonly referred to as an "STL decomposition", Cleveland's 1990 paper is the canonical reference. 5 | 6 | This implementation is a variation of (and takes inspiration from) the implementation of the ``seasonal_decompose`` method `in statsmodels `_. In this implementation, the trend component is calculated by substituting a configurable `Loess regression `_ for the convolutional method used in ``seasonal_decompose``. It also extends the existing ``DecomposeResult`` from ``statsmodels`` to allow for forecasting based on the calculated decomposition. 7 | 8 | 9 | Usage 10 | ----- 11 | 12 | The ``stldecompose`` package is relatively lightweight. It uses ``pandas.Dataframe`` for inputs and outputs, and exposes only a couple of primary methods - ``decompose()`` and ``forecast()`` - as well as a handful of built-in forecasting functions. 13 | 14 | See `the included IPython notebook `_ for more details and usage examples. 15 | 16 | 17 | Installation 18 | ------------ 19 | 20 | A Python 3 virtual environment is recommended. 21 | 22 | The preferred method of installation is via ``pip``:: 23 | 24 | (env) $ pip install stldecompose 25 | 26 | If you'd like the bleeding-edge version, you can also install from this Github repo:: 27 | 28 | (env) $ git clone git@github.com:jrmontag/STLDecompose.git 29 | (env) $ cd STLDecompose; pip install . 30 | 31 | 32 | More Resources 33 | -------------- 34 | 35 | - ``statsmodels`` `Time Series analysis `_ package 36 | - Hyndman's `OTexts reference on STL decomposition `_ 37 | - Cleveland et al. 1990 [`pdf `_] 38 | -------------------------------------------------------------------------------- /STL-usage-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import statsmodels.api as sm\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline\n", 13 | "\n", 14 | "# the main library has a small set of functionality\n", 15 | "from stldecompose import decompose, forecast\n", 16 | "from stldecompose.forecast_funcs import (naive,\n", 17 | " drift, \n", 18 | " mean, \n", 19 | " seasonal_naive)\n", 20 | "\n", 21 | "\n", 22 | "%load_ext autoreload\n", 23 | "%autoreload 2" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We'll use some of the data that comes pre-packaged with `statsmodels` to demonstrate the library functionality. The data set below comprises incomplete, daily measurements of CO2 levels in Hawaii.\n", 31 | "\n", 32 | "**Note:** at the time of this writing, the current release of `statsmodels` includes [a utility method](http://www.statsmodels.org/stable/datasets/index.html#loading-data-as-pandas-objects) for loading these datasets as a `pandas.DataFrame` which appears to be broken. Below is a short hack inspired by the current master branch on the `statsmodels` GitHub page. " 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "def get_statsmodels_df():\n", 42 | " \"\"\"Return packaged data in a pandas.DataFrame\"\"\"\n", 43 | " # some hijinks to get around outdated statsmodels code\n", 44 | " dataset = sm.datasets.co2.load()\n", 45 | " start = dataset.data['date'][0].decode('utf-8')\n", 46 | " index = pd.date_range(start=start, periods=len(dataset.data), freq='W-SAT')\n", 47 | " obs = pd.DataFrame(dataset.data['co2'], index=index, columns=['co2'])\n", 48 | " return obs" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "
\n", 60 | "\n", 73 | "\n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "
co2
1958-03-29316.1
1958-04-05317.3
1958-04-12317.6
1958-04-19317.5
1958-04-26316.4
\n", 103 | "
" 104 | ], 105 | "text/plain": [ 106 | " co2\n", 107 | "1958-03-29 316.1\n", 108 | "1958-04-05 317.3\n", 109 | "1958-04-12 317.6\n", 110 | "1958-04-19 317.5\n", 111 | "1958-04-26 316.4" 112 | ] 113 | }, 114 | "execution_count": 3, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "obs = get_statsmodels_df()\n", 121 | "obs.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Because it's based on some existing `statsmodels` functionality, `STLDecompose` requires two things of the input dataframe:\n", 129 | "1. continuous observations (no missing data points)\n", 130 | "2. a `pandas` `DateTimeIndex`\n", 131 | "\n", 132 | "Since these are both very situation-dependent, we leave it to the user to define how they want to acheive these goals - `pandas` provides a number of ways [to work with missing data](https://pandas.pydata.org/pandas-docs/stable/missing_data.html). In particular, the functions shown below make these steps relatively straightforward. Below, we add use linear interpolation, and resample to daily observations. The resulting frame meets both of our criteria. " 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 4, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
co2
1958-03-29316.100000
1958-03-30316.271429
1958-03-31316.442857
1958-04-01316.614286
1958-04-02316.785714
1958-04-03316.957143
1958-04-04317.128571
1958-04-05317.300000
1958-04-06317.342857
1958-04-07317.385714
\n", 207 | "
" 208 | ], 209 | "text/plain": [ 210 | " co2\n", 211 | "1958-03-29 316.100000\n", 212 | "1958-03-30 316.271429\n", 213 | "1958-03-31 316.442857\n", 214 | "1958-04-01 316.614286\n", 215 | "1958-04-02 316.785714\n", 216 | "1958-04-03 316.957143\n", 217 | "1958-04-04 317.128571\n", 218 | "1958-04-05 317.300000\n", 219 | "1958-04-06 317.342857\n", 220 | "1958-04-07 317.385714" 221 | ] 222 | }, 223 | "execution_count": 4, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "obs = (obs\n", 230 | " .resample('D')\n", 231 | " .mean()\n", 232 | " .interpolate('linear'))\n", 233 | "\n", 234 | "obs.head(10)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 5, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "DatetimeIndex(['1958-03-29', '1958-03-30', '1958-03-31', '1958-04-01',\n", 246 | " '1958-04-02', '1958-04-03', '1958-04-04', '1958-04-05',\n", 247 | " '1958-04-06', '1958-04-07',\n", 248 | " ...\n", 249 | " '2001-12-20', '2001-12-21', '2001-12-22', '2001-12-23',\n", 250 | " '2001-12-24', '2001-12-25', '2001-12-26', '2001-12-27',\n", 251 | " '2001-12-28', '2001-12-29'],\n", 252 | " dtype='datetime64[ns]', length=15982, freq='D')" 253 | ] 254 | }, 255 | "execution_count": 5, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "obs.index" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 6, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "" 273 | ] 274 | }, 275 | "execution_count": 6, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | }, 279 | { 280 | "data": { 281 | "image/png": "\n", 282 | "text/plain": [ 283 | "
" 284 | ] 285 | }, 286 | "metadata": { 287 | "needs_background": "light" 288 | }, 289 | "output_type": "display_data" 290 | } 291 | ], 292 | "source": [ 293 | "obs.head(1000).plot()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "# Decompose\n", 301 | "\n", 302 | "One of the primary pieces of functionality is the STL decomposition. The associated method requires the observation frame, and the primary (largest) period of seasonality. *This `period` is specified in terms of index positions,* and so care is needed for the user to correctly specify the periodicity in terms of their observations.\n", 303 | "\n", 304 | "For example, with daily observations and large annual cycles, `period=365`. For hourly observations with large daily cycles, `period=24`. Some inspection, and trial and error may be helpful." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 7, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "" 316 | ] 317 | }, 318 | "execution_count": 7, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "decomp = decompose(obs, period=365)\n", 325 | "\n", 326 | "decomp" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "The resulting object is an extended version of the `statsmodels.tsa.seasonal.DecomposeResult`. Like the `statsmodels` object, the arrays of values are available on the object (the observations; and the trend, seasonal, and residual components). An extra attribute (the average seasonal cycle) has been added for the purpose of forecasting. \n", 334 | "\n", 335 | "We inherit the built-in `.plot()` method on the object." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 8, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "image/png": "\n", 346 | "text/plain": [ 347 | "
" 348 | ] 349 | }, 350 | "metadata": { 351 | "needs_background": "light" 352 | }, 353 | "output_type": "display_data" 354 | } 355 | ], 356 | "source": [ 357 | "decomp.plot();" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "# Forecast\n", 365 | "\n", 366 | "While the STL decomposition is interesting on its own, `STLDecompose` also provides some relatively naive capabilities for using the decomposition to forecast based on our observations. \n", 367 | "\n", 368 | "We'll use the same data set, but pretend that we only had the first two third of observations. Then we can compare our forecast to the real observation data. " 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 9, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "15982" 380 | ] 381 | }, 382 | "execution_count": 9, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "len(obs)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 10, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "short_obs = obs.head(10000)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 11, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "data": { 407 | "text/plain": [ 408 | "" 409 | ] 410 | }, 411 | "execution_count": 11, 412 | "metadata": {}, 413 | "output_type": "execute_result" 414 | } 415 | ], 416 | "source": [ 417 | "# apply the decomp to the truncated observation\n", 418 | "short_decomp = decompose(short_obs, period=365)\n", 419 | "\n", 420 | "short_decomp" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "The `forecast()` method requires the following arguments:\n", 428 | "- the previously fit `DecomposeResult`\n", 429 | "- the number of steps forward for which we'd like the forecast\n", 430 | "- the specific forecasting function to be applied to the decomposition\n", 431 | "\n", 432 | "There are a handful of predefined functions that can be imported from the `stldecompose.forecast_funcs` module. These implementations are based on [Hyndman's online textbook](https://www.otexts.org/fpp/2/3). The user can also define their own forecast function, following the patterns demonstrated in the predefined functions. \n", 433 | "\n", 434 | "The return type of the `forecast()` method is a `pandas.Dataframe` with a column name that represents the forecast function and an appropriate `DatetimeIndex`." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 12, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/html": [ 445 | "
\n", 446 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | "
drift
1985-08-14345.985881
1985-08-15345.989898
1985-08-16345.993915
1985-08-17345.997933
1985-08-18346.001950
\n", 489 | "
" 490 | ], 491 | "text/plain": [ 492 | " drift\n", 493 | "1985-08-14 345.985881\n", 494 | "1985-08-15 345.989898\n", 495 | "1985-08-16 345.993915\n", 496 | "1985-08-17 345.997933\n", 497 | "1985-08-18 346.001950" 498 | ] 499 | }, 500 | "execution_count": 12, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "fcast = forecast(short_decomp, steps=8000, fc_func=drift)\n", 507 | "\n", 508 | "fcast.head()" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "If desired, we can then plot the corresponding components of the observation and forecast to check and verify the results." 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 13, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "image/png": "\n", 526 | "text/plain": [ 527 | "
" 528 | ] 529 | }, 530 | "metadata": { 531 | "needs_background": "light" 532 | }, 533 | "output_type": "display_data" 534 | } 535 | ], 536 | "source": [ 537 | "plt.plot(obs, '--', label='truth')\n", 538 | "plt.plot(short_obs, '--', label='obs')\n", 539 | "plt.plot(short_decomp.trend, ':', label='decomp.trend')\n", 540 | "plt.plot(fcast, '-', label=fcast.columns[0])\n", 541 | "\n", 542 | "plt.xlim('1970','2004'); plt.ylim(330,380);\n", 543 | "plt.legend();" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "To include the estimated seasonal component in the forecast, use the boolean `seasonal` keyword." 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 14, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "image/png": "\n", 561 | "text/plain": [ 562 | "
" 563 | ] 564 | }, 565 | "metadata": { 566 | "needs_background": "light" 567 | }, 568 | "output_type": "display_data" 569 | } 570 | ], 571 | "source": [ 572 | "fcast = forecast(short_decomp, steps=8000, fc_func=drift, seasonal=True)\n", 573 | "\n", 574 | "plt.plot(obs, '--', label='truth')\n", 575 | "plt.plot(short_obs, '--', label='obs')\n", 576 | "plt.plot(short_decomp.trend, ':', label='decomp.trend')\n", 577 | "plt.plot(fcast, '-', label=fcast.columns[0])\n", 578 | "\n", 579 | "plt.xlim('1970','2004'); plt.ylim(330,380);\n", 580 | "plt.legend();" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 15, 586 | "metadata": {}, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "text/html": [ 591 | "
\n", 592 | "\n", 605 | "\n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | "
drift+seasonal
1985-08-14344.855165
1985-08-15344.800451
1985-08-16344.736606
1985-08-17344.683039
1985-08-18344.632911
\n", 635 | "
" 636 | ], 637 | "text/plain": [ 638 | " drift+seasonal\n", 639 | "1985-08-14 344.855165\n", 640 | "1985-08-15 344.800451\n", 641 | "1985-08-16 344.736606\n", 642 | "1985-08-17 344.683039\n", 643 | "1985-08-18 344.632911" 644 | ] 645 | }, 646 | "execution_count": 15, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | } 650 | ], 651 | "source": [ 652 | "fcast.head()" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "Enjoy." 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [] 668 | } 669 | ], 670 | "metadata": { 671 | "kernelspec": { 672 | "display_name": "Python 3", 673 | "language": "python", 674 | "name": "python3" 675 | }, 676 | "language_info": { 677 | "codemirror_mode": { 678 | "name": "ipython", 679 | "version": 3 680 | }, 681 | "file_extension": ".py", 682 | "mimetype": "text/x-python", 683 | "name": "python", 684 | "nbconvert_exporter": "python", 685 | "pygments_lexer": "ipython3", 686 | "version": "3.7.2" 687 | } 688 | }, 689 | "nbformat": 4, 690 | "nbformat_minor": 2 691 | } 692 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Appreciatively cargo-culted from: https://github.com/kennethreitz/setup.py 5 | 6 | # Note: To use the 'upload' functionality of this file, you must: 7 | # $ pip install twine 8 | # and then you can upload to PyPI via: 9 | # (env) STLDecompose [master] $ python setup.py upload 10 | 11 | import io 12 | import os 13 | import sys 14 | from shutil import rmtree 15 | 16 | from setuptools import find_packages, setup, Command 17 | 18 | # Package meta-data. 19 | NAME = 'stldecompose' 20 | DESCRIPTION = 'A Python implementation of seasonal trend with Loess (STL) time series decomposition' 21 | URL = 'https://github.com/jrmontag/STLDecompose' 22 | EMAIL = 'joshua.montague@gmail.com' 23 | AUTHOR = 'Josh Montague' 24 | 25 | # What packages are required for this module to be executed? 26 | REQUIRED = [ 27 | 'pandas', 28 | 'numpy', 29 | 'scipy', 30 | 'statsmodels', 31 | 'matplotlib', 32 | ] 33 | 34 | # The rest you shouldn't have to touch too much :) 35 | # ------------------------------------------------ 36 | # Except, perhaps the License and Trove Classifiers! 37 | # If you do change the License, remember to change the Trove Classifier for that! 38 | 39 | here = os.path.abspath(os.path.dirname(__file__)) 40 | 41 | # Import the README and use it as the long-description. 42 | # Note: this will only work if 'README.rst' is present in your MANIFEST.in file! 43 | with io.open(os.path.join(here, 'README.rst'), encoding='utf-8') as f: 44 | long_description = '\n' + f.read() 45 | 46 | # Load the package's __version__.py module as a dictionary. 47 | about = {} 48 | with open(os.path.join(here, NAME, '__version__.py')) as f: 49 | exec(f.read(), about) 50 | 51 | 52 | class UploadCommand(Command): 53 | """Support setup.py upload.""" 54 | 55 | description = 'Build and publish the package.' 56 | user_options = [] 57 | 58 | @staticmethod 59 | def status(s): 60 | """Prints things in bold.""" 61 | print('\033[1m{0}\033[0m'.format(s)) 62 | 63 | def initialize_options(self): 64 | pass 65 | 66 | def finalize_options(self): 67 | pass 68 | 69 | def run(self): 70 | try: 71 | self.status('Removing previous builds…') 72 | rmtree(os.path.join(here, 'dist')) 73 | except OSError: 74 | pass 75 | 76 | self.status('Building Source and Wheel (universal) distribution…') 77 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 78 | 79 | self.status('Uploading the package to PyPi via Twine…') 80 | os.system('twine upload dist/*') 81 | 82 | sys.exit() 83 | 84 | 85 | # Where the magic happens: 86 | setup( 87 | name=NAME, 88 | version=about['__version__'], 89 | description=DESCRIPTION, 90 | long_description=long_description, 91 | author=AUTHOR, 92 | author_email=EMAIL, 93 | url=URL, 94 | packages=find_packages(exclude=('tests',)), 95 | # If your package is a single module, use this instead of 'packages': 96 | # py_modules=['mypackage'], 97 | 98 | # entry_points={ 99 | # 'console_scripts': ['mycli=mymodule:cli'], 100 | # }, 101 | install_requires=REQUIRED, 102 | include_package_data=True, 103 | license='MIT', 104 | classifiers=[ 105 | # Trove classifiers 106 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 107 | 'License :: OSI Approved :: MIT License', 108 | 'Programming Language :: Python', 109 | 'Programming Language :: Python :: 2.7', 110 | 'Programming Language :: Python :: 3', 111 | 'Programming Language :: Python :: 3.3', 112 | 'Programming Language :: Python :: 3.4', 113 | 'Programming Language :: Python :: 3.5', 114 | 'Programming Language :: Python :: 3.6', 115 | 'Programming Language :: Python :: Implementation :: CPython', 116 | 'Programming Language :: Python :: Implementation :: PyPy', 117 | 'Development Status :: 3 - Alpha', 118 | 'Topic :: Scientific/Engineering', 119 | ], 120 | # $ setup.py publish support. 121 | cmdclass={ 122 | 'upload': UploadCommand, 123 | }, 124 | ) 125 | -------------------------------------------------------------------------------- /stldecompose/__init__.py: -------------------------------------------------------------------------------- 1 | from .stl import decompose, forecast 2 | -------------------------------------------------------------------------------- /stldecompose/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.5' 2 | -------------------------------------------------------------------------------- /stldecompose/forecast_funcs.py: -------------------------------------------------------------------------------- 1 | # example one-step-ahead forecasting functions 2 | # implementations based on https://www.otexts.org/fpp 3 | 4 | import numpy as np 5 | 6 | 7 | def naive(data, **kwargs): 8 | """The naive forecast for the next point is the value of the previous point. 9 | 10 | In most forecasting situations, the naive model is a good baseline due to it's simplicity 11 | and low computational overhead. 12 | 13 | Args: 14 | data (np.array): Observed data, presumed to be ordered in time. 15 | 16 | Returns: 17 | float: a single-valued forecast for the next value in the series. 18 | """ 19 | forecast = data[-1] 20 | return forecast 21 | 22 | 23 | def seasonal_naive(data, n=7, **kwargs): 24 | """The seasonal naive forecast for the next point is the value observed ``n`` points 25 | prior in the series. 26 | 27 | The seasonal parameter (``n``) does not have units of time, but rather units of one 28 | observation. For example, to account for weekly cycles within daily observations, ``n=7``. 29 | 30 | Args: 31 | data (np.array): Observed data, presumed to be ordered in time. 32 | n (int): period of data seasonality 33 | 34 | Returns: 35 | float: a single-valued forecast for the next value in the series. 36 | """ 37 | forecast = data[-n] 38 | return forecast 39 | 40 | 41 | def mean(data, n=3, **kwargs): 42 | """The mean forecast for the next point is the mean value of the previous ``n`` points in 43 | the series. 44 | 45 | Args: 46 | data (np.array): Observed data, presumed to be ordered in time. 47 | n (int): period over which to calculate the mean 48 | 49 | Returns: 50 | float: a single-valued forecast for the next value in the series. 51 | """ 52 | # don't start averaging until we've seen n points 53 | if len(data[-n:]) < n: 54 | forecast = np.nan 55 | else: 56 | # nb: we'll keep the forecast as a float 57 | forecast = np.mean(data[-n:]) 58 | return forecast 59 | 60 | 61 | def drift(data, n=3, **kwargs): 62 | """The drift forecast for the next point is a linear extrapolation from the previous ``n`` 63 | points in the series. 64 | 65 | Args: 66 | data (np.array): Observed data, presumed to be ordered in time. 67 | n (int): period over which to calculate linear model for extrapolation 68 | 69 | Returns: 70 | float: a single-valued forecast for the next value in the series. 71 | """ 72 | yi = data[-n] 73 | yf = data[-1] 74 | slope = (yf - yi) / (n - 1) 75 | forecast = yf + slope 76 | return forecast 77 | -------------------------------------------------------------------------------- /stldecompose/stl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pandas.core.nanops import nanmean as pd_nanmean 4 | from statsmodels.tsa.seasonal import DecomposeResult 5 | from statsmodels.tsa.filters._utils import _maybe_get_pandas_wrapper_freq 6 | import statsmodels.api as sm 7 | 8 | 9 | def decompose(df, period=365, lo_frac=0.6, lo_delta=0.01): 10 | """Create a seasonal-trend (with Loess, aka "STL") decomposition of observed time series data. 11 | 12 | This implementation is modeled after the ``statsmodels.tsa.seasonal_decompose`` method 13 | but substitutes a Lowess regression for a convolution in its trend estimation. 14 | 15 | This is an additive model, Y[t] = T[t] + S[t] + e[t] 16 | 17 | For more details on lo_frac and lo_delta, see: 18 | `statsmodels.nonparametric.smoothers_lowess.lowess()` 19 | 20 | Args: 21 | df (pandas.Dataframe): Time series of observed counts. This DataFrame must be continuous (no 22 | gaps or missing data), and include a ``pandas.DatetimeIndex``. 23 | period (int, optional): Most significant periodicity in the observed time series, in units of 24 | 1 observation. Ex: to accomodate strong annual periodicity within years of daily 25 | observations, ``period=365``. 26 | lo_frac (float, optional): Fraction of data to use in fitting Lowess regression. 27 | lo_delta (float, optional): Fractional distance within which to use linear-interpolation 28 | instead of weighted regression. Using non-zero ``lo_delta`` significantly decreases 29 | computation time. 30 | 31 | Returns: 32 | `statsmodels.tsa.seasonal.DecomposeResult`: An object with DataFrame attributes for the 33 | seasonal, trend, and residual components, as well as the average seasonal cycle. 34 | 35 | """ 36 | # use some existing pieces of statsmodels 37 | lowess = sm.nonparametric.lowess 38 | _pandas_wrapper, _ = _maybe_get_pandas_wrapper_freq(df) 39 | 40 | # get plain np array 41 | observed = np.asanyarray(df).squeeze() 42 | 43 | # calc trend, remove from observation 44 | trend = lowess(observed, [x for x in range(len(observed))], 45 | frac=lo_frac, 46 | delta=lo_delta * len(observed), 47 | return_sorted=False) 48 | detrended = observed - trend 49 | 50 | # period must not be larger than size of series to avoid introducing NaNs 51 | period = min(period, len(observed)) 52 | 53 | # calc one-period seasonality, remove tiled array from detrended 54 | period_averages = np.array([pd_nanmean(detrended[i::period]) for i in range(period)]) 55 | # 0-center the period avgs 56 | period_averages -= np.mean(period_averages) 57 | seasonal = np.tile(period_averages, len(observed) // period + 1)[:len(observed)] 58 | resid = detrended - seasonal 59 | 60 | # convert the arrays back to appropriate dataframes, stuff them back into 61 | # the statsmodel object 62 | results = list(map(_pandas_wrapper, [seasonal, trend, resid, observed])) 63 | dr = DecomposeResult(seasonal=results[0], 64 | trend=results[1], 65 | resid=results[2], 66 | observed=results[3], 67 | period_averages=period_averages) 68 | return dr 69 | 70 | 71 | def forecast(stl, fc_func, steps=10, seasonal=False, **fc_func_kwargs): 72 | """Forecast the given decomposition ``stl`` forward by ``steps`` steps using the forecasting 73 | function ``fc_func``, optionally including the calculated seasonality. 74 | 75 | This is an additive model, Y[t] = T[t] + S[t] + e[t] 76 | 77 | Args: 78 | stl (a modified statsmodels.tsa.seasonal.DecomposeResult): STL decomposition of observed time 79 | series created using the ``stldecompose.decompose()`` method. 80 | fc_func (function): Function which takes an array of observations and returns a single 81 | valued forecast for the next point. 82 | steps (int, optional): Number of forward steps to include in the forecast 83 | seasonal (bool, optional): Include seasonal component in forecast 84 | fc_func_kwargs: keyword arguments 85 | All remaining arguments are passed to the forecasting function ``fc_func`` 86 | 87 | Returns: 88 | forecast_frame (pd.Dataframe): A ``pandas.Dataframe`` containing forecast values and a 89 | DatetimeIndex matching the observed index. 90 | """ 91 | # container for forecast values 92 | forecast_array = np.array([]) 93 | 94 | # forecast trend 95 | # unpack precalculated trend array stl frame 96 | trend_array = stl.trend 97 | 98 | # iteratively forecast trend ("seasonally adjusted") component 99 | # note: this loop can be slow 100 | for step in range(steps): 101 | # make this prediction on all available data 102 | pred = fc_func(np.append(trend_array, forecast_array), **fc_func_kwargs) 103 | # add this prediction to current array 104 | forecast_array = np.append(forecast_array, pred) 105 | col_name = fc_func.__name__ 106 | 107 | # forecast start and index are determined by observed data 108 | observed_timedelta = stl.observed.index[-1] - stl.observed.index[-2] 109 | forecast_idx_start = stl.observed.index[-1] + observed_timedelta 110 | forecast_idx = pd.date_range(start=forecast_idx_start, 111 | periods=steps, 112 | freq=pd.tseries.frequencies.to_offset(observed_timedelta)) 113 | 114 | # (optionally) forecast seasonal & combine 115 | if seasonal: 116 | # track index and value of max correlation 117 | seasonal_ix = 0 118 | max_correlation = -np.inf 119 | # loop over indexes=length of period avgs 120 | detrended_array = np.asanyarray(stl.observed - stl.trend).squeeze() 121 | for i, x in enumerate(stl.period_averages): 122 | # work slices backward from end of detrended observations 123 | if i == 0: 124 | # slicing w/ [x:-0] doesn't work 125 | detrended_slice = detrended_array[-len(stl.period_averages):] 126 | else: 127 | detrended_slice = detrended_array[-(len(stl.period_averages) + i):-i] 128 | # calculate corr b/w period_avgs and detrend_slice 129 | this_correlation = np.correlate(detrended_slice, stl.period_averages)[0] 130 | if this_correlation > max_correlation: 131 | # update ix and max correlation 132 | max_correlation = this_correlation 133 | seasonal_ix = i 134 | # roll seasonal signal to matching phase 135 | rolled_period_averages = np.roll(stl.period_averages, -seasonal_ix) 136 | # tile as many time as needed to reach "steps", then truncate 137 | tiled_averages = np.tile(rolled_period_averages, 138 | (steps // len(stl.period_averages) + 1))[:steps] 139 | # add seasonal values to previous forecast 140 | forecast_array += tiled_averages 141 | col_name += '+seasonal' 142 | 143 | # combine data array with index into named dataframe 144 | forecast_frame = pd.DataFrame(data=forecast_array, index=forecast_idx) 145 | forecast_frame.columns = [col_name] 146 | return forecast_frame 147 | --------------------------------------------------------------------------------