├── .github └── workflows │ └── google-cloudrun-source.yml ├── .gitignore ├── .mypy.ini ├── .pylintrc ├── CHANGELOG ├── LICENSE ├── MANIFEST.in ├── README.md ├── auto_ts ├── .gitattributes ├── .gitignore ├── __init__.py ├── __version__.py ├── models │ ├── __init__.py │ ├── ar_based │ │ ├── __init__.py │ │ ├── build_arima.py │ │ ├── build_arima_base.py │ │ ├── build_autoarimax.py │ │ ├── build_sarimax.py │ │ ├── build_var.py │ │ └── param_finder.py │ ├── build_base.py │ ├── build_ml.py │ ├── build_prophet.py │ ├── build_pyflux.py │ └── ml_models.py ├── py.typed ├── test │ ├── __init__.py │ ├── test_auto_sarimax.py │ ├── test_auto_ts.py │ └── test_var.py └── utils │ ├── __init__.py │ ├── colors.py │ ├── eda.py │ ├── etl.py │ ├── logging.py │ ├── metrics.py │ ├── my_encoders.py │ └── val.py ├── cloud_run.txt ├── example_datasets ├── Sales_and_Marketing.csv └── ts_2.csv ├── example_notebooks ├── Auto_TS_Test_AV_Hack_TS_Rank_600.ipynb ├── autots_multivariate_example.ipynb └── autots_univariate_example.ipynb ├── images ├── add_fb_prophet.png ├── install_auto_ts.png └── logo.png ├── requirements.txt ├── setup.py └── updates.md /.github/workflows/google-cloudrun-source.yml: -------------------------------------------------------------------------------- 1 | # This workflow will deploy source code on Cloud Run when a commit is pushed to the "master" branch 2 | # 3 | # Overview: 4 | # 5 | # 1. Authenticate to Google Cloud 6 | # 2. Deploy it to Cloud Run 7 | # 8 | # To configure this workflow: 9 | # 10 | # 1. Ensure the required Google Cloud APIs are enabled: 11 | # 12 | # Cloud Run run.googleapis.com 13 | # Cloud Build cloudbuild.googleapis.com 14 | # Artifact Registry artifactregistry.googleapis.com 15 | # 16 | # 2. Create and configure Workload Identity Federation for GitHub (https://github.com/google-github-actions/auth#setting-up-workload-identity-federation) 17 | # 18 | # 3. Ensure the required IAM permissions are granted 19 | # 20 | # Cloud Run 21 | # roles/run.admin 22 | # roles/iam.serviceAccountUser (to act as the Cloud Run runtime service account) 23 | # 24 | # Cloud Build 25 | # roles/cloudbuild.builds.editor 26 | # 27 | # Cloud Storage 28 | # roles/storage.objectAdmin 29 | # 30 | # Artifact Registry 31 | # roles/artifactregistry.admin (project or repository level) 32 | # 33 | # NOTE: You should always follow the principle of least privilege when assigning IAM roles 34 | # 35 | # 4. Create GitHub secrets for WIF_PROVIDER and WIF_SERVICE_ACCOUNT 36 | # 37 | # 5. Change the values for the SERVICE and REGION environment variables (below). 38 | # 39 | # For more support on how to run this workflow, please visit https://github.com/marketplace/actions/deploy-to-cloud-run 40 | # 41 | # Further reading: 42 | # Cloud Run runtime service account - https://cloud.google.com/run/docs/securing/service-identity 43 | # Cloud Run IAM permissions - https://cloud.google.com/run/docs/deploying-source-code#permissions_required_to_deploy 44 | # Cloud Run builds from source - https://cloud.google.com/run/docs/deploying-source-code 45 | # Principle of least privilege - https://cloud.google.com/blog/products/identity-security/dont-get-pwned-practicing-the-principle-of-least-privilege 46 | 47 | name: Deploy to Cloud Run from Source 48 | 49 | on: 50 | push: 51 | branches: [ "master" ] 52 | 53 | env: 54 | PROJECT_ID: YOUR_PROJECT_ID # TODO: update Google Cloud project id 55 | SERVICE: YOUR_SERVICE_NAME # TODO: update Cloud Run service name 56 | REGION: YOUR_SERVICE_REGION # TODO: update Cloud Run service region 57 | 58 | jobs: 59 | deploy: 60 | # Add 'id-token' with the intended permissions for workload identity federation 61 | permissions: 62 | contents: 'read' 63 | id-token: 'write' 64 | 65 | runs-on: ubuntu-latest 66 | steps: 67 | - name: Checkout 68 | uses: actions/checkout@v2 69 | 70 | - name: Google Auth 71 | id: auth 72 | uses: 'google-github-actions/auth@v0' 73 | with: 74 | workload_identity_provider: '${{ secrets.WIF_PROVIDER }}' # e.g. - projects/123456789/locations/global/workloadIdentityPools/my-pool/providers/my-provider 75 | service_account: '${{ secrets.WIF_SERVICE_ACCOUNT }}' # e.g. - my-service-account@my-project.iam.gserviceaccount.com 76 | 77 | # NOTE: Alternative option - authentication via credentials json 78 | # - name: Google Auth 79 | # id: auth 80 | # uses: 'google-github-actions/auth@v0' 81 | # with: 82 | # credentials_json: '${{ secrets.GCP_CREDENTIALS }}' 83 | 84 | - name: Deploy to Cloud Run 85 | id: deploy 86 | uses: google-github-actions/deploy-cloudrun@v0 87 | with: 88 | service: ${{ env.SERVICE }} 89 | region: ${{ env.REGION }} 90 | # NOTE: If required, update to the appropriate source folder 91 | source: ./ 92 | 93 | # If required, use the Cloud Run url output in later steps 94 | - name: Show Output 95 | run: echo ${{ steps.deploy.outputs.url }} 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # VSCode project settings 118 | .vscode 119 | *.code-workspace 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist=lxml.etree 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python module names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | file-ignored, 77 | suppressed-message, 78 | useless-suppression, 79 | deprecated-pragma, 80 | use-symbolic-message-instead, 81 | apply-builtin, 82 | basestring-builtin, 83 | buffer-builtin, 84 | cmp-builtin, 85 | coerce-builtin, 86 | execfile-builtin, 87 | file-builtin, 88 | long-builtin, 89 | raw_input-builtin, 90 | reduce-builtin, 91 | standarderror-builtin, 92 | unicode-builtin, 93 | xrange-builtin, 94 | coerce-method, 95 | delslice-method, 96 | getslice-method, 97 | setslice-method, 98 | no-absolute-import, 99 | old-division, 100 | dict-iter-method, 101 | dict-view-method, 102 | next-method-called, 103 | metaclass-assignment, 104 | indexing-exception, 105 | raising-string, 106 | reload-builtin, 107 | oct-method, 108 | hex-method, 109 | nonzero-method, 110 | cmp-method, 111 | input-builtin, 112 | round-builtin, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | eq-without-hash, 121 | div-method, 122 | idiv-method, 123 | rdiv-method, 124 | exception-message-attribute, 125 | invalid-str-codec, 126 | sys-max-int, 127 | bad-python3-import, 128 | deprecated-string-function, 129 | deprecated-str-translate-call, 130 | deprecated-itertools-function, 131 | deprecated-types-field, 132 | next-method-defined, 133 | dict-items-not-iterating, 134 | dict-keys-not-iterating, 135 | dict-values-not-iterating, 136 | deprecated-operator-function, 137 | deprecated-urllib-function, 138 | xreadlines-attribute, 139 | deprecated-sys-function, 140 | exception-escape, 141 | comprehension-escape 142 | 143 | # Enable the message, report, category or checker with the given id(s). You can 144 | # either give multiple identifier separated by comma (,) or put this option 145 | # multiple time (only on the command line, not in the configuration file where 146 | # it should appear only once). See also the "--disable" option for examples. 147 | enable=c-extension-no-member 148 | 149 | 150 | [REPORTS] 151 | 152 | # Python expression which should return a score less than or equal to 10. You 153 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 154 | # which contain the number of messages in each category, as well as 'statement' 155 | # which is the total number of statements analyzed. This score is used by the 156 | # global evaluation report (RP0004). 157 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 158 | 159 | # Template used to display messages. This is a python new-style format string 160 | # used to format the message information. See doc for all details. 161 | #msg-template= 162 | 163 | # Set the output format. Available formats are text, parseable, colorized, json 164 | # and msvs (visual studio). You can also give a reporter class, e.g. 165 | # mypackage.mymodule.MyReporterClass. 166 | output-format=text 167 | 168 | # Tells whether to display a full report or only the messages. 169 | reports=no 170 | 171 | # Activate the evaluation score. 172 | score=yes 173 | 174 | 175 | [REFACTORING] 176 | 177 | # Maximum number of nested blocks for function / method body 178 | max-nested-blocks=5 179 | 180 | # Complete name of functions that never returns. When checking for 181 | # inconsistent-return-statements if a never returning function is called then 182 | # it will be considered as an explicit return statement and no message will be 183 | # printed. 184 | never-returning-functions=sys.exit 185 | 186 | 187 | [STRING] 188 | 189 | # This flag controls whether the implicit-str-concat-in-sequence should 190 | # generate a warning on implicit string concatenation in sequences defined over 191 | # several lines. 192 | check-str-concat-over-line-jumps=no 193 | 194 | 195 | [SPELLING] 196 | 197 | # Limits count of emitted suggestions for spelling mistakes. 198 | max-spelling-suggestions=4 199 | 200 | # Spelling dictionary name. Available dictionaries: none. To make it work, 201 | # install the python-enchant package. 202 | spelling-dict= 203 | 204 | # List of comma separated words that should not be checked. 205 | spelling-ignore-words= 206 | 207 | # A path to a file that contains the private dictionary; one word per line. 208 | spelling-private-dict-file= 209 | 210 | # Tells whether to store unknown words to the private dictionary (see the 211 | # --spelling-private-dict-file option) instead of raising a message. 212 | spelling-store-unknown-words=no 213 | 214 | 215 | [TYPECHECK] 216 | 217 | # List of decorators that produce context managers, such as 218 | # contextlib.contextmanager. Add to this list to register other decorators that 219 | # produce valid context managers. 220 | contextmanager-decorators=contextlib.contextmanager 221 | 222 | # List of members which are set dynamically and missed by pylint inference 223 | # system, and so shouldn't trigger E1101 when accessed. Python regular 224 | # expressions are accepted. 225 | generated-members= 226 | 227 | # Tells whether missing members accessed in mixin class should be ignored. A 228 | # mixin class is detected if its name ends with "mixin" (case insensitive). 229 | ignore-mixin-members=yes 230 | 231 | # Tells whether to warn about missing members when the owner of the attribute 232 | # is inferred to be None. 233 | ignore-none=yes 234 | 235 | # This flag controls whether pylint should warn about no-member and similar 236 | # checks whenever an opaque object is returned when inferring. The inference 237 | # can return multiple potential results while evaluating a Python object, but 238 | # some branches might not be evaluated, which results in partial inference. In 239 | # that case, it might be useful to still emit no-member and other checks for 240 | # the rest of the inferred objects. 241 | ignore-on-opaque-inference=yes 242 | 243 | # List of class names for which member attributes should not be checked (useful 244 | # for classes with dynamically set attributes). This supports the use of 245 | # qualified names. 246 | ignored-classes=optparse.Values,thread._local,_thread._local 247 | 248 | # List of module names for which member attributes should not be checked 249 | # (useful for modules/projects where namespaces are manipulated during runtime 250 | # and thus existing member attributes cannot be deduced by static analysis). It 251 | # supports qualified module names, as well as Unix pattern matching. 252 | ignored-modules= 253 | 254 | # Show a hint with possible names when a member name was not found. The aspect 255 | # of finding the hint is based on edit distance. 256 | missing-member-hint=yes 257 | 258 | # The minimum edit distance a name should have in order to be considered a 259 | # similar match for a missing member name. 260 | missing-member-hint-distance=1 261 | 262 | # The total number of similar names that should be taken in consideration when 263 | # showing a hint for a missing member. 264 | missing-member-max-choices=1 265 | 266 | # List of decorators that change the signature of a decorated function. 267 | signature-mutators= 268 | 269 | 270 | [BASIC] 271 | 272 | # Naming style matching correct argument names. 273 | argument-naming-style=snake_case 274 | 275 | # Regular expression matching correct argument names. Overrides argument- 276 | # naming-style. 277 | #argument-rgx= 278 | 279 | # Naming style matching correct attribute names. 280 | attr-naming-style=snake_case 281 | 282 | # Regular expression matching correct attribute names. Overrides attr-naming- 283 | # style. 284 | #attr-rgx= 285 | 286 | # Bad variable names which should always be refused, separated by a comma. 287 | bad-names=foo, 288 | bar, 289 | baz, 290 | toto, 291 | tutu, 292 | tata 293 | 294 | # Naming style matching correct class attribute names. 295 | class-attribute-naming-style=any 296 | 297 | # Regular expression matching correct class attribute names. Overrides class- 298 | # attribute-naming-style. 299 | #class-attribute-rgx= 300 | 301 | # Naming style matching correct class names. 302 | class-naming-style=PascalCase 303 | 304 | # Regular expression matching correct class names. Overrides class-naming- 305 | # style. 306 | #class-rgx= 307 | 308 | # Naming style matching correct constant names. 309 | const-naming-style=UPPER_CASE 310 | 311 | # Regular expression matching correct constant names. Overrides const-naming- 312 | # style. 313 | #const-rgx= 314 | 315 | # Minimum line length for functions/classes that require docstrings, shorter 316 | # ones are exempt. 317 | docstring-min-length=-1 318 | 319 | # Naming style matching correct function names. 320 | function-naming-style=snake_case 321 | 322 | # Regular expression matching correct function names. Overrides function- 323 | # naming-style. 324 | #function-rgx= 325 | 326 | # Good variable names which should always be accepted, separated by a comma. 327 | good-names=i, 328 | j, 329 | k, 330 | ex, 331 | Run, 332 | _ 333 | 334 | # Include a hint for the correct naming format with invalid-name. 335 | include-naming-hint=no 336 | 337 | # Naming style matching correct inline iteration names. 338 | inlinevar-naming-style=any 339 | 340 | # Regular expression matching correct inline iteration names. Overrides 341 | # inlinevar-naming-style. 342 | #inlinevar-rgx= 343 | 344 | # Naming style matching correct method names. 345 | method-naming-style=snake_case 346 | 347 | # Regular expression matching correct method names. Overrides method-naming- 348 | # style. 349 | #method-rgx= 350 | 351 | # Naming style matching correct module names. 352 | module-naming-style=snake_case 353 | 354 | # Regular expression matching correct module names. Overrides module-naming- 355 | # style. 356 | #module-rgx= 357 | 358 | # Colon-delimited sets of names that determine each other's naming style when 359 | # the name regexes allow several styles. 360 | name-group= 361 | 362 | # Regular expression which should only match function or class names that do 363 | # not require a docstring. 364 | no-docstring-rgx=^_ 365 | 366 | # List of decorators that produce properties, such as abc.abstractproperty. Add 367 | # to this list to register other decorators that produce valid properties. 368 | # These decorators are taken in consideration only for invalid-name. 369 | property-classes=abc.abstractproperty 370 | 371 | # Naming style matching correct variable names. 372 | variable-naming-style=snake_case 373 | 374 | # Regular expression matching correct variable names. Overrides variable- 375 | # naming-style. 376 | #variable-rgx= 377 | 378 | 379 | [SIMILARITIES] 380 | 381 | # Ignore comments when computing similarities. 382 | ignore-comments=yes 383 | 384 | # Ignore docstrings when computing similarities. 385 | ignore-docstrings=yes 386 | 387 | # Ignore imports when computing similarities. 388 | ignore-imports=no 389 | 390 | # Minimum lines number of a similarity. 391 | min-similarity-lines=4 392 | 393 | 394 | [MISCELLANEOUS] 395 | 396 | # List of note tags to take in consideration, separated by a comma. 397 | notes=FIXME, 398 | XXX, 399 | TODO 400 | 401 | 402 | [LOGGING] 403 | 404 | # Format style used to check logging format string. `old` means using % 405 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 406 | logging-format-style=old 407 | 408 | # Logging modules to check that the string format arguments are in logging 409 | # function parameter format. 410 | logging-modules=logging 411 | 412 | 413 | [VARIABLES] 414 | 415 | # List of additional names supposed to be defined in builtins. Remember that 416 | # you should avoid defining new builtins when possible. 417 | additional-builtins= 418 | 419 | # Tells whether unused global variables should be treated as a violation. 420 | allow-global-unused-variables=yes 421 | 422 | # List of strings which can identify a callback function by name. A callback 423 | # name must start or end with one of those strings. 424 | callbacks=cb_, 425 | _cb 426 | 427 | # A regular expression matching the name of dummy variables (i.e. expected to 428 | # not be used). 429 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 430 | 431 | # Argument names that match this expression will be ignored. Default to name 432 | # with leading underscore. 433 | ignored-argument-names=_.*|^ignored_|^unused_ 434 | 435 | # Tells whether we should check for unused import in __init__ files. 436 | init-import=no 437 | 438 | # List of qualified module names which can have objects that can redefine 439 | # builtins. 440 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 441 | 442 | 443 | [FORMAT] 444 | 445 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 446 | expected-line-ending-format= 447 | 448 | # Regexp for a line that is allowed to be longer than the limit. 449 | ignore-long-lines=^\s*(# )??$ 450 | 451 | # Number of spaces of indent required inside a hanging or continued line. 452 | indent-after-paren=4 453 | 454 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 455 | # tab). 456 | indent-string=' ' 457 | 458 | # Maximum number of characters on a single line. 459 | max-line-length=120 460 | 461 | # Maximum number of lines in a module. 462 | max-module-lines=1000 463 | 464 | # List of optional constructs for which whitespace checking is disabled. `dict- 465 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 466 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 467 | # `empty-line` allows space-only lines. 468 | no-space-check=trailing-comma, 469 | dict-separator 470 | 471 | # Allow the body of a class to be on the same line as the declaration if body 472 | # contains single statement. 473 | single-line-class-stmt=no 474 | 475 | # Allow the body of an if to be on the same line as the test if there is no 476 | # else. 477 | single-line-if-stmt=no 478 | 479 | 480 | [IMPORTS] 481 | 482 | # List of modules that can be imported at any level, not just the top level 483 | # one. 484 | allow-any-import-level= 485 | 486 | # Allow wildcard imports from modules that define __all__. 487 | allow-wildcard-with-all=no 488 | 489 | # Analyse import fallback blocks. This can be used to support both Python 2 and 490 | # 3 compatible code, which means that the block might have code that exists 491 | # only in one or another interpreter, leading to false positives when analysed. 492 | analyse-fallback-blocks=no 493 | 494 | # Deprecated modules which should not be used, separated by a comma. 495 | deprecated-modules=optparse,tkinter.tix 496 | 497 | # Create a graph of external dependencies in the given file (report RP0402 must 498 | # not be disabled). 499 | ext-import-graph= 500 | 501 | # Create a graph of every (i.e. internal and external) dependencies in the 502 | # given file (report RP0402 must not be disabled). 503 | import-graph= 504 | 505 | # Create a graph of internal dependencies in the given file (report RP0402 must 506 | # not be disabled). 507 | int-import-graph= 508 | 509 | # Force import order to recognize a module as part of the standard 510 | # compatibility libraries. 511 | known-standard-library= 512 | 513 | # Force import order to recognize a module as part of a third party library. 514 | known-third-party=enchant 515 | 516 | # Couples of modules and preferred modules, separated by a comma. 517 | preferred-modules= 518 | 519 | 520 | [DESIGN] 521 | 522 | # Maximum number of arguments for function / method. 523 | max-args=12 524 | 525 | # Maximum number of attributes for a class (see R0902). 526 | max-attributes=7 527 | 528 | # Maximum number of boolean expressions in an if statement (see R0916). 529 | max-bool-expr=5 530 | 531 | # Maximum number of branch for function / method body. 532 | max-branches=12 533 | 534 | # Maximum number of locals for function / method body. 535 | max-locals=15 536 | 537 | # Maximum number of parents for a class (see R0901). 538 | max-parents=7 539 | 540 | # Maximum number of public methods for a class (see R0904). 541 | max-public-methods=20 542 | 543 | # Maximum number of return / yield for function / method body. 544 | max-returns=6 545 | 546 | # Maximum number of statements in function / method body. 547 | max-statements=100 548 | 549 | # Minimum number of public methods for a class (see R0903). 550 | min-public-methods=2 551 | 552 | 553 | [CLASSES] 554 | 555 | # List of method names used to declare (i.e. assign) instance attributes. 556 | defining-attr-methods=__init__, 557 | __new__, 558 | setUp, 559 | __post_init__ 560 | 561 | # List of member names, which should be excluded from the protected access 562 | # warning. 563 | exclude-protected=_asdict, 564 | _fields, 565 | _replace, 566 | _source, 567 | _make 568 | 569 | # List of valid names for the first argument in a class method. 570 | valid-classmethod-first-arg=cls 571 | 572 | # List of valid names for the first argument in a metaclass class method. 573 | valid-metaclass-classmethod-first-arg=cls 574 | 575 | 576 | [EXCEPTIONS] 577 | 578 | # Exceptions that will emit a warning when being caught. Defaults to 579 | # "BaseException, Exception". 580 | overgeneral-exceptions=BaseException, 581 | Exception 582 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | ------------------------------- 2 | version_number = '0.0.24.b2' 3 | ------------------------------- 4 | Fixed bug with Prophet Multivariate Prediction where it needed to pass forecast_period in 5 | addition to X_exogen. Only X_egogen is needed now. Forecast Period is calculated based on 6 | number of observations in the X_exogen data. 7 | 8 | TODO: Make sure all predict functions are consistent (Prophet now has an Optional 9 | return if things go wrong. Others should do the same.) 10 | 11 | ------------------------------- 12 | version_number = '0.0.24' 13 | ------------------------------- 14 | Added 'auto_arima' capabaility from pmdarima library 15 | 16 | ------------------------------- 17 | version_number = '0.0.23.b4' 18 | ------------------------------- 19 | Changed default argument for 'sep' in fit function to be 'None' (treated as ',' internally). 20 | 21 | Fixed bug with predict function in auto_ts 22 | Dataframe index for X_exogen needed to be set before passing to predict since we were doing the same 23 | while fitting. Without this, it was causing issues with ML models where we are internally 24 | constructing the 'future dataframe' and if while fiting, the dataframe had datatime index and while 25 | predicting, X_egogen had integer index (index was still in dataframne column in X_egogen), it was 26 | causing issues while adding time series features (could not get time series features from integers). 27 | 28 | 29 | ------------------------------- 30 | version_number = '0.0.23.b3' 31 | ------------------------------- 32 | More time series engineered features included in ML models 33 | Example, 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'weekofyear', 'weekend', etc. 34 | 35 | 36 | ------------------------------- 37 | version_number = '0.0.23.b2' 38 | ------------------------------- 39 | Fixed bug in Prophet rolling window horizon calculation 40 | 41 | 42 | ------------------------------- 43 | version_number = '0.0.23' 44 | ------------------------------- 45 | Prophet now includes multivariate modeling capability with rolling window 46 | SARIMAX also includes multivariate modeling capability with rolling window -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include auto_ts/py.typed # marker file for PEP 561 3 | 4 | include CHANGELOG.md 5 | include LICENSE 6 | include CITATION.cff 7 | include *.cff # citation info 8 | 9 | include MANIFEST.in 10 | include pyproject.toml 11 | include setup.py 12 | include setup.cfg 13 | 14 | include requirements.txt 15 | 16 | recursive-exclude tests * 17 | recursive-exclude docs * 18 | recursive-exclude site * 19 | recursive-exclude example_datasets * 20 | recursive-exclude example_notebooks * 21 | recursive-exclude .github * 22 | 23 | exclude .flake8 24 | exclude .gitignore 25 | exclude .mypy.ini 26 | exclude .pre-commit-config.yaml 27 | exclude .pylintrc 28 | exclude Makefile 29 | exclude updates.md 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Auto_TS: Auto_TimeSeries

2 |

Automatically build multiple Time Series models using a Single Line of Code. Now updated with Dask. 3 | 4 | ![auto-ts](images/logo.png) 5 | 6 | `auto_timeseries` is a complex model building utility for time series data. Since it automates many 7 | Tasks involved in a complex endeavor, it assumes many intelligent defaults. But you can change them. 8 | Auto_Timeseries will rapidly build predictive models based on Statsmodels ARIMA, Seasonal ARIMA, Prophet 9 | and Scikit-Learn ML. It will automatically select the best model which gives best score specified. 10 | 11 | # Table of Contents 12 |

24 | 25 | ## Latest 26 | If you are looking for the latest and greatest updates about our library, check out our [updates page](https://github.com/AutoViML/Auto_TS/blob/master/updates.md). 27 |
28 | 29 | ## Citation 30 | If you use Auto_TS in your research project or paper, please use the following format for citations: 31 | 32 | "Seshadri, Ram (2020). GitHub - AutoViML/Auto_TS: enables you to build and deploy multiple time series models using ML and statistical techniques with a single line of code. Source code: https://github.com/AutoViML/Auto_TS" 33 | 34 |

Introduction

35 | 36 | Auto_TS (Auto_TimeSeries) enables you to build and select multiple time series models using techniques such as ARIMA, SARIMAX, VAR, decomposable (trend+seasonality+holidays) models, and ensemble machine learning models. 37 | 38 | Auto_TimeSeries is an Automated ML library for time series data. Auto_TimeSeries was initially conceived and developed by [Ram Seshadri](https://www.linkedin.com/in/ram-seshadri-nyc-nj/) and was significantly expanded in functionality and scope and upgraded to its present status by [Nikhil Gupta](https://github.com/ngupta23). 39 | 40 | auto-ts.Auto_TimeSeries is the main function that you will call with your train data. You can then choose what kind of models you want: stats, ml or Prophet based model. You can also tell it to automatically select the best model based on the scoring parameter you want it to be based on. It will return the best model and a dictionary containing predictions for the number of forecast_periods you mentioned (default=2). 41 | 42 | ## Install 43 | 44 | ```bash 45 | pip install auto-ts 46 | ``` 47 | 48 | Use `pip3 install auto-ts` if the above doesn’t work 49 | 50 | ```bash 51 | pip install git+https://github.com/AutoViML/Auto_TS.git 52 | ``` 53 | 54 | ### Installing on Colab 55 | If you are using Colab or Kaggle kernel and want to install auto_ts, please use the following steps (otherwise you will get an error!): 56 | 57 | ``` 58 | !pip install auto-ts --no-deps --ignore-installed 59 | !pip install 'fsspec>=0.3.3' 60 | !pip install statsmodels --upgrade 61 | !pip install pmdarima 62 | ``` 63 | 64 | ![auto_ts_colab](images/install_auto_ts.png) 65 | 66 | ### Installing on Windows 67 | 68 | Windows users may experience difficulties with the Prophet and pystan dependency installations. Because of this, we recommend installing Prophet using instructions from the [Prophet documentation page](https://facebook.github.io/prophet/docs/installation.html) prior to installing auto-ts. For Anaconda users, this can be accomplished via: 69 | ```bash 70 | conda install -c conda-forge prophet 71 | pip install auto-ts 72 | ``` 73 | 74 |

Usage

75 | 76 | ### First you need to import auto_timeseries from auto_ts library:
77 | 78 | ```py 79 | from auto_ts import auto_timeseries 80 | ``` 81 | 82 | ### Second, Initialize an auto_timeseries model object which will hold all your parameters: 83 | 84 | ```py 85 | model = auto_timeseries( 86 | score_type='rmse', 87 | time_interval='Month', 88 | non_seasonal_pdq=None, seasonality=False, 89 | seasonal_period=12, 90 | model_type=['Prophet'], 91 | verbose=2, 92 | ) 93 | ``` 94 | 95 | #### Here are how the input parameters defined: 96 | 97 | - **score_type (default='rmse')**: The metric used for scoring the models. Type is string. 98 | Currently only the following two types are supported: 99 | 1. "rmse": Root Mean Squared Error (RMSE) 100 | 1. "normalized_rmse": Ratio of RMSE to the standard deviation of actuals 101 | - **time_interval (default is None)**: Used to indicate the frequency at which the data is collected. 102 | This is used for two purposes (1) in building the Prophet model and (2) used to impute the seasonal period for SARIMAX in case it is not provided by the user (None). Type is String. We use the following [pandas date range frequency](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases) aliases that Prophet uses to make the prediction dataframe.

Hence, please note that these are the list of allowed aliases for frequency: 103 | `['B','C','D','W','M','SM','BM','CBM', 104 | 'MS','SMS','BMS','CBMS','Q','BQ','QS','BQS', 105 | 'A,Y','BA,BY','AS,YS','BAS,BYS','BH', 106 | 'H','T,min','S','L,ms','U,us','N']` 107 | For a start, you can test the following codes for your data and see how the results are (or you can leave it as None and auto_timeseries will try and impute it for you): 108 | - `'MS', 'M', 'SM', 'BM', 'CBM', 'SMS', 'BMS'` for monthly frequency data 109 | - `'D', 'B', 'C'` for daily frequency data 110 | - `'W'` for weekly frequency data 111 | - `'Q', 'BQ', 'QS', 'BQS'` for quarterly frequency data 112 | - `'A,Y', 'BA,BY', 'AS,YS', 'BAS,YAS'` for yearly frequency data 113 | - `'BH', 'H', 'h'` for hourly frequency data 114 | - `'T,min'` for minute frequency data 115 | - `'S', 'L,milliseconds', 'U,microseconds', 'N,nanoseconds'` for second frequency data 116 | - **non_seasonal_pdq (default = (3,1,3))**: Indicates the maximum value of (p, d, q) to be used in the search for statistical ARIMA models. 117 | If None, then the following values are assumed `max_p = 3, max_d = 1, max_q = 3`. Type is Tuple. 118 | - **seasonality (default=False)**: Used in the building of the SARIMAX model only at this time. True or False. Type is bool. 119 | - **seasonal_period (default is None)**: Indicates the seasonal period in your data. This depends on the peak (or valley) period that occurs regularly in your data. 120 | Used in the building of the SARIMAX model only at this time. 121 | There is no impact of this argument if seasonality is set to False 122 | If None, the program will try to infer this from the time_interval (frequency) of the data 123 | We assume the following as defaults but feel free to change them. 124 | 1. If frequency is Monthly, then seasonal_period is assumed to be 12 125 | 1. If frequency is Daily, then seasonal_period is assumed to be 30 (but it could be 7) 126 | 1. If frequency is Weekly, then seasonal_period is assumed to be 52 127 | 1. If frequency is Quarterly, then seasonal_period is assumed to be 4 128 | 1. If frequency is Yearly, then seasonal_period is assumed to be 1 129 | 1. If frequency is Hourly, then seasonal_period is assumed to be 24 130 | 1. If frequency is Minutes, then seasonal_period is assumed to be 60 131 | 1. If frequency is Seconds, then seasonal_period is assumed to be 60 132 | Type is integer 133 | - **conf_int (default=0.95)**: Confidence Interval for building the Prophet model. Default: 0.95. Type is float. 134 | - **model_type (default: 'stats'**: The type(s) of model to build. Default to building only statistical models. If a list is provided, then only those models will be built. Can be a string or a list of models. Allowed values are: 135 | `'best', 'prophet', 'stats', 'ARIMA', 'SARIMAX', 'VAR', 'ML'`. 136 | - `"prophet"` will build a model using Prophet -> this means you must have Prophet installed 137 | - `"stats"` will build statsmodels based ARIMA, SARIMAX and VAR models 138 | - `"ML"` will build a machine learning model using Random Forests provided explanatory vars are given 139 | - `"best"` will try to build all models and pick the best one 140 | - **verbose (default=0)**: Indicates the verbosity of printing. Type is integer. 141 | 142 | WARNING: "best" might take some time for large data sets. We recommend that you 143 | choose a small sample from your data set before attempting to run entire data. 144 | 145 | ### The next step after defining the model object is to fit it with some real data: 146 | 147 | ```py 148 | model.fit( 149 | traindata=train_data, 150 | ts_column=ts_column, 151 | target=target, 152 | cv=5, 153 | sep="," 154 | ) 155 | ``` 156 | 157 | Here are how the parameters defined: 158 | - **traindata (required)**: It can be either a dataframe or a file. You must give the name of the file along with its data path in case if a file. It also accepts a pandas dataframe in case you already have a dataframe loaded in your notebook. 159 | - **ts_column (required)**: name of the datetime column in your dataset (it could be a name of column or index number in the columns index). 160 | - **target (required)**: name of the column you are trying to predict. Target could also be the only column in your dataset. 161 | - **cv (default=5)**: You can enter any integer for the number of folds you want in your cross validation data set. 162 | - **sep (default=",")**: Sep is the separator in your traindata file. If your separator is ",", "\t", ";", make sure you enter it here. If not, it is ignored. 163 | 164 | ### The next step after training the model object is to make some predictions with test data: 165 | 166 | ```py 167 | predictions = model.predict( 168 | testdata = ..., # can be either a dataframe or an integer standing for the forecast_period, 169 | model = 'best' # or any other string that stands for the trained model 170 | ) 171 | ``` 172 | 173 | Here are how the parameters are defined. You can choose to send either testdata in the form of a dataframe or send in an integer to decide how many periods you want to forecast. You need only 174 | - **testdata (required)**: It can be either a dataframe containing test data or you can use an integer standing for the forecast_period (you want). 175 | - **model (optional, default = 'best')**: The name of the model you want to use among the many different models you have trained. Remember that the default is the best model. But you can choose any model that you want to forecast with. Type is String. 176 | 177 |

Requirements

178 | dask, scikit-learn, prophet, statsmodels, pmdarima, XGBoost 179 | 180 |

License:

181 | Apache License 2.0 182 | 183 |

Tips

184 | 185 | - We recommend that you choose a small sample from your data set before attempting to run entire data. and the evaluation metric, so it can select the best model. Currently models within “stats” are compared using AIC and BIC. However, models across different types are compared using RMSE. The results of models are shown using RMSE and Normalized RMSE (ratio of RMSE to the standard deviation of actuals). 186 | - You must clean the data and not have any missing values. Make sure the target variable is numeric, otherwise, it won’t run. If there is more than one target variable in your data set, just specify only one for now, and if you know the time interval that is in your data, you can specify it. Otherwise it auto-ts will try to infer the time interval on its own. 187 | - If you give Auto_Timeseries a different time interval than what the data has, it will automatically resample the data to the given time interval and use the mean of the target for the resampled period. 188 | - Notice that except for filename and ts_column input arguments, which are required, all other arguments are optional. 189 | - Note that optionally you can give a separator for the data in your file. Default is comma (","). 190 | - “time_interval” options are any codes that you can find in this page below. 191 | [Pandas date-range frequency aliases](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases) 192 | - Optionally, you can give seasonal_period as any integer that measures the seasonality in the data. If not given, seasonal_period is assumed automatically as follows: 193 | - Months = 12, 194 | - Days = 30, 195 | - Weeks = 52, 196 | - Qtr = 4, 197 | - Year = 1, 198 | - Hours = 24, 199 | - Minutes = 60 and 200 | - Seconds = 60. 201 | - If you want to give your own non-seasonal order, please input it as non_seasonal_pdq and for seasonal order, use seasonal_PDQ as the input. Use tuples. For example, `seasonal_PDQ = (2,1,2)` and `non_seasonal_pdq = (0,0,3)`. It will accept only tuples. The default is None and Auto_Timeseries will automatically search for the best p,d,q (for Non Seasonal) and P, D, Q (for Seasonal) orders by searching for all parameters from 0 to 12 for each value of p,d,q and 0-3 for each P, Q and 0-1 for D. 202 | 203 |

DISCLAIMER:

204 | 205 | This is not an Officially supported Google project. 206 | 207 | 208 | 209 | © Google 210 | -------------------------------------------------------------------------------- /auto_ts/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /auto_ts/.gitignore: -------------------------------------------------------------------------------- 1 | # Windows thumbnail cache files 2 | Thumbs.db 3 | ehthumbs.db 4 | ehthumbs_vista.db 5 | 6 | # Folder config file 7 | Desktop.ini 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | -------------------------------------------------------------------------------- /auto_ts/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Specifies the version of the Auto_TS package.""" 3 | 4 | __title__ = "Auto_TS" 5 | __author__ = "Ram Seshadri" 6 | __description__ = "Build time series models for any data set, any size. Now using dask." 7 | __url__ = "https://github.com/Auto_ViML/Auto_TS.git" 8 | __version__ = "0.0.92" 9 | __license__ = "Apache License 2.0" 10 | __copyright__ = "2020-22 Google" 11 | -------------------------------------------------------------------------------- /auto_ts/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ar_based import BuildArima, BuildSarimax, BuildAutoSarimax, BuildVAR 2 | from .build_base import BuildBase 3 | from .build_ml import BuildML 4 | from .build_prophet import BuildProphet 5 | from .build_pyflux import build_pyflux_model 6 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_arima import BuildArima 2 | from .build_autoarimax import BuildAutoSarimax 3 | from .build_sarimax import BuildSarimax 4 | from .build_var import BuildVAR 5 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/build_arima.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | import operator 4 | import warnings 5 | from typing import Optional 6 | 7 | import matplotlib.pyplot as plt # type: ignore 8 | import numpy as np # type: ignore 9 | import pandas as pd # type: ignore 10 | import seaborn as sns # type: ignore 11 | from pandas.core.generic import NDFrame # type:ignore 12 | 13 | sns.set(style="white", color_codes=True) 14 | 15 | # imported ARIMA from statsmodels pkg 16 | from statsmodels.tsa.arima_model import ARIMA # type: ignore 17 | 18 | # helper functions 19 | from ...utils import print_static_rmse, print_dynamic_rmse 20 | from ...models.ar_based.param_finder import find_lowest_pq 21 | 22 | 23 | class BuildArima(): 24 | def __init__(self, metric='aic', p_max=3, d_max=1, q_max=3, forecast_period=2, method='mle', verbose=0): 25 | """ 26 | Automatically build an ARIMA Model 27 | """ 28 | self.metric = metric 29 | self.p_max = p_max 30 | self.d_max = d_max 31 | self.q_max = q_max 32 | self.forecast_period = forecast_period 33 | self.method = method 34 | self.verbose = verbose 35 | self.model = None 36 | 37 | def fit(self, ts_df): 38 | """ 39 | Build a Time Series Model using SARIMAX from statsmodels. 40 | 41 | This builds a Non Seasonal ARIMA model given a Univariate time series dataframe with time 42 | as the Index, ts_df can be a dataframe with one column only or a single array. Dont send 43 | Multiple Columns!!! Include only that variable that is a Time Series. DO NOT include 44 | Non-Stationary data. Make sure your Time Series is "Stationary"!! If not, this 45 | will give spurious results, since it automatically builds a Non-Seasonal model, 46 | you need not give it a Seasonal True/False flag. 47 | "metric": You can give it any of the following metrics as criteria: AIC, BIC, Deviance, 48 | Log-likelihood. Optionally, you can give it a fit method as one of the following: 49 | {'css-mle','mle','css'} 50 | """ 51 | 52 | solver = 'lbfgs' # default 53 | 54 | p_min = 0 55 | d_min = 0 56 | q_min = 0 57 | # Initialize a DataFrame to store the results 58 | iteration = 0 59 | results_dict = {} 60 | 61 | ################################################################################ 62 | ####### YOU MUST Absolutely set this parameter correctly as "levels". If not, 63 | #### YOU WILL GET DIFFERENCED PREDICTIONS WHICH ARE FIENDISHLY DIFFICULT TO UNDO. 64 | #### If you set this to levels, then you can do any order of differencing and 65 | #### ARIMA will give you predictions in the same level as orignal values. 66 | ################################################################################ 67 | pred_type = 'levels' 68 | ######################################################################### 69 | ts_train = ts_df[:-self.forecast_period] 70 | ts_test = ts_df[-self.forecast_period:] 71 | if self.verbose == 1: 72 | print('Data Set split into train %s and test %s for Cross Validation Purposes' 73 | % (ts_train.shape, ts_test.shape)) 74 | ######################################################################### 75 | if ts_train.dtype == 'int64': 76 | ts_train = ts_train.astype(float) 77 | for d_val in range(d_min, self.d_max+1): 78 | print('\nDifferencing = %d' % d_val) 79 | results_bic = pd.DataFrame( 80 | index=['AR{}'.format(i) for i in range(p_min, self.p_max+1)], 81 | columns=['MA{}'.format(i) for i in range(q_min, self.q_max+1)] 82 | ) 83 | for p_val, q_val in itertools.product(range(p_min, self.p_max+1), range(q_min, self.q_max+1)): 84 | if p_val == 0 and d_val == 0 and q_val == 0: 85 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan 86 | continue 87 | else: 88 | try: 89 | model = ARIMA(ts_train, order=(p_val, d_val, q_val)) 90 | results = model.fit(transparams=False, method=self.method, solver=solver, disp=False) 91 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('results.' + self.metric) 92 | if iteration % 10 == 0: 93 | print(' Iteration %d completed...' % iteration) 94 | iteration += 1 95 | if iteration >= 100: 96 | print(' Ending Iterations at %d' % iteration) 97 | break 98 | except: 99 | iteration += 1 100 | continue 101 | results_bic = results_bic[results_bic.columns].astype(float) 102 | interim_d = copy.deepcopy(d_val) 103 | interim_p, interim_q, interim_bic = find_lowest_pq(results_bic) 104 | if self.verbose == 1: 105 | _, ax = plt.subplots(figsize=(20, 10)) 106 | ax = sns.heatmap(results_bic, 107 | mask=results_bic.isnull(), 108 | ax=ax, 109 | annot=True, 110 | fmt='.0f') 111 | ax.set_title(self.metric) 112 | results_dict[str(interim_p) + ' ' + str(interim_d) + ' ' + str(interim_q)] = interim_bic 113 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1] 114 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0] 115 | best_p = int(best_pdq.split(' ')[0]) 116 | best_d = int(best_pdq.split(' ')[1]) 117 | best_q = int(best_pdq.split(' ')[2]) 118 | print('\nBest model is: Non Seasonal ARIMA(%d,%d,%d), %s = %0.3f' % (best_p, best_d, best_q, self.metric, best_bic)) 119 | bestmodel = ARIMA(ts_train, order=(best_p, best_d, best_q)) 120 | print('#### Fitting best model for full data set now. Will take time... ######') 121 | try: 122 | self.model = bestmodel.fit(transparams=True, method=self.method, solver=solver, disp=False) 123 | except: 124 | self.model = bestmodel.fit(transparams=False, method=self.method, solver=solver, disp=False) 125 | ### this is needed for static forecasts #################### 126 | y_truth = ts_train[:] 127 | y_forecasted = self.model.predict(typ='levels') 128 | concatenated = pd.concat([y_truth, y_forecasted], axis=1, keys=['original', 'predicted']) 129 | if best_d == 0: 130 | #### Do this for ARIMA only ###### 131 | ### If there is no differencing DO NOT use predict_type since it will give an error = do not use "linear". 132 | print('Static Forecasts:') 133 | print_static_rmse(concatenated['original'].values, concatenated['predicted'].values, best_d) 134 | start_date = ts_df.index[-self.forecast_period] 135 | end_date = ts_df.index[-1] 136 | pred_dynamic = self.model.predict(start=start_date, end=end_date, dynamic=True) 137 | if self.verbose == 1: 138 | ax = concatenated[['original', 'predicted']][best_d:].plot() 139 | pred_dynamic.plot(label='Dynamic Forecast', ax=ax, figsize=(15, 5)) 140 | print('Dynamic %d-period Forecasts:' % (self.forecast_period,)) 141 | plt.legend() 142 | plt.show(block=False) 143 | else: 144 | #### Do this for ARIMA only ###### 145 | #### If there is differencing, you must use "levels" as the predict type to get original levels as actuals 146 | pred_type = 'levels' 147 | print('Static Forecasts:') 148 | print_static_rmse(y_truth[best_d:], y_forecasted) 149 | ########### Dynamic One Step Ahead Forecast ########################### 150 | ### Dynamic Forecasts are a better representation of true predictive power 151 | ## since they only use information from the time series up to a certain point, 152 | ## and after that, forecasts are generated using values from previous forecasted 153 | ## time points. 154 | ################################################################################# 155 | 156 | # TODO: Check if this can be changed to use predict function directly. 157 | start_date = ts_df.index[-self.forecast_period] 158 | end_date = ts_df.index[-1] 159 | pred_dynamic = self.model.predict(typ=pred_type, start=start_date, end=end_date, dynamic=True) 160 | try: 161 | pred_dynamic[pd.to_datetime((pred_dynamic.index-best_d).values[0])] = \ 162 | y_truth[pd.to_datetime((pred_dynamic.index-best_d).values[0])] 163 | except: 164 | print('Dynamic predictions erroring but continuing...') 165 | pred_dynamic.sort_index(inplace=True) 166 | print('\nDynamic %d-period Forecasts:' % self.forecast_period) 167 | if self.verbose == 1: 168 | ax = concatenated.plot() 169 | pred_dynamic.plot(label='Dynamic Forecast', ax=ax, figsize=(15, 5)) 170 | ax.set_xlabel('Date') 171 | ax.set_ylabel('Values') 172 | plt.legend() 173 | plt.show(block=False) 174 | if self.verbose == 1: 175 | try: 176 | self.model.plot_diagnostics(figsize=(16, 12)) 177 | except: 178 | pass 179 | print(self.model.summary()) 180 | 181 | res_frame = self.predict(simple=False) 182 | 183 | if self.verbose == 1: 184 | print('Model Forecast(s):\n', res_frame) 185 | rmse, norm_rmse = print_dynamic_rmse(ts_test, pred_dynamic, ts_train) 186 | return self.model, res_frame, rmse, norm_rmse 187 | 188 | def predict( 189 | self, 190 | testdata: Optional[pd.DataFrame]=None, 191 | forecast_period: Optional[int] = None, 192 | simple: bool = True) -> NDFrame: 193 | """ 194 | Return the predictions 195 | # TODO: Check if the series can be converted to a dataframe for all models. 196 | :rtype cam be Pandas Series (simple), pandas dataframe (simple = False) or None 197 | """ 198 | 199 | # TODO: Add processing of 'simple' argument and return type 200 | 201 | if testdata is not None: 202 | warnings.warn( 203 | "You have passed exogenous variables to make predictions for a ARIMA model." + 204 | "ARIMA models are univariate models and hence these exogenous variables will be ignored for these predictions." 205 | ) 206 | 207 | # TODO: Predictions coming from ARIMA include extra information compared to SARIMAX and VAR. 208 | # Need to make it consistent 209 | # Extract the dynamic predicted and true values of our time series 210 | if forecast_period is None: 211 | # use the forecast period used during training 212 | forecast_period = self.forecast_period 213 | 214 | y_forecasted = self.model.forecast(forecast_period) 215 | 216 | 217 | # TODO: Check if the datetime index can be obtained as in the case of SARIMAX. 218 | # Currently it is just a text index, e.g. Forecast_1, ... 219 | if simple: 220 | res_frame = pd.DataFrame([ 221 | y_forecasted[0], # Mean Forecast 222 | ], 223 | index=['mean'], 224 | columns=['Forecast_' + str(x) for x in range(1, forecast_period+1)] 225 | ).T 226 | res_frame = res_frame.squeeze() # Convert to a pandas series object 227 | else: 228 | res_frame = pd.DataFrame([ 229 | y_forecasted[0], # Mean Forecast 230 | y_forecasted[1], # Std Error 231 | y_forecasted[2], # Lower and Upper CI 232 | ], 233 | index=['mean','mean_se','mean_ci'], 234 | columns=['Forecast_' + str(x) for x in range(1, forecast_period+1)] 235 | ).T 236 | 237 | res_frame['mean_ci_lower'] = res_frame['mean_ci'].map(lambda x: x[0]) 238 | res_frame['mean_ci_upper'] = res_frame['mean_ci'].map(lambda x: x[1]) 239 | res_frame.drop('mean_ci', axis=1, inplace=True) 240 | 241 | return res_frame 242 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/build_arima_base.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | warnings.filterwarnings(action='ignore') 5 | from abc import abstractmethod 6 | import copy 7 | 8 | import numpy as np # type: ignore 9 | import pandas as pd # type: ignore 10 | from pandas.core.generic import NDFrame # type:ignore 11 | import dask 12 | 13 | import matplotlib.pyplot as plt # type: ignore 14 | 15 | #from tscv import GapWalkForward # type: ignore 16 | from sklearn.model_selection import TimeSeriesSplit 17 | 18 | # imported SARIMAX from statsmodels pkg 19 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore 20 | 21 | from ..build_base import BuildBase 22 | 23 | # helper functions 24 | from ...utils import colorful, print_static_rmse, print_ts_model_stats 25 | 26 | 27 | class BuildArimaBase(BuildBase): 28 | def __init__(self, scoring, seasonality=False, seasonal_period=None, p_max=12, 29 | d_max=2, q_max=12, forecast_period=5, verbose=0): 30 | """ 31 | Base class for building any ARIMA model 32 | Definitely applicable to SARIMAX and auto_arima with seasonality 33 | Check later if same can be reused for ARIMA (most likely yes) 34 | """ 35 | super().__init__( 36 | scoring=scoring, 37 | forecast_period=forecast_period, 38 | verbose=verbose 39 | ) 40 | 41 | self.seasonality = seasonality 42 | self.seasonal_period = seasonal_period 43 | self.p_max = p_max 44 | self.d_max = d_max 45 | self.q_max = q_max 46 | 47 | self.best_p = None 48 | self.best_d = None 49 | self.best_q = None 50 | self.best_P = None 51 | self.best_D = None 52 | self.best_Q = None 53 | 54 | 55 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int]=None): 56 | """ 57 | Build a Time Series Model using SARIMAX from statsmodels. 58 | """ 59 | 60 | self.original_target_col = target_col 61 | self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]] 62 | 63 | if len(self.original_preds) == 0: 64 | self.univariate = True 65 | else: 66 | self.univariate = False 67 | 68 | 69 | ########################################## 70 | #### Find best pdq and PDQ parameters #### 71 | ########################################## 72 | 73 | # NOTE: We use the entire dataset to compute the pdq and PDQ parameters. 74 | # Then we use the selected "best" parameters to check how well it 75 | # generalizes across the various folds (which may even be 1) 76 | 77 | # ## Added temporarily 78 | # ts_train = ts_df.iloc[:-self.forecast_period] 79 | # self.find_best_parameters(data = ts_train) 80 | 81 | if self.seasonal_period <= 1: 82 | self.seasonal_period = 2 ### Sarimax cannot have seasonal period 1 or below. 83 | 84 | if self.verbose >= 1: 85 | print(f"\n\nBest Parameters:") 86 | print(f"p: {self.best_p}, d: {self.best_d}, q: {self.best_q}") 87 | print(f"P: {self.best_P}, D: {self.best_D}, Q: {self.best_Q}") 88 | print(f"Seasonality: {self.seasonality}\nSeasonal Period: {self.seasonal_period}") 89 | 90 | ####################################### 91 | #### Cross Validation across Folds #### 92 | ####################################### 93 | 94 | rmse_folds = [] 95 | norm_rmse_folds = [] 96 | forecast_df_folds = [] 97 | 98 | ### Creating a new way to skip cross validation when trying to run auto-ts multiple times. ### 99 | if cv == 0: 100 | cv_in = 0 101 | else: 102 | cv_in = copy.deepcopy(cv) 103 | NFOLDS = self.get_num_folds_from_cv(cv) 104 | 105 | ######################################################################### 106 | if type(ts_df) == dask.dataframe.core.DataFrame: 107 | num_obs = ts_df.shape[0].compute() 108 | else: 109 | num_obs = ts_df.shape[0] 110 | 111 | if self.forecast_period <= 5: 112 | #### Set a minimum of 5 for the number of rows in test! 113 | self.forecast_period = 5 114 | ### In case the number of forecast_period is too high, just reduce it so it can fit into num_obs 115 | if NFOLDS*self.forecast_period > num_obs: 116 | self.forecast_period = int(num_obs/(NFOLDS+1)) 117 | print('Lowering forecast period to %d to enable cross_validation' %self.forecast_period) 118 | ######################################################################### 119 | extra_concatenated = pd.DataFrame() 120 | concatenated = pd.DataFrame() 121 | norm_rmse_folds2 = [] 122 | 123 | max_trainsize = len(ts_df) - self.forecast_period 124 | try: 125 | cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24] 126 | except: 127 | cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize) 128 | 129 | if type(ts_df) == dask.dataframe.core.DataFrame: 130 | ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe 131 | 132 | if cv_in == 0: 133 | print('Skipping cross validation steps since cross_validation = %s' %cv_in) 134 | else: 135 | for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)): 136 | dftx = ts_df.head(len(train_index)+len(test_index)) 137 | ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx 138 | ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx 139 | 140 | 141 | if self.verbose >= 1: 142 | print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}") 143 | 144 | ### this is needed for static forecasts #################### 145 | # TODO: Check if this needs to be fixed to pick usimg self.original_target_col 146 | y_truth = ts_train[:] # TODO: Note that this is only univariate analysis 147 | 148 | if len(self.original_preds) == 0: 149 | exog = None 150 | elif len(self.original_preds) == 1: 151 | exog = ts_test[self.original_preds[0]].values.reshape(-1, 1) 152 | else: 153 | exog = ts_test[self.original_preds].values 154 | 155 | auto_arima_model = self.find_best_parameters(data = ts_train) 156 | self.model = auto_arima_model 157 | y_forecasted = self.model.predict(ts_test.shape[0],exog) 158 | 159 | if fold_number == 0: 160 | concatenated = pd.DataFrame(np.c_[ts_test[self.original_target_col].values, 161 | y_forecasted], columns=['original', 'predicted'],index=ts_test.index) 162 | extra_concatenated = copy.deepcopy(concatenated) 163 | else: 164 | concatenated = pd.DataFrame(np.c_[ts_test[self.original_target_col].values, 165 | y_forecasted], columns=['original', 'predicted'],index=ts_test.index) 166 | extra_concatenated = extra_concatenated.append(concatenated) 167 | 168 | ### for SARIMAX and Auto_ARIMA, you don't have to restore differences since it predicts like actuals.### 169 | y_true = concatenated['original'] 170 | y_pred = concatenated['predicted'] 171 | 172 | if self.verbose >= 1: 173 | print('Static Forecasts:') 174 | # Since you are differencing the data, some original data points will not be available 175 | # Hence taking from first available value. 176 | print_static_rmse(y_true.values, y_pred.values, verbose=self.verbose) 177 | #quick_ts_plot(y_true, y_pred) 178 | 179 | # Extract the dynamic predicted and true values of our time series 180 | forecast_df = copy.deepcopy(y_forecasted) 181 | forecast_df_folds.append(forecast_df) 182 | 183 | 184 | rmse, norm_rmse = print_static_rmse(y_true.values, y_pred.values, verbose=0) ## don't print this time 185 | rmse_folds.append(rmse) 186 | norm_rmse_folds.append(norm_rmse) 187 | 188 | # TODO: Convert rmse_folds, rmse_norm_folds, forecasts_folds into base class attributes 189 | # TODO: Add gettes and seters for these class attributes. 190 | # This will ensure consistency across various model build types. 191 | 192 | 193 | # This is taking the std of entire dataset and using that to normalize 194 | # vs. other approach that was using std of individual folds to standardize. 195 | # Technically this is not correct, but in order to do Apples:Aples compatison with ML 196 | # (sklearn) based cross_val_score, we need to do this since we dont get individual folds 197 | # back for cross_val_score. If at a later point in time, we can get this, then, 198 | # we can revert back to dividing by individual fold std values. 199 | norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse() 200 | 201 | print(f"\nSARIMAX RMSE (all folds): {np.mean(rmse_folds):.4f}") 202 | print(f"SARIMAX Norm RMSE (all folds): {(np.mean(norm_rmse_folds2)*100):.0f}%\n") 203 | try: 204 | print_ts_model_stats(extra_concatenated['original'],extra_concatenated['predicted'], "auto_SARIMAX") 205 | except: 206 | print('Unable to print model stats. Continuing...') 207 | 208 | ############################################### 209 | #### Refit the model on the entire dataset #### 210 | ############################################### 211 | auto_arima_model = self.find_best_parameters(data = ts_df) 212 | self.model = auto_arima_model 213 | self.refit(ts_df=ts_df) 214 | 215 | print(self.model.summary()) 216 | 217 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds 218 | return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2 219 | 220 | def refit(self, ts_df: pd.DataFrame) -> object: 221 | """ 222 | Refits an already trained model using a new dataset 223 | Useful when fitting to the full data after testing with cross validation 224 | :param ts_df The time series data to be used for fitting the model 225 | :type ts_df pd.DataFrame 226 | :rtype object 227 | """ 228 | 229 | bestmodel = self.get_best_model(ts_df) 230 | 231 | print(colorful.BOLD + 'Refitting data with previously found best parameters' + colorful.END) 232 | try: 233 | self.model = bestmodel.fit(disp=False) 234 | print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring))) 235 | except Exception as e: 236 | print(e) 237 | 238 | return self 239 | 240 | @abstractmethod 241 | def find_best_parameters(self, data: pd.DataFrame): 242 | """ 243 | Given a dataset, finds the best parameters using the settings in the class 244 | Need to set the following parameters in the child class 245 | self.best_p, self.best_d, self.best_q 246 | self.best_P, self.best_D, self.best_Q 247 | """ 248 | 249 | 250 | 251 | def get_best_model(self, data: pd.DataFrame): 252 | """ 253 | Returns the 'unfit' SARIMAX model with the given dataset and the 254 | selected best parameters. This can be used to fit or refit the model. 255 | """ 256 | 257 | # In order to get forecasts to be in the same value ranges of the orig_endogs, you 258 | # must set the simple_differencing = False and the start_params to be the same as ARIMA. 259 | # That is the only way to ensure that the output of this model iscomparable to other ARIMA models 260 | 261 | if not self.seasonality: 262 | if self.univariate: 263 | bestmodel = SARIMAX( 264 | endog=data[self.original_target_col], 265 | # exog=data[self.original_preds], ###if it is univariate, no preds needed 266 | order=(self.best_p, self.best_d, self.best_q), 267 | enforce_stationarity=False, 268 | enforce_invertibility=False, 269 | trend='ct', 270 | start_params=[0, 0, 0, 1], 271 | simple_differencing=False) 272 | else: 273 | bestmodel = SARIMAX( 274 | endog=data[self.original_target_col], 275 | exog=data[self.original_preds], ## if it is multivariate, preds are needed 276 | order=(self.best_p, self.best_d, self.best_q), 277 | enforce_stationarity=False, 278 | enforce_invertibility=False, 279 | trend='ct', 280 | start_params=[0, 0, 0, 1], 281 | simple_differencing=False) 282 | else: 283 | if self.univariate: 284 | bestmodel = SARIMAX( 285 | endog=data[self.original_target_col], 286 | # exog=data[self.original_preds], ### if univariate, no preds are needed 287 | order=(self.best_p, self.best_d, self.best_q), 288 | seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period), 289 | enforce_stationarity=False, 290 | enforce_invertibility=False, 291 | trend='ct', 292 | start_params=[0, 0, 0, 1], 293 | simple_differencing=False 294 | ) 295 | else: 296 | bestmodel = SARIMAX( 297 | endog=data[self.original_target_col], 298 | exog=data[self.original_preds], ### if multivariate, preds are needed 299 | order=(self.best_p, self.best_d, self.best_q), 300 | seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period), 301 | enforce_stationarity=False, 302 | enforce_invertibility=False, 303 | trend='ct', 304 | start_params=[0, 0, 0, 1], 305 | simple_differencing=False 306 | ) 307 | 308 | return bestmodel 309 | 310 | def predict( 311 | self, 312 | testdata: Optional[pd.DataFrame]=None, 313 | forecast_period: Optional[int] = None, 314 | simple: bool = True) -> NDFrame: 315 | """ 316 | Return the predictions 317 | """ 318 | # Extract the dynamic predicted and true values of our time series 319 | if self.univariate: 320 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series): 321 | # use the forecast period used during training 322 | forecast_period = testdata.shape[0] 323 | self.forecast_period = testdata.shape[0] 324 | else: 325 | if testdata is None: 326 | raise ValueError("SARIMAX needs testdata to make predictions, but this was not provided. Please provide to proceed.") 327 | forecast_period = self.forecast_period 328 | elif isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series): 329 | if forecast_period != testdata.shape[0]: 330 | warnings.warn("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.") 331 | forecast_period = testdata.shape[0] 332 | self.forecast_period = forecast_period 333 | try: 334 | testdata = testdata[self.original_preds] 335 | except Exception as e: 336 | print(e) 337 | print("Model was trained with train dataframe. Please make sure you are passing a test data frame.") 338 | return 339 | elif isinstance(testdata, int): 340 | if forecast_period != testdata: 341 | print("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.") 342 | 343 | forecast_period = testdata 344 | self.forecast_period = forecast_period 345 | 346 | if self.univariate: 347 | res = self.model.get_forecast(self.forecast_period) 348 | else: 349 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series): 350 | res = self.model.get_forecast(self.forecast_period, exog=testdata) 351 | else: 352 | try: 353 | res = self.model.get_forecast(self.forecast_period) 354 | except Exception as e: 355 | print(e) 356 | print("Model was trained with train dataframe. Please make sure you are passing a test data frame.") 357 | return 358 | 359 | res_frame = res.summary_frame() 360 | res_frame.rename(columns = {'mean':'yhat'}, inplace=True) 361 | 362 | if simple: 363 | res_frame = res_frame['yhat'] 364 | res_frame = res_frame.squeeze() # Convert to a pandas series object 365 | else: 366 | # Pass as is 367 | pass 368 | 369 | return res_frame 370 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/build_autoarimax.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np # type: ignore 4 | import pandas as pd # type: ignore 5 | # TODO: Resolve which one we want to use 6 | # from pmdarima.arima.auto import auto_arima # type: ignore 7 | from pmdarima.arima import auto_arima # type: ignore 8 | 9 | from .build_arima_base import BuildArimaBase 10 | # helper functions 11 | from ...utils import colorful 12 | 13 | 14 | class BuildAutoSarimax(BuildArimaBase): 15 | 16 | def find_best_parameters(self, data: pd.DataFrame): 17 | """ 18 | Given a dataset, finds the best parameters using the settings in the class 19 | """ 20 | 21 | if self.verbose >= 1: 22 | print(colorful.BOLD + '\n Finding the best parameters using AutoArima:' + colorful.END) 23 | if len(self.original_preds) == 0: 24 | exog = None 25 | elif len(self.original_preds) == 1: 26 | exog = data[self.original_preds[0]].values.reshape(-1, 1) 27 | else: 28 | exog = data[self.original_preds].values 29 | 30 | ### for large datasets, speed is of the essence. Hence reduce max size of PDQ 31 | if self.seasonal_period <= 1: 32 | m_min = 2 33 | else: 34 | m_min = self.seasonal_period 35 | if data.shape[0] > 1000: 36 | print(' Using smaller parameters for larger dataset with greater than 1000 samples') 37 | out_of_sample_size = int(0.01*data.shape[0]) 38 | arima_model = auto_arima( 39 | y = data[self.original_target_col], 40 | exogenous=exog, ## these variables must be given in predictions as well 41 | start_p = 0, start_q = 0, start_P = 0, start_Q = 0, 42 | max_p = 2, max_q = 2, max_P = 2, max_Q = 2, 43 | D = 1, max_D = 1, 44 | out_of_sample_size=out_of_sample_size, # use a small amount 45 | information_criterion=self.scoring, # AIC 46 | scoring='mse', # only supports 'mse' or 'mae' 47 | m=m_min, seasonal=self.seasonality, 48 | stepwise = True, random_state=42, n_fits = 10, n_jobs=-1, 49 | error_action = 'ignore') 50 | else: 51 | arima_model = auto_arima( 52 | y = data[self.original_target_col], 53 | exogenous=exog, ## these variables must be given in predictions as well 54 | out_of_sample_size=0, # use whole dataset to compute metrics 55 | information_criterion=self.scoring, # AIC 56 | scoring='mse', # only supports 'mse' or 'mae' 57 | # TODO: Check if we can go higher on max p and q (till seasonality) 58 | start_p=0, d=None, start_q=0, max_p=self.p_max, max_d=self.d_max, max_q=self.q_max, # AR Parameters 59 | start_P=0, D=None, start_Q=0, max_P=self.p_max, max_D=self.d_max, max_Q=self.q_max, # Seasonal Parameters (1) 60 | m=m_min, seasonal=self.seasonality, # Seasonal Parameters (2) 61 | stepwise = True, random_state=42, n_fits = 50, n_jobs=-1, # Hyperparameer Search 62 | error_action='warn', trace = True, supress_warnings=True 63 | ) 64 | 65 | self.best_p, self.best_d, self.best_q = arima_model.order # example (0, 1, 1) 66 | self.best_P, self.best_D, self.best_Q, _ = arima_model.seasonal_order # example (2, 1, 1, 12) 67 | 68 | metric_value = math.nan 69 | 70 | if self.scoring.lower() == 'aic': 71 | metric_value = arima_model.aic() 72 | elif self.scoring.lower() == 'aicc': 73 | metric_value = arima_model.aicc() 74 | elif self.scoring.lower() == 'bic': 75 | metric_value = arima_model.bic() 76 | else: 77 | print("Error: Metric must be 'aic', 'aicc', or 'bic'. Continuing with 'bic' as default") 78 | metric_value = arima_model.bic() 79 | self.scoring = 'bic' 80 | 81 | if self.verbose >= 1: 82 | print( 83 | '\nBest model is a Seasonal SARIMAX(%d,%d,%d)*(%d,%d,%d,%d), %s = %0.3f' % ( 84 | self.best_p, self.best_d, self.best_q, 85 | self.best_P, self.best_D, self.best_Q, 86 | m_min, self.scoring, metric_value) 87 | ) 88 | return arima_model 89 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/build_sarimax.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt # type: ignore 2 | import numpy as np # type: ignore 3 | import pandas as pd # type: ignore 4 | from pandas.core.generic import NDFrame # type:ignore 5 | # imported SARIMAX from statsmodels pkg 6 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore 7 | 8 | from .build_arima_base import BuildArimaBase 9 | from ...models.ar_based.param_finder import find_best_pdq_or_PDQ 10 | # helper functions 11 | from ...utils import colorful 12 | 13 | 14 | # from tscv import GapWalkForward # type: ignore 15 | 16 | 17 | # class BuildSarimax(BuildBase): 18 | class BuildSarimax(BuildArimaBase): 19 | # def __init__(self, scoring, seasonality=False, seasonal_period=None, p_max=12, d_max=2, q_max=12, forecast_period=2, verbose=0): 20 | # """ 21 | # Automatically build a SARIMAX Model 22 | # """ 23 | # super().__init__( 24 | # scoring=scoring, 25 | # forecast_period=forecast_period, 26 | # verbose=verbose 27 | # ) 28 | 29 | # self.seasonality = seasonality 30 | # self.seasonal_period = seasonal_period 31 | # self.p_max = p_max 32 | # self.d_max = d_max 33 | # self.q_max = q_max 34 | 35 | # self.best_p = None 36 | # self.best_d = None 37 | # self.best_q = None 38 | # self.best_P = None 39 | # self.best_D = None 40 | # self.best_Q = None 41 | 42 | 43 | # def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int]=None): 44 | # """ 45 | # Build a Time Series Model using SARIMAX from statsmodels. 46 | # """ 47 | 48 | # self.original_target_col = target_col 49 | # self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]] 50 | 51 | # if len(self.original_preds) == 0: 52 | # self.univariate = True 53 | # else: 54 | # self.univariate = False 55 | 56 | 57 | # ########################################## 58 | # #### Find best pdq and PDQ parameters #### 59 | # ########################################## 60 | 61 | # # NOTE: We use the entire dataset to compute the pdq and PDQ parameters. 62 | # # Then we use the selected "best" parameters to check how well it 63 | # # generalizes across the various folds (which may even be 1) 64 | 65 | # # ## Added temporarily 66 | # # ts_train = ts_df.iloc[:-self.forecast_period] 67 | # # self.find_best_parameters(data = ts_train) 68 | # self.find_best_parameters(data = ts_df) 69 | 70 | # if self.verbose >= 1: 71 | # print(f"\n\nBest Parameters:") 72 | # print(f"p: {self.best_p}, d: {self.best_d}, q: {self.best_q}") 73 | # print(f"P: {self.best_P}, D: {self.best_D}, Q: {self.best_Q}") 74 | # print(f"Seasonality: {self.seasonality} Seasonal Period: {self.seasonal_period}") 75 | 76 | 77 | # ####################################### 78 | # #### Cross Validation across Folds #### 79 | # ####################################### 80 | 81 | # rmse_folds = [] 82 | # norm_rmse_folds = [] 83 | # forecast_df_folds = [] 84 | 85 | # NFOLDS = self.get_num_folds_from_cv(cv) 86 | # cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period) 87 | # for fold_number, (train, test) in enumerate(cv.split(ts_df)): 88 | # ts_train = ts_df.iloc[train] 89 | # ts_test = ts_df.iloc[test] 90 | 91 | # if self.verbose >= 1: 92 | # print(f"\n\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape} Test Shape: {ts_test.shape}") 93 | 94 | 95 | # ######################################### 96 | # #### Define the model with fold data #### 97 | # ######################################### 98 | 99 | # bestmodel = self.get_best_model(ts_train) 100 | 101 | # ###################################### 102 | # #### Fit the model with fold data #### 103 | # ###################################### 104 | 105 | # if self.verbose >= 1: 106 | # print(colorful.BOLD + 'Fitting best SARIMAX model' + colorful.END) 107 | 108 | # try: 109 | # self.model = bestmodel.fit(disp=False) 110 | # if self.verbose >= 1: 111 | # print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring))) 112 | # except Exception as e: 113 | # print(e) 114 | # print('Error: Getting Singular Matrix. Please try using other PDQ parameters or turn off Seasonality') 115 | # return bestmodel, None, np.inf, np.inf 116 | 117 | # if self.verbose >= 1: 118 | # try: 119 | # self.model.plot_diagnostics(figsize=(16, 12)) 120 | # except: 121 | # print('Error: SARIMAX plot diagnostic. Continuing...') 122 | 123 | # ### this is needed for static forecasts #################### 124 | # # TODO: Check if this needs to be fixed to pick usimg self.original_target_col 125 | # y_truth = ts_train[:] # TODO: Note that this is only univariate analysis 126 | 127 | # if self.univariate: 128 | # y_forecasted = self.model.predict(dynamic=False) 129 | # else: 130 | # y_forecasted = self.model.predict(dynamic=False, exog=ts_test[self.original_preds]) 131 | 132 | # concatenated = pd.concat([y_truth, y_forecasted], axis=1, keys=['original', 'predicted']) 133 | 134 | # ### for SARIMAX, you don't have to restore differences since it predicts like actuals.### 135 | # if self.verbose >= 1: 136 | # print('Static Forecasts:') 137 | # # Since you are differencing the data, some original data points will not be available 138 | # # Hence taking from first available value. 139 | # print_static_rmse( 140 | # concatenated['original'].values[self.best_d:], 141 | # concatenated['predicted'].values[self.best_d:], 142 | # verbose=self.verbose 143 | # ) 144 | 145 | # ########### Dynamic One Step Ahead Forecast ########################### 146 | # ### Dynamic Forecats are a better representation of true predictive power 147 | # ## since they only use information from the time series up to a certain point, 148 | # ## and after that, forecasts are generated using values from previous forecasted 149 | # ## time points. 150 | # ################################################################################# 151 | # # Now do dynamic forecast plotting for the last X steps of the data set ###### 152 | 153 | # if self.verbose >= 1: 154 | # ax = concatenated[['original', 'predicted']][self.best_d:].plot(figsize=(16, 12)) 155 | # startdate = ts_df.index[-self.forecast_period-1] 156 | # pred_dynamic = self.model.get_prediction(start=startdate, dynamic=True, full_results=True) 157 | # pred_dynamic_ci = pred_dynamic.conf_int() 158 | # pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax) 159 | # try: 160 | # ax.fill_between(pred_dynamic_ci.index, pred_dynamic_ci.iloc[:, 0], 161 | # pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25) 162 | # ax.fill_betweenx(ax.get_ylim(), startdate, ts_train.index[-1], alpha=.1, zorder=-1) 163 | # except: 164 | # pass 165 | # ax.set_xlabel('Date') 166 | # ax.set_ylabel('Levels') 167 | # plt.legend() 168 | # plt.show(block=False) 169 | 170 | # # Extract the dynamic predicted and true values of our time series 171 | # forecast_df = self.predict(testdata=ts_test[self.original_preds], simple=False) 172 | # forecast_df_folds.append(forecast_df) 173 | 174 | # # Extract Metrics 175 | # if self.verbose >= 1: 176 | # print('Dynamic %d-Period Forecast:' % (self.forecast_period)) 177 | 178 | # rmse, norm_rmse = print_dynamic_rmse(ts_test[self.original_target_col], forecast_df['mean'].values, ts_train[self.original_target_col], toprint=self.verbose) 179 | # rmse_folds.append(rmse) 180 | # norm_rmse_folds.append(norm_rmse) 181 | 182 | # # TODO: Convert rmse_folds, rmse_norm_folds, forecasts_folds into base class attributes 183 | # # TODO: Add gettes and seters for these class attributes. 184 | # # This will ensure consistency across various model build types. 185 | 186 | 187 | # # This is taking the std of entire dataset and using that to normalize 188 | # # vs. other approach that was using std of individual folds to stansardize. 189 | # # Technically this is not correct, but in order to do Apples:Aples compatison with ML 190 | # # (sklearn) based cross_val_score, we need to do this since we dont get indicidual folds 191 | # # back for cross_val_score. If at a later point in time, we can get this, then, 192 | # # we can revert back to dividing by individual fold std values. 193 | # norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse() 194 | 195 | # # print(f"SARIMAX Norm RMSE (Original): {norm_rmse_folds}") 196 | # # print(f"SARIMAX Norm RMSE (New): {norm_rmse_folds2}") 197 | 198 | # ############################################### 199 | # #### Refit the model on the entire dataset #### 200 | # ############################################### 201 | # self.refit(ts_df=ts_df) 202 | 203 | # if self.verbose >= 1: 204 | # print(self.model.summary()) 205 | 206 | # # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds 207 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2 208 | 209 | # def refit(self, ts_df: pd.DataFrame) -> object: 210 | # """ 211 | # Refits an already trained model using a new dataset 212 | # Useful when fitting to the full data after testing with cross validation 213 | # :param ts_df The time series data to be used for fitting the model 214 | # :type ts_df pd.DataFrame 215 | # :rtype object 216 | # """ 217 | 218 | # bestmodel = self.get_best_model(ts_df) 219 | 220 | # print(colorful.BOLD + 'Refitting data with previously found best parameters' + colorful.END) 221 | # try: 222 | # self.model = bestmodel.fit(disp=False) 223 | # print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring))) 224 | # except Exception as e: 225 | # print(e) 226 | 227 | # return self 228 | 229 | 230 | def find_best_parameters(self, data: pd.DataFrame): 231 | """ 232 | Given a dataset, finds the best parameters using the settings in the class 233 | """ 234 | 235 | if not self.seasonality: 236 | if self.verbose >= 1: 237 | print('Building a Non Seasonal Model...') 238 | print('\nFinding best Non Seasonal Parameters:') 239 | # TODO: Check if we need to also pass the exogenous variables here and 240 | # change the functionality of find_best_pdq_or_PDQ to incorporate these 241 | # exogenoug variables. 242 | self.best_p, self.best_d, self.best_q, best_bic, _ = find_best_pdq_or_PDQ( 243 | ts_df=data[self.original_target_col], 244 | scoring=self.scoring, 245 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max, 246 | non_seasonal_pdq=None, 247 | seasonal_period=None, 248 | seasonality=False, 249 | verbose=self.verbose 250 | ) 251 | 252 | if self.verbose >= 1: 253 | print('\nBest model is: Non Seasonal SARIMAX(%d,%d,%d), %s = %0.3f' % ( 254 | self.best_p, self.best_d, self.best_q, self.scoring, best_bic)) 255 | else: 256 | if self.verbose >= 1: 257 | print(colorful.BOLD + 'Building a Seasonal Model...'+colorful.END) 258 | print(colorful.BOLD + '\n Finding best Non-Seasonal pdq Parameters:' + colorful.END) 259 | # TODO: Check if we need to also pass the exogenous variables here and 260 | # change the functionality of find_best_pdq_or_PDQ to incorporate these 261 | # exogenoug variables. 262 | self.best_p, self.best_d, self.best_q, _, _ = find_best_pdq_or_PDQ( 263 | ts_df=data[self.original_target_col], 264 | scoring=self.scoring, 265 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max, 266 | non_seasonal_pdq=None, # we need to figure this out ... 267 | seasonal_period=None, 268 | seasonality=False, # setting seasonality = False for p, d, q 269 | verbose=self.verbose 270 | ) 271 | 272 | if self.verbose >= 1: 273 | print(colorful.BOLD + '\n Finding best Seasonal PDQ Model Parameters:' + colorful.END) 274 | # TODO: Check if we need to also pass the exogenous variables here and 275 | # change the functionality of find_best_pdq_or_PDQ to incorporate these 276 | # exogenoug variables. 277 | self.best_P, self.best_D, self.best_Q, best_bic, self.seasonality = find_best_pdq_or_PDQ( 278 | ts_df=data[self.original_target_col], 279 | scoring=self.scoring, 280 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max, 281 | non_seasonal_pdq=(self.best_p, self.best_d, self.best_q), # found previously ... 282 | seasonal_period=self.seasonal_period, # passing seasonal period 283 | seasonality=True, # setting seasonality = True for P, D, Q 284 | verbose=self.verbose 285 | ) 286 | 287 | if self.seasonality: 288 | if self.verbose >= 1: 289 | print('\nBest model is a Seasonal SARIMAX(%d,%d,%d)*(%d,%d,%d,%d), %s = %0.3f' % ( 290 | self.best_p, self.best_d, self.best_q, 291 | self.best_P, self.best_D, self.best_Q, 292 | self.seasonal_period, self.scoring, best_bic)) 293 | else: 294 | if self.verbose >= 1: 295 | print('\nEven though seasonality has been set to True, the best model is a Non Seasonal SARIMAX(%d,%d,%d)' % ( 296 | self.best_p, self.best_d, self.best_q)) 297 | 298 | 299 | 300 | 301 | 302 | # def get_best_model(self, data: pd.DataFrame): 303 | # """ 304 | # Returns the 'unfit' SARIMAX model with the given dataset and the 305 | # selected best parameters. This can be used to fit or refit the model. 306 | # """ 307 | 308 | # # In order to get forecasts to be in the same value ranges of the orig_endogs, you 309 | # # must set the simple_differencing = False and the start_params to be the same as ARIMA. 310 | # # That is the only way to ensure that the output of this model iscomparable to other ARIMA models 311 | 312 | # if not self.seasonality: 313 | # if self.univariate: 314 | # bestmodel = SARIMAX( 315 | # endog=data[self.original_target_col], 316 | # # exog=data[self.original_preds], 317 | # order=(self.best_p, self.best_d, self.best_q), 318 | # enforce_stationarity=False, 319 | # enforce_invertibility=False, 320 | # trend='ct', 321 | # start_params=[0, 0, 0, 1], 322 | # simple_differencing=False) 323 | # else: 324 | # bestmodel = SARIMAX( 325 | # endog=data[self.original_target_col], 326 | # exog=data[self.original_preds], 327 | # order=(self.best_p, self.best_d, self.best_q), 328 | # enforce_stationarity=False, 329 | # enforce_invertibility=False, 330 | # trend='ct', 331 | # start_params=[0, 0, 0, 1], 332 | # simple_differencing=False) 333 | # else: 334 | # if self.univariate: 335 | # bestmodel = SARIMAX( 336 | # endog=data[self.original_target_col], 337 | # # exog=data[self.original_preds], 338 | # order=(self.best_p, self.best_d, self.best_q), 339 | # seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period), 340 | # enforce_stationarity=False, 341 | # enforce_invertibility=False, 342 | # trend='ct', 343 | # start_params=[0, 0, 0, 1], 344 | # simple_differencing=False 345 | # ) 346 | # else: 347 | # bestmodel = SARIMAX( 348 | # endog=data[self.original_target_col], 349 | # exog=data[self.original_preds], 350 | # order=(self.best_p, self.best_d, self.best_q), 351 | # seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period), 352 | # enforce_stationarity=False, 353 | # enforce_invertibility=False, 354 | # trend='ct', 355 | # start_params=[0, 0, 0, 1], 356 | # simple_differencing=False 357 | # ) 358 | 359 | # return bestmodel 360 | 361 | # def predict( 362 | # self, 363 | # testdata: Optional[pd.DataFrame]=None, 364 | # forecast_period: Optional[int] = None, 365 | # simple: bool = True) -> NDFrame: 366 | # """ 367 | # Return the predictions 368 | # """ 369 | # # Extract the dynamic predicted and true values of our time series 370 | 371 | # if self.univariate: 372 | # if forecast_period is None: 373 | # # use the forecast period used during training 374 | # forecast_period = self.forecast_period 375 | # else: 376 | # if testdata is None: 377 | # raise ValueError("SARIMAX needs testdata to make predictions, but this was not provided. Please provide to proceed.") 378 | 379 | # if forecast_period != testdata.shape[0]: 380 | # warnings.warn("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.") 381 | 382 | # forecast_period = testdata.shape[0] 383 | 384 | # try: 385 | # testdata = testdata[self.original_preds] 386 | # except Exception as e: 387 | # print(e) 388 | # raise ValueError("Some exogenous columns that were used during training are missing in testdata. Please make sure you are passing the correct exogenous columns.") 389 | 390 | # if self.univariate: 391 | # res = self.model.get_forecast(forecast_period) 392 | # else: 393 | # res = self.model.get_forecast(forecast_period, exog=testdata) 394 | 395 | # res_frame = res.summary_frame() 396 | 397 | # if simple: 398 | # res_frame = res_frame['mean'] 399 | # res_frame = res_frame.squeeze() # Convert to a pandas series object 400 | # else: 401 | # # Pass as is 402 | # pass 403 | 404 | # return res_frame 405 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/build_var.py: -------------------------------------------------------------------------------- 1 | """Module to build a VAR model 2 | """ 3 | import copy 4 | import itertools 5 | import operator 6 | import warnings 7 | from typing import Optional 8 | 9 | import dask 10 | import matplotlib.pyplot as plt # type: ignore 11 | import numpy as np # type: ignore 12 | import pandas as pd # type: ignore 13 | import seaborn as sns # type: ignore 14 | from pandas.core.generic import NDFrame # type:ignore 15 | 16 | sns.set(style="white", color_codes=True) 17 | 18 | from statsmodels.tsa.statespace.varmax import VARMAX # type: ignore 19 | 20 | #from tscv import GapWalkForward # type: ignore 21 | from sklearn.model_selection import TimeSeriesSplit 22 | 23 | # helper functions 24 | from ...utils import print_dynamic_rmse 25 | from ...models.ar_based.param_finder import find_lowest_pq 26 | from ..build_base import BuildBase 27 | 28 | 29 | class BuildVAR(BuildBase): 30 | """Class to build a VAR model 31 | """ 32 | def __init__(self, scoring, forecast_period=2, p_max=3, q_max=3, verbose=0): 33 | """ 34 | Automatically build a VAR Model 35 | 36 | Since it automatically builds a VAR model, you need to give it a Criteria (scoring) to optimize 37 | on. You can give it any of the following metrics as scoring options: 38 | AIC, BIC, Deviance, Log-likelihood. 39 | You can give the highest order values for p and q. Default is set to 3 for both. 40 | """ 41 | super().__init__( 42 | scoring=scoring, 43 | forecast_period=forecast_period, 44 | verbose=verbose 45 | ) 46 | self.p_max = p_max 47 | self.q_max = q_max 48 | self.best_p = None 49 | self.best_d = None 50 | self.best_q = None 51 | 52 | # def fit(self, ts_df): 53 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object: 54 | """ 55 | This builds a VAR model given a multivariate time series data frame with time as the Index. 56 | 57 | :param ts_df The time series data to be used for fitting the model. Note that the input can be 58 | a data frame with one column or multiple cols or a multivariate array. However, the first column 59 | must be the target variable. You must include only Time Series data in it. DO NOT include 60 | "Non-Stationary" or "Trendy" data. Make sure your Time Series is "Stationary" before you send 61 | it in!! If not, this will give spurious results. 62 | :type ts_df pd.DataFrame 63 | 64 | :param target_col The column name of the target time series that needs to be modeled. 65 | All other columns will be considered as exogenous variables (if applicable to method) 66 | :type target_col str 67 | 68 | :param cv: Number of folds to use for cross validation. 69 | Number of observations in the Validation set for each fold = forecast period 70 | If None, a single fold is used 71 | :type cv Optional[int] 72 | 73 | :rtype object 74 | """ 75 | self.original_target_col = target_col 76 | self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]] 77 | 78 | ts_df = ts_df[[self.original_target_col] + self.original_preds] 79 | 80 | ####################################### 81 | #### Cross Validation across Folds #### 82 | ####################################### 83 | 84 | rmse_folds = [] 85 | norm_rmse_folds = [] 86 | forecast_df_folds = [] 87 | norm_rmse_folds2 = [] 88 | 89 | ### Creating a new way to skip cross validation when trying to run auto-ts multiple times. ### 90 | if not cv: 91 | cv_in = 0 92 | else: 93 | cv_in = copy.deepcopy(cv) 94 | NFOLDS = self.get_num_folds_from_cv(cv) 95 | #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period) 96 | #cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### sklearn version 0.0.24 97 | max_trainsize = len(ts_df) - self.forecast_period 98 | try: 99 | cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24] 100 | except: 101 | cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize) 102 | 103 | if type(ts_df) == dask.dataframe.core.DataFrame: 104 | ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe 105 | 106 | if cv_in == 0: 107 | print('Skipping cross validation steps since cross_validation = %s' %cv_in) 108 | self.find_best_parameters(data = ts_df) 109 | y_train = ts_df.iloc[:, [0, self.best_d]] 110 | bestmodel = self.get_best_model(y_train) 111 | self.model = bestmodel.fit(disp=False) 112 | else: 113 | for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)): 114 | dftx = ts_df.head(len(train_index)+len(test_index)) 115 | ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx 116 | ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx 117 | 118 | print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}") 119 | self.find_best_parameters(data = ts_train) 120 | 121 | ######################################### 122 | #### Define the model with fold data #### 123 | ######################################### 124 | y_train = ts_train.iloc[:, [0, self.best_d]] 125 | bestmodel = self.get_best_model(y_train) 126 | 127 | ###################################### 128 | #### Fit the model with fold data #### 129 | ###################################### 130 | 131 | if self.verbose >= 1: 132 | print(f'Fitting best VAR model on Fold: {fold_number+1}') 133 | try: 134 | self.model = bestmodel.fit(disp=False) 135 | except Exception as e: 136 | print(e) 137 | print(f'Error: VAR Fit on Fold: {fold_number+1} unsuccessful.') 138 | return bestmodel, None, np.inf, np.inf 139 | 140 | forecast_df = self.predict(ts_test.shape[0],simple=False) 141 | forecast_df_folds.append(forecast_df['yhat'].values) 142 | 143 | rmse, norm_rmse = print_dynamic_rmse(ts_test.iloc[:, 0].values, forecast_df['yhat'].values, 144 | ts_train.iloc[:, 0].values) 145 | rmse_folds.append(rmse) 146 | norm_rmse_folds.append(norm_rmse) 147 | 148 | norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse() 149 | self.model.plot_diagnostics(figsize=(16, 12)) 150 | axis = self.model.impulse_responses(12, orthogonalized=True).plot(figsize=(12, 4)) 151 | axis.set(xlabel='Time Steps', title='VAR model Impulse Response Functions') 152 | 153 | ############################################### 154 | #### Refit the model on the entire dataset #### 155 | ############################################### 156 | y_train = ts_df.iloc[:, [0, self.best_d]] 157 | self.refit(ts_df=y_train) 158 | 159 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds 160 | return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2 161 | 162 | def predict( 163 | self, 164 | testdata: Optional[pd.DataFrame] = None, 165 | forecast_period: Optional[int] = None, 166 | simple: bool = True 167 | ) -> NDFrame: 168 | """ 169 | Return the predictions 170 | """ 171 | 172 | if testdata is not None: 173 | warnings.warn( 174 | "You have passed exogenous variables to make predictions for a VAR model. " + 175 | "VAR model will predict all exogenous variables automatically, " + 176 | "hence your passed values will not be used." 177 | ) 178 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series): 179 | if len(testdata) != self.forecast_period: 180 | self.forecast_period = testdata.shape[0] 181 | elif isinstance(testdata, int): 182 | self.forecast_period = testdata 183 | 184 | forecast_period = self.forecast_period 185 | 186 | # Extract the dynamic predicted and true values of our time series 187 | if forecast_period is None: 188 | # use the forecast period used during training 189 | forecast_period = self.forecast_period 190 | 191 | # y_forecasted = self.model.forecast(forecast_period) 192 | 193 | res = self.model.get_forecast(forecast_period) 194 | res_frame = res.summary_frame() 195 | 196 | res_frame.rename(columns={'mean':'yhat'},inplace=True) 197 | 198 | if simple: 199 | res_frame = res_frame['yhat'] 200 | res_frame = res_frame.squeeze() # Convert to a pandas series object 201 | else: 202 | # Pass as is 203 | pass 204 | 205 | return res_frame 206 | 207 | 208 | def find_best_parameters(self, data: pd.DataFrame): 209 | """ 210 | Given a dataset, finds the best parameters using the settings in the class 211 | """ 212 | #### dmax here means the column number of the data frame: it serves as a placeholder for columns 213 | dmax = data.shape[1] 214 | ############################################################################################### 215 | cols = data.columns.tolist() 216 | # TODO: #14 Make sure that we have a way to not rely on column order to determine the target 217 | # It is assumed that the first column of the dataframe is the target variable #### 218 | ### make sure that is the case before doing this program #################### 219 | i = 1 220 | results_dict = {} 221 | 222 | for d_val in range(1, dmax): 223 | # Takes the target column and one other endogenous column at a time 224 | # and makes a prediction based on that. Then selects the best 225 | # exogenous column at the end. 226 | y_train = data.iloc[:, [0, d_val]] 227 | print('\nAdditional Variable in VAR model = %s' % cols[d_val]) 228 | info_criteria = pd.DataFrame( 229 | index=['AR{}'.format(i) for i in range(0, self.p_max+1)], 230 | columns=['MA{}'.format(i) for i in range(0, self.q_max+1)] 231 | ) 232 | for p_val, q_val in itertools.product(range(0, self.p_max+1), range(0, self.q_max+1)): 233 | if p_val == 0 and q_val == 0: 234 | info_criteria.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan 235 | print(' Iteration %d completed' % i) 236 | i += 1 237 | else: 238 | try: 239 | model = VARMAX(y_train, order=(p_val, q_val), trend='c') 240 | model = model.fit(max_iter=1000, disp=False) 241 | info_criteria.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('model.' + self.scoring) 242 | print(' Iteration %d completed' % i) 243 | i += 1 244 | except Exception: 245 | i += 1 246 | print(' Iteration %d completed' % i) 247 | info_criteria = info_criteria[info_criteria.columns].astype(float) 248 | interim_d = copy.deepcopy(d_val) 249 | interim_p, interim_q, interim_bic = find_lowest_pq(info_criteria) 250 | if self.verbose == 1: 251 | _, axis = plt.subplots(figsize=(20, 10)) 252 | axis = sns.heatmap( 253 | info_criteria, 254 | mask=info_criteria.isnull(), 255 | ax=axis, 256 | annot=True, 257 | fmt='.0f' 258 | ) 259 | axis.set_title(self.scoring) 260 | results_dict[str(interim_p) + ' ' + str(interim_d) + ' ' + str(interim_q)] = interim_bic 261 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1] 262 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0] 263 | self.best_p = int(best_pdq.split(' ')[0]) 264 | self.best_d = int(best_pdq.split(' ')[1]) 265 | self.best_q = int(best_pdq.split(' ')[2]) 266 | 267 | print('Best variable selected for VAR: %s' % data.columns.tolist()[self.best_d]) 268 | 269 | def refit(self, ts_df: pd.DataFrame) -> object: 270 | """ 271 | Refits an already trained model using a new dataset 272 | Useful when fitting to the full data after testing with cross validation 273 | :param ts_df The time series data to be used for fitting the model 274 | :type ts_df pd.DataFrame 275 | :rtype object 276 | """ 277 | bestmodel = self.get_best_model(ts_df) 278 | print('Refitting data with previously found best parameters') 279 | try: 280 | self.model = bestmodel.fit(disp=False) 281 | print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring))) 282 | except Exception as exception: 283 | print(exception) 284 | 285 | return self 286 | 287 | 288 | def get_best_model(self, data: pd.DataFrame): 289 | """ 290 | Returns the 'unfit' SARIMAX model with the given dataset and the 291 | selected best parameters. This can be used to fit or refit the model. 292 | """ 293 | bestmodel = VARMAX(data, order=(self.best_p, self.best_q), trend='c') 294 | return bestmodel 295 | -------------------------------------------------------------------------------- /auto_ts/models/ar_based/param_finder.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | import operator 4 | 5 | import matplotlib.pyplot as plt # type: ignore 6 | import numpy as np # type: ignore 7 | import pandas as pd # type: ignore 8 | import seaborn as sns # type: ignore 9 | 10 | # This gives an error when running from a python script. 11 | # Maybe, this should be set in the jupyter notebook directly. 12 | # get_ipython().magic('matplotlib inline') 13 | sns.set(style="white", color_codes=True) 14 | # imported SARIMAX from statsmodels pkg for find_best_pdq_or_PDQ 15 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore 16 | 17 | 18 | def find_lowest_pq(df): 19 | """ 20 | This is an auto-ARIMA function that iterates through parameters pdq and finds the best 21 | based on aan eval metric sent in as input. 22 | 23 | This finds the row and column numbers of the lowest or highest value in a dataframe. All it needs is numeric values. 24 | It will return the row and column together as a string, you will have to split it into two. 25 | It will also return the lowest value in the dataframe by default but you can change it to "max". 26 | """ 27 | dicti = {} 28 | for ma in list(df): 29 | try: 30 | dicti[ma + ' ' + df[ma].idxmin()] = df[ma].sort_values()[0] 31 | except: 32 | pass 33 | lowest_bic = min(dicti.items(), key=operator.itemgetter(1))[1] 34 | lowest_pq = min(dicti.items(), key=operator.itemgetter(1))[0] 35 | ma_q = int(lowest_pq.split(' ')[0][2:]) 36 | ar_p = int(lowest_pq.split(' ')[1][2:]) 37 | print(' Best AR order p = %d, MA order q = %d, Interim metric = %0.3f' % (ar_p, ma_q, lowest_bic)) 38 | return ar_p, ma_q, lowest_bic 39 | 40 | 41 | def find_best_pdq_or_PDQ(ts_df, scoring, p_max, d_max, q_max, non_seasonal_pdq, 42 | seasonal_period, seasonality=False, verbose=0): 43 | p_min = 0 44 | d_min = 0 45 | q_min = 0 46 | if seasonality: 47 | ns_p = non_seasonal_pdq[0] 48 | ns_d = non_seasonal_pdq[1] 49 | ns_q = non_seasonal_pdq[2] 50 | # Initialize a DataFrame to store the results 51 | iteration = 0 52 | results_dict = {} 53 | seasonality_dict = {} 54 | for d_val in range(d_min, d_max+1): 55 | print(f"\nDifferencing = {d_val} with Seasonality = {seasonality}") 56 | results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min, p_max+1)], 57 | columns=['MA{}'.format(i) for i in range(q_min, q_max+1)]) 58 | for p_val, q_val in itertools.product(range(p_min,p_max+1), range(q_min, q_max+1)): 59 | if p_val == 0 and d_val == 0 and q_val == 0: 60 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan 61 | continue 62 | try: 63 | if seasonality: 64 | # In order to get forecasts to be in the same value ranges of the 65 | # orig_endogs, you must set the simple_differencing = False and 66 | # the start_params to be the same as ARIMA. 67 | # That is the only way to ensure that the output of this 68 | # model is comparable to other ARIMA models 69 | 70 | model = SARIMAX( 71 | ts_df, 72 | order=(ns_p, ns_d, ns_q), 73 | seasonal_order=(p_val, d_val, q_val, seasonal_period), 74 | enforce_stationarity=False, 75 | enforce_invertibility=False, 76 | trend='ct', 77 | start_params=[0, 0, 0, 1], 78 | simple_differencing=False 79 | ) 80 | else: 81 | model = SARIMAX( 82 | ts_df, 83 | order=(p_val, d_val, q_val), 84 | enforce_stationarity=False, 85 | enforce_invertibility=False, 86 | trend='ct', 87 | start_params=[0, 0, 0, 1], 88 | simple_differencing=False 89 | ) 90 | 91 | results = model.fit(disp=False) 92 | 93 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('results.' + scoring) 94 | if iteration % 10 == 0: 95 | print(' Iteration %d completed...' % iteration) 96 | iteration += 1 97 | elif iteration >= 100: 98 | print(' Ending Iterations at %d' % iteration) 99 | break 100 | except: 101 | iteration += 1 102 | continue 103 | results_bic = results_bic[results_bic.columns].astype(float) 104 | 105 | # # TODO: Print if needed 106 | # print("Inside find_best_pdq_or_PDQ --> results_bic") 107 | # print(results_bic) 108 | 109 | interim_d = d_val 110 | if results_bic.isnull().all().all(): 111 | print(' D = %d results in an empty ARMA set. Setting Seasonality to False since model might overfit' %d_val) 112 | #### Set Seasonality to False if this empty condition happens repeatedly #### 113 | seasonality_dict[d_val] = False 114 | # TODO: This should not be set to False for all future d values, but without this ARIMA is giving large errors (overfitting) 115 | seasonality = False 116 | continue 117 | else: 118 | seasonality_dict[d_val] = True 119 | # TODO: This should not be set to False for all future d values, but without this ARIMA is giving large errors (overfitting) 120 | seasonality = True 121 | interim_p, interim_q, interim_bic = find_lowest_pq(results_bic) 122 | if verbose == 1: 123 | _, ax = plt.subplots(figsize=(20, 10)) 124 | ax = sns.heatmap(results_bic, mask=results_bic.isnull(), ax=ax, annot=True, fmt='.0f') 125 | ax.set_title(scoring) 126 | results_dict[str(interim_p)+' '+str(interim_d)+' '+str(interim_q)] = interim_bic 127 | try: 128 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1] 129 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0] 130 | best_p = int(best_pdq.split(' ')[0]) 131 | best_d = int(best_pdq.split(' ')[1]) 132 | best_q = int(best_pdq.split(' ')[2]) 133 | except: 134 | best_p = copy.deepcopy(p_val) 135 | best_q = copy.deepcopy(q_val) 136 | best_d = copy.deepcopy(d_val) 137 | best_bic = 0 138 | 139 | # # TODO: Print if needed 140 | # print(f"Seasonal Dictionary: {seasonality_dict}") 141 | 142 | # return best_p, best_d, best_q, best_bic, seasonality 143 | return best_p, best_d, best_q, best_bic, seasonality_dict.get(best_d) 144 | -------------------------------------------------------------------------------- /auto_ts/models/build_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional, List 3 | 4 | import pandas as pd # type: ignore 5 | from pandas.core.generic import NDFrame # type:ignore 6 | 7 | 8 | class BuildBase(ABC): 9 | """ 10 | Base Class for Building a model 11 | """ 12 | 13 | def __init__(self, scoring: str, forecast_period: int, verbose: int, 14 | **kwargs 15 | ): 16 | self.scoring = scoring 17 | self.forecast_period = forecast_period 18 | self.verbose = verbose 19 | self.kwargs = kwargs 20 | self.model = None 21 | self.original_target_col: str = "" 22 | self.original_preds: List[str] = [] 23 | self.strf_time_format: str = "" 24 | self.num_boost_rounds: int = 250 25 | 26 | 27 | @abstractmethod 28 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object: 29 | """ 30 | Fits the model to the data 31 | 32 | :param ts_df The time series data to be used for fitting the model 33 | :type ts_df pd.DataFrame 34 | 35 | :param target_col The column name of the target time series that needs to be modeled. 36 | All other columns will be considered as exogenous variables (if applicable to method) 37 | :type target_col str 38 | 39 | :param cv: Number of folds to use for cross validation. 40 | Number of observations in the Validation set for each fold = forecast period 41 | If None, a single fold is used 42 | :type cv Optional[int] 43 | 44 | :rtype object 45 | """ 46 | 47 | 48 | @abstractmethod 49 | def refit(self, ts_df: pd.DataFrame) -> object: 50 | """ 51 | Refits an already trained model using a new dataset 52 | Useful when fitting to the full data after testing with cross validation 53 | :param ts_df The time series data to be used for fitting the model 54 | :type ts_df pd.DataFrame 55 | :rtype object 56 | """ 57 | 58 | @abstractmethod 59 | def predict( 60 | self, 61 | testdata: Optional[pd.DataFrame]=None, 62 | forecast_period: Optional[int] = None, 63 | simple: bool = True) -> NDFrame: 64 | """ 65 | Return the predictions 66 | :param testdata The test dataframe containing the exogenous varaiables to be used for predicton. 67 | :type testdata Optional[pd.DataFrame] 68 | :param forecast_period The number of periods to make a prediction for. 69 | :type forecast_period Optional[int] 70 | :param simple If True, this method just returns the predictions. 71 | If False, it will return the standard error, lower and upper confidence interval (if available) 72 | :type simple bool 73 | :rtype NDFrame 74 | """ 75 | 76 | def check_model_built(self): 77 | if self.model is None: 78 | raise AttributeError( 79 | "You are trying to perform an operation that requires the model to have been fit."+ 80 | "However the model has not been fit yet. Please fit the model once before you try this operation." 81 | ) 82 | 83 | def get_num_folds_from_cv(self, cv): 84 | if cv is None: 85 | NFOLDS = 2 86 | elif cv == 0: 87 | NFOLDS = 2 88 | else: 89 | NFOLDS = cv 90 | 91 | return NFOLDS 92 | -------------------------------------------------------------------------------- /auto_ts/models/build_pyflux.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import operator 3 | 4 | import numpy as np # type: ignore 5 | import pandas as pd # type: ignore 6 | 7 | # helper functions 8 | from ..utils import print_static_rmse, print_dynamic_rmse 9 | 10 | 11 | ######################################################### 12 | def build_pyflux_model(df, target, ar=3, ma=3,integ=1, forecast_period=2, 13 | fitmethod='MLE', nsims=100, score_type='rmse', verbose=0): 14 | """ 15 | Build a quick pyflux model with default parameters for AR, MA and I terms in ARIMA. 16 | You can build a rolling forecast using the rolling_forecast parameter. 17 | PyFlux is a fiendishly complicated program with very poor documentation. 18 | I had to dig deep into the API to figure these things out especially the 19 | """ 20 | try: 21 | # imported pyflux pkg 22 | import pyflux as pf # type: ignore 23 | except: 24 | print('Pyflux is not installed - hence not running PyFlux model') 25 | return 'error','error','error','error' 26 | ts_df = df[:] 27 | ############################################################################## 28 | ts_train = ts_df[:-forecast_period] 29 | ts_test = ts_df[-forecast_period:] 30 | if verbose == 1: 31 | print('Data Set split into train %s and test %s for Cross Validation Purposes' 32 | % (ts_train.shape, ts_test.shape)) 33 | ##################################################################################################### 34 | if integ > 1: 35 | print(' Setting "integration"=1 since differenced predictions > 1 are difficult to interpret') 36 | integ = 1 37 | if fitmethod == 'M-H': 38 | print(' Assuming number of simulations = %d' % nsims) 39 | #################################################################################################### 40 | ###### define p,d,q parameters here #################### 41 | p = range(0, ar+1) 42 | q = range(0, ma+1) 43 | d = range(0, integ+1) ### dont do much more than 1 differencing in PyFlux models since its hard to undo 44 | #### Generate all different combinations of p,d,q triplets ###### 45 | pdq = list(itertools.product(p, d, q)) 46 | eval_metrics = {} 47 | print('Cycling through various (p,d,q) parameters') 48 | for param in pdq: 49 | if verbose == 1: 50 | print('.', end="") 51 | model = pf.ARIMA(data=ts_train, ar=param[0], integ=param[1], ma=param[2], target=target) 52 | try: 53 | if fitmethod == 'MLE': 54 | x = model.fit() 55 | elif fitmethod == 'M-H': 56 | x = model.fit('M-H', nsims=nsims) 57 | except: 58 | x = model.fit('MLE') 59 | mu, actuals = model._model(model.latent_variables.get_z_values()) 60 | predicted = model.link(mu) 61 | rmse, norm_rmse = print_static_rmse(actuals,predicted) 62 | if score_type == 'rmse': 63 | eval_metrics[param] = rmse 64 | else: 65 | eval_metrics[param] = norm_rmse 66 | bestpdq = min(eval_metrics.items(), key=operator.itemgetter(1))[0] 67 | print('\nBest Params Selected (based on %s): %s' % (score_type, bestpdq)) 68 | bestmodel = pf.ARIMA(data=ts_train, ar=bestpdq[0], integ=bestpdq[1], ma=bestpdq[2], target=target) 69 | x = bestmodel.fit() 70 | if verbose == 1: 71 | bestmodel.plot_fit(figsize=(15, 5)) 72 | #model.plot_predict_is(h=forecast_period,fit_once=False,fit_method=fitmethod) 73 | if verbose == 1: 74 | x.summary() 75 | n = int(0.5*len(df)) 76 | bestmodel.plot_predict(h=forecast_period, past_values=n, intervals=True, figsize=(15, 5)) 77 | forecast_df = bestmodel.predict(forecast_period, intervals=True) 78 | mu, actuals = bestmodel._model(bestmodel.latent_variables.get_z_values()) 79 | predicted = bestmodel.link(mu) 80 | print('Dynamic %d-period Forecasts:' % forecast_period) 81 | if bestpdq[1] == 1: 82 | mod_target = 'Differenced ' + target 83 | res = restore_differenced_predictions(ts_test[target].values, forecast_df[mod_target], 84 | ts_train[target][-1:]) 85 | rmse, norm_rmse = print_dynamic_rmse(ts_test[target].values, res, ts_train[target]) 86 | else: 87 | rmse, norm_rmse = print_dynamic_rmse(ts_test[target].values,forecast_df[target].values, ts_train[target]) 88 | return bestmodel, forecast_df, rmse, norm_rmse 89 | 90 | 91 | def restore_differenced_predictions(actuals, predicted, start_value, func=None, periods=1, diff_yes=True): 92 | try: 93 | restored = pd.Series(index=start_value.index) 94 | restored.ix[start_value.ix[:periods].index] = start_value.values[:periods] 95 | rest = restored.ix[predicted.index] 96 | restored = pd.Series(np.r_[restored, rest], index=np.r_[start_value.index, rest.index]) 97 | restored.ix[predicted.index] = predicted.values 98 | restored = restored[(periods-1):].cumsum() 99 | if func: 100 | restored = eval('np.' + func + '(restored)') 101 | return restored[periods:] 102 | except: 103 | restored = start_value.values+predicted 104 | if func: 105 | restored = eval('np.' + func + '(restored)') 106 | return restored 107 | -------------------------------------------------------------------------------- /auto_ts/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/auto_ts/py.typed -------------------------------------------------------------------------------- /auto_ts/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/auto_ts/test/__init__.py -------------------------------------------------------------------------------- /auto_ts/test/test_auto_sarimax.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit Tests for BuildAutoSarimax 3 | 4 | ---------------------- 5 | Total Combinations: 8 6 | ---------------------- 7 | Seasonality: Seasonal, Non-Seasonal (2) 8 | Univariate, Multivariate (2) 9 | CV: Yes, No (2) 10 | """ 11 | 12 | import unittest 13 | 14 | import numpy as np # type: ignore 15 | import pandas as pd # type: ignore 16 | from pandas.testing import assert_frame_equal # type: ignore 17 | from pandas.testing import assert_series_equal # type: ignore 18 | from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper # type: ignore 19 | 20 | 21 | class TestAutoSarimax(unittest.TestCase): 22 | 23 | def setUp(self): 24 | # Pre Release 25 | import sys 26 | import os 27 | sys.path.append(os.environ['DEV_AUTOTS']) 28 | import pandas as pd # type: ignore 29 | 30 | datapath = 'example_datasets/' 31 | filename1 = 'Sales_and_Marketing.csv' 32 | dft = pd.read_csv(datapath+filename1,index_col=None) 33 | 34 | self.ts_column = 'Time Period' 35 | self.sep = ',' 36 | self.target = 'Sales' 37 | self.preds = [x for x in list(dft) if x not in [self.ts_column, self.target]] # Exogenous variable names 38 | 39 | self.train_multivar = dft[:40] 40 | self.test_multivar = dft[40:] 41 | 42 | self.train_univar = dft[:40][[self.ts_column, self.target]] 43 | self.test_univar = dft[40:][[self.ts_column, self.target]] 44 | 45 | self.forecast_period = 8 46 | 47 | self.expected_pred_col_names = np.array(['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper']) 48 | 49 | ######################## 50 | #### Golden Results #### 51 | ######################## 52 | 53 | # TODO: Add to each individual test 54 | ## For each of the 8 combinations, we need the following 55 | # Internal Validation results (for each fold) 56 | # Internal Validation RMSE (overall and for each fold) 57 | 58 | # External Test results (various combinations of prediction windows - same as forecast period OR not same) 59 | # External Test RMSE 60 | 61 | 62 | def test_seasonal_univar_noCV(self): 63 | """ 64 | Test 1: Seasonal Univariate Without CV 65 | """ 66 | pass 67 | 68 | def test_seasonal_univar_CV(self): 69 | """ 70 | Test 2: Seasonal Univariate With CV 71 | """ 72 | pass 73 | 74 | def test_seasonal_multivar_noCV(self): 75 | """ 76 | Test 3: Seasonal Multivariate Without CV 77 | """ 78 | pass 79 | 80 | def test_seasonal_multivar_CV(self): 81 | """ 82 | Test 4: Seasonal Multivariate With CV 83 | """ 84 | pass 85 | 86 | def test_nonseasonal_univar_noCV(self): 87 | """ 88 | Test 5: Non Seasonal Univariate Without CV 89 | """ 90 | pass 91 | 92 | def test_nonseasonal_univar_CV(self): 93 | """ 94 | Test 6: Non Seasonal Univariate With CV 95 | """ 96 | pass 97 | 98 | def test_nonseasonal_multivar_noCV(self): 99 | """ 100 | Test 7: Non Seasonal Multivariate Without CV 101 | """ 102 | pass 103 | 104 | def test_nonseasonal_multivar_CV(self): 105 | """ 106 | Test 8: Non Seasonal Multivariate With CV 107 | """ 108 | pass 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /auto_ts/test/test_var.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit Tests for VAR Models 3 | 4 | ---------------------- 5 | Total Combinations: 4 6 | ---------------------- 7 | Seasonality: NA 8 | Univariate, Multivariate: Simple Independent Test for Univariate (1) 9 | CV: Yes, No (2) 10 | """ 11 | 12 | import math 13 | import os 14 | import sys 15 | import unittest 16 | 17 | import numpy as np # type: ignore 18 | import pandas as pd # type: ignore 19 | from pandas.testing import assert_frame_equal # type: ignore 20 | from pandas.testing import assert_series_equal # type: ignore 21 | from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper # type: ignore 22 | 23 | sys.path.append(os.environ['DEV_AUTOTS']) 24 | from auto_ts import auto_timeseries as ATS 25 | 26 | class TestVAR(unittest.TestCase): 27 | 28 | def setUp(self): 29 | # Pre Release 30 | 31 | 32 | datapath = 'example_datasets/' 33 | filename1 = 'Sales_and_Marketing.csv' 34 | dft = pd.read_csv(datapath + filename1, index_col = None) 35 | 36 | self.ts_column = 'Time Period' 37 | self.sep = ',' 38 | self.target = 'Sales' 39 | self.preds = [x for x in list(dft) if x not in [self.ts_column, self.target]] # Exogenous variable names 40 | 41 | self.train_multivar = dft[:40] 42 | self.test_multivar = dft[40:] 43 | 44 | self.train_univar = dft[:40][[self.ts_column, self.target]] 45 | self.test_univar = dft[40:][[self.ts_column, self.target]] 46 | 47 | self.forecast_period = 8 48 | 49 | self.expected_pred_col_names = np.array(['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper']) 50 | 51 | ######################## 52 | #### Golden Results #### 53 | ######################## 54 | 55 | # TODO: Add to each individual test 56 | ## For each of the 8 combinations, we need the following 57 | # Internal Validation results (for each fold) 58 | # Internal Validation RMSE (overall and for each fold) 59 | 60 | # External Test results (various combinations of prediction windows - same as forecast period OR not same) 61 | # External Test RMSE 62 | 63 | 64 | ############################ 65 | #### VAR Golden Results #### 66 | ############################ 67 | 68 | #### UNIVARIATE #### 69 | self.forecast_gold_var_univar = None 70 | self.rmse_gold_var_univar = math.inf 71 | self.forecast_gold_var_univar_series = None 72 | self.forecast_gold_var_univar_series_10 = None 73 | 74 | #### MULTIVARIATE #### 75 | 76 | # Internal (to AutoML) validation set results 77 | self.forecast_gold_var_multivar_internal_val_cv_fold1 = np.array([ 78 | 510.302336, 531.109224, 536.878513, 534.311164, 79 | 529.305887, 525.199071, 523.015255, 522.445215 80 | ]) 81 | 82 | self.forecast_gold_var_multivar_internal_val_cv_fold2 = np.array([ 83 | 741.377909, 676.233419, 615.538721, 571.797729, 84 | 546.952783, 537.342231, 537.474487, 542.307393 85 | ]) 86 | 87 | self.rmse_gold_var_multivar_cv_fold1 = 155.21757611 88 | self.rmse_gold_var_multivar_cv_fold2 = 112.4770318 # Without CV gets this result 89 | 90 | ## External Test Set results 91 | results = [ 92 | 675.899931, 622.204059, 578.38291, 553.067517, 93 | 543.612945, 543.696406, 547.604403, 551.762352 94 | ] 95 | index = pd.to_datetime([ 96 | '2014-05-01', '2014-06-01', '2014-07-01', '2014-08-01', 97 | '2014-09-01', '2014-10-01', '2014-11-01', '2014-12-01' 98 | ]) 99 | self.forecast_gold_var_multivar = np.array(results) 100 | 101 | self.forecast_gold_var_multivar_series = pd.Series(data=results, index=index) 102 | self.forecast_gold_var_multivar_series.name = 'mean' 103 | 104 | results = results + [554.643756, 556.055009] 105 | index = pd.to_datetime([ 106 | '2014-05-01', '2014-06-01', '2014-07-01', '2014-08-01', 107 | '2014-09-01', '2014-10-01', '2014-11-01', '2014-12-01', 108 | '2015-01-01', '2015-02-01' 109 | ]) 110 | self.forecast_gold_var_multivar_series_10 = pd.Series(data=results, index=index) 111 | self.forecast_gold_var_multivar_series_10.name = 'mean' 112 | 113 | 114 | def test_noCV(self): 115 | """ 116 | Test 1: VAR without CV 117 | """ 118 | print("\n\n" + "*"*50) 119 | print("Performing Unit Test: 'test_noCV'") 120 | print("*"*50 + "\n\n") 121 | 122 | automl_model = ATS( 123 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month', 124 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12, 125 | model_type='VAR', 126 | verbose=0) 127 | automl_model.fit( 128 | traindata=self.train_multivar, 129 | ts_column=self.ts_column, 130 | target=self.target, 131 | cv=None, 132 | sep=self.sep) 133 | 134 | ml_dict = automl_model.get_ml_dict() 135 | 136 | ###################### 137 | ## External Results ## 138 | ###################### 139 | 140 | # Simple forecast with forecast window = the one used in training 141 | # Using named model 142 | test_predictions = automl_model.predict( 143 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 144 | forecast_period=self.forecast_period, 145 | model="VAR" 146 | ) 147 | assert_series_equal( 148 | test_predictions['mean'].round(6), 149 | self.forecast_gold_var_multivar_series 150 | ) 151 | 152 | # Simple forecast with forecast window != the one used in training 153 | # Using named model 154 | test_predictions = automl_model.predict( 155 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 156 | forecast_period=10, 157 | model="VAR" 158 | ) 159 | assert_series_equal(test_predictions['mean'].round(6), self.forecast_gold_var_multivar_series_10) 160 | 161 | # Complex forecasts (returns confidence intervals, etc.) 162 | test_predictions = automl_model.predict( 163 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 164 | forecast_period=self.forecast_period, 165 | model="VAR", 166 | simple=False 167 | ) 168 | self.assertIsNone( 169 | np.testing.assert_array_equal( 170 | test_predictions.columns.values, self.expected_pred_col_names 171 | ) 172 | ) 173 | 174 | ################### 175 | ## ML Dictionary ## 176 | ################### 177 | self.assertIsNone( 178 | np.testing.assert_array_equal( 179 | np.round(ml_dict.get('VAR').get('forecast')[0]['mean'].values.astype(np.double), 6), 180 | self.forecast_gold_var_multivar_internal_val_cv_fold2 181 | ), 182 | "(Multivar Test) VAR Forecast does not match up with expected values." 183 | ) 184 | 185 | self.assertEqual( 186 | round(ml_dict.get('VAR').get('rmse')[0], 8), self.rmse_gold_var_multivar_cv_fold2, 187 | "(Multivar Test) VAR RMSE does not match up with expected values.") 188 | 189 | def test_CV(self): 190 | """ 191 | Test 2: VAR with CV 192 | """ 193 | print("\n\n" + "*"*50) 194 | print("Performing Unit Test: 'test_CV'") 195 | print("*"*50 + "\n\n") 196 | 197 | automl_model = ATS( 198 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month', 199 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12, 200 | model_type='VAR', 201 | verbose=0) 202 | automl_model.fit( 203 | traindata=self.train_multivar, 204 | ts_column=self.ts_column, 205 | target=self.target, 206 | cv=2, 207 | sep=self.sep) 208 | 209 | ml_dict = automl_model.get_ml_dict() 210 | 211 | ###################### 212 | ## External Results ## 213 | ###################### 214 | 215 | # Simple forecast with forecast window = the one used in training 216 | # Using named model 217 | test_predictions = automl_model.predict( 218 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 219 | forecast_period=self.forecast_period, 220 | model="VAR" 221 | ) 222 | assert_series_equal( 223 | test_predictions['mean'].round(6), 224 | self.forecast_gold_var_multivar_series 225 | ) 226 | 227 | # Simple forecast with forecast window != the one used in training 228 | # Using named model 229 | test_predictions = automl_model.predict( 230 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 231 | forecast_period=10, 232 | model="VAR" 233 | ) 234 | assert_series_equal(test_predictions['mean'].round(6), self.forecast_gold_var_multivar_series_10) 235 | 236 | # Complex forecasts (returns confidence intervals, etc.) 237 | test_predictions = automl_model.predict( 238 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR 239 | forecast_period=self.forecast_period, 240 | model="VAR", 241 | simple=False 242 | ) 243 | self.assertIsNone( 244 | np.testing.assert_array_equal( 245 | test_predictions.columns.values, self.expected_pred_col_names 246 | ) 247 | ) 248 | 249 | ################### 250 | ## ML Dictionary ## 251 | ################### 252 | self.assertIsNone( 253 | np.testing.assert_array_equal( 254 | np.round(ml_dict.get('VAR').get('forecast')[0]['mean'].values.astype(np.double), 6), 255 | self.forecast_gold_var_multivar_internal_val_cv_fold1, 256 | 257 | ), 258 | "(Multivar Test) VAR Forecast does not match up with expected values." 259 | ) 260 | self.assertIsNone( 261 | np.testing.assert_array_equal( 262 | np.round(ml_dict.get('VAR').get('forecast')[1]['mean'].values.astype(np.double), 6), 263 | self.forecast_gold_var_multivar_internal_val_cv_fold2 264 | ), 265 | "(Multivar Test) VAR Forecast does not match up with expected values." 266 | ) 267 | 268 | self.assertEqual( 269 | round(ml_dict.get('VAR').get('rmse')[0], 8), self.rmse_gold_var_multivar_cv_fold1, 270 | "(Multivar Test) VAR RMSE does not match up with expected values.") 271 | self.assertEqual( 272 | round(ml_dict.get('VAR').get('rmse')[1], 8), self.rmse_gold_var_multivar_cv_fold2, 273 | "(Multivar Test) VAR RMSE does not match up with expected values.") 274 | 275 | 276 | def test_univar(self): 277 | """ 278 | Test 3: Univariate VAR 279 | """ 280 | print("\n\n" + "*"*50) 281 | print("Performing Unit Test: 'test_univar'") 282 | print("*"*50 + "\n\n") 283 | 284 | automl_model = ATS( 285 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month', 286 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12, 287 | model_type='VAR', 288 | verbose=0) 289 | automl_model.fit( 290 | traindata=self.train_univar, 291 | ts_column=self.ts_column, 292 | target=self.target, 293 | cv=None 294 | ) 295 | ml_dict = automl_model.get_ml_dict() 296 | 297 | self.assertIsNone(automl_model.get_model_build('VAR'), "Expected Univar VAR model to be None but did not get None.") 298 | 299 | # Simple forecast with forecast window = one used in training 300 | # Using named model 301 | test_predictions = automl_model.predict( 302 | forecast_period=self.forecast_period, 303 | model="VAR" 304 | ) 305 | self.assertIsNone(test_predictions) 306 | 307 | # Simple forecast with forecast window != one used in training 308 | # Using named model 309 | test_predictions = automl_model.predict( 310 | forecast_period=10, 311 | model="VAR" 312 | ) 313 | self.assertIsNone(test_predictions) 314 | 315 | # Complex forecasts (returns confidence intervals, etc.) 316 | test_predictions = automl_model.predict( 317 | forecast_period=self.forecast_period, 318 | model="VAR", 319 | simple=False 320 | ) 321 | self.assertIsNone(test_predictions) 322 | 323 | ################### 324 | ## ML Dictionary ## 325 | ################### 326 | self.assertEqual( 327 | ml_dict.get('VAR').get('forecast'), self.forecast_gold_var_univar, 328 | "(Univar Test) VAR Forecast does not match up with expected values." 329 | ) 330 | 331 | self.assertEqual( 332 | round(ml_dict.get('VAR').get('rmse'), 8), self.rmse_gold_var_univar, 333 | "(Univar Test) VAR RMSE does not match up with expected values.") -------------------------------------------------------------------------------- /auto_ts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .colors import colorful 2 | from .eda import time_series_plot, top_correlation_to_name, test_stationarity 3 | from .etl import load_ts_data, convert_timeseries_dataframe_to_supervised, \ 4 | time_series_split, find_max_min_value_in_a_dataframe, left_subtract, \ 5 | change_to_datetime_index, change_to_datetime_index_test, reduce_mem_usage, load_test_data 6 | from .metrics import print_static_rmse, print_dynamic_rmse, print_normalized_rmse, \ 7 | print_ts_model_stats 8 | from .my_encoders import My_LabelEncoder, My_LabelEncoder_Pipe 9 | from .val import cross_validation_time_series, rolling_validation_time_series, \ 10 | ts_model_validation, quick_ts_plot 11 | -------------------------------------------------------------------------------- /auto_ts/utils/colors.py: -------------------------------------------------------------------------------- 1 | class colorful: 2 | PURPLE = '\033[95m' 3 | CYAN = '\033[96m' 4 | DARKCYAN = '\033[36m' 5 | BLUE = '\033[94m' 6 | GREEN = '\033[92m' 7 | YELLOW = '\033[93m' 8 | RED = '\033[91m' 9 | BOLD = '\033[1m' 10 | UNDERLINE = '\033[4m' 11 | END = '\033[0m' 12 | -------------------------------------------------------------------------------- /auto_ts/utils/eda.py: -------------------------------------------------------------------------------- 1 | import numpy as np # type: ignore 2 | import numpy as np # type: ignore 3 | import pandas as pd # type: ignore 4 | import seaborn as sns # type: ignore 5 | from itertools import cycle 6 | import matplotlib.pyplot as plt 7 | 8 | # This gives an error when running from a python script. 9 | # Maybe, this should be set in the jupyter notebook directly. 10 | # get_ipython().magic('matplotlib inline') 11 | sns.set(style="white", color_codes=True) 12 | # TSA from Statsmodels 13 | import statsmodels.tsa.api as smt # type: ignore 14 | import dask 15 | from .colors import colorful 16 | def time_series_plot(y, lags=31, title='Original Time Series', chart_type='line', 17 | chart_freq='years'): 18 | """ 19 | Plot a Time Series along with how it will look after differencing and what its 20 | AR/MA lags will be by viewing the ACF and PACF, along with its histogram. 21 | You just need to provide the time series (y) as a Series. Index is assumed 22 | to be Pandas datetime. It assumes that you want to see default lags of 31. 23 | But you can modify it to suit. 24 | """ 25 | 26 | y = copy.deepcopy(y) 27 | if chart_freq in ['MS', 'M', 'SM', 'BM', 'CBM', 'SMS', 'BMS']: 28 | chart_time = 'months' 29 | elif chart_freq in ['D', 'B', 'C']: 30 | chart_time = 'days' 31 | elif chart_freq in ['W']: 32 | chart_time = 'weeks' 33 | elif chart_freq in ['Q', 'BQ', 'QS', 'BQS']: 34 | chart_time = 'quarters' 35 | elif chart_freq in ['A,Y', 'BA,BY', 'AS,YS', 'BAS,YAS']: 36 | chart_time = 'years' 37 | elif chart_freq in ['BH', 'H', 'h']: 38 | chart_time = 'hours' 39 | elif chart_freq in ['T,min']: 40 | chart_time = 'minutes' 41 | elif chart_freq in ['S', 'L,milliseconds', 'U,microseconds', 'N,nanoseconds']: 42 | chart_time = 'seconds' 43 | else: 44 | print('chart frequency not known. Continuing...') 45 | return 46 | colors = cycle('byrcmgkbyrcmgkbyrcmgkbyrcmgkbyr') 47 | fig = plt.figure(figsize=(20, 20)) 48 | grid = plt.GridSpec(3, 2, wspace=0.5, hspace=0.5) 49 | fig.subplots_adjust(hspace=1) 50 | ########## Use the gridspec function ############## 51 | ts_ax = plt.subplot(grid[0, 0:]) 52 | diff_ax = plt.subplot(grid[1, 0]) 53 | hist_ax = plt.subplot(grid[1, 1]) 54 | acf_ax = plt.subplot(grid[2, 0]) 55 | pacf_ax = plt.subplot(grid[2, 1]) 56 | ### Draw multiple kinds of graphs here to each subplot axis ### 57 | 58 | if type(y) == dask.dataframe.core.DataFrame or type(y) == dask.dataframe.core.Series: 59 | y = y.head(len(y)) ## this converts it into a pandas Series 60 | if chart_type == 'line': 61 | y.plot(ax=ts_ax, color=next(colors)) 62 | else: 63 | if chart_time == 'years': 64 | majors = mdates.YearLocator() # every year 65 | minors = mdates.MonthLocator() # every month 66 | majorsFmt = mdates.DateFormatter('%Y') 67 | elif chart_time == 'months': 68 | majors = mdates.YearLocator() # every year 69 | minors = mdates.MonthLocator() # every month 70 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y') 71 | elif chart_time == 'weeks': 72 | majors = mdates.MonthLocator() 73 | minors = mdates.WeekdayLocator(byweekday=(1), interval=1) 74 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y') 75 | elif chart_time == 'days': 76 | majors = mdates.DayLocator(bymonthday=None, interval=1, tz=None) 77 | minors = mdates.HourLocator(byhour=None, interval=1, tz=None) 78 | majorsFmt = mdates.DateFormatter('\n\n\n%d\n%b') 79 | else: 80 | majors = mdates.YearLocator() # every year 81 | minors = mdates.MonthLocator() # every month 82 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y') 83 | try: 84 | #### this works in most cases but in some cases, it gives an error 85 | ts_ax.bar(y.index, height=y, width=20, color=list((y>0).astype(int).map({1:'g',0:'r'}).values)) 86 | except: 87 | #### In some cases where y is a dataframe, this might work. 88 | yindex = y.index 89 | yvalues = y.values.ravel() 90 | ts_ax.bar(yindex, height=yvalues, width=20, color=list(using_where((yvalues>0).astype(int)).ravel())) 91 | ts_ax.xaxis.set_major_locator(majors) 92 | ts_ax.xaxis.set_major_formatter(majorsFmt) 93 | ts_ax.xaxis.set_minor_locator(minors) 94 | ts_ax.format_xdata = mdates.DateFormatter('%Y-%m-%d') 95 | ts_ax.grid(True) 96 | #### Now draw the ACF and PACF charts 97 | ts_ax.set_title(title) 98 | y.diff(1).plot(ax=diff_ax, color=next(colors)) 99 | diff_ax.set_title('After Differencing = 1') 100 | y.plot(ax=hist_ax, kind='hist', bins=25, color=next(colors)) 101 | hist_ax.set_title('Histogram for Original Series') 102 | try: 103 | if len(y) < lags: 104 | lags = int(len(y) - 1) 105 | smt.graphics.plot_acf(y, lags=lags, ax=acf_ax) 106 | acf_ax.set_title('ACF for Original Series') 107 | except: 108 | acf_ax.set_title('Data Error: Could not draw ACF for Original Series') 109 | try: 110 | ### the number of lags cannot be greater than 50% of len of y. So limit it. 111 | if lags >= len(y)*0.5: 112 | lags = int(len(y)*0.5 - 1) 113 | smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax) 114 | pacf_ax.set_title('PACF for Original Series') 115 | except: 116 | pacf_ax.set_title('Data Error: Could not draw PACF for Original Series') 117 | [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]] 118 | plt.show(block=False) 119 | 120 | def using_where(x): 121 | return np.where(x == 1, 'g', 'r') 122 | ################################################################################# 123 | 124 | def top_correlation_to_name(stocks, column_name, searchstring, top=5): 125 | """ 126 | #################################################################################### 127 | This function draws a correlation chart of the top "x" rows of a data frame that are highly 128 | correlated to a selected row in the dataframe. You can think of the rows of the input 129 | dataframe as containing stock prices or fund flows or product sales and the columns should 130 | contain time series data of prices or flows or sales over multiple time periods. 131 | Now this program will allow you to select the top 5 or 10 rows that are highly correlated 132 | to a given row selected by the column: column_name and using a search string "searchstring". 133 | The program will search for the search string in that column column_name and return a list 134 | of 5 or 10 rows that are the most correlated to that selected row. If you give "top" as 135 | a float ratio then it will use the ratio as the cut off point in the correlation 136 | coefficient to select rows. 137 | #################################################################################### 138 | """ 139 | #### First increment top by 1 since you are asking for top X names in addition to the one you have, top += 1 140 | incl = [x for x in list(stocks) if x not in column_name] 141 | ### First drop all NA rows since they will mess up your correlations, stocks.dropna(inplace=True) 142 | if stocks.empty: 143 | print('After dropping NaNs, the data frame has become empty.') 144 | return 145 | ### Now find the highest correlated rows to the selected row ### 146 | try: 147 | index_val = search_string(stocks, column_name,searchstring).index[0] 148 | except: 149 | print('Not able to find the search string in the column.') 150 | return 151 | ### Bring that selected Row to the top of the Data Frame 152 | df = stocks[:] 153 | # TODO: Undefined variable 'l' 154 | df["new"] = range(l, len(df)+l) 155 | df.loc[index_val,"new"] = 0 156 | stocks = df.sort_values("new").drop("new",axis=1) 157 | stocks.reset_index(inplace=True,drop=True) 158 | ##### Now calculate the correlation coefficients of other rows with the Top row 159 | try: 160 | cordf = pd.DataFrame(stocks[incl].T.corr().sort_values(0, ascending=False)) 161 | except: 162 | print('Cannot calculate Correlations since Dataframe contains string values or objects.') 163 | return 164 | try: 165 | cordf = stocks[column_name].join(cordf) 166 | except: 167 | cordf = pd.concat((stocks[column_name], cordf), axis=1) 168 | #### Visualizing the top 5 or 10 or whatever cut-off they have given for Corr Coeff 169 | if top >= 1: 170 | top10index = cordf.sort_values(0, ascending=False).iloc[:top, :3].index 171 | top10names = cordf.sort_values(0, ascending=False).iloc[:top, :3][column_name] 172 | top10values = cordf.sort_values(0, ascending=False)[0].values[:top] 173 | else: 174 | top10index = cordf.sort_values(0, ascending=False)[ 175 | cordf.sort_values(0, ascending=False)[0].values >= top].index 176 | top10names = cordf.sort_values(0, ascending=False)[cordf.sort_values( 177 | 0, ascending=False)[0].values >= top][column_name] 178 | top10alues = cordf.sort_values(0, ascending=False)[cordf.sort_values( 179 | 0, ascending=False)[0].values >= top][0] 180 | print(top10names, top10values) 181 | #### Now plot the top rows that are highly correlated based on condition above 182 | stocksloc = stocks.iloc[top10index] 183 | #### Visualizing using Matplotlib ### 184 | stocksloc = stocksloc.T 185 | stocksloc = stocksloc.reset_index(drop=True) 186 | stocksloc.columns = stocksloc.iloc[0].values.tolist() 187 | stocksloc.drop(0).plot(subplots=True, figsize=(15, 10), legend=False, 188 | title="Top %s Correlations to %s" % (top, searchstring)) 189 | [ax.legend(loc=1) for ax in plt.gcf().axes] 190 | plt.tight_layout() 191 | plt.show(block=False) 192 | 193 | ################################################################################ 194 | def pretty_print_table(dfo): 195 | from io import StringIO 196 | import prettytable 197 | output = StringIO() 198 | dfo.to_csv(output) 199 | output.seek(0) 200 | pt = prettytable.from_csv(output) 201 | print(pt) 202 | 203 | import copy 204 | def test_stationarity(time_df, maxlag=31, regression='c', autolag='BIC', 205 | window=None, plot=False, verbose=False, var_only=False): 206 | """ 207 | Check unit root stationarity of a time series array or an entire dataframe. 208 | Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR. 209 | Null hypothesis: the series is non-stationary. 210 | If p >= alpha, the series is non-stationary. 211 | If p < alpha, reject the null hypothesis (has unit root stationarity). 212 | Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/ 213 | Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html 214 | window argument is only required for plotting rolling functions. Default=4. 215 | """ 216 | time_df = copy.deepcopy(time_df) 217 | if len(time_df) <= int(2.5*maxlag): 218 | maxlag = 5 219 | print('setting maxlag to a low number = %s' %maxlag) 220 | # set defaults (from function page) 221 | if type(time_df) == pd.DataFrame: 222 | #print('modifying time series dataframe into an array to test') 223 | timeseries = time_df.values.ravel() 224 | else: 225 | timeseries = copy.deepcopy(time_df) 226 | if regression is None: 227 | regression = 'c' 228 | if verbose: 229 | print('\nRunning Augmented Dickey-Fuller test with paramters:') 230 | print(' maxlag: {}'.format(maxlag),'regression: {}'.format(regression),'autolag: {}'.format(autolag)) 231 | alpha = 0.05 232 | if plot: 233 | try: 234 | if window is None: 235 | window = 4 236 | # Determing rolling statistics 237 | rolmean = timeseries.rolling(window=window, center=False).mean() 238 | rolstd = timeseries.rolling(window=window, center=False).std() 239 | # Plot rolling statistics: 240 | orig = plt.plot(timeseries, color='blue', label='Original') 241 | mean = plt.plot(rolmean, color='red', label='Rolling Mean ({})'.format(window)) 242 | std = plt.plot(rolstd, color='black', label='Rolling Std ({})'.format(window)) 243 | plt.legend(loc='best') 244 | plt.title('Rolling Mean & Standard Deviation') 245 | plt.show(block=False) 246 | except: 247 | print('Data must have date-time as index to plot!') 248 | return 249 | # Perform Augmented Dickey-Fuller test: 250 | if var_only: 251 | ### In VAR models, check all_vars for stationarity 252 | ### if it is 1, then all vars are stationary. If not difference it once and try again! 253 | ### Use Statsmodels for tests ########### 254 | diff_limit = 0 255 | for i in range(3): 256 | stationary_test = check_each_var_for_stationarity(time_df, autolag, verbose) 257 | if stationary_test: 258 | if i == 0: 259 | print('Data is already stationary') 260 | diff_limit = 0 261 | break 262 | elif i == 1: 263 | print('Data is stationary after one differencing') 264 | diff_limit = 1 265 | break 266 | elif i == 2: 267 | diff_limit = 2 268 | print('Data is stationary after two differencing') 269 | break 270 | else: 271 | if i == 2: 272 | print('Alert! Data is not stationary even after two differencing. Continuing...') 273 | diff_limit = 0 274 | break 275 | else: 276 | time_df = time_df.diff(1).dropna() 277 | continue 278 | return diff_limit 279 | else: 280 | ### In non-VAR models you need to test only the target variable for stationarity ## 281 | dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag) 282 | dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 283 | 'p-value', 284 | '#Lags Used', 285 | 'Number of Observations Used', 286 | ],name='Dickey-Fuller Augmented Test') 287 | for key, value in dftest[4].items(): 288 | dfoutput['Critical Value (%s)' % key] = value 289 | if verbose: 290 | print('Results of Augmented Dickey-Fuller Test:') 291 | pretty_print_table(dfoutput) 292 | if dftest[1] >= alpha: 293 | print(' this series is non-stationary. Trying test again after differencing...') 294 | timeseries = pd.Series(timeseries).diff(1).dropna().values 295 | dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag) 296 | dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 297 | 'p-value', 298 | '#Lags Used', 299 | 'Number of Observations Used', 300 | ],name='Dickey-Fuller Augmented Test') 301 | for key, value in dftest[4].items(): 302 | dfoutput['Critical Value (%s)' % key] = value 303 | if verbose: 304 | print('After differencing=1, results of Augmented Dickey-Fuller Test:') 305 | pretty_print_table(dfoutput) 306 | if dftest[1] >= alpha: 307 | print(colorful.BOLD +'this series is NOT stationary' + colorful.END) 308 | return False 309 | else: 310 | print(colorful.BOLD +'this series is stationary' + colorful.END) 311 | return True 312 | else: 313 | print(colorful.BOLD +'this series is stationary' + colorful.END) 314 | return True 315 | ################################################################################ 316 | def adjust(val, length= 6): 317 | return str(val).ljust(length) 318 | def check_each_var_for_stationarity(time_df, autolag, verbose=0): 319 | alpha = 0.05 320 | all_vars = 1 321 | copy_cols = time_df.columns.tolist() 322 | for each_var in copy_cols: 323 | timeseries = time_df[each_var].values 324 | dftest = smt.adfuller(timeseries, autolag=autolag) 325 | if verbose >= 2: 326 | ############################ Print Summary ##################### 327 | output = {'test_statistic':round(dftest[0], 4), 'pvalue':round(dftest[1], 4), 'n_lags':round(dftest[2], 4), 'n_obs':dftest[3]} 328 | p_value = output['pvalue'] 329 | print(f' Augmented Dickey-Fuller Test on "{each_var}"', "\n ", '-'*47) 330 | print(f' Null Hypothesis: Data has unit root. Non-Stationary.') 331 | print(f' Significance Level = {alpha}') 332 | print(f' Test Statistic = {output["test_statistic"]}') 333 | print(f' No. Lags Chosen = {output["n_lags"]}') 334 | 335 | for key,val in dftest[4].items(): 336 | print(f' Critical value {adjust(key)} = {round(val, 3)}') 337 | 338 | if p_value <= alpha: 339 | print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.") 340 | print(f" => Series is Stationary.") 341 | else: 342 | print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.") 343 | print(f" => Series is Non-Stationary.") 344 | #################################################################### 345 | if dftest[1] < alpha: 346 | all_vars = 1*all_vars 347 | else: 348 | all_vars = 0*all_vars 349 | return all_vars 350 | ################################################################################## -------------------------------------------------------------------------------- /auto_ts/utils/logging.py: -------------------------------------------------------------------------------- 1 | 2 | """Utilities that affect logging 3 | """ 4 | 5 | import os 6 | 7 | # https://github.com/facebook/prophet/issues/223#issuecomment-326455744 8 | class SuppressStdoutStderr(): 9 | ''' 10 | A context manager for doing a "deep suppression" of stdout and stderr in 11 | Python, i.e. will suppress all print, even if the print originates in a 12 | compiled C/Fortran sub-function. 13 | This will not suppress raised exceptions, since exceptions are printed 14 | to stderr just before a script exits, and after the context manager has 15 | exited (at least, I think that is why it lets exceptions through). 16 | 17 | ''' 18 | def __init__(self): 19 | # Open a pair of null files 20 | self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] 21 | # Save the actual stdout (1) and stderr (2) file descriptors. 22 | self.save_fds = [os.dup(1), os.dup(2)] 23 | 24 | def __enter__(self): 25 | # Assign the null pointers to stdout and stderr. 26 | os.dup2(self.null_fds[0], 1) 27 | os.dup2(self.null_fds[1], 2) 28 | 29 | def __exit__(self, *_): 30 | # Re-assign the real stdout/stderr back to (1) and (2) 31 | os.dup2(self.save_fds[0], 1) 32 | os.dup2(self.save_fds[1], 2) 33 | # Close the null files 34 | for fd in self.null_fds + self.save_fds: 35 | os.close(fd) 36 | -------------------------------------------------------------------------------- /auto_ts/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np # type: ignore 2 | import pandas as pd # type: ignore 3 | import matplotlib.pyplot as plt # type: ignore 4 | from sklearn.metrics import mean_absolute_error, mean_squared_error # type: ignore 5 | import matplotlib.pyplot as plt # type: ignore 6 | import numpy as np # type: ignore 7 | import pandas as pd # type: ignore 8 | from sklearn.metrics import mean_absolute_error, mean_squared_error # type: ignore 9 | 10 | 11 | def print_static_rmse(actual: np.array, predicted: np.array, start_from: int=0, verbose: int=0): 12 | """ 13 | this calculates the ratio of the rmse error to the standard deviation of the actuals. 14 | This ratio should be below 1 for a model to be considered useful. 15 | The comparison starts from the row indicated in the "start_from" variable. 16 | """ 17 | rmse = np.sqrt(mean_squared_error(actual[start_from:], predicted[start_from:])) 18 | std_dev = actual[start_from:].std() 19 | if verbose >= 1: 20 | print(' RMSE = %0.2f' %rmse) 21 | print(' Std Deviation of Actuals = %0.2f' %(std_dev)) 22 | print(' Normalized RMSE (as pct of std dev) = %0.1f%%' %(rmse*100/std_dev)) 23 | return rmse, rmse/std_dev 24 | 25 | 26 | def print_dynamic_rmse(actuals: np.array, predicted: np.array, original: np.array, toprint: bool = True): 27 | """ 28 | This utility calculates rmse between actuals and predicted. However, it does one more. 29 | Since in dynamic forecast, we need the longer original, it calculates Normalized RMSE 30 | using the original array's std deviation. That way, the forecast of 2 values does not 31 | result in a larger Normalized RMSE since the std deviation of 2 values will be v small. 32 | """ 33 | rmse = np.sqrt(np.mean((actuals - predicted)**2)) 34 | norm_rmse = rmse/original.std() 35 | if toprint: 36 | print(' RMSE = {:,.2f}'.format(rmse)) 37 | print(' Std Deviation of actuals = {:,.2f}'.format(actuals.std())) 38 | print(' Normalized RMSE (as pct of std dev) = %0.0f%%' %(100*norm_rmse)) 39 | return rmse, norm_rmse 40 | 41 | 42 | def print_normalized_rmse(actuals: np.array, predicted: np.array, start_from: int=0): 43 | """ 44 | This utility calculates rmse between actuals and predicted. However, it does one more. 45 | If the original is given, it calculates Normalized RMSE using the original array's std deviation. 46 | """ 47 | actuals = actuals[start_from:] 48 | predicted = predicted[start_from:] 49 | rmse = np.sqrt(np.mean(mean_squared_error(actuals,predicted))) 50 | norm_rmse = rmse/actuals.std() 51 | print('RMSE = {:,.2f}'.format(rmse)) 52 | print('Std Deviation of Actuals = {:,.2f}'.format(actuals.std())) 53 | print('Normalized RMSE = %0.0f%%' %(100*norm_rmse)) 54 | return rmse, norm_rmse 55 | 56 | 57 | def print_rmse(y: np.array, y_hat: np.array): 58 | """ 59 | Calculating Root Mean Square Error https://en.wikipedia.org/wiki/Root-mean-square_deviation 60 | """ 61 | mse = np.mean((y - y_hat)**2) 62 | return np.sqrt(mse) 63 | 64 | 65 | def print_mape(y: np.array, y_hat: np.array): 66 | """ 67 | Calculating Mean Absolute Percent Error https://en.wikipedia.org/wiki/Mean_absolute_percentage_error 68 | """ 69 | try: 70 | perc_err = (100*(y - y_hat))/y 71 | return np.mean(abs(perc_err)) 72 | except: 73 | return np.nan 74 | 75 | 76 | def print_ts_model_stats(actuals: np.array, predicted: np.array, title="Model"): 77 | """ 78 | This program prints and returns MAE, RMSE, MAPE. 79 | If you like the MAE and RMSE as a percentage of something, just give that number 80 | in the input as "number_as_percentage" and it will return the MAE and RMSE as a 81 | ratio of that number. Returns MAE, MAE_as_percentage, and RMSE_as_percentage 82 | """ 83 | try: 84 | number_as_percentage = actuals.std() 85 | if (predicted.index == actuals.index).all(): 86 | dfplot = pd.DataFrame(actuals).join(pd.DataFrame(predicted)) 87 | else: 88 | dfplot = pd.DataFrame([actuals.values, predicted.values]).T 89 | dfplot.columns = ['Actual','Forecast'] 90 | dfplot = dfplot.sort_index() 91 | plt.figure(figsize=(15,8)) 92 | plt.plot(dfplot) 93 | plt.legend(['original','predicted']) 94 | plt.title('%s: Actual vs Forecast in expanding (training) Window Cross Validation' %title, fontsize=20) 95 | except: 96 | pass 97 | print('\n-------------------------------------------') 98 | print('Model Cross Validation Results:') 99 | print('-------------------------------------------') 100 | mae = mean_absolute_error(actuals, predicted) 101 | mse = mean_squared_error(actuals, predicted) 102 | print(' MAE (Mean Absolute Error = %0.2f' %mae) 103 | rmse = np.sqrt(mean_squared_error(actuals,predicted)) 104 | print(' MSE (Mean Squared Error = %0.2f' %mse) 105 | mape = print_mape(actuals, predicted) 106 | print(" MAPE (Mean Absolute Percent Error) = %0.0f%%" %(mape)) 107 | print(" RMSE (Root Mean Squared Error) = %0.04f" %(rmse)) 108 | # Normalized RMSE print('RMSE = {:,.Of}'.format(rmse)) 109 | print(' Normalized RMSE (MinMax) = %0.0f%%' %(100*rmse/abs(actuals.max()-actuals.min()))) 110 | rmse_asp = (np.sqrt(mean_squared_error(actuals,predicted))/number_as_percentage)*100 111 | print(' Normalized RMSE (as Std Dev of Actuals)= %0.0f%%' %rmse_asp) 112 | return rmse, rmse_asp 113 | -------------------------------------------------------------------------------- /auto_ts/utils/my_encoders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.preprocessing import OneHotEncoder 6 | from sklearn.base import BaseEstimator #gives fit_transform method for free 7 | import pdb 8 | from sklearn.base import TransformerMixin 9 | from collections import defaultdict 10 | #################################################################################################### 11 | class My_LabelEncoder(BaseEstimator, TransformerMixin): 12 | """ 13 | ################################################################################################ 14 | ###### The My_LabelEncoder class was developed by Ram Seshadri for AutoViML ######### 15 | ###### The My_LabelEncoder class works just like sklearn's Label Encoder but better! ####### 16 | ##### It label encodes any cat var in your dataset. It also handles NaN's in your dataset! #### 17 | ## The beauty of this function is that it takes care of NaN's and unknown (future) values.##### 18 | ##################### This is the BEST working version - don't mess with it!! ################## 19 | ################################################################################################ 20 | Usage: 21 | le = My_LabelEncoder() 22 | le.fit_transform(train[column]) ## this will give your transformed values as an array 23 | le.transform(test[column]) ### this will give your transformed values as an array 24 | 25 | Usage in Column Transformers and Pipelines: 26 | No. It cannot be used in pipelines since it need to produce two columns for the next stage in pipeline. 27 | See my other module called My_LabelEncoder_Pipe() to see how it can be used in Pipelines. 28 | """ 29 | def __init__(self): 30 | self.transformer = defaultdict(str) 31 | self.inverse_transformer = defaultdict(str) 32 | self.max_val = 0 33 | 34 | def fit(self,testx, y=None): 35 | if isinstance(testx, pd.Series): 36 | pass 37 | elif isinstance(testx, np.ndarray): 38 | testx = pd.Series(testx) 39 | else: 40 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ### 41 | ### But if it is a one-dimensional dataframe, convert it into a Series 42 | if testx.shape[1] == 1: 43 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0]) 44 | else: 45 | #### Since it is multi-dimensional, So in this case, just return the data as is 46 | return self 47 | ins = np.unique(testx.factorize()[1]).tolist() 48 | outs = np.unique(testx.factorize()[0]).tolist() 49 | #ins = testx.value_counts(dropna=False).index 50 | if -1 in outs: 51 | # it already has nan if -1 is in outs. No need to add it. 52 | if not np.nan in ins: 53 | ins.insert(0,np.nan) 54 | self.transformer = dict(zip(ins,outs)) 55 | self.inverse_transformer = dict(zip(outs,ins)) 56 | return self 57 | 58 | def transform(self, testx, y=None): 59 | if isinstance(testx, pd.Series): 60 | pass 61 | elif isinstance(testx, np.ndarray): 62 | testx = pd.Series(testx) 63 | else: 64 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ### 65 | ### But if it is a one-dimensional dataframe, convert it into a Series 66 | if testx.shape[1] == 1: 67 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0]) 68 | else: 69 | #### Since it is multi-dimensional, So in this case, just return the data as is 70 | return testx, y 71 | ### now convert the input to transformer dictionary values 72 | new_ins = np.unique(testx.factorize()[1]).tolist() 73 | missing = [x for x in new_ins if x not in self.transformer.keys()] 74 | if len(missing) > 0: 75 | for each_missing in missing: 76 | self.transformer[each_missing] = int(self.max_val + 1) 77 | self.inverse_transformer[int(self.max_val+1)] = each_missing 78 | self.max_val = int(self.max_val+1) 79 | else: 80 | self.max_val = np.max(list(self.transformer.values())) 81 | outs = testx.map(self.transformer).values.astype(int) 82 | ### To handle category dtype you must do the next step ##### 83 | testk = testx.map(self.transformer) ## this must be still a pd.Series 84 | if testx.dtype not in [np.int16, np.int32, np.int64, float, bool, object]: 85 | if testx.isnull().sum().sum() > 0: 86 | fillval = self.transformer[np.nan] 87 | testk = testk.cat.add_categories([fillval]) 88 | testk = testk.fillna(fillval) 89 | testk = testk.astype(int) 90 | return testk, y 91 | else: 92 | testk = testk.astype(int) 93 | return testk, y 94 | else: 95 | return outs 96 | 97 | def inverse_transform(self, testx, y=None): 98 | ### now convert the input to transformer dictionary values 99 | if isinstance(testx, pd.Series): 100 | outs = testx.map(self.inverse_transformer).values 101 | elif isinstance(testx, np.ndarray): 102 | outs = pd.Series(testx).map(self.inverse_transformer).values 103 | else: 104 | outs = testx[:] 105 | return outs 106 | ################################################################################# 107 | class My_LabelEncoder_Pipe(BaseEstimator, TransformerMixin): 108 | """ 109 | ################################################################################################ 110 | ###### The My_LabelEncoder_Pipe class was developed by Ram Seshadri for Auto_TS ##### 111 | ###### The My_LabelEncoder_Pipe class works just like sklearn's Label Encoder but better! ##### 112 | ##### It label encodes any cat var in your dataset. But it can also be used in Pipelines! ##### 113 | ## The beauty of this function is that it takes care of NaN's and unknown (future) values.##### 114 | ##### Since it produces an unused second column it can be used in sklearn's Pipelines. ##### 115 | ##### But for that you need to add a drop_second_col() function to this My_LabelEncoder_Pipe ## 116 | ##### and then feed the whole pipeline to a Column_Transformer function. It is very easy. ##### 117 | ##################### This is the BEST working version - don't mess with it!! ################## 118 | ################################################################################################ 119 | Usage in pipelines: 120 | le = My_LabelEncoder_Pipe() 121 | le.fit_transform(train[column]) ## this will give you two columns - beware! 122 | le.transform(test[column]) ### this will give you two columns - beware! 123 | 124 | Usage in Column Transformers: 125 | def drop_second_col(Xt): 126 | ### This deletes the 2nd column. Hence col number=1 and axis=1 ### 127 | return np.delete(Xt, 1, 1) 128 | 129 | drop_second_col_func = FunctionTransformer(drop_second_col) 130 | 131 | le_one = make_pipeline(le, drop_second_col_func) 132 | 133 | ct = make_column_transformer( 134 | (le_one, catvars[0]), 135 | (le_one, catvars[1]), 136 | (imp, numvars), 137 | remainder=remainder) 138 | 139 | """ 140 | def __init__(self): 141 | self.transformer = defaultdict(str) 142 | self.inverse_transformer = defaultdict(str) 143 | self.max_val = 0 144 | 145 | def fit(self,testx, y=None): 146 | if isinstance(testx, pd.Series): 147 | pass 148 | elif isinstance(testx, np.ndarray): 149 | testx = pd.Series(testx) 150 | else: 151 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ### 152 | ### But if it is a one-dimensional dataframe, convert it into a Series 153 | if testx.shape[1] == 1: 154 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0]) 155 | else: 156 | #### Since it is multi-dimensional, So in this case, just return the data as is 157 | return self 158 | ins = np.unique(testx.factorize()[1]).tolist() 159 | outs = np.unique(testx.factorize()[0]).tolist() 160 | #ins = testx.value_counts(dropna=False).index 161 | if -1 in outs: 162 | # it already has nan if -1 is in outs. No need to add it. 163 | if not np.nan in ins: 164 | ins.insert(0,np.nan) 165 | self.transformer = dict(zip(ins,outs)) 166 | self.inverse_transformer = dict(zip(outs,ins)) 167 | return self 168 | 169 | def transform(self, testx, y=None): 170 | if isinstance(testx, pd.Series): 171 | pass 172 | elif isinstance(testx, np.ndarray): 173 | testx = pd.Series(testx) 174 | else: 175 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ### 176 | ### But if it is a one-dimensional dataframe, convert it into a Series 177 | if testx.shape[1] == 1: 178 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0]) 179 | else: 180 | #### Since it is multi-dimensional, So in this case, just return the data as is 181 | return testx, y 182 | ### now convert the input to transformer dictionary values 183 | new_ins = np.unique(testx.factorize()[1]).tolist() 184 | missing = [x for x in new_ins if x not in self.transformer.keys()] 185 | if len(missing) > 0: 186 | for each_missing in missing: 187 | self.transformer[each_missing] = int(self.max_val + 1) 188 | self.inverse_transformer[int(self.max_val+1)] = each_missing 189 | self.max_val = int(self.max_val+1) 190 | else: 191 | self.max_val = np.max(list(self.transformer.values())) 192 | outs = testx.map(self.transformer).values 193 | testk = testx.map(self.transformer) 194 | if testx.dtype not in [np.int16, np.int32, np.int64, float, bool, object]: 195 | if testx.isnull().sum().sum() > 0: 196 | fillval = self.transformer[np.nan] 197 | testk = testk.cat.add_categories([fillval]) 198 | testk = testk.fillna(fillval) 199 | testk = testk.astype(int) 200 | return testk, y 201 | else: 202 | testk = testk.astype(int) 203 | return testk, y 204 | else: 205 | return np.c_[outs,np.zeros(shape=outs.shape)].astype(int) 206 | 207 | def inverse_transform(self, testx, y=None): 208 | ### now convert the input to transformer dictionary values 209 | if isinstance(testx, pd.Series): 210 | outs = testx.map(self.inverse_transformer).values 211 | elif isinstance(testx, np.ndarray): 212 | outs = pd.Series(testx).map(self.inverse_transformer).values 213 | else: 214 | outs = testx[:] 215 | return outs 216 | ################################################################################# 217 | -------------------------------------------------------------------------------- /auto_ts/utils/val.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt # type: ignore 2 | import numpy as np # type: ignore 3 | import pandas as pd # type: ignore 4 | import seaborn as sns # type: ignore 5 | 6 | # This gives an error when running from a python script. 7 | # Maybe, this should be set in the jupyter notebook directly. 8 | # get_ipython().magic('matplotlib inline') 9 | sns.set(style="white", color_codes=True) 10 | 11 | from sklearn.model_selection import TimeSeriesSplit # type: ignore 12 | from sklearn.model_selection import GridSearchCV # type: ignore 13 | 14 | ######################################################### 15 | def cross_validation_time_series(model, df, preds, target,n_times=10,verbose=0): 16 | """ 17 | This splits a time series data frame "n" times as specified in the input (default=10) 18 | Initially it will start with a certain number of rows in train but it will gradually 19 | increase train size in steps (which it will calculate automatically) while the 20 | number of test rows will remain the same (though their content will vary). 21 | This utility is based on sklearn's time_series_split() 22 | """ 23 | if n_times > 10: 24 | print('More than 10 splits is not recommended. Setting n_times to 10') 25 | n_times = 10 26 | splits = TimeSeriesSplit(n_splits=n_times) 27 | index = 0 28 | X = df[preds].values 29 | y = df[target].values 30 | non_df = {} 31 | rmse_list = [] 32 | for train_index, test_index in splits.split(X): 33 | X_train = X[train_index] 34 | y_train = y[train_index] 35 | X_test = X[test_index] 36 | y_test = y[test_index] 37 | if verbose == 1: 38 | print('Iteration %d: Total Observations = %d' %(index,len(X_train)+len(X_test))) 39 | print(' Training Index %d Observations: %s' %(len(train_index),train_index)) 40 | print(' Testing Index %d Observations: %s' %(len(test_index),test_index)) 41 | model.fit(X_train, y_train) 42 | # TODO: Check print_rmse is not defined or loaded 43 | rmse = print_rmse(y_test, model.predict(X_test)) 44 | rmse_list.append(rmse) 45 | norm_rmse = rmse/y_test.std() 46 | print(' Split %d: Normalized RMSE = %0.2f' %(norm_rmse)) 47 | non_df[index] = norm_rmse 48 | index += 1 49 | non_df = pd.Series(non_df) 50 | non_df.plot() 51 | ave_norm_rmse = np.mean(rmse_list)/y.std() 52 | print('Normalized RMSE over entire data after %d splits = 0.2f' %(index,ave_norm_rmse)) 53 | return ave_norm_rmse 54 | ########################################################## 55 | def rolling_validation_time_series(model, df, preds, target,train_size=0, 56 | test_size=0, verbose=0): 57 | """ 58 | This utility uses a Walk Forward or Rolling Period time series cross validation method. 59 | Initially it will start with a minimum number of observations to train the model. 60 | It then gradually increases the train size in steps (which it will calculate automatically) 61 | while fixing the number of test rows the same (though their content will vary). 62 | Once the train+test series exceeds the number of rows in data set, it stops. 63 | It does not use SKLearn's Time Series Split. You need to provide the initial sizes 64 | of train and test and it will take care of the rest. 65 | """ 66 | df = df[:] 67 | index = 0 68 | X = df[preds].values 69 | y = df[target].values 70 | non_df = {} 71 | # rmse_list = [] # # TODO: Unused (check) 72 | if train_size == 0: 73 | train_size = np.int(np.ceil(len(y)/2)) 74 | if test_size == 0: 75 | test_size = np.int(np.ceil(len(y)/4)) 76 | # step_size = np.int(np.ceil(test_size/10)) # TODO: Unused (check) 77 | n_records = len(X) 78 | ### This contains the start point of test size for each K-Fold in time series 79 | test_list = np.floor(np.linspace(train_size,n_records-1,5)).tolist() 80 | for i in range(4): 81 | train_size = np.int(test_list[i]) 82 | test_size = np.int(test_list[i+1] - test_list[i]) 83 | X_train, X_test = X[:train_size],X[train_size:train_size+test_size] 84 | y_train, y_test = y[:train_size],y[train_size:train_size+test_size] 85 | model.fit(X_train, y_train) 86 | if i == 0: 87 | ### Since both start and end points are included, you have to subtract 1 from index in this 88 | df.loc[:train_size-1,'predictions'] = y[:train_size] 89 | df.loc[train_size:train_size+test_size-1,'predictions'] = model.predict(X_test) 90 | elif i == 3: 91 | test_size = np.int(len(X) - train_size) 92 | X_train, X_test = X[:train_size],X[train_size:train_size+test_size] 93 | y_train, y_test = y[:train_size],y[train_size:train_size+test_size] 94 | df.loc[train_size:train_size+test_size,'predictions'] = model.predict(X_test) 95 | else: 96 | df.loc[train_size:train_size+test_size-1,'predictions'] = model.predict(X_test) 97 | if len(y_train) + len(y_test) >= df.shape[0]: 98 | if verbose: 99 | print('Iteration %d: Observations:%d' %(index+1,len(X_train)+len(X_test))) 100 | print(' Train Size=%d, Test Size=%d' %(len(y_train),len(y_test))) 101 | # TODO: 102 | rmse = print_rmse(y_test, model.predict(X_test)) 103 | norm_rmse = rmse/y_test.std() 104 | non_df[i] = rmse 105 | if verbose: 106 | print('Normalized RMSE = %0.2f' %norm_rmse) 107 | non_df = pd.Series(non_df) 108 | weighted_ave_rmse = np.average(non_df.values,weights=non_df.index,axis=0) 109 | print('\nWeighted Average of RMSE (%d iterations) = %0.2f\n Normalized Wtd Aver. RMSE (using std dev) = %0.2f' 110 | %(index+1, weighted_ave_rmse,weighted_ave_rmse/y[:].std())) 111 | ############################# 112 | if verbose == 1 or verbose == 2: 113 | fig, ax1 = plt.subplots(nrows=1,ncols=1,figsize=(12,8)) 114 | ax1.plot(df[target],label='In-Sample Data', linestyle='-') 115 | ax1.plot(df['predictions'],'g',alpha=0.6,label='Rolling Forecast') 116 | ax1.set_xlabel('Time') 117 | ax1.set_ylabel('Values') 118 | ax1.legend(loc='best') 119 | return weighted_ave_rmse, weighted_ave_rmse/y[:].std(), df 120 | else: 121 | if verbose: 122 | print('Iteration %d: Observations:%d' %(index+1,len(X_train)+len(X_test))) 123 | print(' Train Size=%d, Test Size=%d' %(len(y_train),len(y_test))) 124 | # TODO: Check print_rmse is not defined or loaded 125 | rmse = print_rmse(y_test, model.predict(X_test)) 126 | norm_rmse = rmse/y_test.std() 127 | non_df[i] = rmse 128 | if verbose: 129 | print('Normalized RMSE = %0.2f' %norm_rmse) 130 | index += 1 131 | 132 | 133 | ################################################### 134 | # Re-run the above statistical tests, and more. To be used when selecting viable models. 135 | def ts_model_validation(model_results): 136 | """ 137 | Once you have built a time series model, how to validate it. This utility attempts to. 138 | This is only done on SARIMAX models from statsmodels. Don't try it on other models. 139 | The input is model_results which is the variable assigned to the model.fit() method. 140 | """ 141 | het_method='breakvar' 142 | norm_method='jarquebera' 143 | sercor_method='ljungbox' 144 | ######################## 145 | (het_stat, het_p) = model_results.test_heteroskedasticity(het_method)[0] 146 | norm_stat, norm_p, skew, kurtosis = model_results.test_normality(norm_method)[0] 147 | sercor_stat, sercor_p = model_results.test_serial_correlation(method=sercor_method)[0] 148 | sercor_stat = sercor_stat[-1] # last number for the largest lag 149 | sercor_p = sercor_p[-1] # last number for the largest lag 150 | 151 | # Run Durbin-Watson test on the standardized residuals. 152 | # The statistic is approximately equal to 2*(1-r), where r is the sample autocorrelation of the residuals. 153 | # Thus, for r == 0, indicating no serial correlation, the test statistic equals 2. 154 | # This statistic will always be between 0 and 4. The closer to 0 the statistic, 155 | # the more evidence for positive serial correlation. The closer to 4, 156 | # the more evidence for negative serial correlation. 157 | # Essentially, below 1 or above 3 is bad. 158 | 159 | # TODO: Checdk statsmodel is not loaded as sm. 160 | dw = sm.stats.stattools.durbin_watson(model_results.filter_results.standardized_forecasts_error[0, model_results.loglikelihood_burn:]) 161 | 162 | # check whether roots are outside the unit circle (we want them to be); 163 | # will be True when AR is not used (i.e., AR order = 0) 164 | arroots_outside_unit_circle = np.all(np.abs(model_results.arroots) > 1) 165 | # will be True when MA is not used (i.e., MA order = 0) 166 | maroots_outside_unit_circle = np.all(np.abs(model_results.maroots) > 1) 167 | 168 | print('Test heteroskedasticity of residuals ({}): stat={:.3f}, p={:.3f}'.format(het_method, het_stat, het_p)); 169 | print('\nTest normality of residuals ({}): stat={:.3f}, p={:.3f}'.format(norm_method, norm_stat, norm_p)); 170 | print('\nTest serial correlation of residuals ({}): stat={:.3f}, p={:.3f}'.format(sercor_method, sercor_stat, sercor_p)); 171 | print('\nDurbin-Watson test on residuals: d={:.2f}\n\t(NB: 2 means no serial correlation, 0=pos, 4=neg)'.format(dw)) 172 | print('\nTest for all AR roots outside unit circle (>1): {}'.format(arroots_outside_unit_circle)) 173 | print('\nTest for all MA roots outside unit circle (>1): {}'.format(maroots_outside_unit_circle)) 174 | ############################################################################################################ 175 | def quick_ts_plot(y_true, y_pred, modelname='Prophet'): 176 | fig,ax = plt.subplots(figsize=(15,7)) 177 | labels = ['actual','forecast'] 178 | y_true.plot(ax=ax,) 179 | y_pred.plot(ax=ax,) 180 | ax.legend(labels) 181 | plt.title('%s: Actual vs Forecast in expanding (training) window Cross Validation' %modelname, fontsize=20); 182 | ############################################################################################## 183 | -------------------------------------------------------------------------------- /cloud_run.txt: -------------------------------------------------------------------------------- 1 | # This workflow will deploy source code on Cloud Run when a commit is pushed to the "master" branch 2 | # 3 | # Overview: 4 | # 5 | # 1. Authenticate to Google Cloud 6 | # 2. Deploy it to Cloud Run 7 | # 8 | # To configure this workflow: 9 | # 10 | # 1. Ensure the required Google Cloud APIs are enabled: 11 | # 12 | # Cloud Run run.googleapis.com 13 | # Cloud Build cloudbuild.googleapis.com 14 | # Artifact Registry artifactregistry.googleapis.com 15 | # 16 | # 2. Create and configure Workload Identity Federation for GitHub (https://github.com/google-github-actions/auth#setting-up-workload-identity-federation) 17 | # 18 | # 3. Ensure the required IAM permissions are granted 19 | # 20 | # Cloud Run 21 | # roles/run.admin 22 | # roles/iam.serviceAccountUser (to act as the Cloud Run runtime service account) 23 | # 24 | # Cloud Build 25 | # roles/cloudbuild.builds.editor 26 | # 27 | # Cloud Storage 28 | # roles/storage.objectAdmin 29 | # 30 | # Artifact Registry 31 | # roles/artifactregistry.admin (project or repository level) 32 | # 33 | # NOTE: You should always follow the principle of least privilege when assigning IAM roles 34 | # 35 | # 4. Create GitHub secrets for WIF_PROVIDER and WIF_SERVICE_ACCOUNT 36 | # 37 | # 5. Change the values for the SERVICE and REGION environment variables (below). 38 | # 39 | # For more support on how to run this workflow, please visit https://github.com/marketplace/actions/deploy-to-cloud-run 40 | # 41 | # Further reading: 42 | # Cloud Run runtime service account - https://cloud.google.com/run/docs/securing/service-identity 43 | # Cloud Run IAM permissions - https://cloud.google.com/run/docs/deploying-source-code#permissions_required_to_deploy 44 | # Cloud Run builds from source - https://cloud.google.com/run/docs/deploying-source-code 45 | # Principle of least privilege - https://cloud.google.com/blog/products/identity-security/dont-get-pwned-practicing-the-principle-of-least-privilege -------------------------------------------------------------------------------- /example_datasets/Sales_and_Marketing.csv: -------------------------------------------------------------------------------- 1 | Time Period,Sales,Marketing Expense 2 | 2011-01-01,397,486.64 3 | 2011-02-01,400,501.8 4 | 2011-03-01,498,437.09 5 | 2011-04-01,536,565.16 6 | 2011-05-01,596,744.15 7 | 2011-06-01,591,548.74 8 | 2011-07-01,651,650.21 9 | 2011-08-01,654,777.51 10 | 2011-09-01,509,547.11 11 | 2011-10-01,437,382.81 12 | 2011-11-01,406,551.56 13 | 2011-12-01,470,401.69 14 | 2012-01-01,428,370.97 15 | 2012-02-01,423,318.39 16 | 2012-03-01,507,477.39 17 | 2012-04-01,536,418.66 18 | 2012-05-01,610,429.68 19 | 2012-06-01,609,713.24 20 | 2012-07-01,687,658.22 21 | 2012-08-01,707,800.52 22 | 2012-09-01,509,640.45 23 | 2012-10-01,452,606.49 24 | 2012-11-01,412,426.88 25 | 2012-12-01,472,513.48 26 | 2013-01-01,454,300.29 27 | 2013-02-01,455,330.84 28 | 2013-03-01,568,444.04 29 | 2013-04-01,610,628.82 30 | 2013-05-01,706,620.36 31 | 2013-06-01,661,682.6 32 | 2013-07-01,767,684.64 33 | 2013-08-01,783,748.47 34 | 2013-09-01,583,668.46 35 | 2013-10-01,513,499.31 36 | 2013-11-01,481,401.92 37 | 2013-12-01,567,605.06 38 | 2014-01-01,525,429.73 39 | 2014-02-01,520,602.86 40 | 2014-03-01,587,596.15 41 | 2014-04-01,710,619.39 42 | 2014-05-01,793,758.31 43 | 2014-06-01,749,980.16 44 | 2014-07-01,871,905.1 45 | 2014-08-01,848,784.62 46 | 2014-09-01,640,718.98 47 | 2014-10-01,581,570.3 48 | 2014-11-01,519,527.6 49 | 2014-12-01,605,559.75 50 | -------------------------------------------------------------------------------- /example_datasets/ts_2.csv: -------------------------------------------------------------------------------- 1 | DATE,UMCSENT 2 | 1978-01-01,83.7 3 | 1978-02-01,84.3 4 | 1978-03-01,78.8 5 | 1978-04-01,81.6 6 | 1978-05-01,82.9 7 | 1978-06-01,80.0 8 | 1978-07-01,82.4 9 | 1978-08-01,78.4 10 | 1978-09-01,80.4 11 | 1978-10-01,79.3 12 | 1978-11-01,75.0 13 | 1978-12-01,66.1 14 | 1979-01-01,72.1 15 | 1979-02-01,73.9 16 | 1979-03-01,68.4 17 | 1979-04-01,66.0 18 | 1979-05-01,68.1 19 | 1979-06-01,65.8 20 | 1979-07-01,60.4 21 | 1979-08-01,64.5 22 | 1979-09-01,66.7 23 | 1979-10-01,62.1 24 | 1979-11-01,63.3 25 | 1979-12-01,61.0 26 | 1980-01-01,67.0 27 | 1980-02-01,66.9 28 | 1980-03-01,56.5 29 | 1980-04-01,52.7 30 | 1980-05-01,51.7 31 | 1980-06-01,58.7 32 | 1980-07-01,62.3 33 | 1980-08-01,67.3 34 | 1980-09-01,73.7 35 | 1980-10-01,75.0 36 | 1980-11-01,76.7 37 | 1980-12-01,64.5 38 | 1981-01-01,71.4 39 | 1981-02-01,66.9 40 | 1981-03-01,66.5 41 | 1981-04-01,72.4 42 | 1981-05-01,76.3 43 | 1981-06-01,73.1 44 | 1981-07-01,74.1 45 | 1981-08-01,77.2 46 | 1981-09-01,73.1 47 | 1981-10-01,70.3 48 | 1981-11-01,62.5 49 | 1981-12-01,64.3 50 | 1982-01-01,71.0 51 | 1982-02-01,66.5 52 | 1982-03-01,62.0 53 | 1982-04-01,65.5 54 | 1982-05-01,67.5 55 | 1982-06-01,65.7 56 | 1982-07-01,65.4 57 | 1982-08-01,65.4 58 | 1982-09-01,69.3 59 | 1982-10-01,73.4 60 | 1982-11-01,72.1 61 | 1982-12-01,71.9 62 | 1983-01-01,70.4 63 | 1983-02-01,74.6 64 | 1983-03-01,80.8 65 | 1983-04-01,89.1 66 | 1983-05-01,93.3 67 | 1983-06-01,92.2 68 | 1983-07-01,92.8 69 | 1983-08-01,90.9 70 | 1983-09-01,89.9 71 | 1983-10-01,89.3 72 | 1983-11-01,91.1 73 | 1983-12-01,94.2 74 | 1984-01-01,100.1 75 | 1984-02-01,97.4 76 | 1984-03-01,101.0 77 | 1984-04-01,96.1 78 | 1984-05-01,98.1 79 | 1984-06-01,95.5 80 | 1984-07-01,96.6 81 | 1984-08-01,99.1 82 | 1984-09-01,100.9 83 | 1984-10-01,96.3 84 | 1984-11-01,95.7 85 | 1984-12-01,92.9 86 | 1985-01-01,96.0 87 | 1985-02-01,93.7 88 | 1985-03-01,93.7 89 | 1985-04-01,94.6 90 | 1985-05-01,91.8 91 | 1985-06-01,96.5 92 | 1985-07-01,94.0 93 | 1985-08-01,92.4 94 | 1985-09-01,92.1 95 | 1985-10-01,88.4 96 | 1985-11-01,90.9 97 | 1985-12-01,93.9 98 | 1986-01-01,95.6 99 | 1986-02-01,95.9 100 | 1986-03-01,95.1 101 | 1986-04-01,96.2 102 | 1986-05-01,94.8 103 | 1986-06-01,99.3 104 | 1986-07-01,97.7 105 | 1986-08-01,94.9 106 | 1986-09-01,91.9 107 | 1986-10-01,95.6 108 | 1986-11-01,91.4 109 | 1986-12-01,89.1 110 | 1987-01-01,90.4 111 | 1987-02-01,90.2 112 | 1987-03-01,90.8 113 | 1987-04-01,92.8 114 | 1987-05-01,91.1 115 | 1987-06-01,91.5 116 | 1987-07-01,93.7 117 | 1987-08-01,94.4 118 | 1987-09-01,93.6 119 | 1987-10-01,89.3 120 | 1987-11-01,83.1 121 | 1987-12-01,86.8 122 | 1988-01-01,90.8 123 | 1988-02-01,91.6 124 | 1988-03-01,94.6 125 | 1988-04-01,91.2 126 | 1988-05-01,94.8 127 | 1988-06-01,94.7 128 | 1988-07-01,93.4 129 | 1988-08-01,97.4 130 | 1988-09-01,97.3 131 | 1988-10-01,94.1 132 | 1988-11-01,93.0 133 | 1988-12-01,91.9 134 | 1989-01-01,97.9 135 | 1989-02-01,95.4 136 | 1989-03-01,94.3 137 | 1989-04-01,91.5 138 | 1989-05-01,90.7 139 | 1989-06-01,90.6 140 | 1989-07-01,92.0 141 | 1989-08-01,89.6 142 | 1989-09-01,95.8 143 | 1989-10-01,93.9 144 | 1989-11-01,90.9 145 | 1989-12-01,90.5 146 | 1990-01-01,93.0 147 | 1990-02-01,89.5 148 | 1990-03-01,91.3 149 | 1990-04-01,93.9 150 | 1990-05-01,90.6 151 | 1990-06-01,88.3 152 | 1990-07-01,88.2 153 | 1990-08-01,76.4 154 | 1990-09-01,72.8 155 | 1990-10-01,63.9 156 | 1990-11-01,66.0 157 | 1990-12-01,65.5 158 | 1991-01-01,66.8 159 | 1991-02-01,70.4 160 | 1991-03-01,87.7 161 | 1991-04-01,81.8 162 | 1991-05-01,78.3 163 | 1991-06-01,82.1 164 | 1991-07-01,82.9 165 | 1991-08-01,82.0 166 | 1991-09-01,83.0 167 | 1991-10-01,78.3 168 | 1991-11-01,69.1 169 | 1991-12-01,68.2 170 | 1992-01-01,67.5 171 | 1992-02-01,68.8 172 | 1992-03-01,76.0 173 | 1992-04-01,77.2 174 | 1992-05-01,79.2 175 | 1992-06-01,80.4 176 | 1992-07-01,76.6 177 | 1992-08-01,76.1 178 | 1992-09-01,75.6 179 | 1992-10-01,73.3 180 | 1992-11-01,85.3 181 | 1992-12-01,91.0 182 | 1993-01-01,89.3 183 | 1993-02-01,86.6 184 | 1993-03-01,85.9 185 | 1993-04-01,85.6 186 | 1993-05-01,80.3 187 | 1993-06-01,81.5 188 | 1993-07-01,77.0 189 | 1993-08-01,77.3 190 | 1993-09-01,77.9 191 | 1993-10-01,82.7 192 | 1993-11-01,81.2 193 | 1993-12-01,88.2 194 | 1994-01-01,94.3 195 | 1994-02-01,93.2 196 | 1994-03-01,91.5 197 | 1994-04-01,92.6 198 | 1994-05-01,92.8 199 | 1994-06-01,91.2 200 | 1994-07-01,89.0 201 | 1994-08-01,91.7 202 | 1994-09-01,91.5 203 | 1994-10-01,92.7 204 | 1994-11-01,91.6 205 | 1994-12-01,95.1 206 | 1995-01-01,97.6 207 | 1995-02-01,95.1 208 | 1995-03-01,90.3 209 | 1995-04-01,92.5 210 | 1995-05-01,89.8 211 | 1995-06-01,92.7 212 | 1995-07-01,94.4 213 | 1995-08-01,96.2 214 | 1995-09-01,88.9 215 | 1995-10-01,90.2 216 | 1995-11-01,88.2 217 | 1995-12-01,91.0 218 | 1996-01-01,89.3 219 | 1996-02-01,88.5 220 | 1996-03-01,93.7 221 | 1996-04-01,92.7 222 | 1996-05-01,89.4 223 | 1996-06-01,92.4 224 | 1996-07-01,94.7 225 | 1996-08-01,95.3 226 | 1996-09-01,94.7 227 | 1996-10-01,96.5 228 | 1996-11-01,99.2 229 | 1996-12-01,96.9 230 | 1997-01-01,97.4 231 | 1997-02-01,99.7 232 | 1997-03-01,100.0 233 | 1997-04-01,101.4 234 | 1997-05-01,103.2 235 | 1997-06-01,104.5 236 | 1997-07-01,107.1 237 | 1997-08-01,104.4 238 | 1997-09-01,106.0 239 | 1997-10-01,105.6 240 | 1997-11-01,107.2 241 | 1997-12-01,102.1 242 | 1998-01-01,106.6 243 | 1998-02-01,110.4 244 | 1998-03-01,106.5 245 | 1998-04-01,108.7 246 | 1998-05-01,106.5 247 | 1998-06-01,105.6 248 | 1998-07-01,105.2 249 | 1998-08-01,104.4 250 | 1998-09-01,100.9 251 | 1998-10-01,97.4 252 | 1998-11-01,102.7 253 | 1998-12-01,100.5 254 | 1999-01-01,103.9 255 | 1999-02-01,108.1 256 | 1999-03-01,105.7 257 | 1999-04-01,104.6 258 | 1999-05-01,106.8 259 | 1999-06-01,107.3 260 | 1999-07-01,106.0 261 | 1999-08-01,104.5 262 | 1999-09-01,107.2 263 | 1999-10-01,103.2 264 | 1999-11-01,107.2 265 | 1999-12-01,105.4 266 | 2000-01-01,112.0 267 | 2000-02-01,111.3 268 | 2000-03-01,107.1 269 | 2000-04-01,109.2 270 | 2000-05-01,110.7 271 | 2000-06-01,106.4 272 | 2000-07-01,108.3 273 | 2000-08-01,107.3 274 | 2000-09-01,106.8 275 | 2000-10-01,105.8 276 | 2000-11-01,107.6 277 | 2000-12-01,98.4 278 | 2001-01-01,94.7 279 | 2001-02-01,90.6 280 | 2001-03-01,91.5 281 | 2001-04-01,88.4 282 | 2001-05-01,92.0 283 | 2001-06-01,92.6 284 | 2001-07-01,92.4 285 | 2001-08-01,91.5 286 | 2001-09-01,81.8 287 | 2001-10-01,82.7 288 | 2001-11-01,83.9 289 | 2001-12-01,88.8 290 | 2002-01-01,93.0 291 | 2002-02-01,90.7 292 | 2002-03-01,95.7 293 | 2002-04-01,93.0 294 | 2002-05-01,96.9 295 | 2002-06-01,92.4 296 | 2002-07-01,88.1 297 | 2002-08-01,87.6 298 | 2002-09-01,86.1 299 | 2002-10-01,80.6 300 | 2002-11-01,84.2 301 | 2002-12-01,86.7 302 | 2003-01-01,82.4 303 | 2003-02-01,79.9 304 | 2003-03-01,77.6 305 | 2003-04-01,86.0 306 | 2003-05-01,92.1 307 | 2003-06-01,89.7 308 | 2003-07-01,90.9 309 | 2003-08-01,89.3 310 | 2003-09-01,87.7 311 | 2003-10-01,89.6 312 | 2003-11-01,93.7 313 | 2003-12-01,92.6 314 | 2004-01-01,103.8 315 | 2004-02-01,94.4 316 | 2004-03-01,95.8 317 | 2004-04-01,94.2 318 | 2004-05-01,90.2 319 | 2004-06-01,95.6 320 | 2004-07-01,96.7 321 | 2004-08-01,95.9 322 | 2004-09-01,94.2 323 | 2004-10-01,91.7 324 | 2004-11-01,92.8 325 | 2004-12-01,97.1 326 | 2005-01-01,95.5 327 | 2005-02-01,94.1 328 | 2005-03-01,92.6 329 | 2005-04-01,87.7 330 | 2005-05-01,86.9 331 | 2005-06-01,96.0 332 | 2005-07-01,96.5 333 | 2005-08-01,89.1 334 | 2005-09-01,76.9 335 | 2005-10-01,74.2 336 | 2005-11-01,81.6 337 | 2005-12-01,91.5 338 | 2006-01-01,91.2 339 | 2006-02-01,86.7 340 | 2006-03-01,88.9 341 | 2006-04-01,87.4 342 | 2006-05-01,79.1 343 | 2006-06-01,84.9 344 | 2006-07-01,84.7 345 | 2006-08-01,82.0 346 | 2006-09-01,85.4 347 | 2006-10-01,93.6 348 | 2006-11-01,92.1 349 | 2006-12-01,91.7 350 | 2007-01-01,96.9 351 | 2007-02-01,91.3 352 | 2007-03-01,88.4 353 | 2007-04-01,87.1 354 | 2007-05-01,88.3 355 | 2007-06-01,85.3 356 | 2007-07-01,90.4 357 | 2007-08-01,83.4 358 | 2007-09-01,83.4 359 | 2007-10-01,80.9 360 | 2007-11-01,76.1 361 | 2007-12-01,75.5 362 | 2008-01-01,78.4 363 | 2008-02-01,70.8 364 | 2008-03-01,69.5 365 | 2008-04-01,62.6 366 | 2008-05-01,59.8 367 | 2008-06-01,56.4 368 | 2008-07-01,61.2 369 | 2008-08-01,63.0 370 | 2008-09-01,70.3 371 | 2008-10-01,57.6 372 | 2008-11-01,55.3 373 | 2008-12-01,60.1 374 | 2009-01-01,61.2 375 | 2009-02-01,56.3 376 | 2009-03-01,57.3 377 | 2009-04-01,65.1 378 | 2009-05-01,68.7 379 | 2009-06-01,70.8 380 | 2009-07-01,66.0 381 | 2009-08-01,65.7 382 | 2009-09-01,73.5 383 | 2009-10-01,70.6 384 | 2009-11-01,67.4 385 | 2009-12-01,72.5 386 | 2010-01-01,74.4 387 | 2010-02-01,73.6 388 | 2010-03-01,73.6 389 | 2010-04-01,72.2 390 | 2010-05-01,73.6 391 | 2010-06-01,76.0 392 | 2010-07-01,67.8 393 | 2010-08-01,68.9 394 | 2010-09-01,68.2 395 | 2010-10-01,67.7 396 | 2010-11-01,71.6 397 | 2010-12-01,74.5 398 | 2011-01-01,74.2 399 | 2011-02-01,77.5 400 | 2011-03-01,67.5 401 | 2011-04-01,69.8 402 | 2011-05-01,74.3 403 | 2011-06-01,71.5 404 | 2011-07-01,63.7 405 | 2011-08-01,55.8 406 | 2011-09-01,59.5 407 | 2011-10-01,60.8 408 | 2011-11-01,63.7 409 | 2011-12-01,69.9 410 | 2012-01-01,75.0 411 | 2012-02-01,75.3 412 | 2012-03-01,76.2 413 | 2012-04-01,76.4 414 | 2012-05-01,79.3 415 | 2012-06-01,73.2 416 | 2012-07-01,72.3 417 | 2012-08-01,74.3 418 | 2012-09-01,78.3 419 | 2012-10-01,82.6 420 | 2012-11-01,82.7 421 | 2012-12-01,72.9 422 | 2013-01-01,73.8 423 | 2013-02-01,77.6 424 | 2013-03-01,78.6 425 | 2013-04-01,76.4 426 | 2013-05-01,84.5 427 | 2013-06-01,84.1 428 | 2013-07-01,85.1 429 | 2013-08-01,82.1 430 | 2013-09-01,77.5 431 | 2013-10-01,73.2 432 | 2013-11-01,75.1 433 | 2013-12-01,82.5 434 | 2014-01-01,81.2 435 | 2014-02-01,81.6 436 | 2014-03-01,80.0 437 | 2014-04-01,84.1 438 | 2014-05-01,81.9 439 | 2014-06-01,82.5 440 | 2014-07-01,81.8 441 | 2014-08-01,82.5 442 | 2014-09-01,84.6 443 | 2014-10-01,86.9 444 | 2014-11-01,88.8 445 | 2014-12-01,93.6 446 | 2015-01-01,98.1 447 | 2015-02-01,95.4 448 | 2015-03-01,93.0 449 | 2015-04-01,95.9 450 | 2015-05-01,90.7 451 | 2015-06-01,96.1 452 | 2015-07-01,93.1 453 | 2015-08-01,91.9 454 | 2015-09-01,87.2 455 | 2015-10-01,90.0 456 | 2015-11-01,91.3 457 | 2015-12-01,92.6 458 | 2016-01-01,92 459 | 2016-02-01,91.7 460 | 2016-03-01,91 461 | 2016-04-01,89 462 | 2016-05-01,94.7 463 | 2016-06-01,93.5 464 | 2016-07-01,90 465 | 2016-08-01,89.8 466 | 2016-09-01,91.2 467 | 2016-10-01,87.2 468 | 2016-11-01,93.8 469 | 2016-12-01,98.2 470 | 2017-01-01,98.5 471 | 2017-02-01,96.3 472 | 2017-03-01,96.9 473 | 2017-04-01,97 474 | 2017-05-01,97.1 475 | 2017-06-01,95 476 | 2017-07-01,93.4 477 | 2017-08-01,96.8 478 | 2017-09-01,95.1 479 | 2017-10-01,100.7 480 | 2017-11-01,98.5 481 | 2017-12-01,95.9 482 | 2018-01-01,95.7 483 | 2018-02-01,99.7 484 | 2018-03-01,101.4 485 | 2018-04-01,98.8 486 | 2018-05-01,98 487 | 2018-06-01,98.2 488 | -------------------------------------------------------------------------------- /images/add_fb_prophet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/add_fb_prophet.png -------------------------------------------------------------------------------- /images/install_auto_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/install_auto_ts.png -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Library dependencies for the python code. You need to install these with 2 | 3 | # conda create -n python=3.6 (or 3.7) 4 | # source activate 5 | # run requirements 6 | # python -m ipykernel install --user --name --display-name "" 7 | 8 | # `pip install -U -r requirements.txt` before you can run this. 9 | 10 | # Base libraries 11 | numpy 12 | pandas 13 | xlrd 14 | scipy 15 | prettytable 16 | xgboost>=2.0.0 # with GPU support 17 | GPUtil 18 | dask>=2022.2.0 19 | distributed>=2022.2.0 20 | GPUtil>=1.4.0 21 | pyyaml>=5.4.1 22 | 23 | # Viz libs 24 | matplotlib 25 | seaborn 26 | 27 | # Stats libraries 28 | scikit-learn>=0.24.0 29 | statsmodels 30 | 31 | # Auto-Arima 32 | pmdarima 33 | 34 | # Facebook Prophet 35 | prophet 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="auto_ts", 8 | version="0.0.92", 9 | author="Ram Seshadri", 10 | # author_email="author@example.com", 11 | description="Automatically Build Multiple Time Series models fast - now with Facebook Prophet!", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | license='Apache License 2.0', 15 | url="https://github.com/AutoViML/Auto_TS", 16 | packages=setuptools.find_packages(exclude=("auto_ts/test",)), 17 | install_requires=[ 18 | "ipython", 19 | "jupyter", 20 | "pmdarima", 21 | "numpy", 22 | "xlrd", 23 | "pandas", 24 | "matplotlib", 25 | "seaborn", 26 | "prophet", 27 | "scikit-learn>=0.24.0", 28 | "statsmodels", 29 | "xgboost>=2.0", 30 | "prettytable", 31 | "dask>=2022.1.0", 32 | "pyyaml>=5.4.1", 33 | "GPUtil>=1.4.0", 34 | "distributed>=2022.2.0", 35 | ], 36 | classifiers=[ 37 | "Programming Language :: Python :: 3", 38 | "Operating System :: OS Independent", 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /updates.md: -------------------------------------------------------------------------------- 1 |

Latest Updates about Auto_TS library Page:

2 | 3 |

January 2024 Update:

4 |

We have now added `XGBoost with GPU` support to Auto_TS. Auto_TS will automatically detect if there is a GPU in your Kaggle kernel or your local machine and will run XGBoost with GPU support. Hope this speeds up your computations!. 5 | 6 |

November 2023 Update:

7 |

We have now added `Google Cloud Run` support to Auto_TS. You can simply use the instructions in this page to deploy Auto_TS models on Google Cloud Run. Many thanks to abdulrahman305 for providing a Pull Request to add this functionality to Auto_TS.
8 | 9 |

March 2023 Update:

10 |

We have now upgraded `FB Prophet` to the latest version which is simply called `prophet`.
11 | 12 |

Aug 2022 Update:

13 |

You can now add FB Prophet arguments directly into Auto_TimeSeries using the kwargs argument. See example below: 14 | 15 | ![fb-prophet](images/add_fb_prophet.png) 16 | 17 |

Jan 2022 Update:

18 | New since version 0.0.35: You can now load your file into a Dask dataframe automatically. Just provide the name of your file and if it is too large to fit into a pandas dataframe, Auto_TS will automatically detect and load it into a Dask dataframe. 19 | 20 | --------------------------------------------------------------------------------