├── .github
└── workflows
│ └── google-cloudrun-source.yml
├── .gitignore
├── .mypy.ini
├── .pylintrc
├── CHANGELOG
├── LICENSE
├── MANIFEST.in
├── README.md
├── auto_ts
├── .gitattributes
├── .gitignore
├── __init__.py
├── __version__.py
├── models
│ ├── __init__.py
│ ├── ar_based
│ │ ├── __init__.py
│ │ ├── build_arima.py
│ │ ├── build_arima_base.py
│ │ ├── build_autoarimax.py
│ │ ├── build_sarimax.py
│ │ ├── build_var.py
│ │ └── param_finder.py
│ ├── build_base.py
│ ├── build_ml.py
│ ├── build_prophet.py
│ ├── build_pyflux.py
│ └── ml_models.py
├── py.typed
├── test
│ ├── __init__.py
│ ├── test_auto_sarimax.py
│ ├── test_auto_ts.py
│ └── test_var.py
└── utils
│ ├── __init__.py
│ ├── colors.py
│ ├── eda.py
│ ├── etl.py
│ ├── logging.py
│ ├── metrics.py
│ ├── my_encoders.py
│ └── val.py
├── cloud_run.txt
├── example_datasets
├── Sales_and_Marketing.csv
└── ts_2.csv
├── example_notebooks
├── Auto_TS_Test_AV_Hack_TS_Rank_600.ipynb
├── autots_multivariate_example.ipynb
└── autots_univariate_example.ipynb
├── images
├── add_fb_prophet.png
├── install_auto_ts.png
└── logo.png
├── requirements.txt
├── setup.py
└── updates.md
/.github/workflows/google-cloudrun-source.yml:
--------------------------------------------------------------------------------
1 | # This workflow will deploy source code on Cloud Run when a commit is pushed to the "master" branch
2 | #
3 | # Overview:
4 | #
5 | # 1. Authenticate to Google Cloud
6 | # 2. Deploy it to Cloud Run
7 | #
8 | # To configure this workflow:
9 | #
10 | # 1. Ensure the required Google Cloud APIs are enabled:
11 | #
12 | # Cloud Run run.googleapis.com
13 | # Cloud Build cloudbuild.googleapis.com
14 | # Artifact Registry artifactregistry.googleapis.com
15 | #
16 | # 2. Create and configure Workload Identity Federation for GitHub (https://github.com/google-github-actions/auth#setting-up-workload-identity-federation)
17 | #
18 | # 3. Ensure the required IAM permissions are granted
19 | #
20 | # Cloud Run
21 | # roles/run.admin
22 | # roles/iam.serviceAccountUser (to act as the Cloud Run runtime service account)
23 | #
24 | # Cloud Build
25 | # roles/cloudbuild.builds.editor
26 | #
27 | # Cloud Storage
28 | # roles/storage.objectAdmin
29 | #
30 | # Artifact Registry
31 | # roles/artifactregistry.admin (project or repository level)
32 | #
33 | # NOTE: You should always follow the principle of least privilege when assigning IAM roles
34 | #
35 | # 4. Create GitHub secrets for WIF_PROVIDER and WIF_SERVICE_ACCOUNT
36 | #
37 | # 5. Change the values for the SERVICE and REGION environment variables (below).
38 | #
39 | # For more support on how to run this workflow, please visit https://github.com/marketplace/actions/deploy-to-cloud-run
40 | #
41 | # Further reading:
42 | # Cloud Run runtime service account - https://cloud.google.com/run/docs/securing/service-identity
43 | # Cloud Run IAM permissions - https://cloud.google.com/run/docs/deploying-source-code#permissions_required_to_deploy
44 | # Cloud Run builds from source - https://cloud.google.com/run/docs/deploying-source-code
45 | # Principle of least privilege - https://cloud.google.com/blog/products/identity-security/dont-get-pwned-practicing-the-principle-of-least-privilege
46 |
47 | name: Deploy to Cloud Run from Source
48 |
49 | on:
50 | push:
51 | branches: [ "master" ]
52 |
53 | env:
54 | PROJECT_ID: YOUR_PROJECT_ID # TODO: update Google Cloud project id
55 | SERVICE: YOUR_SERVICE_NAME # TODO: update Cloud Run service name
56 | REGION: YOUR_SERVICE_REGION # TODO: update Cloud Run service region
57 |
58 | jobs:
59 | deploy:
60 | # Add 'id-token' with the intended permissions for workload identity federation
61 | permissions:
62 | contents: 'read'
63 | id-token: 'write'
64 |
65 | runs-on: ubuntu-latest
66 | steps:
67 | - name: Checkout
68 | uses: actions/checkout@v2
69 |
70 | - name: Google Auth
71 | id: auth
72 | uses: 'google-github-actions/auth@v0'
73 | with:
74 | workload_identity_provider: '${{ secrets.WIF_PROVIDER }}' # e.g. - projects/123456789/locations/global/workloadIdentityPools/my-pool/providers/my-provider
75 | service_account: '${{ secrets.WIF_SERVICE_ACCOUNT }}' # e.g. - my-service-account@my-project.iam.gserviceaccount.com
76 |
77 | # NOTE: Alternative option - authentication via credentials json
78 | # - name: Google Auth
79 | # id: auth
80 | # uses: 'google-github-actions/auth@v0'
81 | # with:
82 | # credentials_json: '${{ secrets.GCP_CREDENTIALS }}'
83 |
84 | - name: Deploy to Cloud Run
85 | id: deploy
86 | uses: google-github-actions/deploy-cloudrun@v0
87 | with:
88 | service: ${{ env.SERVICE }}
89 | region: ${{ env.REGION }}
90 | # NOTE: If required, update to the appropriate source folder
91 | source: ./
92 |
93 | # If required, use the Cloud Run url output in later steps
94 | - name: Show Output
95 | run: echo ${{ steps.deploy.outputs.url }}
96 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # VSCode project settings
118 | .vscode
119 | *.code-workspace
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
--------------------------------------------------------------------------------
/.mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # A comma-separated list of package or module names from where C extensions may
4 | # be loaded. Extensions are loading into the active Python interpreter and may
5 | # run arbitrary code.
6 | extension-pkg-whitelist=lxml.etree
7 |
8 | # Add files or directories to the blacklist. They should be base names, not
9 | # paths.
10 | ignore=CVS
11 |
12 | # Add files or directories matching the regex patterns to the blacklist. The
13 | # regex matches against base names, not paths.
14 | ignore-patterns=
15 |
16 | # Python code to execute, usually for sys.path manipulation such as
17 | # pygtk.require().
18 | #init-hook=
19 |
20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21 | # number of processors available to use.
22 | jobs=1
23 |
24 | # Control the amount of potential inferred values when inferring a single
25 | # object. This can help the performance when dealing with large functions or
26 | # complex, nested conditions.
27 | limit-inference-results=100
28 |
29 | # List of plugins (as comma separated values of python module names) to load,
30 | # usually to register additional checkers.
31 | load-plugins=
32 |
33 | # Pickle collected data for later comparisons.
34 | persistent=yes
35 |
36 | # Specify a configuration file.
37 | #rcfile=
38 |
39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
40 | # user-friendly hints instead of false-positive error messages.
41 | suggestion-mode=yes
42 |
43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
44 | # active Python interpreter and may run arbitrary code.
45 | unsafe-load-any-extension=no
46 |
47 |
48 | [MESSAGES CONTROL]
49 |
50 | # Only show warnings with the listed confidence levels. Leave empty to show
51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52 | confidence=
53 |
54 | # Disable the message, report, category or checker with the given id(s). You
55 | # can either give multiple identifiers separated by comma (,) or put this
56 | # option multiple times (only on the command line, not in the configuration
57 | # file where it should appear only once). You can also use "--disable=all" to
58 | # disable everything first and then reenable specific checks. For example, if
59 | # you want to run only the similarities checker, you can use "--disable=all
60 | # --enable=similarities". If you want to run only the classes checker, but have
61 | # no Warning level messages displayed, use "--disable=all --enable=classes
62 | # --disable=W".
63 | disable=print-statement,
64 | parameter-unpacking,
65 | unpacking-in-except,
66 | old-raise-syntax,
67 | backtick,
68 | long-suffix,
69 | old-ne-operator,
70 | old-octal-literal,
71 | import-star-module-level,
72 | non-ascii-bytes-literal,
73 | raw-checker-failed,
74 | bad-inline-option,
75 | locally-disabled,
76 | file-ignored,
77 | suppressed-message,
78 | useless-suppression,
79 | deprecated-pragma,
80 | use-symbolic-message-instead,
81 | apply-builtin,
82 | basestring-builtin,
83 | buffer-builtin,
84 | cmp-builtin,
85 | coerce-builtin,
86 | execfile-builtin,
87 | file-builtin,
88 | long-builtin,
89 | raw_input-builtin,
90 | reduce-builtin,
91 | standarderror-builtin,
92 | unicode-builtin,
93 | xrange-builtin,
94 | coerce-method,
95 | delslice-method,
96 | getslice-method,
97 | setslice-method,
98 | no-absolute-import,
99 | old-division,
100 | dict-iter-method,
101 | dict-view-method,
102 | next-method-called,
103 | metaclass-assignment,
104 | indexing-exception,
105 | raising-string,
106 | reload-builtin,
107 | oct-method,
108 | hex-method,
109 | nonzero-method,
110 | cmp-method,
111 | input-builtin,
112 | round-builtin,
113 | intern-builtin,
114 | unichr-builtin,
115 | map-builtin-not-iterating,
116 | zip-builtin-not-iterating,
117 | range-builtin-not-iterating,
118 | filter-builtin-not-iterating,
119 | using-cmp-argument,
120 | eq-without-hash,
121 | div-method,
122 | idiv-method,
123 | rdiv-method,
124 | exception-message-attribute,
125 | invalid-str-codec,
126 | sys-max-int,
127 | bad-python3-import,
128 | deprecated-string-function,
129 | deprecated-str-translate-call,
130 | deprecated-itertools-function,
131 | deprecated-types-field,
132 | next-method-defined,
133 | dict-items-not-iterating,
134 | dict-keys-not-iterating,
135 | dict-values-not-iterating,
136 | deprecated-operator-function,
137 | deprecated-urllib-function,
138 | xreadlines-attribute,
139 | deprecated-sys-function,
140 | exception-escape,
141 | comprehension-escape
142 |
143 | # Enable the message, report, category or checker with the given id(s). You can
144 | # either give multiple identifier separated by comma (,) or put this option
145 | # multiple time (only on the command line, not in the configuration file where
146 | # it should appear only once). See also the "--disable" option for examples.
147 | enable=c-extension-no-member
148 |
149 |
150 | [REPORTS]
151 |
152 | # Python expression which should return a score less than or equal to 10. You
153 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
154 | # which contain the number of messages in each category, as well as 'statement'
155 | # which is the total number of statements analyzed. This score is used by the
156 | # global evaluation report (RP0004).
157 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
158 |
159 | # Template used to display messages. This is a python new-style format string
160 | # used to format the message information. See doc for all details.
161 | #msg-template=
162 |
163 | # Set the output format. Available formats are text, parseable, colorized, json
164 | # and msvs (visual studio). You can also give a reporter class, e.g.
165 | # mypackage.mymodule.MyReporterClass.
166 | output-format=text
167 |
168 | # Tells whether to display a full report or only the messages.
169 | reports=no
170 |
171 | # Activate the evaluation score.
172 | score=yes
173 |
174 |
175 | [REFACTORING]
176 |
177 | # Maximum number of nested blocks for function / method body
178 | max-nested-blocks=5
179 |
180 | # Complete name of functions that never returns. When checking for
181 | # inconsistent-return-statements if a never returning function is called then
182 | # it will be considered as an explicit return statement and no message will be
183 | # printed.
184 | never-returning-functions=sys.exit
185 |
186 |
187 | [STRING]
188 |
189 | # This flag controls whether the implicit-str-concat-in-sequence should
190 | # generate a warning on implicit string concatenation in sequences defined over
191 | # several lines.
192 | check-str-concat-over-line-jumps=no
193 |
194 |
195 | [SPELLING]
196 |
197 | # Limits count of emitted suggestions for spelling mistakes.
198 | max-spelling-suggestions=4
199 |
200 | # Spelling dictionary name. Available dictionaries: none. To make it work,
201 | # install the python-enchant package.
202 | spelling-dict=
203 |
204 | # List of comma separated words that should not be checked.
205 | spelling-ignore-words=
206 |
207 | # A path to a file that contains the private dictionary; one word per line.
208 | spelling-private-dict-file=
209 |
210 | # Tells whether to store unknown words to the private dictionary (see the
211 | # --spelling-private-dict-file option) instead of raising a message.
212 | spelling-store-unknown-words=no
213 |
214 |
215 | [TYPECHECK]
216 |
217 | # List of decorators that produce context managers, such as
218 | # contextlib.contextmanager. Add to this list to register other decorators that
219 | # produce valid context managers.
220 | contextmanager-decorators=contextlib.contextmanager
221 |
222 | # List of members which are set dynamically and missed by pylint inference
223 | # system, and so shouldn't trigger E1101 when accessed. Python regular
224 | # expressions are accepted.
225 | generated-members=
226 |
227 | # Tells whether missing members accessed in mixin class should be ignored. A
228 | # mixin class is detected if its name ends with "mixin" (case insensitive).
229 | ignore-mixin-members=yes
230 |
231 | # Tells whether to warn about missing members when the owner of the attribute
232 | # is inferred to be None.
233 | ignore-none=yes
234 |
235 | # This flag controls whether pylint should warn about no-member and similar
236 | # checks whenever an opaque object is returned when inferring. The inference
237 | # can return multiple potential results while evaluating a Python object, but
238 | # some branches might not be evaluated, which results in partial inference. In
239 | # that case, it might be useful to still emit no-member and other checks for
240 | # the rest of the inferred objects.
241 | ignore-on-opaque-inference=yes
242 |
243 | # List of class names for which member attributes should not be checked (useful
244 | # for classes with dynamically set attributes). This supports the use of
245 | # qualified names.
246 | ignored-classes=optparse.Values,thread._local,_thread._local
247 |
248 | # List of module names for which member attributes should not be checked
249 | # (useful for modules/projects where namespaces are manipulated during runtime
250 | # and thus existing member attributes cannot be deduced by static analysis). It
251 | # supports qualified module names, as well as Unix pattern matching.
252 | ignored-modules=
253 |
254 | # Show a hint with possible names when a member name was not found. The aspect
255 | # of finding the hint is based on edit distance.
256 | missing-member-hint=yes
257 |
258 | # The minimum edit distance a name should have in order to be considered a
259 | # similar match for a missing member name.
260 | missing-member-hint-distance=1
261 |
262 | # The total number of similar names that should be taken in consideration when
263 | # showing a hint for a missing member.
264 | missing-member-max-choices=1
265 |
266 | # List of decorators that change the signature of a decorated function.
267 | signature-mutators=
268 |
269 |
270 | [BASIC]
271 |
272 | # Naming style matching correct argument names.
273 | argument-naming-style=snake_case
274 |
275 | # Regular expression matching correct argument names. Overrides argument-
276 | # naming-style.
277 | #argument-rgx=
278 |
279 | # Naming style matching correct attribute names.
280 | attr-naming-style=snake_case
281 |
282 | # Regular expression matching correct attribute names. Overrides attr-naming-
283 | # style.
284 | #attr-rgx=
285 |
286 | # Bad variable names which should always be refused, separated by a comma.
287 | bad-names=foo,
288 | bar,
289 | baz,
290 | toto,
291 | tutu,
292 | tata
293 |
294 | # Naming style matching correct class attribute names.
295 | class-attribute-naming-style=any
296 |
297 | # Regular expression matching correct class attribute names. Overrides class-
298 | # attribute-naming-style.
299 | #class-attribute-rgx=
300 |
301 | # Naming style matching correct class names.
302 | class-naming-style=PascalCase
303 |
304 | # Regular expression matching correct class names. Overrides class-naming-
305 | # style.
306 | #class-rgx=
307 |
308 | # Naming style matching correct constant names.
309 | const-naming-style=UPPER_CASE
310 |
311 | # Regular expression matching correct constant names. Overrides const-naming-
312 | # style.
313 | #const-rgx=
314 |
315 | # Minimum line length for functions/classes that require docstrings, shorter
316 | # ones are exempt.
317 | docstring-min-length=-1
318 |
319 | # Naming style matching correct function names.
320 | function-naming-style=snake_case
321 |
322 | # Regular expression matching correct function names. Overrides function-
323 | # naming-style.
324 | #function-rgx=
325 |
326 | # Good variable names which should always be accepted, separated by a comma.
327 | good-names=i,
328 | j,
329 | k,
330 | ex,
331 | Run,
332 | _
333 |
334 | # Include a hint for the correct naming format with invalid-name.
335 | include-naming-hint=no
336 |
337 | # Naming style matching correct inline iteration names.
338 | inlinevar-naming-style=any
339 |
340 | # Regular expression matching correct inline iteration names. Overrides
341 | # inlinevar-naming-style.
342 | #inlinevar-rgx=
343 |
344 | # Naming style matching correct method names.
345 | method-naming-style=snake_case
346 |
347 | # Regular expression matching correct method names. Overrides method-naming-
348 | # style.
349 | #method-rgx=
350 |
351 | # Naming style matching correct module names.
352 | module-naming-style=snake_case
353 |
354 | # Regular expression matching correct module names. Overrides module-naming-
355 | # style.
356 | #module-rgx=
357 |
358 | # Colon-delimited sets of names that determine each other's naming style when
359 | # the name regexes allow several styles.
360 | name-group=
361 |
362 | # Regular expression which should only match function or class names that do
363 | # not require a docstring.
364 | no-docstring-rgx=^_
365 |
366 | # List of decorators that produce properties, such as abc.abstractproperty. Add
367 | # to this list to register other decorators that produce valid properties.
368 | # These decorators are taken in consideration only for invalid-name.
369 | property-classes=abc.abstractproperty
370 |
371 | # Naming style matching correct variable names.
372 | variable-naming-style=snake_case
373 |
374 | # Regular expression matching correct variable names. Overrides variable-
375 | # naming-style.
376 | #variable-rgx=
377 |
378 |
379 | [SIMILARITIES]
380 |
381 | # Ignore comments when computing similarities.
382 | ignore-comments=yes
383 |
384 | # Ignore docstrings when computing similarities.
385 | ignore-docstrings=yes
386 |
387 | # Ignore imports when computing similarities.
388 | ignore-imports=no
389 |
390 | # Minimum lines number of a similarity.
391 | min-similarity-lines=4
392 |
393 |
394 | [MISCELLANEOUS]
395 |
396 | # List of note tags to take in consideration, separated by a comma.
397 | notes=FIXME,
398 | XXX,
399 | TODO
400 |
401 |
402 | [LOGGING]
403 |
404 | # Format style used to check logging format string. `old` means using %
405 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
406 | logging-format-style=old
407 |
408 | # Logging modules to check that the string format arguments are in logging
409 | # function parameter format.
410 | logging-modules=logging
411 |
412 |
413 | [VARIABLES]
414 |
415 | # List of additional names supposed to be defined in builtins. Remember that
416 | # you should avoid defining new builtins when possible.
417 | additional-builtins=
418 |
419 | # Tells whether unused global variables should be treated as a violation.
420 | allow-global-unused-variables=yes
421 |
422 | # List of strings which can identify a callback function by name. A callback
423 | # name must start or end with one of those strings.
424 | callbacks=cb_,
425 | _cb
426 |
427 | # A regular expression matching the name of dummy variables (i.e. expected to
428 | # not be used).
429 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
430 |
431 | # Argument names that match this expression will be ignored. Default to name
432 | # with leading underscore.
433 | ignored-argument-names=_.*|^ignored_|^unused_
434 |
435 | # Tells whether we should check for unused import in __init__ files.
436 | init-import=no
437 |
438 | # List of qualified module names which can have objects that can redefine
439 | # builtins.
440 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
441 |
442 |
443 | [FORMAT]
444 |
445 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
446 | expected-line-ending-format=
447 |
448 | # Regexp for a line that is allowed to be longer than the limit.
449 | ignore-long-lines=^\s*(# )??$
450 |
451 | # Number of spaces of indent required inside a hanging or continued line.
452 | indent-after-paren=4
453 |
454 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
455 | # tab).
456 | indent-string=' '
457 |
458 | # Maximum number of characters on a single line.
459 | max-line-length=120
460 |
461 | # Maximum number of lines in a module.
462 | max-module-lines=1000
463 |
464 | # List of optional constructs for which whitespace checking is disabled. `dict-
465 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
466 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
467 | # `empty-line` allows space-only lines.
468 | no-space-check=trailing-comma,
469 | dict-separator
470 |
471 | # Allow the body of a class to be on the same line as the declaration if body
472 | # contains single statement.
473 | single-line-class-stmt=no
474 |
475 | # Allow the body of an if to be on the same line as the test if there is no
476 | # else.
477 | single-line-if-stmt=no
478 |
479 |
480 | [IMPORTS]
481 |
482 | # List of modules that can be imported at any level, not just the top level
483 | # one.
484 | allow-any-import-level=
485 |
486 | # Allow wildcard imports from modules that define __all__.
487 | allow-wildcard-with-all=no
488 |
489 | # Analyse import fallback blocks. This can be used to support both Python 2 and
490 | # 3 compatible code, which means that the block might have code that exists
491 | # only in one or another interpreter, leading to false positives when analysed.
492 | analyse-fallback-blocks=no
493 |
494 | # Deprecated modules which should not be used, separated by a comma.
495 | deprecated-modules=optparse,tkinter.tix
496 |
497 | # Create a graph of external dependencies in the given file (report RP0402 must
498 | # not be disabled).
499 | ext-import-graph=
500 |
501 | # Create a graph of every (i.e. internal and external) dependencies in the
502 | # given file (report RP0402 must not be disabled).
503 | import-graph=
504 |
505 | # Create a graph of internal dependencies in the given file (report RP0402 must
506 | # not be disabled).
507 | int-import-graph=
508 |
509 | # Force import order to recognize a module as part of the standard
510 | # compatibility libraries.
511 | known-standard-library=
512 |
513 | # Force import order to recognize a module as part of a third party library.
514 | known-third-party=enchant
515 |
516 | # Couples of modules and preferred modules, separated by a comma.
517 | preferred-modules=
518 |
519 |
520 | [DESIGN]
521 |
522 | # Maximum number of arguments for function / method.
523 | max-args=12
524 |
525 | # Maximum number of attributes for a class (see R0902).
526 | max-attributes=7
527 |
528 | # Maximum number of boolean expressions in an if statement (see R0916).
529 | max-bool-expr=5
530 |
531 | # Maximum number of branch for function / method body.
532 | max-branches=12
533 |
534 | # Maximum number of locals for function / method body.
535 | max-locals=15
536 |
537 | # Maximum number of parents for a class (see R0901).
538 | max-parents=7
539 |
540 | # Maximum number of public methods for a class (see R0904).
541 | max-public-methods=20
542 |
543 | # Maximum number of return / yield for function / method body.
544 | max-returns=6
545 |
546 | # Maximum number of statements in function / method body.
547 | max-statements=100
548 |
549 | # Minimum number of public methods for a class (see R0903).
550 | min-public-methods=2
551 |
552 |
553 | [CLASSES]
554 |
555 | # List of method names used to declare (i.e. assign) instance attributes.
556 | defining-attr-methods=__init__,
557 | __new__,
558 | setUp,
559 | __post_init__
560 |
561 | # List of member names, which should be excluded from the protected access
562 | # warning.
563 | exclude-protected=_asdict,
564 | _fields,
565 | _replace,
566 | _source,
567 | _make
568 |
569 | # List of valid names for the first argument in a class method.
570 | valid-classmethod-first-arg=cls
571 |
572 | # List of valid names for the first argument in a metaclass class method.
573 | valid-metaclass-classmethod-first-arg=cls
574 |
575 |
576 | [EXCEPTIONS]
577 |
578 | # Exceptions that will emit a warning when being caught. Defaults to
579 | # "BaseException, Exception".
580 | overgeneral-exceptions=BaseException,
581 | Exception
582 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | -------------------------------
2 | version_number = '0.0.24.b2'
3 | -------------------------------
4 | Fixed bug with Prophet Multivariate Prediction where it needed to pass forecast_period in
5 | addition to X_exogen. Only X_egogen is needed now. Forecast Period is calculated based on
6 | number of observations in the X_exogen data.
7 |
8 | TODO: Make sure all predict functions are consistent (Prophet now has an Optional
9 | return if things go wrong. Others should do the same.)
10 |
11 | -------------------------------
12 | version_number = '0.0.24'
13 | -------------------------------
14 | Added 'auto_arima' capabaility from pmdarima library
15 |
16 | -------------------------------
17 | version_number = '0.0.23.b4'
18 | -------------------------------
19 | Changed default argument for 'sep' in fit function to be 'None' (treated as ',' internally).
20 |
21 | Fixed bug with predict function in auto_ts
22 | Dataframe index for X_exogen needed to be set before passing to predict since we were doing the same
23 | while fitting. Without this, it was causing issues with ML models where we are internally
24 | constructing the 'future dataframe' and if while fiting, the dataframe had datatime index and while
25 | predicting, X_egogen had integer index (index was still in dataframne column in X_egogen), it was
26 | causing issues while adding time series features (could not get time series features from integers).
27 |
28 |
29 | -------------------------------
30 | version_number = '0.0.23.b3'
31 | -------------------------------
32 | More time series engineered features included in ML models
33 | Example, 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'weekofyear', 'weekend', etc.
34 |
35 |
36 | -------------------------------
37 | version_number = '0.0.23.b2'
38 | -------------------------------
39 | Fixed bug in Prophet rolling window horizon calculation
40 |
41 |
42 | -------------------------------
43 | version_number = '0.0.23'
44 | -------------------------------
45 | Prophet now includes multivariate modeling capability with rolling window
46 | SARIMAX also includes multivariate modeling capability with rolling window
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include auto_ts/py.typed # marker file for PEP 561
3 |
4 | include CHANGELOG.md
5 | include LICENSE
6 | include CITATION.cff
7 | include *.cff # citation info
8 |
9 | include MANIFEST.in
10 | include pyproject.toml
11 | include setup.py
12 | include setup.cfg
13 |
14 | include requirements.txt
15 |
16 | recursive-exclude tests *
17 | recursive-exclude docs *
18 | recursive-exclude site *
19 | recursive-exclude example_datasets *
20 | recursive-exclude example_notebooks *
21 | recursive-exclude .github *
22 |
23 | exclude .flake8
24 | exclude .gitignore
25 | exclude .mypy.ini
26 | exclude .pre-commit-config.yaml
27 | exclude .pylintrc
28 | exclude Makefile
29 | exclude updates.md
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Auto_TS: Auto_TimeSeries
2 | Automatically build multiple Time Series models using a Single Line of Code. Now updated with Dask.
3 |
4 | 
5 |
6 | `auto_timeseries` is a complex model building utility for time series data. Since it automates many
7 | Tasks involved in a complex endeavor, it assumes many intelligent defaults. But you can change them.
8 | Auto_Timeseries will rapidly build predictive models based on Statsmodels ARIMA, Seasonal ARIMA, Prophet
9 | and Scikit-Learn ML. It will automatically select the best model which gives best score specified.
10 |
11 | # Table of Contents
12 |
24 |
25 | ## Latest
26 | If you are looking for the latest and greatest updates about our library, check out our [updates page](https://github.com/AutoViML/Auto_TS/blob/master/updates.md).
27 |
28 |
29 | ## Citation
30 | If you use Auto_TS in your research project or paper, please use the following format for citations:
31 |
32 | "Seshadri, Ram (2020). GitHub - AutoViML/Auto_TS: enables you to build and deploy multiple time series models using ML and statistical techniques with a single line of code. Source code: https://github.com/AutoViML/Auto_TS"
33 |
34 | Introduction
35 |
36 | Auto_TS (Auto_TimeSeries) enables you to build and select multiple time series models using techniques such as ARIMA, SARIMAX, VAR, decomposable (trend+seasonality+holidays) models, and ensemble machine learning models.
37 |
38 | Auto_TimeSeries is an Automated ML library for time series data. Auto_TimeSeries was initially conceived and developed by [Ram Seshadri](https://www.linkedin.com/in/ram-seshadri-nyc-nj/) and was significantly expanded in functionality and scope and upgraded to its present status by [Nikhil Gupta](https://github.com/ngupta23).
39 |
40 | auto-ts.Auto_TimeSeries is the main function that you will call with your train data. You can then choose what kind of models you want: stats, ml or Prophet based model. You can also tell it to automatically select the best model based on the scoring parameter you want it to be based on. It will return the best model and a dictionary containing predictions for the number of forecast_periods you mentioned (default=2).
41 |
42 | ## Install
43 |
44 | ```bash
45 | pip install auto-ts
46 | ```
47 |
48 | Use `pip3 install auto-ts` if the above doesn’t work
49 |
50 | ```bash
51 | pip install git+https://github.com/AutoViML/Auto_TS.git
52 | ```
53 |
54 | ### Installing on Colab
55 | If you are using Colab or Kaggle kernel and want to install auto_ts, please use the following steps (otherwise you will get an error!):
56 |
57 | ```
58 | !pip install auto-ts --no-deps --ignore-installed
59 | !pip install 'fsspec>=0.3.3'
60 | !pip install statsmodels --upgrade
61 | !pip install pmdarima
62 | ```
63 |
64 | 
65 |
66 | ### Installing on Windows
67 |
68 | Windows users may experience difficulties with the Prophet and pystan dependency installations. Because of this, we recommend installing Prophet using instructions from the [Prophet documentation page](https://facebook.github.io/prophet/docs/installation.html) prior to installing auto-ts. For Anaconda users, this can be accomplished via:
69 | ```bash
70 | conda install -c conda-forge prophet
71 | pip install auto-ts
72 | ```
73 |
74 | Usage
75 |
76 | ### First you need to import auto_timeseries from auto_ts library:
77 |
78 | ```py
79 | from auto_ts import auto_timeseries
80 | ```
81 |
82 | ### Second, Initialize an auto_timeseries model object which will hold all your parameters:
83 |
84 | ```py
85 | model = auto_timeseries(
86 | score_type='rmse',
87 | time_interval='Month',
88 | non_seasonal_pdq=None, seasonality=False,
89 | seasonal_period=12,
90 | model_type=['Prophet'],
91 | verbose=2,
92 | )
93 | ```
94 |
95 | #### Here are how the input parameters defined:
96 |
97 | - **score_type (default='rmse')**: The metric used for scoring the models. Type is string.
98 | Currently only the following two types are supported:
99 | 1. "rmse": Root Mean Squared Error (RMSE)
100 | 1. "normalized_rmse": Ratio of RMSE to the standard deviation of actuals
101 | - **time_interval (default is None)**: Used to indicate the frequency at which the data is collected.
102 | This is used for two purposes (1) in building the Prophet model and (2) used to impute the seasonal period for SARIMAX in case it is not provided by the user (None). Type is String. We use the following [pandas date range frequency](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases) aliases that Prophet uses to make the prediction dataframe. Hence, please note that these are the list of allowed aliases for frequency:
103 | `['B','C','D','W','M','SM','BM','CBM',
104 | 'MS','SMS','BMS','CBMS','Q','BQ','QS','BQS',
105 | 'A,Y','BA,BY','AS,YS','BAS,BYS','BH',
106 | 'H','T,min','S','L,ms','U,us','N']`
107 | For a start, you can test the following codes for your data and see how the results are (or you can leave it as None and auto_timeseries will try and impute it for you):
108 | - `'MS', 'M', 'SM', 'BM', 'CBM', 'SMS', 'BMS'` for monthly frequency data
109 | - `'D', 'B', 'C'` for daily frequency data
110 | - `'W'` for weekly frequency data
111 | - `'Q', 'BQ', 'QS', 'BQS'` for quarterly frequency data
112 | - `'A,Y', 'BA,BY', 'AS,YS', 'BAS,YAS'` for yearly frequency data
113 | - `'BH', 'H', 'h'` for hourly frequency data
114 | - `'T,min'` for minute frequency data
115 | - `'S', 'L,milliseconds', 'U,microseconds', 'N,nanoseconds'` for second frequency data
116 | - **non_seasonal_pdq (default = (3,1,3))**: Indicates the maximum value of (p, d, q) to be used in the search for statistical ARIMA models.
117 | If None, then the following values are assumed `max_p = 3, max_d = 1, max_q = 3`. Type is Tuple.
118 | - **seasonality (default=False)**: Used in the building of the SARIMAX model only at this time. True or False. Type is bool.
119 | - **seasonal_period (default is None)**: Indicates the seasonal period in your data. This depends on the peak (or valley) period that occurs regularly in your data.
120 | Used in the building of the SARIMAX model only at this time.
121 | There is no impact of this argument if seasonality is set to False
122 | If None, the program will try to infer this from the time_interval (frequency) of the data
123 | We assume the following as defaults but feel free to change them.
124 | 1. If frequency is Monthly, then seasonal_period is assumed to be 12
125 | 1. If frequency is Daily, then seasonal_period is assumed to be 30 (but it could be 7)
126 | 1. If frequency is Weekly, then seasonal_period is assumed to be 52
127 | 1. If frequency is Quarterly, then seasonal_period is assumed to be 4
128 | 1. If frequency is Yearly, then seasonal_period is assumed to be 1
129 | 1. If frequency is Hourly, then seasonal_period is assumed to be 24
130 | 1. If frequency is Minutes, then seasonal_period is assumed to be 60
131 | 1. If frequency is Seconds, then seasonal_period is assumed to be 60
132 | Type is integer
133 | - **conf_int (default=0.95)**: Confidence Interval for building the Prophet model. Default: 0.95. Type is float.
134 | - **model_type (default: 'stats'**: The type(s) of model to build. Default to building only statistical models. If a list is provided, then only those models will be built. Can be a string or a list of models. Allowed values are:
135 | `'best', 'prophet', 'stats', 'ARIMA', 'SARIMAX', 'VAR', 'ML'`.
136 | - `"prophet"` will build a model using Prophet -> this means you must have Prophet installed
137 | - `"stats"` will build statsmodels based ARIMA, SARIMAX and VAR models
138 | - `"ML"` will build a machine learning model using Random Forests provided explanatory vars are given
139 | - `"best"` will try to build all models and pick the best one
140 | - **verbose (default=0)**: Indicates the verbosity of printing. Type is integer.
141 |
142 | WARNING: "best" might take some time for large data sets. We recommend that you
143 | choose a small sample from your data set before attempting to run entire data.
144 |
145 | ### The next step after defining the model object is to fit it with some real data:
146 |
147 | ```py
148 | model.fit(
149 | traindata=train_data,
150 | ts_column=ts_column,
151 | target=target,
152 | cv=5,
153 | sep=","
154 | )
155 | ```
156 |
157 | Here are how the parameters defined:
158 | - **traindata (required)**: It can be either a dataframe or a file. You must give the name of the file along with its data path in case if a file. It also accepts a pandas dataframe in case you already have a dataframe loaded in your notebook.
159 | - **ts_column (required)**: name of the datetime column in your dataset (it could be a name of column or index number in the columns index).
160 | - **target (required)**: name of the column you are trying to predict. Target could also be the only column in your dataset.
161 | - **cv (default=5)**: You can enter any integer for the number of folds you want in your cross validation data set.
162 | - **sep (default=",")**: Sep is the separator in your traindata file. If your separator is ",", "\t", ";", make sure you enter it here. If not, it is ignored.
163 |
164 | ### The next step after training the model object is to make some predictions with test data:
165 |
166 | ```py
167 | predictions = model.predict(
168 | testdata = ..., # can be either a dataframe or an integer standing for the forecast_period,
169 | model = 'best' # or any other string that stands for the trained model
170 | )
171 | ```
172 |
173 | Here are how the parameters are defined. You can choose to send either testdata in the form of a dataframe or send in an integer to decide how many periods you want to forecast. You need only
174 | - **testdata (required)**: It can be either a dataframe containing test data or you can use an integer standing for the forecast_period (you want).
175 | - **model (optional, default = 'best')**: The name of the model you want to use among the many different models you have trained. Remember that the default is the best model. But you can choose any model that you want to forecast with. Type is String.
176 |
177 |
Requirements
178 | dask, scikit-learn, prophet, statsmodels, pmdarima, XGBoost
179 |
180 | License:
181 | Apache License 2.0
182 |
183 | Tips
184 |
185 | - We recommend that you choose a small sample from your data set before attempting to run entire data. and the evaluation metric, so it can select the best model. Currently models within “stats” are compared using AIC and BIC. However, models across different types are compared using RMSE. The results of models are shown using RMSE and Normalized RMSE (ratio of RMSE to the standard deviation of actuals).
186 | - You must clean the data and not have any missing values. Make sure the target variable is numeric, otherwise, it won’t run. If there is more than one target variable in your data set, just specify only one for now, and if you know the time interval that is in your data, you can specify it. Otherwise it auto-ts will try to infer the time interval on its own.
187 | - If you give Auto_Timeseries a different time interval than what the data has, it will automatically resample the data to the given time interval and use the mean of the target for the resampled period.
188 | - Notice that except for filename and ts_column input arguments, which are required, all other arguments are optional.
189 | - Note that optionally you can give a separator for the data in your file. Default is comma (",").
190 | - “time_interval” options are any codes that you can find in this page below.
191 | [Pandas date-range frequency aliases](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases)
192 | - Optionally, you can give seasonal_period as any integer that measures the seasonality in the data. If not given, seasonal_period is assumed automatically as follows:
193 | - Months = 12,
194 | - Days = 30,
195 | - Weeks = 52,
196 | - Qtr = 4,
197 | - Year = 1,
198 | - Hours = 24,
199 | - Minutes = 60 and
200 | - Seconds = 60.
201 | - If you want to give your own non-seasonal order, please input it as non_seasonal_pdq and for seasonal order, use seasonal_PDQ as the input. Use tuples. For example, `seasonal_PDQ = (2,1,2)` and `non_seasonal_pdq = (0,0,3)`. It will accept only tuples. The default is None and Auto_Timeseries will automatically search for the best p,d,q (for Non Seasonal) and P, D, Q (for Seasonal) orders by searching for all parameters from 0 to 12 for each value of p,d,q and 0-3 for each P, Q and 0-1 for D.
202 |
203 | DISCLAIMER:
204 |
205 | This is not an Officially supported Google project.
206 |
207 | Copyright
208 |
209 | © Google
210 |
--------------------------------------------------------------------------------
/auto_ts/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/auto_ts/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows thumbnail cache files
2 | Thumbs.db
3 | ehthumbs.db
4 | ehthumbs_vista.db
5 |
6 | # Folder config file
7 | Desktop.ini
8 |
9 | # Recycle Bin used on file shares
10 | $RECYCLE.BIN/
11 |
12 | # Windows Installer files
13 | *.cab
14 | *.msi
15 | *.msm
16 | *.msp
17 |
18 | # Windows shortcuts
19 | *.lnk
20 |
21 | # =========================
22 | # Operating System Files
23 | # =========================
24 |
--------------------------------------------------------------------------------
/auto_ts/__version__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Specifies the version of the Auto_TS package."""
3 |
4 | __title__ = "Auto_TS"
5 | __author__ = "Ram Seshadri"
6 | __description__ = "Build time series models for any data set, any size. Now using dask."
7 | __url__ = "https://github.com/Auto_ViML/Auto_TS.git"
8 | __version__ = "0.0.92"
9 | __license__ = "Apache License 2.0"
10 | __copyright__ = "2020-22 Google"
11 |
--------------------------------------------------------------------------------
/auto_ts/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .ar_based import BuildArima, BuildSarimax, BuildAutoSarimax, BuildVAR
2 | from .build_base import BuildBase
3 | from .build_ml import BuildML
4 | from .build_prophet import BuildProphet
5 | from .build_pyflux import build_pyflux_model
6 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/__init__.py:
--------------------------------------------------------------------------------
1 | from .build_arima import BuildArima
2 | from .build_autoarimax import BuildAutoSarimax
3 | from .build_sarimax import BuildSarimax
4 | from .build_var import BuildVAR
5 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/build_arima.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import itertools
3 | import operator
4 | import warnings
5 | from typing import Optional
6 |
7 | import matplotlib.pyplot as plt # type: ignore
8 | import numpy as np # type: ignore
9 | import pandas as pd # type: ignore
10 | import seaborn as sns # type: ignore
11 | from pandas.core.generic import NDFrame # type:ignore
12 |
13 | sns.set(style="white", color_codes=True)
14 |
15 | # imported ARIMA from statsmodels pkg
16 | from statsmodels.tsa.arima_model import ARIMA # type: ignore
17 |
18 | # helper functions
19 | from ...utils import print_static_rmse, print_dynamic_rmse
20 | from ...models.ar_based.param_finder import find_lowest_pq
21 |
22 |
23 | class BuildArima():
24 | def __init__(self, metric='aic', p_max=3, d_max=1, q_max=3, forecast_period=2, method='mle', verbose=0):
25 | """
26 | Automatically build an ARIMA Model
27 | """
28 | self.metric = metric
29 | self.p_max = p_max
30 | self.d_max = d_max
31 | self.q_max = q_max
32 | self.forecast_period = forecast_period
33 | self.method = method
34 | self.verbose = verbose
35 | self.model = None
36 |
37 | def fit(self, ts_df):
38 | """
39 | Build a Time Series Model using SARIMAX from statsmodels.
40 |
41 | This builds a Non Seasonal ARIMA model given a Univariate time series dataframe with time
42 | as the Index, ts_df can be a dataframe with one column only or a single array. Dont send
43 | Multiple Columns!!! Include only that variable that is a Time Series. DO NOT include
44 | Non-Stationary data. Make sure your Time Series is "Stationary"!! If not, this
45 | will give spurious results, since it automatically builds a Non-Seasonal model,
46 | you need not give it a Seasonal True/False flag.
47 | "metric": You can give it any of the following metrics as criteria: AIC, BIC, Deviance,
48 | Log-likelihood. Optionally, you can give it a fit method as one of the following:
49 | {'css-mle','mle','css'}
50 | """
51 |
52 | solver = 'lbfgs' # default
53 |
54 | p_min = 0
55 | d_min = 0
56 | q_min = 0
57 | # Initialize a DataFrame to store the results
58 | iteration = 0
59 | results_dict = {}
60 |
61 | ################################################################################
62 | ####### YOU MUST Absolutely set this parameter correctly as "levels". If not,
63 | #### YOU WILL GET DIFFERENCED PREDICTIONS WHICH ARE FIENDISHLY DIFFICULT TO UNDO.
64 | #### If you set this to levels, then you can do any order of differencing and
65 | #### ARIMA will give you predictions in the same level as orignal values.
66 | ################################################################################
67 | pred_type = 'levels'
68 | #########################################################################
69 | ts_train = ts_df[:-self.forecast_period]
70 | ts_test = ts_df[-self.forecast_period:]
71 | if self.verbose == 1:
72 | print('Data Set split into train %s and test %s for Cross Validation Purposes'
73 | % (ts_train.shape, ts_test.shape))
74 | #########################################################################
75 | if ts_train.dtype == 'int64':
76 | ts_train = ts_train.astype(float)
77 | for d_val in range(d_min, self.d_max+1):
78 | print('\nDifferencing = %d' % d_val)
79 | results_bic = pd.DataFrame(
80 | index=['AR{}'.format(i) for i in range(p_min, self.p_max+1)],
81 | columns=['MA{}'.format(i) for i in range(q_min, self.q_max+1)]
82 | )
83 | for p_val, q_val in itertools.product(range(p_min, self.p_max+1), range(q_min, self.q_max+1)):
84 | if p_val == 0 and d_val == 0 and q_val == 0:
85 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan
86 | continue
87 | else:
88 | try:
89 | model = ARIMA(ts_train, order=(p_val, d_val, q_val))
90 | results = model.fit(transparams=False, method=self.method, solver=solver, disp=False)
91 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('results.' + self.metric)
92 | if iteration % 10 == 0:
93 | print(' Iteration %d completed...' % iteration)
94 | iteration += 1
95 | if iteration >= 100:
96 | print(' Ending Iterations at %d' % iteration)
97 | break
98 | except:
99 | iteration += 1
100 | continue
101 | results_bic = results_bic[results_bic.columns].astype(float)
102 | interim_d = copy.deepcopy(d_val)
103 | interim_p, interim_q, interim_bic = find_lowest_pq(results_bic)
104 | if self.verbose == 1:
105 | _, ax = plt.subplots(figsize=(20, 10))
106 | ax = sns.heatmap(results_bic,
107 | mask=results_bic.isnull(),
108 | ax=ax,
109 | annot=True,
110 | fmt='.0f')
111 | ax.set_title(self.metric)
112 | results_dict[str(interim_p) + ' ' + str(interim_d) + ' ' + str(interim_q)] = interim_bic
113 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1]
114 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0]
115 | best_p = int(best_pdq.split(' ')[0])
116 | best_d = int(best_pdq.split(' ')[1])
117 | best_q = int(best_pdq.split(' ')[2])
118 | print('\nBest model is: Non Seasonal ARIMA(%d,%d,%d), %s = %0.3f' % (best_p, best_d, best_q, self.metric, best_bic))
119 | bestmodel = ARIMA(ts_train, order=(best_p, best_d, best_q))
120 | print('#### Fitting best model for full data set now. Will take time... ######')
121 | try:
122 | self.model = bestmodel.fit(transparams=True, method=self.method, solver=solver, disp=False)
123 | except:
124 | self.model = bestmodel.fit(transparams=False, method=self.method, solver=solver, disp=False)
125 | ### this is needed for static forecasts ####################
126 | y_truth = ts_train[:]
127 | y_forecasted = self.model.predict(typ='levels')
128 | concatenated = pd.concat([y_truth, y_forecasted], axis=1, keys=['original', 'predicted'])
129 | if best_d == 0:
130 | #### Do this for ARIMA only ######
131 | ### If there is no differencing DO NOT use predict_type since it will give an error = do not use "linear".
132 | print('Static Forecasts:')
133 | print_static_rmse(concatenated['original'].values, concatenated['predicted'].values, best_d)
134 | start_date = ts_df.index[-self.forecast_period]
135 | end_date = ts_df.index[-1]
136 | pred_dynamic = self.model.predict(start=start_date, end=end_date, dynamic=True)
137 | if self.verbose == 1:
138 | ax = concatenated[['original', 'predicted']][best_d:].plot()
139 | pred_dynamic.plot(label='Dynamic Forecast', ax=ax, figsize=(15, 5))
140 | print('Dynamic %d-period Forecasts:' % (self.forecast_period,))
141 | plt.legend()
142 | plt.show(block=False)
143 | else:
144 | #### Do this for ARIMA only ######
145 | #### If there is differencing, you must use "levels" as the predict type to get original levels as actuals
146 | pred_type = 'levels'
147 | print('Static Forecasts:')
148 | print_static_rmse(y_truth[best_d:], y_forecasted)
149 | ########### Dynamic One Step Ahead Forecast ###########################
150 | ### Dynamic Forecasts are a better representation of true predictive power
151 | ## since they only use information from the time series up to a certain point,
152 | ## and after that, forecasts are generated using values from previous forecasted
153 | ## time points.
154 | #################################################################################
155 |
156 | # TODO: Check if this can be changed to use predict function directly.
157 | start_date = ts_df.index[-self.forecast_period]
158 | end_date = ts_df.index[-1]
159 | pred_dynamic = self.model.predict(typ=pred_type, start=start_date, end=end_date, dynamic=True)
160 | try:
161 | pred_dynamic[pd.to_datetime((pred_dynamic.index-best_d).values[0])] = \
162 | y_truth[pd.to_datetime((pred_dynamic.index-best_d).values[0])]
163 | except:
164 | print('Dynamic predictions erroring but continuing...')
165 | pred_dynamic.sort_index(inplace=True)
166 | print('\nDynamic %d-period Forecasts:' % self.forecast_period)
167 | if self.verbose == 1:
168 | ax = concatenated.plot()
169 | pred_dynamic.plot(label='Dynamic Forecast', ax=ax, figsize=(15, 5))
170 | ax.set_xlabel('Date')
171 | ax.set_ylabel('Values')
172 | plt.legend()
173 | plt.show(block=False)
174 | if self.verbose == 1:
175 | try:
176 | self.model.plot_diagnostics(figsize=(16, 12))
177 | except:
178 | pass
179 | print(self.model.summary())
180 |
181 | res_frame = self.predict(simple=False)
182 |
183 | if self.verbose == 1:
184 | print('Model Forecast(s):\n', res_frame)
185 | rmse, norm_rmse = print_dynamic_rmse(ts_test, pred_dynamic, ts_train)
186 | return self.model, res_frame, rmse, norm_rmse
187 |
188 | def predict(
189 | self,
190 | testdata: Optional[pd.DataFrame]=None,
191 | forecast_period: Optional[int] = None,
192 | simple: bool = True) -> NDFrame:
193 | """
194 | Return the predictions
195 | # TODO: Check if the series can be converted to a dataframe for all models.
196 | :rtype cam be Pandas Series (simple), pandas dataframe (simple = False) or None
197 | """
198 |
199 | # TODO: Add processing of 'simple' argument and return type
200 |
201 | if testdata is not None:
202 | warnings.warn(
203 | "You have passed exogenous variables to make predictions for a ARIMA model." +
204 | "ARIMA models are univariate models and hence these exogenous variables will be ignored for these predictions."
205 | )
206 |
207 | # TODO: Predictions coming from ARIMA include extra information compared to SARIMAX and VAR.
208 | # Need to make it consistent
209 | # Extract the dynamic predicted and true values of our time series
210 | if forecast_period is None:
211 | # use the forecast period used during training
212 | forecast_period = self.forecast_period
213 |
214 | y_forecasted = self.model.forecast(forecast_period)
215 |
216 |
217 | # TODO: Check if the datetime index can be obtained as in the case of SARIMAX.
218 | # Currently it is just a text index, e.g. Forecast_1, ...
219 | if simple:
220 | res_frame = pd.DataFrame([
221 | y_forecasted[0], # Mean Forecast
222 | ],
223 | index=['mean'],
224 | columns=['Forecast_' + str(x) for x in range(1, forecast_period+1)]
225 | ).T
226 | res_frame = res_frame.squeeze() # Convert to a pandas series object
227 | else:
228 | res_frame = pd.DataFrame([
229 | y_forecasted[0], # Mean Forecast
230 | y_forecasted[1], # Std Error
231 | y_forecasted[2], # Lower and Upper CI
232 | ],
233 | index=['mean','mean_se','mean_ci'],
234 | columns=['Forecast_' + str(x) for x in range(1, forecast_period+1)]
235 | ).T
236 |
237 | res_frame['mean_ci_lower'] = res_frame['mean_ci'].map(lambda x: x[0])
238 | res_frame['mean_ci_upper'] = res_frame['mean_ci'].map(lambda x: x[1])
239 | res_frame.drop('mean_ci', axis=1, inplace=True)
240 |
241 | return res_frame
242 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/build_arima_base.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from typing import Optional
3 |
4 | warnings.filterwarnings(action='ignore')
5 | from abc import abstractmethod
6 | import copy
7 |
8 | import numpy as np # type: ignore
9 | import pandas as pd # type: ignore
10 | from pandas.core.generic import NDFrame # type:ignore
11 | import dask
12 |
13 | import matplotlib.pyplot as plt # type: ignore
14 |
15 | #from tscv import GapWalkForward # type: ignore
16 | from sklearn.model_selection import TimeSeriesSplit
17 |
18 | # imported SARIMAX from statsmodels pkg
19 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
20 |
21 | from ..build_base import BuildBase
22 |
23 | # helper functions
24 | from ...utils import colorful, print_static_rmse, print_ts_model_stats
25 |
26 |
27 | class BuildArimaBase(BuildBase):
28 | def __init__(self, scoring, seasonality=False, seasonal_period=None, p_max=12,
29 | d_max=2, q_max=12, forecast_period=5, verbose=0):
30 | """
31 | Base class for building any ARIMA model
32 | Definitely applicable to SARIMAX and auto_arima with seasonality
33 | Check later if same can be reused for ARIMA (most likely yes)
34 | """
35 | super().__init__(
36 | scoring=scoring,
37 | forecast_period=forecast_period,
38 | verbose=verbose
39 | )
40 |
41 | self.seasonality = seasonality
42 | self.seasonal_period = seasonal_period
43 | self.p_max = p_max
44 | self.d_max = d_max
45 | self.q_max = q_max
46 |
47 | self.best_p = None
48 | self.best_d = None
49 | self.best_q = None
50 | self.best_P = None
51 | self.best_D = None
52 | self.best_Q = None
53 |
54 |
55 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int]=None):
56 | """
57 | Build a Time Series Model using SARIMAX from statsmodels.
58 | """
59 |
60 | self.original_target_col = target_col
61 | self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]]
62 |
63 | if len(self.original_preds) == 0:
64 | self.univariate = True
65 | else:
66 | self.univariate = False
67 |
68 |
69 | ##########################################
70 | #### Find best pdq and PDQ parameters ####
71 | ##########################################
72 |
73 | # NOTE: We use the entire dataset to compute the pdq and PDQ parameters.
74 | # Then we use the selected "best" parameters to check how well it
75 | # generalizes across the various folds (which may even be 1)
76 |
77 | # ## Added temporarily
78 | # ts_train = ts_df.iloc[:-self.forecast_period]
79 | # self.find_best_parameters(data = ts_train)
80 |
81 | if self.seasonal_period <= 1:
82 | self.seasonal_period = 2 ### Sarimax cannot have seasonal period 1 or below.
83 |
84 | if self.verbose >= 1:
85 | print(f"\n\nBest Parameters:")
86 | print(f"p: {self.best_p}, d: {self.best_d}, q: {self.best_q}")
87 | print(f"P: {self.best_P}, D: {self.best_D}, Q: {self.best_Q}")
88 | print(f"Seasonality: {self.seasonality}\nSeasonal Period: {self.seasonal_period}")
89 |
90 | #######################################
91 | #### Cross Validation across Folds ####
92 | #######################################
93 |
94 | rmse_folds = []
95 | norm_rmse_folds = []
96 | forecast_df_folds = []
97 |
98 | ### Creating a new way to skip cross validation when trying to run auto-ts multiple times. ###
99 | if cv == 0:
100 | cv_in = 0
101 | else:
102 | cv_in = copy.deepcopy(cv)
103 | NFOLDS = self.get_num_folds_from_cv(cv)
104 |
105 | #########################################################################
106 | if type(ts_df) == dask.dataframe.core.DataFrame:
107 | num_obs = ts_df.shape[0].compute()
108 | else:
109 | num_obs = ts_df.shape[0]
110 |
111 | if self.forecast_period <= 5:
112 | #### Set a minimum of 5 for the number of rows in test!
113 | self.forecast_period = 5
114 | ### In case the number of forecast_period is too high, just reduce it so it can fit into num_obs
115 | if NFOLDS*self.forecast_period > num_obs:
116 | self.forecast_period = int(num_obs/(NFOLDS+1))
117 | print('Lowering forecast period to %d to enable cross_validation' %self.forecast_period)
118 | #########################################################################
119 | extra_concatenated = pd.DataFrame()
120 | concatenated = pd.DataFrame()
121 | norm_rmse_folds2 = []
122 |
123 | max_trainsize = len(ts_df) - self.forecast_period
124 | try:
125 | cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24]
126 | except:
127 | cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize)
128 |
129 | if type(ts_df) == dask.dataframe.core.DataFrame:
130 | ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe
131 |
132 | if cv_in == 0:
133 | print('Skipping cross validation steps since cross_validation = %s' %cv_in)
134 | else:
135 | for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)):
136 | dftx = ts_df.head(len(train_index)+len(test_index))
137 | ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx
138 | ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx
139 |
140 |
141 | if self.verbose >= 1:
142 | print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}")
143 |
144 | ### this is needed for static forecasts ####################
145 | # TODO: Check if this needs to be fixed to pick usimg self.original_target_col
146 | y_truth = ts_train[:] # TODO: Note that this is only univariate analysis
147 |
148 | if len(self.original_preds) == 0:
149 | exog = None
150 | elif len(self.original_preds) == 1:
151 | exog = ts_test[self.original_preds[0]].values.reshape(-1, 1)
152 | else:
153 | exog = ts_test[self.original_preds].values
154 |
155 | auto_arima_model = self.find_best_parameters(data = ts_train)
156 | self.model = auto_arima_model
157 | y_forecasted = self.model.predict(ts_test.shape[0],exog)
158 |
159 | if fold_number == 0:
160 | concatenated = pd.DataFrame(np.c_[ts_test[self.original_target_col].values,
161 | y_forecasted], columns=['original', 'predicted'],index=ts_test.index)
162 | extra_concatenated = copy.deepcopy(concatenated)
163 | else:
164 | concatenated = pd.DataFrame(np.c_[ts_test[self.original_target_col].values,
165 | y_forecasted], columns=['original', 'predicted'],index=ts_test.index)
166 | extra_concatenated = extra_concatenated.append(concatenated)
167 |
168 | ### for SARIMAX and Auto_ARIMA, you don't have to restore differences since it predicts like actuals.###
169 | y_true = concatenated['original']
170 | y_pred = concatenated['predicted']
171 |
172 | if self.verbose >= 1:
173 | print('Static Forecasts:')
174 | # Since you are differencing the data, some original data points will not be available
175 | # Hence taking from first available value.
176 | print_static_rmse(y_true.values, y_pred.values, verbose=self.verbose)
177 | #quick_ts_plot(y_true, y_pred)
178 |
179 | # Extract the dynamic predicted and true values of our time series
180 | forecast_df = copy.deepcopy(y_forecasted)
181 | forecast_df_folds.append(forecast_df)
182 |
183 |
184 | rmse, norm_rmse = print_static_rmse(y_true.values, y_pred.values, verbose=0) ## don't print this time
185 | rmse_folds.append(rmse)
186 | norm_rmse_folds.append(norm_rmse)
187 |
188 | # TODO: Convert rmse_folds, rmse_norm_folds, forecasts_folds into base class attributes
189 | # TODO: Add gettes and seters for these class attributes.
190 | # This will ensure consistency across various model build types.
191 |
192 |
193 | # This is taking the std of entire dataset and using that to normalize
194 | # vs. other approach that was using std of individual folds to standardize.
195 | # Technically this is not correct, but in order to do Apples:Aples compatison with ML
196 | # (sklearn) based cross_val_score, we need to do this since we dont get individual folds
197 | # back for cross_val_score. If at a later point in time, we can get this, then,
198 | # we can revert back to dividing by individual fold std values.
199 | norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse()
200 |
201 | print(f"\nSARIMAX RMSE (all folds): {np.mean(rmse_folds):.4f}")
202 | print(f"SARIMAX Norm RMSE (all folds): {(np.mean(norm_rmse_folds2)*100):.0f}%\n")
203 | try:
204 | print_ts_model_stats(extra_concatenated['original'],extra_concatenated['predicted'], "auto_SARIMAX")
205 | except:
206 | print('Unable to print model stats. Continuing...')
207 |
208 | ###############################################
209 | #### Refit the model on the entire dataset ####
210 | ###############################################
211 | auto_arima_model = self.find_best_parameters(data = ts_df)
212 | self.model = auto_arima_model
213 | self.refit(ts_df=ts_df)
214 |
215 | print(self.model.summary())
216 |
217 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
218 | return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2
219 |
220 | def refit(self, ts_df: pd.DataFrame) -> object:
221 | """
222 | Refits an already trained model using a new dataset
223 | Useful when fitting to the full data after testing with cross validation
224 | :param ts_df The time series data to be used for fitting the model
225 | :type ts_df pd.DataFrame
226 | :rtype object
227 | """
228 |
229 | bestmodel = self.get_best_model(ts_df)
230 |
231 | print(colorful.BOLD + 'Refitting data with previously found best parameters' + colorful.END)
232 | try:
233 | self.model = bestmodel.fit(disp=False)
234 | print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring)))
235 | except Exception as e:
236 | print(e)
237 |
238 | return self
239 |
240 | @abstractmethod
241 | def find_best_parameters(self, data: pd.DataFrame):
242 | """
243 | Given a dataset, finds the best parameters using the settings in the class
244 | Need to set the following parameters in the child class
245 | self.best_p, self.best_d, self.best_q
246 | self.best_P, self.best_D, self.best_Q
247 | """
248 |
249 |
250 |
251 | def get_best_model(self, data: pd.DataFrame):
252 | """
253 | Returns the 'unfit' SARIMAX model with the given dataset and the
254 | selected best parameters. This can be used to fit or refit the model.
255 | """
256 |
257 | # In order to get forecasts to be in the same value ranges of the orig_endogs, you
258 | # must set the simple_differencing = False and the start_params to be the same as ARIMA.
259 | # That is the only way to ensure that the output of this model iscomparable to other ARIMA models
260 |
261 | if not self.seasonality:
262 | if self.univariate:
263 | bestmodel = SARIMAX(
264 | endog=data[self.original_target_col],
265 | # exog=data[self.original_preds], ###if it is univariate, no preds needed
266 | order=(self.best_p, self.best_d, self.best_q),
267 | enforce_stationarity=False,
268 | enforce_invertibility=False,
269 | trend='ct',
270 | start_params=[0, 0, 0, 1],
271 | simple_differencing=False)
272 | else:
273 | bestmodel = SARIMAX(
274 | endog=data[self.original_target_col],
275 | exog=data[self.original_preds], ## if it is multivariate, preds are needed
276 | order=(self.best_p, self.best_d, self.best_q),
277 | enforce_stationarity=False,
278 | enforce_invertibility=False,
279 | trend='ct',
280 | start_params=[0, 0, 0, 1],
281 | simple_differencing=False)
282 | else:
283 | if self.univariate:
284 | bestmodel = SARIMAX(
285 | endog=data[self.original_target_col],
286 | # exog=data[self.original_preds], ### if univariate, no preds are needed
287 | order=(self.best_p, self.best_d, self.best_q),
288 | seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period),
289 | enforce_stationarity=False,
290 | enforce_invertibility=False,
291 | trend='ct',
292 | start_params=[0, 0, 0, 1],
293 | simple_differencing=False
294 | )
295 | else:
296 | bestmodel = SARIMAX(
297 | endog=data[self.original_target_col],
298 | exog=data[self.original_preds], ### if multivariate, preds are needed
299 | order=(self.best_p, self.best_d, self.best_q),
300 | seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period),
301 | enforce_stationarity=False,
302 | enforce_invertibility=False,
303 | trend='ct',
304 | start_params=[0, 0, 0, 1],
305 | simple_differencing=False
306 | )
307 |
308 | return bestmodel
309 |
310 | def predict(
311 | self,
312 | testdata: Optional[pd.DataFrame]=None,
313 | forecast_period: Optional[int] = None,
314 | simple: bool = True) -> NDFrame:
315 | """
316 | Return the predictions
317 | """
318 | # Extract the dynamic predicted and true values of our time series
319 | if self.univariate:
320 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series):
321 | # use the forecast period used during training
322 | forecast_period = testdata.shape[0]
323 | self.forecast_period = testdata.shape[0]
324 | else:
325 | if testdata is None:
326 | raise ValueError("SARIMAX needs testdata to make predictions, but this was not provided. Please provide to proceed.")
327 | forecast_period = self.forecast_period
328 | elif isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series):
329 | if forecast_period != testdata.shape[0]:
330 | warnings.warn("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.")
331 | forecast_period = testdata.shape[0]
332 | self.forecast_period = forecast_period
333 | try:
334 | testdata = testdata[self.original_preds]
335 | except Exception as e:
336 | print(e)
337 | print("Model was trained with train dataframe. Please make sure you are passing a test data frame.")
338 | return
339 | elif isinstance(testdata, int):
340 | if forecast_period != testdata:
341 | print("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.")
342 |
343 | forecast_period = testdata
344 | self.forecast_period = forecast_period
345 |
346 | if self.univariate:
347 | res = self.model.get_forecast(self.forecast_period)
348 | else:
349 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series):
350 | res = self.model.get_forecast(self.forecast_period, exog=testdata)
351 | else:
352 | try:
353 | res = self.model.get_forecast(self.forecast_period)
354 | except Exception as e:
355 | print(e)
356 | print("Model was trained with train dataframe. Please make sure you are passing a test data frame.")
357 | return
358 |
359 | res_frame = res.summary_frame()
360 | res_frame.rename(columns = {'mean':'yhat'}, inplace=True)
361 |
362 | if simple:
363 | res_frame = res_frame['yhat']
364 | res_frame = res_frame.squeeze() # Convert to a pandas series object
365 | else:
366 | # Pass as is
367 | pass
368 |
369 | return res_frame
370 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/build_autoarimax.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np # type: ignore
4 | import pandas as pd # type: ignore
5 | # TODO: Resolve which one we want to use
6 | # from pmdarima.arima.auto import auto_arima # type: ignore
7 | from pmdarima.arima import auto_arima # type: ignore
8 |
9 | from .build_arima_base import BuildArimaBase
10 | # helper functions
11 | from ...utils import colorful
12 |
13 |
14 | class BuildAutoSarimax(BuildArimaBase):
15 |
16 | def find_best_parameters(self, data: pd.DataFrame):
17 | """
18 | Given a dataset, finds the best parameters using the settings in the class
19 | """
20 |
21 | if self.verbose >= 1:
22 | print(colorful.BOLD + '\n Finding the best parameters using AutoArima:' + colorful.END)
23 | if len(self.original_preds) == 0:
24 | exog = None
25 | elif len(self.original_preds) == 1:
26 | exog = data[self.original_preds[0]].values.reshape(-1, 1)
27 | else:
28 | exog = data[self.original_preds].values
29 |
30 | ### for large datasets, speed is of the essence. Hence reduce max size of PDQ
31 | if self.seasonal_period <= 1:
32 | m_min = 2
33 | else:
34 | m_min = self.seasonal_period
35 | if data.shape[0] > 1000:
36 | print(' Using smaller parameters for larger dataset with greater than 1000 samples')
37 | out_of_sample_size = int(0.01*data.shape[0])
38 | arima_model = auto_arima(
39 | y = data[self.original_target_col],
40 | exogenous=exog, ## these variables must be given in predictions as well
41 | start_p = 0, start_q = 0, start_P = 0, start_Q = 0,
42 | max_p = 2, max_q = 2, max_P = 2, max_Q = 2,
43 | D = 1, max_D = 1,
44 | out_of_sample_size=out_of_sample_size, # use a small amount
45 | information_criterion=self.scoring, # AIC
46 | scoring='mse', # only supports 'mse' or 'mae'
47 | m=m_min, seasonal=self.seasonality,
48 | stepwise = True, random_state=42, n_fits = 10, n_jobs=-1,
49 | error_action = 'ignore')
50 | else:
51 | arima_model = auto_arima(
52 | y = data[self.original_target_col],
53 | exogenous=exog, ## these variables must be given in predictions as well
54 | out_of_sample_size=0, # use whole dataset to compute metrics
55 | information_criterion=self.scoring, # AIC
56 | scoring='mse', # only supports 'mse' or 'mae'
57 | # TODO: Check if we can go higher on max p and q (till seasonality)
58 | start_p=0, d=None, start_q=0, max_p=self.p_max, max_d=self.d_max, max_q=self.q_max, # AR Parameters
59 | start_P=0, D=None, start_Q=0, max_P=self.p_max, max_D=self.d_max, max_Q=self.q_max, # Seasonal Parameters (1)
60 | m=m_min, seasonal=self.seasonality, # Seasonal Parameters (2)
61 | stepwise = True, random_state=42, n_fits = 50, n_jobs=-1, # Hyperparameer Search
62 | error_action='warn', trace = True, supress_warnings=True
63 | )
64 |
65 | self.best_p, self.best_d, self.best_q = arima_model.order # example (0, 1, 1)
66 | self.best_P, self.best_D, self.best_Q, _ = arima_model.seasonal_order # example (2, 1, 1, 12)
67 |
68 | metric_value = math.nan
69 |
70 | if self.scoring.lower() == 'aic':
71 | metric_value = arima_model.aic()
72 | elif self.scoring.lower() == 'aicc':
73 | metric_value = arima_model.aicc()
74 | elif self.scoring.lower() == 'bic':
75 | metric_value = arima_model.bic()
76 | else:
77 | print("Error: Metric must be 'aic', 'aicc', or 'bic'. Continuing with 'bic' as default")
78 | metric_value = arima_model.bic()
79 | self.scoring = 'bic'
80 |
81 | if self.verbose >= 1:
82 | print(
83 | '\nBest model is a Seasonal SARIMAX(%d,%d,%d)*(%d,%d,%d,%d), %s = %0.3f' % (
84 | self.best_p, self.best_d, self.best_q,
85 | self.best_P, self.best_D, self.best_Q,
86 | m_min, self.scoring, metric_value)
87 | )
88 | return arima_model
89 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/build_sarimax.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt # type: ignore
2 | import numpy as np # type: ignore
3 | import pandas as pd # type: ignore
4 | from pandas.core.generic import NDFrame # type:ignore
5 | # imported SARIMAX from statsmodels pkg
6 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
7 |
8 | from .build_arima_base import BuildArimaBase
9 | from ...models.ar_based.param_finder import find_best_pdq_or_PDQ
10 | # helper functions
11 | from ...utils import colorful
12 |
13 |
14 | # from tscv import GapWalkForward # type: ignore
15 |
16 |
17 | # class BuildSarimax(BuildBase):
18 | class BuildSarimax(BuildArimaBase):
19 | # def __init__(self, scoring, seasonality=False, seasonal_period=None, p_max=12, d_max=2, q_max=12, forecast_period=2, verbose=0):
20 | # """
21 | # Automatically build a SARIMAX Model
22 | # """
23 | # super().__init__(
24 | # scoring=scoring,
25 | # forecast_period=forecast_period,
26 | # verbose=verbose
27 | # )
28 |
29 | # self.seasonality = seasonality
30 | # self.seasonal_period = seasonal_period
31 | # self.p_max = p_max
32 | # self.d_max = d_max
33 | # self.q_max = q_max
34 |
35 | # self.best_p = None
36 | # self.best_d = None
37 | # self.best_q = None
38 | # self.best_P = None
39 | # self.best_D = None
40 | # self.best_Q = None
41 |
42 |
43 | # def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int]=None):
44 | # """
45 | # Build a Time Series Model using SARIMAX from statsmodels.
46 | # """
47 |
48 | # self.original_target_col = target_col
49 | # self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]]
50 |
51 | # if len(self.original_preds) == 0:
52 | # self.univariate = True
53 | # else:
54 | # self.univariate = False
55 |
56 |
57 | # ##########################################
58 | # #### Find best pdq and PDQ parameters ####
59 | # ##########################################
60 |
61 | # # NOTE: We use the entire dataset to compute the pdq and PDQ parameters.
62 | # # Then we use the selected "best" parameters to check how well it
63 | # # generalizes across the various folds (which may even be 1)
64 |
65 | # # ## Added temporarily
66 | # # ts_train = ts_df.iloc[:-self.forecast_period]
67 | # # self.find_best_parameters(data = ts_train)
68 | # self.find_best_parameters(data = ts_df)
69 |
70 | # if self.verbose >= 1:
71 | # print(f"\n\nBest Parameters:")
72 | # print(f"p: {self.best_p}, d: {self.best_d}, q: {self.best_q}")
73 | # print(f"P: {self.best_P}, D: {self.best_D}, Q: {self.best_Q}")
74 | # print(f"Seasonality: {self.seasonality} Seasonal Period: {self.seasonal_period}")
75 |
76 |
77 | # #######################################
78 | # #### Cross Validation across Folds ####
79 | # #######################################
80 |
81 | # rmse_folds = []
82 | # norm_rmse_folds = []
83 | # forecast_df_folds = []
84 |
85 | # NFOLDS = self.get_num_folds_from_cv(cv)
86 | # cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period)
87 | # for fold_number, (train, test) in enumerate(cv.split(ts_df)):
88 | # ts_train = ts_df.iloc[train]
89 | # ts_test = ts_df.iloc[test]
90 |
91 | # if self.verbose >= 1:
92 | # print(f"\n\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape} Test Shape: {ts_test.shape}")
93 |
94 |
95 | # #########################################
96 | # #### Define the model with fold data ####
97 | # #########################################
98 |
99 | # bestmodel = self.get_best_model(ts_train)
100 |
101 | # ######################################
102 | # #### Fit the model with fold data ####
103 | # ######################################
104 |
105 | # if self.verbose >= 1:
106 | # print(colorful.BOLD + 'Fitting best SARIMAX model' + colorful.END)
107 |
108 | # try:
109 | # self.model = bestmodel.fit(disp=False)
110 | # if self.verbose >= 1:
111 | # print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring)))
112 | # except Exception as e:
113 | # print(e)
114 | # print('Error: Getting Singular Matrix. Please try using other PDQ parameters or turn off Seasonality')
115 | # return bestmodel, None, np.inf, np.inf
116 |
117 | # if self.verbose >= 1:
118 | # try:
119 | # self.model.plot_diagnostics(figsize=(16, 12))
120 | # except:
121 | # print('Error: SARIMAX plot diagnostic. Continuing...')
122 |
123 | # ### this is needed for static forecasts ####################
124 | # # TODO: Check if this needs to be fixed to pick usimg self.original_target_col
125 | # y_truth = ts_train[:] # TODO: Note that this is only univariate analysis
126 |
127 | # if self.univariate:
128 | # y_forecasted = self.model.predict(dynamic=False)
129 | # else:
130 | # y_forecasted = self.model.predict(dynamic=False, exog=ts_test[self.original_preds])
131 |
132 | # concatenated = pd.concat([y_truth, y_forecasted], axis=1, keys=['original', 'predicted'])
133 |
134 | # ### for SARIMAX, you don't have to restore differences since it predicts like actuals.###
135 | # if self.verbose >= 1:
136 | # print('Static Forecasts:')
137 | # # Since you are differencing the data, some original data points will not be available
138 | # # Hence taking from first available value.
139 | # print_static_rmse(
140 | # concatenated['original'].values[self.best_d:],
141 | # concatenated['predicted'].values[self.best_d:],
142 | # verbose=self.verbose
143 | # )
144 |
145 | # ########### Dynamic One Step Ahead Forecast ###########################
146 | # ### Dynamic Forecats are a better representation of true predictive power
147 | # ## since they only use information from the time series up to a certain point,
148 | # ## and after that, forecasts are generated using values from previous forecasted
149 | # ## time points.
150 | # #################################################################################
151 | # # Now do dynamic forecast plotting for the last X steps of the data set ######
152 |
153 | # if self.verbose >= 1:
154 | # ax = concatenated[['original', 'predicted']][self.best_d:].plot(figsize=(16, 12))
155 | # startdate = ts_df.index[-self.forecast_period-1]
156 | # pred_dynamic = self.model.get_prediction(start=startdate, dynamic=True, full_results=True)
157 | # pred_dynamic_ci = pred_dynamic.conf_int()
158 | # pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)
159 | # try:
160 | # ax.fill_between(pred_dynamic_ci.index, pred_dynamic_ci.iloc[:, 0],
161 | # pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)
162 | # ax.fill_betweenx(ax.get_ylim(), startdate, ts_train.index[-1], alpha=.1, zorder=-1)
163 | # except:
164 | # pass
165 | # ax.set_xlabel('Date')
166 | # ax.set_ylabel('Levels')
167 | # plt.legend()
168 | # plt.show(block=False)
169 |
170 | # # Extract the dynamic predicted and true values of our time series
171 | # forecast_df = self.predict(testdata=ts_test[self.original_preds], simple=False)
172 | # forecast_df_folds.append(forecast_df)
173 |
174 | # # Extract Metrics
175 | # if self.verbose >= 1:
176 | # print('Dynamic %d-Period Forecast:' % (self.forecast_period))
177 |
178 | # rmse, norm_rmse = print_dynamic_rmse(ts_test[self.original_target_col], forecast_df['mean'].values, ts_train[self.original_target_col], toprint=self.verbose)
179 | # rmse_folds.append(rmse)
180 | # norm_rmse_folds.append(norm_rmse)
181 |
182 | # # TODO: Convert rmse_folds, rmse_norm_folds, forecasts_folds into base class attributes
183 | # # TODO: Add gettes and seters for these class attributes.
184 | # # This will ensure consistency across various model build types.
185 |
186 |
187 | # # This is taking the std of entire dataset and using that to normalize
188 | # # vs. other approach that was using std of individual folds to stansardize.
189 | # # Technically this is not correct, but in order to do Apples:Aples compatison with ML
190 | # # (sklearn) based cross_val_score, we need to do this since we dont get indicidual folds
191 | # # back for cross_val_score. If at a later point in time, we can get this, then,
192 | # # we can revert back to dividing by individual fold std values.
193 | # norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse()
194 |
195 | # # print(f"SARIMAX Norm RMSE (Original): {norm_rmse_folds}")
196 | # # print(f"SARIMAX Norm RMSE (New): {norm_rmse_folds2}")
197 |
198 | # ###############################################
199 | # #### Refit the model on the entire dataset ####
200 | # ###############################################
201 | # self.refit(ts_df=ts_df)
202 |
203 | # if self.verbose >= 1:
204 | # print(self.model.summary())
205 |
206 | # # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
207 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2
208 |
209 | # def refit(self, ts_df: pd.DataFrame) -> object:
210 | # """
211 | # Refits an already trained model using a new dataset
212 | # Useful when fitting to the full data after testing with cross validation
213 | # :param ts_df The time series data to be used for fitting the model
214 | # :type ts_df pd.DataFrame
215 | # :rtype object
216 | # """
217 |
218 | # bestmodel = self.get_best_model(ts_df)
219 |
220 | # print(colorful.BOLD + 'Refitting data with previously found best parameters' + colorful.END)
221 | # try:
222 | # self.model = bestmodel.fit(disp=False)
223 | # print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring)))
224 | # except Exception as e:
225 | # print(e)
226 |
227 | # return self
228 |
229 |
230 | def find_best_parameters(self, data: pd.DataFrame):
231 | """
232 | Given a dataset, finds the best parameters using the settings in the class
233 | """
234 |
235 | if not self.seasonality:
236 | if self.verbose >= 1:
237 | print('Building a Non Seasonal Model...')
238 | print('\nFinding best Non Seasonal Parameters:')
239 | # TODO: Check if we need to also pass the exogenous variables here and
240 | # change the functionality of find_best_pdq_or_PDQ to incorporate these
241 | # exogenoug variables.
242 | self.best_p, self.best_d, self.best_q, best_bic, _ = find_best_pdq_or_PDQ(
243 | ts_df=data[self.original_target_col],
244 | scoring=self.scoring,
245 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max,
246 | non_seasonal_pdq=None,
247 | seasonal_period=None,
248 | seasonality=False,
249 | verbose=self.verbose
250 | )
251 |
252 | if self.verbose >= 1:
253 | print('\nBest model is: Non Seasonal SARIMAX(%d,%d,%d), %s = %0.3f' % (
254 | self.best_p, self.best_d, self.best_q, self.scoring, best_bic))
255 | else:
256 | if self.verbose >= 1:
257 | print(colorful.BOLD + 'Building a Seasonal Model...'+colorful.END)
258 | print(colorful.BOLD + '\n Finding best Non-Seasonal pdq Parameters:' + colorful.END)
259 | # TODO: Check if we need to also pass the exogenous variables here and
260 | # change the functionality of find_best_pdq_or_PDQ to incorporate these
261 | # exogenoug variables.
262 | self.best_p, self.best_d, self.best_q, _, _ = find_best_pdq_or_PDQ(
263 | ts_df=data[self.original_target_col],
264 | scoring=self.scoring,
265 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max,
266 | non_seasonal_pdq=None, # we need to figure this out ...
267 | seasonal_period=None,
268 | seasonality=False, # setting seasonality = False for p, d, q
269 | verbose=self.verbose
270 | )
271 |
272 | if self.verbose >= 1:
273 | print(colorful.BOLD + '\n Finding best Seasonal PDQ Model Parameters:' + colorful.END)
274 | # TODO: Check if we need to also pass the exogenous variables here and
275 | # change the functionality of find_best_pdq_or_PDQ to incorporate these
276 | # exogenoug variables.
277 | self.best_P, self.best_D, self.best_Q, best_bic, self.seasonality = find_best_pdq_or_PDQ(
278 | ts_df=data[self.original_target_col],
279 | scoring=self.scoring,
280 | p_max=self.p_max, d_max=self.d_max, q_max=self.q_max,
281 | non_seasonal_pdq=(self.best_p, self.best_d, self.best_q), # found previously ...
282 | seasonal_period=self.seasonal_period, # passing seasonal period
283 | seasonality=True, # setting seasonality = True for P, D, Q
284 | verbose=self.verbose
285 | )
286 |
287 | if self.seasonality:
288 | if self.verbose >= 1:
289 | print('\nBest model is a Seasonal SARIMAX(%d,%d,%d)*(%d,%d,%d,%d), %s = %0.3f' % (
290 | self.best_p, self.best_d, self.best_q,
291 | self.best_P, self.best_D, self.best_Q,
292 | self.seasonal_period, self.scoring, best_bic))
293 | else:
294 | if self.verbose >= 1:
295 | print('\nEven though seasonality has been set to True, the best model is a Non Seasonal SARIMAX(%d,%d,%d)' % (
296 | self.best_p, self.best_d, self.best_q))
297 |
298 |
299 |
300 |
301 |
302 | # def get_best_model(self, data: pd.DataFrame):
303 | # """
304 | # Returns the 'unfit' SARIMAX model with the given dataset and the
305 | # selected best parameters. This can be used to fit or refit the model.
306 | # """
307 |
308 | # # In order to get forecasts to be in the same value ranges of the orig_endogs, you
309 | # # must set the simple_differencing = False and the start_params to be the same as ARIMA.
310 | # # That is the only way to ensure that the output of this model iscomparable to other ARIMA models
311 |
312 | # if not self.seasonality:
313 | # if self.univariate:
314 | # bestmodel = SARIMAX(
315 | # endog=data[self.original_target_col],
316 | # # exog=data[self.original_preds],
317 | # order=(self.best_p, self.best_d, self.best_q),
318 | # enforce_stationarity=False,
319 | # enforce_invertibility=False,
320 | # trend='ct',
321 | # start_params=[0, 0, 0, 1],
322 | # simple_differencing=False)
323 | # else:
324 | # bestmodel = SARIMAX(
325 | # endog=data[self.original_target_col],
326 | # exog=data[self.original_preds],
327 | # order=(self.best_p, self.best_d, self.best_q),
328 | # enforce_stationarity=False,
329 | # enforce_invertibility=False,
330 | # trend='ct',
331 | # start_params=[0, 0, 0, 1],
332 | # simple_differencing=False)
333 | # else:
334 | # if self.univariate:
335 | # bestmodel = SARIMAX(
336 | # endog=data[self.original_target_col],
337 | # # exog=data[self.original_preds],
338 | # order=(self.best_p, self.best_d, self.best_q),
339 | # seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period),
340 | # enforce_stationarity=False,
341 | # enforce_invertibility=False,
342 | # trend='ct',
343 | # start_params=[0, 0, 0, 1],
344 | # simple_differencing=False
345 | # )
346 | # else:
347 | # bestmodel = SARIMAX(
348 | # endog=data[self.original_target_col],
349 | # exog=data[self.original_preds],
350 | # order=(self.best_p, self.best_d, self.best_q),
351 | # seasonal_order=(self.best_P, self.best_D, self.best_Q, self.seasonal_period),
352 | # enforce_stationarity=False,
353 | # enforce_invertibility=False,
354 | # trend='ct',
355 | # start_params=[0, 0, 0, 1],
356 | # simple_differencing=False
357 | # )
358 |
359 | # return bestmodel
360 |
361 | # def predict(
362 | # self,
363 | # testdata: Optional[pd.DataFrame]=None,
364 | # forecast_period: Optional[int] = None,
365 | # simple: bool = True) -> NDFrame:
366 | # """
367 | # Return the predictions
368 | # """
369 | # # Extract the dynamic predicted and true values of our time series
370 |
371 | # if self.univariate:
372 | # if forecast_period is None:
373 | # # use the forecast period used during training
374 | # forecast_period = self.forecast_period
375 | # else:
376 | # if testdata is None:
377 | # raise ValueError("SARIMAX needs testdata to make predictions, but this was not provided. Please provide to proceed.")
378 |
379 | # if forecast_period != testdata.shape[0]:
380 | # warnings.warn("Forecast Period is not equal to the number of observations in testdata. The forecast period will be assumed to be the number of observations in testdata.")
381 |
382 | # forecast_period = testdata.shape[0]
383 |
384 | # try:
385 | # testdata = testdata[self.original_preds]
386 | # except Exception as e:
387 | # print(e)
388 | # raise ValueError("Some exogenous columns that were used during training are missing in testdata. Please make sure you are passing the correct exogenous columns.")
389 |
390 | # if self.univariate:
391 | # res = self.model.get_forecast(forecast_period)
392 | # else:
393 | # res = self.model.get_forecast(forecast_period, exog=testdata)
394 |
395 | # res_frame = res.summary_frame()
396 |
397 | # if simple:
398 | # res_frame = res_frame['mean']
399 | # res_frame = res_frame.squeeze() # Convert to a pandas series object
400 | # else:
401 | # # Pass as is
402 | # pass
403 |
404 | # return res_frame
405 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/build_var.py:
--------------------------------------------------------------------------------
1 | """Module to build a VAR model
2 | """
3 | import copy
4 | import itertools
5 | import operator
6 | import warnings
7 | from typing import Optional
8 |
9 | import dask
10 | import matplotlib.pyplot as plt # type: ignore
11 | import numpy as np # type: ignore
12 | import pandas as pd # type: ignore
13 | import seaborn as sns # type: ignore
14 | from pandas.core.generic import NDFrame # type:ignore
15 |
16 | sns.set(style="white", color_codes=True)
17 |
18 | from statsmodels.tsa.statespace.varmax import VARMAX # type: ignore
19 |
20 | #from tscv import GapWalkForward # type: ignore
21 | from sklearn.model_selection import TimeSeriesSplit
22 |
23 | # helper functions
24 | from ...utils import print_dynamic_rmse
25 | from ...models.ar_based.param_finder import find_lowest_pq
26 | from ..build_base import BuildBase
27 |
28 |
29 | class BuildVAR(BuildBase):
30 | """Class to build a VAR model
31 | """
32 | def __init__(self, scoring, forecast_period=2, p_max=3, q_max=3, verbose=0):
33 | """
34 | Automatically build a VAR Model
35 |
36 | Since it automatically builds a VAR model, you need to give it a Criteria (scoring) to optimize
37 | on. You can give it any of the following metrics as scoring options:
38 | AIC, BIC, Deviance, Log-likelihood.
39 | You can give the highest order values for p and q. Default is set to 3 for both.
40 | """
41 | super().__init__(
42 | scoring=scoring,
43 | forecast_period=forecast_period,
44 | verbose=verbose
45 | )
46 | self.p_max = p_max
47 | self.q_max = q_max
48 | self.best_p = None
49 | self.best_d = None
50 | self.best_q = None
51 |
52 | # def fit(self, ts_df):
53 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object:
54 | """
55 | This builds a VAR model given a multivariate time series data frame with time as the Index.
56 |
57 | :param ts_df The time series data to be used for fitting the model. Note that the input can be
58 | a data frame with one column or multiple cols or a multivariate array. However, the first column
59 | must be the target variable. You must include only Time Series data in it. DO NOT include
60 | "Non-Stationary" or "Trendy" data. Make sure your Time Series is "Stationary" before you send
61 | it in!! If not, this will give spurious results.
62 | :type ts_df pd.DataFrame
63 |
64 | :param target_col The column name of the target time series that needs to be modeled.
65 | All other columns will be considered as exogenous variables (if applicable to method)
66 | :type target_col str
67 |
68 | :param cv: Number of folds to use for cross validation.
69 | Number of observations in the Validation set for each fold = forecast period
70 | If None, a single fold is used
71 | :type cv Optional[int]
72 |
73 | :rtype object
74 | """
75 | self.original_target_col = target_col
76 | self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]]
77 |
78 | ts_df = ts_df[[self.original_target_col] + self.original_preds]
79 |
80 | #######################################
81 | #### Cross Validation across Folds ####
82 | #######################################
83 |
84 | rmse_folds = []
85 | norm_rmse_folds = []
86 | forecast_df_folds = []
87 | norm_rmse_folds2 = []
88 |
89 | ### Creating a new way to skip cross validation when trying to run auto-ts multiple times. ###
90 | if not cv:
91 | cv_in = 0
92 | else:
93 | cv_in = copy.deepcopy(cv)
94 | NFOLDS = self.get_num_folds_from_cv(cv)
95 | #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period)
96 | #cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### sklearn version 0.0.24
97 | max_trainsize = len(ts_df) - self.forecast_period
98 | try:
99 | cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24]
100 | except:
101 | cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize)
102 |
103 | if type(ts_df) == dask.dataframe.core.DataFrame:
104 | ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe
105 |
106 | if cv_in == 0:
107 | print('Skipping cross validation steps since cross_validation = %s' %cv_in)
108 | self.find_best_parameters(data = ts_df)
109 | y_train = ts_df.iloc[:, [0, self.best_d]]
110 | bestmodel = self.get_best_model(y_train)
111 | self.model = bestmodel.fit(disp=False)
112 | else:
113 | for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)):
114 | dftx = ts_df.head(len(train_index)+len(test_index))
115 | ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx
116 | ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx
117 |
118 | print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}")
119 | self.find_best_parameters(data = ts_train)
120 |
121 | #########################################
122 | #### Define the model with fold data ####
123 | #########################################
124 | y_train = ts_train.iloc[:, [0, self.best_d]]
125 | bestmodel = self.get_best_model(y_train)
126 |
127 | ######################################
128 | #### Fit the model with fold data ####
129 | ######################################
130 |
131 | if self.verbose >= 1:
132 | print(f'Fitting best VAR model on Fold: {fold_number+1}')
133 | try:
134 | self.model = bestmodel.fit(disp=False)
135 | except Exception as e:
136 | print(e)
137 | print(f'Error: VAR Fit on Fold: {fold_number+1} unsuccessful.')
138 | return bestmodel, None, np.inf, np.inf
139 |
140 | forecast_df = self.predict(ts_test.shape[0],simple=False)
141 | forecast_df_folds.append(forecast_df['yhat'].values)
142 |
143 | rmse, norm_rmse = print_dynamic_rmse(ts_test.iloc[:, 0].values, forecast_df['yhat'].values,
144 | ts_train.iloc[:, 0].values)
145 | rmse_folds.append(rmse)
146 | norm_rmse_folds.append(norm_rmse)
147 |
148 | norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse()
149 | self.model.plot_diagnostics(figsize=(16, 12))
150 | axis = self.model.impulse_responses(12, orthogonalized=True).plot(figsize=(12, 4))
151 | axis.set(xlabel='Time Steps', title='VAR model Impulse Response Functions')
152 |
153 | ###############################################
154 | #### Refit the model on the entire dataset ####
155 | ###############################################
156 | y_train = ts_df.iloc[:, [0, self.best_d]]
157 | self.refit(ts_df=y_train)
158 |
159 | # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
160 | return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2
161 |
162 | def predict(
163 | self,
164 | testdata: Optional[pd.DataFrame] = None,
165 | forecast_period: Optional[int] = None,
166 | simple: bool = True
167 | ) -> NDFrame:
168 | """
169 | Return the predictions
170 | """
171 |
172 | if testdata is not None:
173 | warnings.warn(
174 | "You have passed exogenous variables to make predictions for a VAR model. " +
175 | "VAR model will predict all exogenous variables automatically, " +
176 | "hence your passed values will not be used."
177 | )
178 | if isinstance(testdata, pd.DataFrame) or isinstance(testdata, pd.Series):
179 | if len(testdata) != self.forecast_period:
180 | self.forecast_period = testdata.shape[0]
181 | elif isinstance(testdata, int):
182 | self.forecast_period = testdata
183 |
184 | forecast_period = self.forecast_period
185 |
186 | # Extract the dynamic predicted and true values of our time series
187 | if forecast_period is None:
188 | # use the forecast period used during training
189 | forecast_period = self.forecast_period
190 |
191 | # y_forecasted = self.model.forecast(forecast_period)
192 |
193 | res = self.model.get_forecast(forecast_period)
194 | res_frame = res.summary_frame()
195 |
196 | res_frame.rename(columns={'mean':'yhat'},inplace=True)
197 |
198 | if simple:
199 | res_frame = res_frame['yhat']
200 | res_frame = res_frame.squeeze() # Convert to a pandas series object
201 | else:
202 | # Pass as is
203 | pass
204 |
205 | return res_frame
206 |
207 |
208 | def find_best_parameters(self, data: pd.DataFrame):
209 | """
210 | Given a dataset, finds the best parameters using the settings in the class
211 | """
212 | #### dmax here means the column number of the data frame: it serves as a placeholder for columns
213 | dmax = data.shape[1]
214 | ###############################################################################################
215 | cols = data.columns.tolist()
216 | # TODO: #14 Make sure that we have a way to not rely on column order to determine the target
217 | # It is assumed that the first column of the dataframe is the target variable ####
218 | ### make sure that is the case before doing this program ####################
219 | i = 1
220 | results_dict = {}
221 |
222 | for d_val in range(1, dmax):
223 | # Takes the target column and one other endogenous column at a time
224 | # and makes a prediction based on that. Then selects the best
225 | # exogenous column at the end.
226 | y_train = data.iloc[:, [0, d_val]]
227 | print('\nAdditional Variable in VAR model = %s' % cols[d_val])
228 | info_criteria = pd.DataFrame(
229 | index=['AR{}'.format(i) for i in range(0, self.p_max+1)],
230 | columns=['MA{}'.format(i) for i in range(0, self.q_max+1)]
231 | )
232 | for p_val, q_val in itertools.product(range(0, self.p_max+1), range(0, self.q_max+1)):
233 | if p_val == 0 and q_val == 0:
234 | info_criteria.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan
235 | print(' Iteration %d completed' % i)
236 | i += 1
237 | else:
238 | try:
239 | model = VARMAX(y_train, order=(p_val, q_val), trend='c')
240 | model = model.fit(max_iter=1000, disp=False)
241 | info_criteria.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('model.' + self.scoring)
242 | print(' Iteration %d completed' % i)
243 | i += 1
244 | except Exception:
245 | i += 1
246 | print(' Iteration %d completed' % i)
247 | info_criteria = info_criteria[info_criteria.columns].astype(float)
248 | interim_d = copy.deepcopy(d_val)
249 | interim_p, interim_q, interim_bic = find_lowest_pq(info_criteria)
250 | if self.verbose == 1:
251 | _, axis = plt.subplots(figsize=(20, 10))
252 | axis = sns.heatmap(
253 | info_criteria,
254 | mask=info_criteria.isnull(),
255 | ax=axis,
256 | annot=True,
257 | fmt='.0f'
258 | )
259 | axis.set_title(self.scoring)
260 | results_dict[str(interim_p) + ' ' + str(interim_d) + ' ' + str(interim_q)] = interim_bic
261 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1]
262 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0]
263 | self.best_p = int(best_pdq.split(' ')[0])
264 | self.best_d = int(best_pdq.split(' ')[1])
265 | self.best_q = int(best_pdq.split(' ')[2])
266 |
267 | print('Best variable selected for VAR: %s' % data.columns.tolist()[self.best_d])
268 |
269 | def refit(self, ts_df: pd.DataFrame) -> object:
270 | """
271 | Refits an already trained model using a new dataset
272 | Useful when fitting to the full data after testing with cross validation
273 | :param ts_df The time series data to be used for fitting the model
274 | :type ts_df pd.DataFrame
275 | :rtype object
276 | """
277 | bestmodel = self.get_best_model(ts_df)
278 | print('Refitting data with previously found best parameters')
279 | try:
280 | self.model = bestmodel.fit(disp=False)
281 | print(' Best %s metric = %0.1f' % (self.scoring, eval('self.model.' + self.scoring)))
282 | except Exception as exception:
283 | print(exception)
284 |
285 | return self
286 |
287 |
288 | def get_best_model(self, data: pd.DataFrame):
289 | """
290 | Returns the 'unfit' SARIMAX model with the given dataset and the
291 | selected best parameters. This can be used to fit or refit the model.
292 | """
293 | bestmodel = VARMAX(data, order=(self.best_p, self.best_q), trend='c')
294 | return bestmodel
295 |
--------------------------------------------------------------------------------
/auto_ts/models/ar_based/param_finder.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import itertools
3 | import operator
4 |
5 | import matplotlib.pyplot as plt # type: ignore
6 | import numpy as np # type: ignore
7 | import pandas as pd # type: ignore
8 | import seaborn as sns # type: ignore
9 |
10 | # This gives an error when running from a python script.
11 | # Maybe, this should be set in the jupyter notebook directly.
12 | # get_ipython().magic('matplotlib inline')
13 | sns.set(style="white", color_codes=True)
14 | # imported SARIMAX from statsmodels pkg for find_best_pdq_or_PDQ
15 | from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
16 |
17 |
18 | def find_lowest_pq(df):
19 | """
20 | This is an auto-ARIMA function that iterates through parameters pdq and finds the best
21 | based on aan eval metric sent in as input.
22 |
23 | This finds the row and column numbers of the lowest or highest value in a dataframe. All it needs is numeric values.
24 | It will return the row and column together as a string, you will have to split it into two.
25 | It will also return the lowest value in the dataframe by default but you can change it to "max".
26 | """
27 | dicti = {}
28 | for ma in list(df):
29 | try:
30 | dicti[ma + ' ' + df[ma].idxmin()] = df[ma].sort_values()[0]
31 | except:
32 | pass
33 | lowest_bic = min(dicti.items(), key=operator.itemgetter(1))[1]
34 | lowest_pq = min(dicti.items(), key=operator.itemgetter(1))[0]
35 | ma_q = int(lowest_pq.split(' ')[0][2:])
36 | ar_p = int(lowest_pq.split(' ')[1][2:])
37 | print(' Best AR order p = %d, MA order q = %d, Interim metric = %0.3f' % (ar_p, ma_q, lowest_bic))
38 | return ar_p, ma_q, lowest_bic
39 |
40 |
41 | def find_best_pdq_or_PDQ(ts_df, scoring, p_max, d_max, q_max, non_seasonal_pdq,
42 | seasonal_period, seasonality=False, verbose=0):
43 | p_min = 0
44 | d_min = 0
45 | q_min = 0
46 | if seasonality:
47 | ns_p = non_seasonal_pdq[0]
48 | ns_d = non_seasonal_pdq[1]
49 | ns_q = non_seasonal_pdq[2]
50 | # Initialize a DataFrame to store the results
51 | iteration = 0
52 | results_dict = {}
53 | seasonality_dict = {}
54 | for d_val in range(d_min, d_max+1):
55 | print(f"\nDifferencing = {d_val} with Seasonality = {seasonality}")
56 | results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min, p_max+1)],
57 | columns=['MA{}'.format(i) for i in range(q_min, q_max+1)])
58 | for p_val, q_val in itertools.product(range(p_min,p_max+1), range(q_min, q_max+1)):
59 | if p_val == 0 and d_val == 0 and q_val == 0:
60 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = np.nan
61 | continue
62 | try:
63 | if seasonality:
64 | # In order to get forecasts to be in the same value ranges of the
65 | # orig_endogs, you must set the simple_differencing = False and
66 | # the start_params to be the same as ARIMA.
67 | # That is the only way to ensure that the output of this
68 | # model is comparable to other ARIMA models
69 |
70 | model = SARIMAX(
71 | ts_df,
72 | order=(ns_p, ns_d, ns_q),
73 | seasonal_order=(p_val, d_val, q_val, seasonal_period),
74 | enforce_stationarity=False,
75 | enforce_invertibility=False,
76 | trend='ct',
77 | start_params=[0, 0, 0, 1],
78 | simple_differencing=False
79 | )
80 | else:
81 | model = SARIMAX(
82 | ts_df,
83 | order=(p_val, d_val, q_val),
84 | enforce_stationarity=False,
85 | enforce_invertibility=False,
86 | trend='ct',
87 | start_params=[0, 0, 0, 1],
88 | simple_differencing=False
89 | )
90 |
91 | results = model.fit(disp=False)
92 |
93 | results_bic.loc['AR{}'.format(p_val), 'MA{}'.format(q_val)] = eval('results.' + scoring)
94 | if iteration % 10 == 0:
95 | print(' Iteration %d completed...' % iteration)
96 | iteration += 1
97 | elif iteration >= 100:
98 | print(' Ending Iterations at %d' % iteration)
99 | break
100 | except:
101 | iteration += 1
102 | continue
103 | results_bic = results_bic[results_bic.columns].astype(float)
104 |
105 | # # TODO: Print if needed
106 | # print("Inside find_best_pdq_or_PDQ --> results_bic")
107 | # print(results_bic)
108 |
109 | interim_d = d_val
110 | if results_bic.isnull().all().all():
111 | print(' D = %d results in an empty ARMA set. Setting Seasonality to False since model might overfit' %d_val)
112 | #### Set Seasonality to False if this empty condition happens repeatedly ####
113 | seasonality_dict[d_val] = False
114 | # TODO: This should not be set to False for all future d values, but without this ARIMA is giving large errors (overfitting)
115 | seasonality = False
116 | continue
117 | else:
118 | seasonality_dict[d_val] = True
119 | # TODO: This should not be set to False for all future d values, but without this ARIMA is giving large errors (overfitting)
120 | seasonality = True
121 | interim_p, interim_q, interim_bic = find_lowest_pq(results_bic)
122 | if verbose == 1:
123 | _, ax = plt.subplots(figsize=(20, 10))
124 | ax = sns.heatmap(results_bic, mask=results_bic.isnull(), ax=ax, annot=True, fmt='.0f')
125 | ax.set_title(scoring)
126 | results_dict[str(interim_p)+' '+str(interim_d)+' '+str(interim_q)] = interim_bic
127 | try:
128 | best_bic = min(results_dict.items(), key=operator.itemgetter(1))[1]
129 | best_pdq = min(results_dict.items(), key=operator.itemgetter(1))[0]
130 | best_p = int(best_pdq.split(' ')[0])
131 | best_d = int(best_pdq.split(' ')[1])
132 | best_q = int(best_pdq.split(' ')[2])
133 | except:
134 | best_p = copy.deepcopy(p_val)
135 | best_q = copy.deepcopy(q_val)
136 | best_d = copy.deepcopy(d_val)
137 | best_bic = 0
138 |
139 | # # TODO: Print if needed
140 | # print(f"Seasonal Dictionary: {seasonality_dict}")
141 |
142 | # return best_p, best_d, best_q, best_bic, seasonality
143 | return best_p, best_d, best_q, best_bic, seasonality_dict.get(best_d)
144 |
--------------------------------------------------------------------------------
/auto_ts/models/build_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Optional, List
3 |
4 | import pandas as pd # type: ignore
5 | from pandas.core.generic import NDFrame # type:ignore
6 |
7 |
8 | class BuildBase(ABC):
9 | """
10 | Base Class for Building a model
11 | """
12 |
13 | def __init__(self, scoring: str, forecast_period: int, verbose: int,
14 | **kwargs
15 | ):
16 | self.scoring = scoring
17 | self.forecast_period = forecast_period
18 | self.verbose = verbose
19 | self.kwargs = kwargs
20 | self.model = None
21 | self.original_target_col: str = ""
22 | self.original_preds: List[str] = []
23 | self.strf_time_format: str = ""
24 | self.num_boost_rounds: int = 250
25 |
26 |
27 | @abstractmethod
28 | def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object:
29 | """
30 | Fits the model to the data
31 |
32 | :param ts_df The time series data to be used for fitting the model
33 | :type ts_df pd.DataFrame
34 |
35 | :param target_col The column name of the target time series that needs to be modeled.
36 | All other columns will be considered as exogenous variables (if applicable to method)
37 | :type target_col str
38 |
39 | :param cv: Number of folds to use for cross validation.
40 | Number of observations in the Validation set for each fold = forecast period
41 | If None, a single fold is used
42 | :type cv Optional[int]
43 |
44 | :rtype object
45 | """
46 |
47 |
48 | @abstractmethod
49 | def refit(self, ts_df: pd.DataFrame) -> object:
50 | """
51 | Refits an already trained model using a new dataset
52 | Useful when fitting to the full data after testing with cross validation
53 | :param ts_df The time series data to be used for fitting the model
54 | :type ts_df pd.DataFrame
55 | :rtype object
56 | """
57 |
58 | @abstractmethod
59 | def predict(
60 | self,
61 | testdata: Optional[pd.DataFrame]=None,
62 | forecast_period: Optional[int] = None,
63 | simple: bool = True) -> NDFrame:
64 | """
65 | Return the predictions
66 | :param testdata The test dataframe containing the exogenous varaiables to be used for predicton.
67 | :type testdata Optional[pd.DataFrame]
68 | :param forecast_period The number of periods to make a prediction for.
69 | :type forecast_period Optional[int]
70 | :param simple If True, this method just returns the predictions.
71 | If False, it will return the standard error, lower and upper confidence interval (if available)
72 | :type simple bool
73 | :rtype NDFrame
74 | """
75 |
76 | def check_model_built(self):
77 | if self.model is None:
78 | raise AttributeError(
79 | "You are trying to perform an operation that requires the model to have been fit."+
80 | "However the model has not been fit yet. Please fit the model once before you try this operation."
81 | )
82 |
83 | def get_num_folds_from_cv(self, cv):
84 | if cv is None:
85 | NFOLDS = 2
86 | elif cv == 0:
87 | NFOLDS = 2
88 | else:
89 | NFOLDS = cv
90 |
91 | return NFOLDS
92 |
--------------------------------------------------------------------------------
/auto_ts/models/build_pyflux.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import operator
3 |
4 | import numpy as np # type: ignore
5 | import pandas as pd # type: ignore
6 |
7 | # helper functions
8 | from ..utils import print_static_rmse, print_dynamic_rmse
9 |
10 |
11 | #########################################################
12 | def build_pyflux_model(df, target, ar=3, ma=3,integ=1, forecast_period=2,
13 | fitmethod='MLE', nsims=100, score_type='rmse', verbose=0):
14 | """
15 | Build a quick pyflux model with default parameters for AR, MA and I terms in ARIMA.
16 | You can build a rolling forecast using the rolling_forecast parameter.
17 | PyFlux is a fiendishly complicated program with very poor documentation.
18 | I had to dig deep into the API to figure these things out especially the
19 | """
20 | try:
21 | # imported pyflux pkg
22 | import pyflux as pf # type: ignore
23 | except:
24 | print('Pyflux is not installed - hence not running PyFlux model')
25 | return 'error','error','error','error'
26 | ts_df = df[:]
27 | ##############################################################################
28 | ts_train = ts_df[:-forecast_period]
29 | ts_test = ts_df[-forecast_period:]
30 | if verbose == 1:
31 | print('Data Set split into train %s and test %s for Cross Validation Purposes'
32 | % (ts_train.shape, ts_test.shape))
33 | #####################################################################################################
34 | if integ > 1:
35 | print(' Setting "integration"=1 since differenced predictions > 1 are difficult to interpret')
36 | integ = 1
37 | if fitmethod == 'M-H':
38 | print(' Assuming number of simulations = %d' % nsims)
39 | ####################################################################################################
40 | ###### define p,d,q parameters here ####################
41 | p = range(0, ar+1)
42 | q = range(0, ma+1)
43 | d = range(0, integ+1) ### dont do much more than 1 differencing in PyFlux models since its hard to undo
44 | #### Generate all different combinations of p,d,q triplets ######
45 | pdq = list(itertools.product(p, d, q))
46 | eval_metrics = {}
47 | print('Cycling through various (p,d,q) parameters')
48 | for param in pdq:
49 | if verbose == 1:
50 | print('.', end="")
51 | model = pf.ARIMA(data=ts_train, ar=param[0], integ=param[1], ma=param[2], target=target)
52 | try:
53 | if fitmethod == 'MLE':
54 | x = model.fit()
55 | elif fitmethod == 'M-H':
56 | x = model.fit('M-H', nsims=nsims)
57 | except:
58 | x = model.fit('MLE')
59 | mu, actuals = model._model(model.latent_variables.get_z_values())
60 | predicted = model.link(mu)
61 | rmse, norm_rmse = print_static_rmse(actuals,predicted)
62 | if score_type == 'rmse':
63 | eval_metrics[param] = rmse
64 | else:
65 | eval_metrics[param] = norm_rmse
66 | bestpdq = min(eval_metrics.items(), key=operator.itemgetter(1))[0]
67 | print('\nBest Params Selected (based on %s): %s' % (score_type, bestpdq))
68 | bestmodel = pf.ARIMA(data=ts_train, ar=bestpdq[0], integ=bestpdq[1], ma=bestpdq[2], target=target)
69 | x = bestmodel.fit()
70 | if verbose == 1:
71 | bestmodel.plot_fit(figsize=(15, 5))
72 | #model.plot_predict_is(h=forecast_period,fit_once=False,fit_method=fitmethod)
73 | if verbose == 1:
74 | x.summary()
75 | n = int(0.5*len(df))
76 | bestmodel.plot_predict(h=forecast_period, past_values=n, intervals=True, figsize=(15, 5))
77 | forecast_df = bestmodel.predict(forecast_period, intervals=True)
78 | mu, actuals = bestmodel._model(bestmodel.latent_variables.get_z_values())
79 | predicted = bestmodel.link(mu)
80 | print('Dynamic %d-period Forecasts:' % forecast_period)
81 | if bestpdq[1] == 1:
82 | mod_target = 'Differenced ' + target
83 | res = restore_differenced_predictions(ts_test[target].values, forecast_df[mod_target],
84 | ts_train[target][-1:])
85 | rmse, norm_rmse = print_dynamic_rmse(ts_test[target].values, res, ts_train[target])
86 | else:
87 | rmse, norm_rmse = print_dynamic_rmse(ts_test[target].values,forecast_df[target].values, ts_train[target])
88 | return bestmodel, forecast_df, rmse, norm_rmse
89 |
90 |
91 | def restore_differenced_predictions(actuals, predicted, start_value, func=None, periods=1, diff_yes=True):
92 | try:
93 | restored = pd.Series(index=start_value.index)
94 | restored.ix[start_value.ix[:periods].index] = start_value.values[:periods]
95 | rest = restored.ix[predicted.index]
96 | restored = pd.Series(np.r_[restored, rest], index=np.r_[start_value.index, rest.index])
97 | restored.ix[predicted.index] = predicted.values
98 | restored = restored[(periods-1):].cumsum()
99 | if func:
100 | restored = eval('np.' + func + '(restored)')
101 | return restored[periods:]
102 | except:
103 | restored = start_value.values+predicted
104 | if func:
105 | restored = eval('np.' + func + '(restored)')
106 | return restored
107 |
--------------------------------------------------------------------------------
/auto_ts/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/auto_ts/py.typed
--------------------------------------------------------------------------------
/auto_ts/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/auto_ts/test/__init__.py
--------------------------------------------------------------------------------
/auto_ts/test/test_auto_sarimax.py:
--------------------------------------------------------------------------------
1 | """
2 | Unit Tests for BuildAutoSarimax
3 |
4 | ----------------------
5 | Total Combinations: 8
6 | ----------------------
7 | Seasonality: Seasonal, Non-Seasonal (2)
8 | Univariate, Multivariate (2)
9 | CV: Yes, No (2)
10 | """
11 |
12 | import unittest
13 |
14 | import numpy as np # type: ignore
15 | import pandas as pd # type: ignore
16 | from pandas.testing import assert_frame_equal # type: ignore
17 | from pandas.testing import assert_series_equal # type: ignore
18 | from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper # type: ignore
19 |
20 |
21 | class TestAutoSarimax(unittest.TestCase):
22 |
23 | def setUp(self):
24 | # Pre Release
25 | import sys
26 | import os
27 | sys.path.append(os.environ['DEV_AUTOTS'])
28 | import pandas as pd # type: ignore
29 |
30 | datapath = 'example_datasets/'
31 | filename1 = 'Sales_and_Marketing.csv'
32 | dft = pd.read_csv(datapath+filename1,index_col=None)
33 |
34 | self.ts_column = 'Time Period'
35 | self.sep = ','
36 | self.target = 'Sales'
37 | self.preds = [x for x in list(dft) if x not in [self.ts_column, self.target]] # Exogenous variable names
38 |
39 | self.train_multivar = dft[:40]
40 | self.test_multivar = dft[40:]
41 |
42 | self.train_univar = dft[:40][[self.ts_column, self.target]]
43 | self.test_univar = dft[40:][[self.ts_column, self.target]]
44 |
45 | self.forecast_period = 8
46 |
47 | self.expected_pred_col_names = np.array(['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper'])
48 |
49 | ########################
50 | #### Golden Results ####
51 | ########################
52 |
53 | # TODO: Add to each individual test
54 | ## For each of the 8 combinations, we need the following
55 | # Internal Validation results (for each fold)
56 | # Internal Validation RMSE (overall and for each fold)
57 |
58 | # External Test results (various combinations of prediction windows - same as forecast period OR not same)
59 | # External Test RMSE
60 |
61 |
62 | def test_seasonal_univar_noCV(self):
63 | """
64 | Test 1: Seasonal Univariate Without CV
65 | """
66 | pass
67 |
68 | def test_seasonal_univar_CV(self):
69 | """
70 | Test 2: Seasonal Univariate With CV
71 | """
72 | pass
73 |
74 | def test_seasonal_multivar_noCV(self):
75 | """
76 | Test 3: Seasonal Multivariate Without CV
77 | """
78 | pass
79 |
80 | def test_seasonal_multivar_CV(self):
81 | """
82 | Test 4: Seasonal Multivariate With CV
83 | """
84 | pass
85 |
86 | def test_nonseasonal_univar_noCV(self):
87 | """
88 | Test 5: Non Seasonal Univariate Without CV
89 | """
90 | pass
91 |
92 | def test_nonseasonal_univar_CV(self):
93 | """
94 | Test 6: Non Seasonal Univariate With CV
95 | """
96 | pass
97 |
98 | def test_nonseasonal_multivar_noCV(self):
99 | """
100 | Test 7: Non Seasonal Multivariate Without CV
101 | """
102 | pass
103 |
104 | def test_nonseasonal_multivar_CV(self):
105 | """
106 | Test 8: Non Seasonal Multivariate With CV
107 | """
108 | pass
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/auto_ts/test/test_var.py:
--------------------------------------------------------------------------------
1 | """
2 | Unit Tests for VAR Models
3 |
4 | ----------------------
5 | Total Combinations: 4
6 | ----------------------
7 | Seasonality: NA
8 | Univariate, Multivariate: Simple Independent Test for Univariate (1)
9 | CV: Yes, No (2)
10 | """
11 |
12 | import math
13 | import os
14 | import sys
15 | import unittest
16 |
17 | import numpy as np # type: ignore
18 | import pandas as pd # type: ignore
19 | from pandas.testing import assert_frame_equal # type: ignore
20 | from pandas.testing import assert_series_equal # type: ignore
21 | from statsmodels.tsa.statespace.sarimax import SARIMAXResultsWrapper # type: ignore
22 |
23 | sys.path.append(os.environ['DEV_AUTOTS'])
24 | from auto_ts import auto_timeseries as ATS
25 |
26 | class TestVAR(unittest.TestCase):
27 |
28 | def setUp(self):
29 | # Pre Release
30 |
31 |
32 | datapath = 'example_datasets/'
33 | filename1 = 'Sales_and_Marketing.csv'
34 | dft = pd.read_csv(datapath + filename1, index_col = None)
35 |
36 | self.ts_column = 'Time Period'
37 | self.sep = ','
38 | self.target = 'Sales'
39 | self.preds = [x for x in list(dft) if x not in [self.ts_column, self.target]] # Exogenous variable names
40 |
41 | self.train_multivar = dft[:40]
42 | self.test_multivar = dft[40:]
43 |
44 | self.train_univar = dft[:40][[self.ts_column, self.target]]
45 | self.test_univar = dft[40:][[self.ts_column, self.target]]
46 |
47 | self.forecast_period = 8
48 |
49 | self.expected_pred_col_names = np.array(['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper'])
50 |
51 | ########################
52 | #### Golden Results ####
53 | ########################
54 |
55 | # TODO: Add to each individual test
56 | ## For each of the 8 combinations, we need the following
57 | # Internal Validation results (for each fold)
58 | # Internal Validation RMSE (overall and for each fold)
59 |
60 | # External Test results (various combinations of prediction windows - same as forecast period OR not same)
61 | # External Test RMSE
62 |
63 |
64 | ############################
65 | #### VAR Golden Results ####
66 | ############################
67 |
68 | #### UNIVARIATE ####
69 | self.forecast_gold_var_univar = None
70 | self.rmse_gold_var_univar = math.inf
71 | self.forecast_gold_var_univar_series = None
72 | self.forecast_gold_var_univar_series_10 = None
73 |
74 | #### MULTIVARIATE ####
75 |
76 | # Internal (to AutoML) validation set results
77 | self.forecast_gold_var_multivar_internal_val_cv_fold1 = np.array([
78 | 510.302336, 531.109224, 536.878513, 534.311164,
79 | 529.305887, 525.199071, 523.015255, 522.445215
80 | ])
81 |
82 | self.forecast_gold_var_multivar_internal_val_cv_fold2 = np.array([
83 | 741.377909, 676.233419, 615.538721, 571.797729,
84 | 546.952783, 537.342231, 537.474487, 542.307393
85 | ])
86 |
87 | self.rmse_gold_var_multivar_cv_fold1 = 155.21757611
88 | self.rmse_gold_var_multivar_cv_fold2 = 112.4770318 # Without CV gets this result
89 |
90 | ## External Test Set results
91 | results = [
92 | 675.899931, 622.204059, 578.38291, 553.067517,
93 | 543.612945, 543.696406, 547.604403, 551.762352
94 | ]
95 | index = pd.to_datetime([
96 | '2014-05-01', '2014-06-01', '2014-07-01', '2014-08-01',
97 | '2014-09-01', '2014-10-01', '2014-11-01', '2014-12-01'
98 | ])
99 | self.forecast_gold_var_multivar = np.array(results)
100 |
101 | self.forecast_gold_var_multivar_series = pd.Series(data=results, index=index)
102 | self.forecast_gold_var_multivar_series.name = 'mean'
103 |
104 | results = results + [554.643756, 556.055009]
105 | index = pd.to_datetime([
106 | '2014-05-01', '2014-06-01', '2014-07-01', '2014-08-01',
107 | '2014-09-01', '2014-10-01', '2014-11-01', '2014-12-01',
108 | '2015-01-01', '2015-02-01'
109 | ])
110 | self.forecast_gold_var_multivar_series_10 = pd.Series(data=results, index=index)
111 | self.forecast_gold_var_multivar_series_10.name = 'mean'
112 |
113 |
114 | def test_noCV(self):
115 | """
116 | Test 1: VAR without CV
117 | """
118 | print("\n\n" + "*"*50)
119 | print("Performing Unit Test: 'test_noCV'")
120 | print("*"*50 + "\n\n")
121 |
122 | automl_model = ATS(
123 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month',
124 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12,
125 | model_type='VAR',
126 | verbose=0)
127 | automl_model.fit(
128 | traindata=self.train_multivar,
129 | ts_column=self.ts_column,
130 | target=self.target,
131 | cv=None,
132 | sep=self.sep)
133 |
134 | ml_dict = automl_model.get_ml_dict()
135 |
136 | ######################
137 | ## External Results ##
138 | ######################
139 |
140 | # Simple forecast with forecast window = the one used in training
141 | # Using named model
142 | test_predictions = automl_model.predict(
143 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
144 | forecast_period=self.forecast_period,
145 | model="VAR"
146 | )
147 | assert_series_equal(
148 | test_predictions['mean'].round(6),
149 | self.forecast_gold_var_multivar_series
150 | )
151 |
152 | # Simple forecast with forecast window != the one used in training
153 | # Using named model
154 | test_predictions = automl_model.predict(
155 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
156 | forecast_period=10,
157 | model="VAR"
158 | )
159 | assert_series_equal(test_predictions['mean'].round(6), self.forecast_gold_var_multivar_series_10)
160 |
161 | # Complex forecasts (returns confidence intervals, etc.)
162 | test_predictions = automl_model.predict(
163 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
164 | forecast_period=self.forecast_period,
165 | model="VAR",
166 | simple=False
167 | )
168 | self.assertIsNone(
169 | np.testing.assert_array_equal(
170 | test_predictions.columns.values, self.expected_pred_col_names
171 | )
172 | )
173 |
174 | ###################
175 | ## ML Dictionary ##
176 | ###################
177 | self.assertIsNone(
178 | np.testing.assert_array_equal(
179 | np.round(ml_dict.get('VAR').get('forecast')[0]['mean'].values.astype(np.double), 6),
180 | self.forecast_gold_var_multivar_internal_val_cv_fold2
181 | ),
182 | "(Multivar Test) VAR Forecast does not match up with expected values."
183 | )
184 |
185 | self.assertEqual(
186 | round(ml_dict.get('VAR').get('rmse')[0], 8), self.rmse_gold_var_multivar_cv_fold2,
187 | "(Multivar Test) VAR RMSE does not match up with expected values.")
188 |
189 | def test_CV(self):
190 | """
191 | Test 2: VAR with CV
192 | """
193 | print("\n\n" + "*"*50)
194 | print("Performing Unit Test: 'test_CV'")
195 | print("*"*50 + "\n\n")
196 |
197 | automl_model = ATS(
198 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month',
199 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12,
200 | model_type='VAR',
201 | verbose=0)
202 | automl_model.fit(
203 | traindata=self.train_multivar,
204 | ts_column=self.ts_column,
205 | target=self.target,
206 | cv=2,
207 | sep=self.sep)
208 |
209 | ml_dict = automl_model.get_ml_dict()
210 |
211 | ######################
212 | ## External Results ##
213 | ######################
214 |
215 | # Simple forecast with forecast window = the one used in training
216 | # Using named model
217 | test_predictions = automl_model.predict(
218 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
219 | forecast_period=self.forecast_period,
220 | model="VAR"
221 | )
222 | assert_series_equal(
223 | test_predictions['mean'].round(6),
224 | self.forecast_gold_var_multivar_series
225 | )
226 |
227 | # Simple forecast with forecast window != the one used in training
228 | # Using named model
229 | test_predictions = automl_model.predict(
230 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
231 | forecast_period=10,
232 | model="VAR"
233 | )
234 | assert_series_equal(test_predictions['mean'].round(6), self.forecast_gold_var_multivar_series_10)
235 |
236 | # Complex forecasts (returns confidence intervals, etc.)
237 | test_predictions = automl_model.predict(
238 | testdata=self.test_multivar[[self.ts_column] + self.preds], # Not needed for VAR
239 | forecast_period=self.forecast_period,
240 | model="VAR",
241 | simple=False
242 | )
243 | self.assertIsNone(
244 | np.testing.assert_array_equal(
245 | test_predictions.columns.values, self.expected_pred_col_names
246 | )
247 | )
248 |
249 | ###################
250 | ## ML Dictionary ##
251 | ###################
252 | self.assertIsNone(
253 | np.testing.assert_array_equal(
254 | np.round(ml_dict.get('VAR').get('forecast')[0]['mean'].values.astype(np.double), 6),
255 | self.forecast_gold_var_multivar_internal_val_cv_fold1,
256 |
257 | ),
258 | "(Multivar Test) VAR Forecast does not match up with expected values."
259 | )
260 | self.assertIsNone(
261 | np.testing.assert_array_equal(
262 | np.round(ml_dict.get('VAR').get('forecast')[1]['mean'].values.astype(np.double), 6),
263 | self.forecast_gold_var_multivar_internal_val_cv_fold2
264 | ),
265 | "(Multivar Test) VAR Forecast does not match up with expected values."
266 | )
267 |
268 | self.assertEqual(
269 | round(ml_dict.get('VAR').get('rmse')[0], 8), self.rmse_gold_var_multivar_cv_fold1,
270 | "(Multivar Test) VAR RMSE does not match up with expected values.")
271 | self.assertEqual(
272 | round(ml_dict.get('VAR').get('rmse')[1], 8), self.rmse_gold_var_multivar_cv_fold2,
273 | "(Multivar Test) VAR RMSE does not match up with expected values.")
274 |
275 |
276 | def test_univar(self):
277 | """
278 | Test 3: Univariate VAR
279 | """
280 | print("\n\n" + "*"*50)
281 | print("Performing Unit Test: 'test_univar'")
282 | print("*"*50 + "\n\n")
283 |
284 | automl_model = ATS(
285 | score_type='rmse', forecast_period=self.forecast_period, time_interval='Month',
286 | non_seasonal_pdq=None, seasonality=False, seasonal_period=12,
287 | model_type='VAR',
288 | verbose=0)
289 | automl_model.fit(
290 | traindata=self.train_univar,
291 | ts_column=self.ts_column,
292 | target=self.target,
293 | cv=None
294 | )
295 | ml_dict = automl_model.get_ml_dict()
296 |
297 | self.assertIsNone(automl_model.get_model_build('VAR'), "Expected Univar VAR model to be None but did not get None.")
298 |
299 | # Simple forecast with forecast window = one used in training
300 | # Using named model
301 | test_predictions = automl_model.predict(
302 | forecast_period=self.forecast_period,
303 | model="VAR"
304 | )
305 | self.assertIsNone(test_predictions)
306 |
307 | # Simple forecast with forecast window != one used in training
308 | # Using named model
309 | test_predictions = automl_model.predict(
310 | forecast_period=10,
311 | model="VAR"
312 | )
313 | self.assertIsNone(test_predictions)
314 |
315 | # Complex forecasts (returns confidence intervals, etc.)
316 | test_predictions = automl_model.predict(
317 | forecast_period=self.forecast_period,
318 | model="VAR",
319 | simple=False
320 | )
321 | self.assertIsNone(test_predictions)
322 |
323 | ###################
324 | ## ML Dictionary ##
325 | ###################
326 | self.assertEqual(
327 | ml_dict.get('VAR').get('forecast'), self.forecast_gold_var_univar,
328 | "(Univar Test) VAR Forecast does not match up with expected values."
329 | )
330 |
331 | self.assertEqual(
332 | round(ml_dict.get('VAR').get('rmse'), 8), self.rmse_gold_var_univar,
333 | "(Univar Test) VAR RMSE does not match up with expected values.")
--------------------------------------------------------------------------------
/auto_ts/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .colors import colorful
2 | from .eda import time_series_plot, top_correlation_to_name, test_stationarity
3 | from .etl import load_ts_data, convert_timeseries_dataframe_to_supervised, \
4 | time_series_split, find_max_min_value_in_a_dataframe, left_subtract, \
5 | change_to_datetime_index, change_to_datetime_index_test, reduce_mem_usage, load_test_data
6 | from .metrics import print_static_rmse, print_dynamic_rmse, print_normalized_rmse, \
7 | print_ts_model_stats
8 | from .my_encoders import My_LabelEncoder, My_LabelEncoder_Pipe
9 | from .val import cross_validation_time_series, rolling_validation_time_series, \
10 | ts_model_validation, quick_ts_plot
11 |
--------------------------------------------------------------------------------
/auto_ts/utils/colors.py:
--------------------------------------------------------------------------------
1 | class colorful:
2 | PURPLE = '\033[95m'
3 | CYAN = '\033[96m'
4 | DARKCYAN = '\033[36m'
5 | BLUE = '\033[94m'
6 | GREEN = '\033[92m'
7 | YELLOW = '\033[93m'
8 | RED = '\033[91m'
9 | BOLD = '\033[1m'
10 | UNDERLINE = '\033[4m'
11 | END = '\033[0m'
12 |
--------------------------------------------------------------------------------
/auto_ts/utils/eda.py:
--------------------------------------------------------------------------------
1 | import numpy as np # type: ignore
2 | import numpy as np # type: ignore
3 | import pandas as pd # type: ignore
4 | import seaborn as sns # type: ignore
5 | from itertools import cycle
6 | import matplotlib.pyplot as plt
7 |
8 | # This gives an error when running from a python script.
9 | # Maybe, this should be set in the jupyter notebook directly.
10 | # get_ipython().magic('matplotlib inline')
11 | sns.set(style="white", color_codes=True)
12 | # TSA from Statsmodels
13 | import statsmodels.tsa.api as smt # type: ignore
14 | import dask
15 | from .colors import colorful
16 | def time_series_plot(y, lags=31, title='Original Time Series', chart_type='line',
17 | chart_freq='years'):
18 | """
19 | Plot a Time Series along with how it will look after differencing and what its
20 | AR/MA lags will be by viewing the ACF and PACF, along with its histogram.
21 | You just need to provide the time series (y) as a Series. Index is assumed
22 | to be Pandas datetime. It assumes that you want to see default lags of 31.
23 | But you can modify it to suit.
24 | """
25 |
26 | y = copy.deepcopy(y)
27 | if chart_freq in ['MS', 'M', 'SM', 'BM', 'CBM', 'SMS', 'BMS']:
28 | chart_time = 'months'
29 | elif chart_freq in ['D', 'B', 'C']:
30 | chart_time = 'days'
31 | elif chart_freq in ['W']:
32 | chart_time = 'weeks'
33 | elif chart_freq in ['Q', 'BQ', 'QS', 'BQS']:
34 | chart_time = 'quarters'
35 | elif chart_freq in ['A,Y', 'BA,BY', 'AS,YS', 'BAS,YAS']:
36 | chart_time = 'years'
37 | elif chart_freq in ['BH', 'H', 'h']:
38 | chart_time = 'hours'
39 | elif chart_freq in ['T,min']:
40 | chart_time = 'minutes'
41 | elif chart_freq in ['S', 'L,milliseconds', 'U,microseconds', 'N,nanoseconds']:
42 | chart_time = 'seconds'
43 | else:
44 | print('chart frequency not known. Continuing...')
45 | return
46 | colors = cycle('byrcmgkbyrcmgkbyrcmgkbyrcmgkbyr')
47 | fig = plt.figure(figsize=(20, 20))
48 | grid = plt.GridSpec(3, 2, wspace=0.5, hspace=0.5)
49 | fig.subplots_adjust(hspace=1)
50 | ########## Use the gridspec function ##############
51 | ts_ax = plt.subplot(grid[0, 0:])
52 | diff_ax = plt.subplot(grid[1, 0])
53 | hist_ax = plt.subplot(grid[1, 1])
54 | acf_ax = plt.subplot(grid[2, 0])
55 | pacf_ax = plt.subplot(grid[2, 1])
56 | ### Draw multiple kinds of graphs here to each subplot axis ###
57 |
58 | if type(y) == dask.dataframe.core.DataFrame or type(y) == dask.dataframe.core.Series:
59 | y = y.head(len(y)) ## this converts it into a pandas Series
60 | if chart_type == 'line':
61 | y.plot(ax=ts_ax, color=next(colors))
62 | else:
63 | if chart_time == 'years':
64 | majors = mdates.YearLocator() # every year
65 | minors = mdates.MonthLocator() # every month
66 | majorsFmt = mdates.DateFormatter('%Y')
67 | elif chart_time == 'months':
68 | majors = mdates.YearLocator() # every year
69 | minors = mdates.MonthLocator() # every month
70 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y')
71 | elif chart_time == 'weeks':
72 | majors = mdates.MonthLocator()
73 | minors = mdates.WeekdayLocator(byweekday=(1), interval=1)
74 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y')
75 | elif chart_time == 'days':
76 | majors = mdates.DayLocator(bymonthday=None, interval=1, tz=None)
77 | minors = mdates.HourLocator(byhour=None, interval=1, tz=None)
78 | majorsFmt = mdates.DateFormatter('\n\n\n%d\n%b')
79 | else:
80 | majors = mdates.YearLocator() # every year
81 | minors = mdates.MonthLocator() # every month
82 | majorsFmt = mdates.DateFormatter('\n\n\n%b\n%Y')
83 | try:
84 | #### this works in most cases but in some cases, it gives an error
85 | ts_ax.bar(y.index, height=y, width=20, color=list((y>0).astype(int).map({1:'g',0:'r'}).values))
86 | except:
87 | #### In some cases where y is a dataframe, this might work.
88 | yindex = y.index
89 | yvalues = y.values.ravel()
90 | ts_ax.bar(yindex, height=yvalues, width=20, color=list(using_where((yvalues>0).astype(int)).ravel()))
91 | ts_ax.xaxis.set_major_locator(majors)
92 | ts_ax.xaxis.set_major_formatter(majorsFmt)
93 | ts_ax.xaxis.set_minor_locator(minors)
94 | ts_ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
95 | ts_ax.grid(True)
96 | #### Now draw the ACF and PACF charts
97 | ts_ax.set_title(title)
98 | y.diff(1).plot(ax=diff_ax, color=next(colors))
99 | diff_ax.set_title('After Differencing = 1')
100 | y.plot(ax=hist_ax, kind='hist', bins=25, color=next(colors))
101 | hist_ax.set_title('Histogram for Original Series')
102 | try:
103 | if len(y) < lags:
104 | lags = int(len(y) - 1)
105 | smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
106 | acf_ax.set_title('ACF for Original Series')
107 | except:
108 | acf_ax.set_title('Data Error: Could not draw ACF for Original Series')
109 | try:
110 | ### the number of lags cannot be greater than 50% of len of y. So limit it.
111 | if lags >= len(y)*0.5:
112 | lags = int(len(y)*0.5 - 1)
113 | smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
114 | pacf_ax.set_title('PACF for Original Series')
115 | except:
116 | pacf_ax.set_title('Data Error: Could not draw PACF for Original Series')
117 | [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
118 | plt.show(block=False)
119 |
120 | def using_where(x):
121 | return np.where(x == 1, 'g', 'r')
122 | #################################################################################
123 |
124 | def top_correlation_to_name(stocks, column_name, searchstring, top=5):
125 | """
126 | ####################################################################################
127 | This function draws a correlation chart of the top "x" rows of a data frame that are highly
128 | correlated to a selected row in the dataframe. You can think of the rows of the input
129 | dataframe as containing stock prices or fund flows or product sales and the columns should
130 | contain time series data of prices or flows or sales over multiple time periods.
131 | Now this program will allow you to select the top 5 or 10 rows that are highly correlated
132 | to a given row selected by the column: column_name and using a search string "searchstring".
133 | The program will search for the search string in that column column_name and return a list
134 | of 5 or 10 rows that are the most correlated to that selected row. If you give "top" as
135 | a float ratio then it will use the ratio as the cut off point in the correlation
136 | coefficient to select rows.
137 | ####################################################################################
138 | """
139 | #### First increment top by 1 since you are asking for top X names in addition to the one you have, top += 1
140 | incl = [x for x in list(stocks) if x not in column_name]
141 | ### First drop all NA rows since they will mess up your correlations, stocks.dropna(inplace=True)
142 | if stocks.empty:
143 | print('After dropping NaNs, the data frame has become empty.')
144 | return
145 | ### Now find the highest correlated rows to the selected row ###
146 | try:
147 | index_val = search_string(stocks, column_name,searchstring).index[0]
148 | except:
149 | print('Not able to find the search string in the column.')
150 | return
151 | ### Bring that selected Row to the top of the Data Frame
152 | df = stocks[:]
153 | # TODO: Undefined variable 'l'
154 | df["new"] = range(l, len(df)+l)
155 | df.loc[index_val,"new"] = 0
156 | stocks = df.sort_values("new").drop("new",axis=1)
157 | stocks.reset_index(inplace=True,drop=True)
158 | ##### Now calculate the correlation coefficients of other rows with the Top row
159 | try:
160 | cordf = pd.DataFrame(stocks[incl].T.corr().sort_values(0, ascending=False))
161 | except:
162 | print('Cannot calculate Correlations since Dataframe contains string values or objects.')
163 | return
164 | try:
165 | cordf = stocks[column_name].join(cordf)
166 | except:
167 | cordf = pd.concat((stocks[column_name], cordf), axis=1)
168 | #### Visualizing the top 5 or 10 or whatever cut-off they have given for Corr Coeff
169 | if top >= 1:
170 | top10index = cordf.sort_values(0, ascending=False).iloc[:top, :3].index
171 | top10names = cordf.sort_values(0, ascending=False).iloc[:top, :3][column_name]
172 | top10values = cordf.sort_values(0, ascending=False)[0].values[:top]
173 | else:
174 | top10index = cordf.sort_values(0, ascending=False)[
175 | cordf.sort_values(0, ascending=False)[0].values >= top].index
176 | top10names = cordf.sort_values(0, ascending=False)[cordf.sort_values(
177 | 0, ascending=False)[0].values >= top][column_name]
178 | top10alues = cordf.sort_values(0, ascending=False)[cordf.sort_values(
179 | 0, ascending=False)[0].values >= top][0]
180 | print(top10names, top10values)
181 | #### Now plot the top rows that are highly correlated based on condition above
182 | stocksloc = stocks.iloc[top10index]
183 | #### Visualizing using Matplotlib ###
184 | stocksloc = stocksloc.T
185 | stocksloc = stocksloc.reset_index(drop=True)
186 | stocksloc.columns = stocksloc.iloc[0].values.tolist()
187 | stocksloc.drop(0).plot(subplots=True, figsize=(15, 10), legend=False,
188 | title="Top %s Correlations to %s" % (top, searchstring))
189 | [ax.legend(loc=1) for ax in plt.gcf().axes]
190 | plt.tight_layout()
191 | plt.show(block=False)
192 |
193 | ################################################################################
194 | def pretty_print_table(dfo):
195 | from io import StringIO
196 | import prettytable
197 | output = StringIO()
198 | dfo.to_csv(output)
199 | output.seek(0)
200 | pt = prettytable.from_csv(output)
201 | print(pt)
202 |
203 | import copy
204 | def test_stationarity(time_df, maxlag=31, regression='c', autolag='BIC',
205 | window=None, plot=False, verbose=False, var_only=False):
206 | """
207 | Check unit root stationarity of a time series array or an entire dataframe.
208 | Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR.
209 | Null hypothesis: the series is non-stationary.
210 | If p >= alpha, the series is non-stationary.
211 | If p < alpha, reject the null hypothesis (has unit root stationarity).
212 | Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
213 | Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html
214 | window argument is only required for plotting rolling functions. Default=4.
215 | """
216 | time_df = copy.deepcopy(time_df)
217 | if len(time_df) <= int(2.5*maxlag):
218 | maxlag = 5
219 | print('setting maxlag to a low number = %s' %maxlag)
220 | # set defaults (from function page)
221 | if type(time_df) == pd.DataFrame:
222 | #print('modifying time series dataframe into an array to test')
223 | timeseries = time_df.values.ravel()
224 | else:
225 | timeseries = copy.deepcopy(time_df)
226 | if regression is None:
227 | regression = 'c'
228 | if verbose:
229 | print('\nRunning Augmented Dickey-Fuller test with paramters:')
230 | print(' maxlag: {}'.format(maxlag),'regression: {}'.format(regression),'autolag: {}'.format(autolag))
231 | alpha = 0.05
232 | if plot:
233 | try:
234 | if window is None:
235 | window = 4
236 | # Determing rolling statistics
237 | rolmean = timeseries.rolling(window=window, center=False).mean()
238 | rolstd = timeseries.rolling(window=window, center=False).std()
239 | # Plot rolling statistics:
240 | orig = plt.plot(timeseries, color='blue', label='Original')
241 | mean = plt.plot(rolmean, color='red', label='Rolling Mean ({})'.format(window))
242 | std = plt.plot(rolstd, color='black', label='Rolling Std ({})'.format(window))
243 | plt.legend(loc='best')
244 | plt.title('Rolling Mean & Standard Deviation')
245 | plt.show(block=False)
246 | except:
247 | print('Data must have date-time as index to plot!')
248 | return
249 | # Perform Augmented Dickey-Fuller test:
250 | if var_only:
251 | ### In VAR models, check all_vars for stationarity
252 | ### if it is 1, then all vars are stationary. If not difference it once and try again!
253 | ### Use Statsmodels for tests ###########
254 | diff_limit = 0
255 | for i in range(3):
256 | stationary_test = check_each_var_for_stationarity(time_df, autolag, verbose)
257 | if stationary_test:
258 | if i == 0:
259 | print('Data is already stationary')
260 | diff_limit = 0
261 | break
262 | elif i == 1:
263 | print('Data is stationary after one differencing')
264 | diff_limit = 1
265 | break
266 | elif i == 2:
267 | diff_limit = 2
268 | print('Data is stationary after two differencing')
269 | break
270 | else:
271 | if i == 2:
272 | print('Alert! Data is not stationary even after two differencing. Continuing...')
273 | diff_limit = 0
274 | break
275 | else:
276 | time_df = time_df.diff(1).dropna()
277 | continue
278 | return diff_limit
279 | else:
280 | ### In non-VAR models you need to test only the target variable for stationarity ##
281 | dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag)
282 | dfoutput = pd.Series(dftest[0:4], index=['Test Statistic',
283 | 'p-value',
284 | '#Lags Used',
285 | 'Number of Observations Used',
286 | ],name='Dickey-Fuller Augmented Test')
287 | for key, value in dftest[4].items():
288 | dfoutput['Critical Value (%s)' % key] = value
289 | if verbose:
290 | print('Results of Augmented Dickey-Fuller Test:')
291 | pretty_print_table(dfoutput)
292 | if dftest[1] >= alpha:
293 | print(' this series is non-stationary. Trying test again after differencing...')
294 | timeseries = pd.Series(timeseries).diff(1).dropna().values
295 | dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag)
296 | dfoutput = pd.Series(dftest[0:4], index=['Test Statistic',
297 | 'p-value',
298 | '#Lags Used',
299 | 'Number of Observations Used',
300 | ],name='Dickey-Fuller Augmented Test')
301 | for key, value in dftest[4].items():
302 | dfoutput['Critical Value (%s)' % key] = value
303 | if verbose:
304 | print('After differencing=1, results of Augmented Dickey-Fuller Test:')
305 | pretty_print_table(dfoutput)
306 | if dftest[1] >= alpha:
307 | print(colorful.BOLD +'this series is NOT stationary' + colorful.END)
308 | return False
309 | else:
310 | print(colorful.BOLD +'this series is stationary' + colorful.END)
311 | return True
312 | else:
313 | print(colorful.BOLD +'this series is stationary' + colorful.END)
314 | return True
315 | ################################################################################
316 | def adjust(val, length= 6):
317 | return str(val).ljust(length)
318 | def check_each_var_for_stationarity(time_df, autolag, verbose=0):
319 | alpha = 0.05
320 | all_vars = 1
321 | copy_cols = time_df.columns.tolist()
322 | for each_var in copy_cols:
323 | timeseries = time_df[each_var].values
324 | dftest = smt.adfuller(timeseries, autolag=autolag)
325 | if verbose >= 2:
326 | ############################ Print Summary #####################
327 | output = {'test_statistic':round(dftest[0], 4), 'pvalue':round(dftest[1], 4), 'n_lags':round(dftest[2], 4), 'n_obs':dftest[3]}
328 | p_value = output['pvalue']
329 | print(f' Augmented Dickey-Fuller Test on "{each_var}"', "\n ", '-'*47)
330 | print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
331 | print(f' Significance Level = {alpha}')
332 | print(f' Test Statistic = {output["test_statistic"]}')
333 | print(f' No. Lags Chosen = {output["n_lags"]}')
334 |
335 | for key,val in dftest[4].items():
336 | print(f' Critical value {adjust(key)} = {round(val, 3)}')
337 |
338 | if p_value <= alpha:
339 | print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
340 | print(f" => Series is Stationary.")
341 | else:
342 | print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
343 | print(f" => Series is Non-Stationary.")
344 | ####################################################################
345 | if dftest[1] < alpha:
346 | all_vars = 1*all_vars
347 | else:
348 | all_vars = 0*all_vars
349 | return all_vars
350 | ##################################################################################
--------------------------------------------------------------------------------
/auto_ts/utils/logging.py:
--------------------------------------------------------------------------------
1 |
2 | """Utilities that affect logging
3 | """
4 |
5 | import os
6 |
7 | # https://github.com/facebook/prophet/issues/223#issuecomment-326455744
8 | class SuppressStdoutStderr():
9 | '''
10 | A context manager for doing a "deep suppression" of stdout and stderr in
11 | Python, i.e. will suppress all print, even if the print originates in a
12 | compiled C/Fortran sub-function.
13 | This will not suppress raised exceptions, since exceptions are printed
14 | to stderr just before a script exits, and after the context manager has
15 | exited (at least, I think that is why it lets exceptions through).
16 |
17 | '''
18 | def __init__(self):
19 | # Open a pair of null files
20 | self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
21 | # Save the actual stdout (1) and stderr (2) file descriptors.
22 | self.save_fds = [os.dup(1), os.dup(2)]
23 |
24 | def __enter__(self):
25 | # Assign the null pointers to stdout and stderr.
26 | os.dup2(self.null_fds[0], 1)
27 | os.dup2(self.null_fds[1], 2)
28 |
29 | def __exit__(self, *_):
30 | # Re-assign the real stdout/stderr back to (1) and (2)
31 | os.dup2(self.save_fds[0], 1)
32 | os.dup2(self.save_fds[1], 2)
33 | # Close the null files
34 | for fd in self.null_fds + self.save_fds:
35 | os.close(fd)
36 |
--------------------------------------------------------------------------------
/auto_ts/utils/metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np # type: ignore
2 | import pandas as pd # type: ignore
3 | import matplotlib.pyplot as plt # type: ignore
4 | from sklearn.metrics import mean_absolute_error, mean_squared_error # type: ignore
5 | import matplotlib.pyplot as plt # type: ignore
6 | import numpy as np # type: ignore
7 | import pandas as pd # type: ignore
8 | from sklearn.metrics import mean_absolute_error, mean_squared_error # type: ignore
9 |
10 |
11 | def print_static_rmse(actual: np.array, predicted: np.array, start_from: int=0, verbose: int=0):
12 | """
13 | this calculates the ratio of the rmse error to the standard deviation of the actuals.
14 | This ratio should be below 1 for a model to be considered useful.
15 | The comparison starts from the row indicated in the "start_from" variable.
16 | """
17 | rmse = np.sqrt(mean_squared_error(actual[start_from:], predicted[start_from:]))
18 | std_dev = actual[start_from:].std()
19 | if verbose >= 1:
20 | print(' RMSE = %0.2f' %rmse)
21 | print(' Std Deviation of Actuals = %0.2f' %(std_dev))
22 | print(' Normalized RMSE (as pct of std dev) = %0.1f%%' %(rmse*100/std_dev))
23 | return rmse, rmse/std_dev
24 |
25 |
26 | def print_dynamic_rmse(actuals: np.array, predicted: np.array, original: np.array, toprint: bool = True):
27 | """
28 | This utility calculates rmse between actuals and predicted. However, it does one more.
29 | Since in dynamic forecast, we need the longer original, it calculates Normalized RMSE
30 | using the original array's std deviation. That way, the forecast of 2 values does not
31 | result in a larger Normalized RMSE since the std deviation of 2 values will be v small.
32 | """
33 | rmse = np.sqrt(np.mean((actuals - predicted)**2))
34 | norm_rmse = rmse/original.std()
35 | if toprint:
36 | print(' RMSE = {:,.2f}'.format(rmse))
37 | print(' Std Deviation of actuals = {:,.2f}'.format(actuals.std()))
38 | print(' Normalized RMSE (as pct of std dev) = %0.0f%%' %(100*norm_rmse))
39 | return rmse, norm_rmse
40 |
41 |
42 | def print_normalized_rmse(actuals: np.array, predicted: np.array, start_from: int=0):
43 | """
44 | This utility calculates rmse between actuals and predicted. However, it does one more.
45 | If the original is given, it calculates Normalized RMSE using the original array's std deviation.
46 | """
47 | actuals = actuals[start_from:]
48 | predicted = predicted[start_from:]
49 | rmse = np.sqrt(np.mean(mean_squared_error(actuals,predicted)))
50 | norm_rmse = rmse/actuals.std()
51 | print('RMSE = {:,.2f}'.format(rmse))
52 | print('Std Deviation of Actuals = {:,.2f}'.format(actuals.std()))
53 | print('Normalized RMSE = %0.0f%%' %(100*norm_rmse))
54 | return rmse, norm_rmse
55 |
56 |
57 | def print_rmse(y: np.array, y_hat: np.array):
58 | """
59 | Calculating Root Mean Square Error https://en.wikipedia.org/wiki/Root-mean-square_deviation
60 | """
61 | mse = np.mean((y - y_hat)**2)
62 | return np.sqrt(mse)
63 |
64 |
65 | def print_mape(y: np.array, y_hat: np.array):
66 | """
67 | Calculating Mean Absolute Percent Error https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
68 | """
69 | try:
70 | perc_err = (100*(y - y_hat))/y
71 | return np.mean(abs(perc_err))
72 | except:
73 | return np.nan
74 |
75 |
76 | def print_ts_model_stats(actuals: np.array, predicted: np.array, title="Model"):
77 | """
78 | This program prints and returns MAE, RMSE, MAPE.
79 | If you like the MAE and RMSE as a percentage of something, just give that number
80 | in the input as "number_as_percentage" and it will return the MAE and RMSE as a
81 | ratio of that number. Returns MAE, MAE_as_percentage, and RMSE_as_percentage
82 | """
83 | try:
84 | number_as_percentage = actuals.std()
85 | if (predicted.index == actuals.index).all():
86 | dfplot = pd.DataFrame(actuals).join(pd.DataFrame(predicted))
87 | else:
88 | dfplot = pd.DataFrame([actuals.values, predicted.values]).T
89 | dfplot.columns = ['Actual','Forecast']
90 | dfplot = dfplot.sort_index()
91 | plt.figure(figsize=(15,8))
92 | plt.plot(dfplot)
93 | plt.legend(['original','predicted'])
94 | plt.title('%s: Actual vs Forecast in expanding (training) Window Cross Validation' %title, fontsize=20)
95 | except:
96 | pass
97 | print('\n-------------------------------------------')
98 | print('Model Cross Validation Results:')
99 | print('-------------------------------------------')
100 | mae = mean_absolute_error(actuals, predicted)
101 | mse = mean_squared_error(actuals, predicted)
102 | print(' MAE (Mean Absolute Error = %0.2f' %mae)
103 | rmse = np.sqrt(mean_squared_error(actuals,predicted))
104 | print(' MSE (Mean Squared Error = %0.2f' %mse)
105 | mape = print_mape(actuals, predicted)
106 | print(" MAPE (Mean Absolute Percent Error) = %0.0f%%" %(mape))
107 | print(" RMSE (Root Mean Squared Error) = %0.04f" %(rmse))
108 | # Normalized RMSE print('RMSE = {:,.Of}'.format(rmse))
109 | print(' Normalized RMSE (MinMax) = %0.0f%%' %(100*rmse/abs(actuals.max()-actuals.min())))
110 | rmse_asp = (np.sqrt(mean_squared_error(actuals,predicted))/number_as_percentage)*100
111 | print(' Normalized RMSE (as Std Dev of Actuals)= %0.0f%%' %rmse_asp)
112 | return rmse, rmse_asp
113 |
--------------------------------------------------------------------------------
/auto_ts/utils/my_encoders.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.preprocessing import OneHotEncoder
6 | from sklearn.base import BaseEstimator #gives fit_transform method for free
7 | import pdb
8 | from sklearn.base import TransformerMixin
9 | from collections import defaultdict
10 | ####################################################################################################
11 | class My_LabelEncoder(BaseEstimator, TransformerMixin):
12 | """
13 | ################################################################################################
14 | ###### The My_LabelEncoder class was developed by Ram Seshadri for AutoViML #########
15 | ###### The My_LabelEncoder class works just like sklearn's Label Encoder but better! #######
16 | ##### It label encodes any cat var in your dataset. It also handles NaN's in your dataset! ####
17 | ## The beauty of this function is that it takes care of NaN's and unknown (future) values.#####
18 | ##################### This is the BEST working version - don't mess with it!! ##################
19 | ################################################################################################
20 | Usage:
21 | le = My_LabelEncoder()
22 | le.fit_transform(train[column]) ## this will give your transformed values as an array
23 | le.transform(test[column]) ### this will give your transformed values as an array
24 |
25 | Usage in Column Transformers and Pipelines:
26 | No. It cannot be used in pipelines since it need to produce two columns for the next stage in pipeline.
27 | See my other module called My_LabelEncoder_Pipe() to see how it can be used in Pipelines.
28 | """
29 | def __init__(self):
30 | self.transformer = defaultdict(str)
31 | self.inverse_transformer = defaultdict(str)
32 | self.max_val = 0
33 |
34 | def fit(self,testx, y=None):
35 | if isinstance(testx, pd.Series):
36 | pass
37 | elif isinstance(testx, np.ndarray):
38 | testx = pd.Series(testx)
39 | else:
40 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ###
41 | ### But if it is a one-dimensional dataframe, convert it into a Series
42 | if testx.shape[1] == 1:
43 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0])
44 | else:
45 | #### Since it is multi-dimensional, So in this case, just return the data as is
46 | return self
47 | ins = np.unique(testx.factorize()[1]).tolist()
48 | outs = np.unique(testx.factorize()[0]).tolist()
49 | #ins = testx.value_counts(dropna=False).index
50 | if -1 in outs:
51 | # it already has nan if -1 is in outs. No need to add it.
52 | if not np.nan in ins:
53 | ins.insert(0,np.nan)
54 | self.transformer = dict(zip(ins,outs))
55 | self.inverse_transformer = dict(zip(outs,ins))
56 | return self
57 |
58 | def transform(self, testx, y=None):
59 | if isinstance(testx, pd.Series):
60 | pass
61 | elif isinstance(testx, np.ndarray):
62 | testx = pd.Series(testx)
63 | else:
64 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ###
65 | ### But if it is a one-dimensional dataframe, convert it into a Series
66 | if testx.shape[1] == 1:
67 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0])
68 | else:
69 | #### Since it is multi-dimensional, So in this case, just return the data as is
70 | return testx, y
71 | ### now convert the input to transformer dictionary values
72 | new_ins = np.unique(testx.factorize()[1]).tolist()
73 | missing = [x for x in new_ins if x not in self.transformer.keys()]
74 | if len(missing) > 0:
75 | for each_missing in missing:
76 | self.transformer[each_missing] = int(self.max_val + 1)
77 | self.inverse_transformer[int(self.max_val+1)] = each_missing
78 | self.max_val = int(self.max_val+1)
79 | else:
80 | self.max_val = np.max(list(self.transformer.values()))
81 | outs = testx.map(self.transformer).values.astype(int)
82 | ### To handle category dtype you must do the next step #####
83 | testk = testx.map(self.transformer) ## this must be still a pd.Series
84 | if testx.dtype not in [np.int16, np.int32, np.int64, float, bool, object]:
85 | if testx.isnull().sum().sum() > 0:
86 | fillval = self.transformer[np.nan]
87 | testk = testk.cat.add_categories([fillval])
88 | testk = testk.fillna(fillval)
89 | testk = testk.astype(int)
90 | return testk, y
91 | else:
92 | testk = testk.astype(int)
93 | return testk, y
94 | else:
95 | return outs
96 |
97 | def inverse_transform(self, testx, y=None):
98 | ### now convert the input to transformer dictionary values
99 | if isinstance(testx, pd.Series):
100 | outs = testx.map(self.inverse_transformer).values
101 | elif isinstance(testx, np.ndarray):
102 | outs = pd.Series(testx).map(self.inverse_transformer).values
103 | else:
104 | outs = testx[:]
105 | return outs
106 | #################################################################################
107 | class My_LabelEncoder_Pipe(BaseEstimator, TransformerMixin):
108 | """
109 | ################################################################################################
110 | ###### The My_LabelEncoder_Pipe class was developed by Ram Seshadri for Auto_TS #####
111 | ###### The My_LabelEncoder_Pipe class works just like sklearn's Label Encoder but better! #####
112 | ##### It label encodes any cat var in your dataset. But it can also be used in Pipelines! #####
113 | ## The beauty of this function is that it takes care of NaN's and unknown (future) values.#####
114 | ##### Since it produces an unused second column it can be used in sklearn's Pipelines. #####
115 | ##### But for that you need to add a drop_second_col() function to this My_LabelEncoder_Pipe ##
116 | ##### and then feed the whole pipeline to a Column_Transformer function. It is very easy. #####
117 | ##################### This is the BEST working version - don't mess with it!! ##################
118 | ################################################################################################
119 | Usage in pipelines:
120 | le = My_LabelEncoder_Pipe()
121 | le.fit_transform(train[column]) ## this will give you two columns - beware!
122 | le.transform(test[column]) ### this will give you two columns - beware!
123 |
124 | Usage in Column Transformers:
125 | def drop_second_col(Xt):
126 | ### This deletes the 2nd column. Hence col number=1 and axis=1 ###
127 | return np.delete(Xt, 1, 1)
128 |
129 | drop_second_col_func = FunctionTransformer(drop_second_col)
130 |
131 | le_one = make_pipeline(le, drop_second_col_func)
132 |
133 | ct = make_column_transformer(
134 | (le_one, catvars[0]),
135 | (le_one, catvars[1]),
136 | (imp, numvars),
137 | remainder=remainder)
138 |
139 | """
140 | def __init__(self):
141 | self.transformer = defaultdict(str)
142 | self.inverse_transformer = defaultdict(str)
143 | self.max_val = 0
144 |
145 | def fit(self,testx, y=None):
146 | if isinstance(testx, pd.Series):
147 | pass
148 | elif isinstance(testx, np.ndarray):
149 | testx = pd.Series(testx)
150 | else:
151 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ###
152 | ### But if it is a one-dimensional dataframe, convert it into a Series
153 | if testx.shape[1] == 1:
154 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0])
155 | else:
156 | #### Since it is multi-dimensional, So in this case, just return the data as is
157 | return self
158 | ins = np.unique(testx.factorize()[1]).tolist()
159 | outs = np.unique(testx.factorize()[0]).tolist()
160 | #ins = testx.value_counts(dropna=False).index
161 | if -1 in outs:
162 | # it already has nan if -1 is in outs. No need to add it.
163 | if not np.nan in ins:
164 | ins.insert(0,np.nan)
165 | self.transformer = dict(zip(ins,outs))
166 | self.inverse_transformer = dict(zip(outs,ins))
167 | return self
168 |
169 | def transform(self, testx, y=None):
170 | if isinstance(testx, pd.Series):
171 | pass
172 | elif isinstance(testx, np.ndarray):
173 | testx = pd.Series(testx)
174 | else:
175 | #### There is no way to transform dataframes since you will get a nested renamer error if you try ###
176 | ### But if it is a one-dimensional dataframe, convert it into a Series
177 | if testx.shape[1] == 1:
178 | testx = pd.Series(testx.values.ravel(),name=testx.columns[0])
179 | else:
180 | #### Since it is multi-dimensional, So in this case, just return the data as is
181 | return testx, y
182 | ### now convert the input to transformer dictionary values
183 | new_ins = np.unique(testx.factorize()[1]).tolist()
184 | missing = [x for x in new_ins if x not in self.transformer.keys()]
185 | if len(missing) > 0:
186 | for each_missing in missing:
187 | self.transformer[each_missing] = int(self.max_val + 1)
188 | self.inverse_transformer[int(self.max_val+1)] = each_missing
189 | self.max_val = int(self.max_val+1)
190 | else:
191 | self.max_val = np.max(list(self.transformer.values()))
192 | outs = testx.map(self.transformer).values
193 | testk = testx.map(self.transformer)
194 | if testx.dtype not in [np.int16, np.int32, np.int64, float, bool, object]:
195 | if testx.isnull().sum().sum() > 0:
196 | fillval = self.transformer[np.nan]
197 | testk = testk.cat.add_categories([fillval])
198 | testk = testk.fillna(fillval)
199 | testk = testk.astype(int)
200 | return testk, y
201 | else:
202 | testk = testk.astype(int)
203 | return testk, y
204 | else:
205 | return np.c_[outs,np.zeros(shape=outs.shape)].astype(int)
206 |
207 | def inverse_transform(self, testx, y=None):
208 | ### now convert the input to transformer dictionary values
209 | if isinstance(testx, pd.Series):
210 | outs = testx.map(self.inverse_transformer).values
211 | elif isinstance(testx, np.ndarray):
212 | outs = pd.Series(testx).map(self.inverse_transformer).values
213 | else:
214 | outs = testx[:]
215 | return outs
216 | #################################################################################
217 |
--------------------------------------------------------------------------------
/auto_ts/utils/val.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt # type: ignore
2 | import numpy as np # type: ignore
3 | import pandas as pd # type: ignore
4 | import seaborn as sns # type: ignore
5 |
6 | # This gives an error when running from a python script.
7 | # Maybe, this should be set in the jupyter notebook directly.
8 | # get_ipython().magic('matplotlib inline')
9 | sns.set(style="white", color_codes=True)
10 |
11 | from sklearn.model_selection import TimeSeriesSplit # type: ignore
12 | from sklearn.model_selection import GridSearchCV # type: ignore
13 |
14 | #########################################################
15 | def cross_validation_time_series(model, df, preds, target,n_times=10,verbose=0):
16 | """
17 | This splits a time series data frame "n" times as specified in the input (default=10)
18 | Initially it will start with a certain number of rows in train but it will gradually
19 | increase train size in steps (which it will calculate automatically) while the
20 | number of test rows will remain the same (though their content will vary).
21 | This utility is based on sklearn's time_series_split()
22 | """
23 | if n_times > 10:
24 | print('More than 10 splits is not recommended. Setting n_times to 10')
25 | n_times = 10
26 | splits = TimeSeriesSplit(n_splits=n_times)
27 | index = 0
28 | X = df[preds].values
29 | y = df[target].values
30 | non_df = {}
31 | rmse_list = []
32 | for train_index, test_index in splits.split(X):
33 | X_train = X[train_index]
34 | y_train = y[train_index]
35 | X_test = X[test_index]
36 | y_test = y[test_index]
37 | if verbose == 1:
38 | print('Iteration %d: Total Observations = %d' %(index,len(X_train)+len(X_test)))
39 | print(' Training Index %d Observations: %s' %(len(train_index),train_index))
40 | print(' Testing Index %d Observations: %s' %(len(test_index),test_index))
41 | model.fit(X_train, y_train)
42 | # TODO: Check print_rmse is not defined or loaded
43 | rmse = print_rmse(y_test, model.predict(X_test))
44 | rmse_list.append(rmse)
45 | norm_rmse = rmse/y_test.std()
46 | print(' Split %d: Normalized RMSE = %0.2f' %(norm_rmse))
47 | non_df[index] = norm_rmse
48 | index += 1
49 | non_df = pd.Series(non_df)
50 | non_df.plot()
51 | ave_norm_rmse = np.mean(rmse_list)/y.std()
52 | print('Normalized RMSE over entire data after %d splits = 0.2f' %(index,ave_norm_rmse))
53 | return ave_norm_rmse
54 | ##########################################################
55 | def rolling_validation_time_series(model, df, preds, target,train_size=0,
56 | test_size=0, verbose=0):
57 | """
58 | This utility uses a Walk Forward or Rolling Period time series cross validation method.
59 | Initially it will start with a minimum number of observations to train the model.
60 | It then gradually increases the train size in steps (which it will calculate automatically)
61 | while fixing the number of test rows the same (though their content will vary).
62 | Once the train+test series exceeds the number of rows in data set, it stops.
63 | It does not use SKLearn's Time Series Split. You need to provide the initial sizes
64 | of train and test and it will take care of the rest.
65 | """
66 | df = df[:]
67 | index = 0
68 | X = df[preds].values
69 | y = df[target].values
70 | non_df = {}
71 | # rmse_list = [] # # TODO: Unused (check)
72 | if train_size == 0:
73 | train_size = np.int(np.ceil(len(y)/2))
74 | if test_size == 0:
75 | test_size = np.int(np.ceil(len(y)/4))
76 | # step_size = np.int(np.ceil(test_size/10)) # TODO: Unused (check)
77 | n_records = len(X)
78 | ### This contains the start point of test size for each K-Fold in time series
79 | test_list = np.floor(np.linspace(train_size,n_records-1,5)).tolist()
80 | for i in range(4):
81 | train_size = np.int(test_list[i])
82 | test_size = np.int(test_list[i+1] - test_list[i])
83 | X_train, X_test = X[:train_size],X[train_size:train_size+test_size]
84 | y_train, y_test = y[:train_size],y[train_size:train_size+test_size]
85 | model.fit(X_train, y_train)
86 | if i == 0:
87 | ### Since both start and end points are included, you have to subtract 1 from index in this
88 | df.loc[:train_size-1,'predictions'] = y[:train_size]
89 | df.loc[train_size:train_size+test_size-1,'predictions'] = model.predict(X_test)
90 | elif i == 3:
91 | test_size = np.int(len(X) - train_size)
92 | X_train, X_test = X[:train_size],X[train_size:train_size+test_size]
93 | y_train, y_test = y[:train_size],y[train_size:train_size+test_size]
94 | df.loc[train_size:train_size+test_size,'predictions'] = model.predict(X_test)
95 | else:
96 | df.loc[train_size:train_size+test_size-1,'predictions'] = model.predict(X_test)
97 | if len(y_train) + len(y_test) >= df.shape[0]:
98 | if verbose:
99 | print('Iteration %d: Observations:%d' %(index+1,len(X_train)+len(X_test)))
100 | print(' Train Size=%d, Test Size=%d' %(len(y_train),len(y_test)))
101 | # TODO:
102 | rmse = print_rmse(y_test, model.predict(X_test))
103 | norm_rmse = rmse/y_test.std()
104 | non_df[i] = rmse
105 | if verbose:
106 | print('Normalized RMSE = %0.2f' %norm_rmse)
107 | non_df = pd.Series(non_df)
108 | weighted_ave_rmse = np.average(non_df.values,weights=non_df.index,axis=0)
109 | print('\nWeighted Average of RMSE (%d iterations) = %0.2f\n Normalized Wtd Aver. RMSE (using std dev) = %0.2f'
110 | %(index+1, weighted_ave_rmse,weighted_ave_rmse/y[:].std()))
111 | #############################
112 | if verbose == 1 or verbose == 2:
113 | fig, ax1 = plt.subplots(nrows=1,ncols=1,figsize=(12,8))
114 | ax1.plot(df[target],label='In-Sample Data', linestyle='-')
115 | ax1.plot(df['predictions'],'g',alpha=0.6,label='Rolling Forecast')
116 | ax1.set_xlabel('Time')
117 | ax1.set_ylabel('Values')
118 | ax1.legend(loc='best')
119 | return weighted_ave_rmse, weighted_ave_rmse/y[:].std(), df
120 | else:
121 | if verbose:
122 | print('Iteration %d: Observations:%d' %(index+1,len(X_train)+len(X_test)))
123 | print(' Train Size=%d, Test Size=%d' %(len(y_train),len(y_test)))
124 | # TODO: Check print_rmse is not defined or loaded
125 | rmse = print_rmse(y_test, model.predict(X_test))
126 | norm_rmse = rmse/y_test.std()
127 | non_df[i] = rmse
128 | if verbose:
129 | print('Normalized RMSE = %0.2f' %norm_rmse)
130 | index += 1
131 |
132 |
133 | ###################################################
134 | # Re-run the above statistical tests, and more. To be used when selecting viable models.
135 | def ts_model_validation(model_results):
136 | """
137 | Once you have built a time series model, how to validate it. This utility attempts to.
138 | This is only done on SARIMAX models from statsmodels. Don't try it on other models.
139 | The input is model_results which is the variable assigned to the model.fit() method.
140 | """
141 | het_method='breakvar'
142 | norm_method='jarquebera'
143 | sercor_method='ljungbox'
144 | ########################
145 | (het_stat, het_p) = model_results.test_heteroskedasticity(het_method)[0]
146 | norm_stat, norm_p, skew, kurtosis = model_results.test_normality(norm_method)[0]
147 | sercor_stat, sercor_p = model_results.test_serial_correlation(method=sercor_method)[0]
148 | sercor_stat = sercor_stat[-1] # last number for the largest lag
149 | sercor_p = sercor_p[-1] # last number for the largest lag
150 |
151 | # Run Durbin-Watson test on the standardized residuals.
152 | # The statistic is approximately equal to 2*(1-r), where r is the sample autocorrelation of the residuals.
153 | # Thus, for r == 0, indicating no serial correlation, the test statistic equals 2.
154 | # This statistic will always be between 0 and 4. The closer to 0 the statistic,
155 | # the more evidence for positive serial correlation. The closer to 4,
156 | # the more evidence for negative serial correlation.
157 | # Essentially, below 1 or above 3 is bad.
158 |
159 | # TODO: Checdk statsmodel is not loaded as sm.
160 | dw = sm.stats.stattools.durbin_watson(model_results.filter_results.standardized_forecasts_error[0, model_results.loglikelihood_burn:])
161 |
162 | # check whether roots are outside the unit circle (we want them to be);
163 | # will be True when AR is not used (i.e., AR order = 0)
164 | arroots_outside_unit_circle = np.all(np.abs(model_results.arroots) > 1)
165 | # will be True when MA is not used (i.e., MA order = 0)
166 | maroots_outside_unit_circle = np.all(np.abs(model_results.maroots) > 1)
167 |
168 | print('Test heteroskedasticity of residuals ({}): stat={:.3f}, p={:.3f}'.format(het_method, het_stat, het_p));
169 | print('\nTest normality of residuals ({}): stat={:.3f}, p={:.3f}'.format(norm_method, norm_stat, norm_p));
170 | print('\nTest serial correlation of residuals ({}): stat={:.3f}, p={:.3f}'.format(sercor_method, sercor_stat, sercor_p));
171 | print('\nDurbin-Watson test on residuals: d={:.2f}\n\t(NB: 2 means no serial correlation, 0=pos, 4=neg)'.format(dw))
172 | print('\nTest for all AR roots outside unit circle (>1): {}'.format(arroots_outside_unit_circle))
173 | print('\nTest for all MA roots outside unit circle (>1): {}'.format(maroots_outside_unit_circle))
174 | ############################################################################################################
175 | def quick_ts_plot(y_true, y_pred, modelname='Prophet'):
176 | fig,ax = plt.subplots(figsize=(15,7))
177 | labels = ['actual','forecast']
178 | y_true.plot(ax=ax,)
179 | y_pred.plot(ax=ax,)
180 | ax.legend(labels)
181 | plt.title('%s: Actual vs Forecast in expanding (training) window Cross Validation' %modelname, fontsize=20);
182 | ##############################################################################################
183 |
--------------------------------------------------------------------------------
/cloud_run.txt:
--------------------------------------------------------------------------------
1 | # This workflow will deploy source code on Cloud Run when a commit is pushed to the "master" branch
2 | #
3 | # Overview:
4 | #
5 | # 1. Authenticate to Google Cloud
6 | # 2. Deploy it to Cloud Run
7 | #
8 | # To configure this workflow:
9 | #
10 | # 1. Ensure the required Google Cloud APIs are enabled:
11 | #
12 | # Cloud Run run.googleapis.com
13 | # Cloud Build cloudbuild.googleapis.com
14 | # Artifact Registry artifactregistry.googleapis.com
15 | #
16 | # 2. Create and configure Workload Identity Federation for GitHub (https://github.com/google-github-actions/auth#setting-up-workload-identity-federation)
17 | #
18 | # 3. Ensure the required IAM permissions are granted
19 | #
20 | # Cloud Run
21 | # roles/run.admin
22 | # roles/iam.serviceAccountUser (to act as the Cloud Run runtime service account)
23 | #
24 | # Cloud Build
25 | # roles/cloudbuild.builds.editor
26 | #
27 | # Cloud Storage
28 | # roles/storage.objectAdmin
29 | #
30 | # Artifact Registry
31 | # roles/artifactregistry.admin (project or repository level)
32 | #
33 | # NOTE: You should always follow the principle of least privilege when assigning IAM roles
34 | #
35 | # 4. Create GitHub secrets for WIF_PROVIDER and WIF_SERVICE_ACCOUNT
36 | #
37 | # 5. Change the values for the SERVICE and REGION environment variables (below).
38 | #
39 | # For more support on how to run this workflow, please visit https://github.com/marketplace/actions/deploy-to-cloud-run
40 | #
41 | # Further reading:
42 | # Cloud Run runtime service account - https://cloud.google.com/run/docs/securing/service-identity
43 | # Cloud Run IAM permissions - https://cloud.google.com/run/docs/deploying-source-code#permissions_required_to_deploy
44 | # Cloud Run builds from source - https://cloud.google.com/run/docs/deploying-source-code
45 | # Principle of least privilege - https://cloud.google.com/blog/products/identity-security/dont-get-pwned-practicing-the-principle-of-least-privilege
--------------------------------------------------------------------------------
/example_datasets/Sales_and_Marketing.csv:
--------------------------------------------------------------------------------
1 | Time Period,Sales,Marketing Expense
2 | 2011-01-01,397,486.64
3 | 2011-02-01,400,501.8
4 | 2011-03-01,498,437.09
5 | 2011-04-01,536,565.16
6 | 2011-05-01,596,744.15
7 | 2011-06-01,591,548.74
8 | 2011-07-01,651,650.21
9 | 2011-08-01,654,777.51
10 | 2011-09-01,509,547.11
11 | 2011-10-01,437,382.81
12 | 2011-11-01,406,551.56
13 | 2011-12-01,470,401.69
14 | 2012-01-01,428,370.97
15 | 2012-02-01,423,318.39
16 | 2012-03-01,507,477.39
17 | 2012-04-01,536,418.66
18 | 2012-05-01,610,429.68
19 | 2012-06-01,609,713.24
20 | 2012-07-01,687,658.22
21 | 2012-08-01,707,800.52
22 | 2012-09-01,509,640.45
23 | 2012-10-01,452,606.49
24 | 2012-11-01,412,426.88
25 | 2012-12-01,472,513.48
26 | 2013-01-01,454,300.29
27 | 2013-02-01,455,330.84
28 | 2013-03-01,568,444.04
29 | 2013-04-01,610,628.82
30 | 2013-05-01,706,620.36
31 | 2013-06-01,661,682.6
32 | 2013-07-01,767,684.64
33 | 2013-08-01,783,748.47
34 | 2013-09-01,583,668.46
35 | 2013-10-01,513,499.31
36 | 2013-11-01,481,401.92
37 | 2013-12-01,567,605.06
38 | 2014-01-01,525,429.73
39 | 2014-02-01,520,602.86
40 | 2014-03-01,587,596.15
41 | 2014-04-01,710,619.39
42 | 2014-05-01,793,758.31
43 | 2014-06-01,749,980.16
44 | 2014-07-01,871,905.1
45 | 2014-08-01,848,784.62
46 | 2014-09-01,640,718.98
47 | 2014-10-01,581,570.3
48 | 2014-11-01,519,527.6
49 | 2014-12-01,605,559.75
50 |
--------------------------------------------------------------------------------
/example_datasets/ts_2.csv:
--------------------------------------------------------------------------------
1 | DATE,UMCSENT
2 | 1978-01-01,83.7
3 | 1978-02-01,84.3
4 | 1978-03-01,78.8
5 | 1978-04-01,81.6
6 | 1978-05-01,82.9
7 | 1978-06-01,80.0
8 | 1978-07-01,82.4
9 | 1978-08-01,78.4
10 | 1978-09-01,80.4
11 | 1978-10-01,79.3
12 | 1978-11-01,75.0
13 | 1978-12-01,66.1
14 | 1979-01-01,72.1
15 | 1979-02-01,73.9
16 | 1979-03-01,68.4
17 | 1979-04-01,66.0
18 | 1979-05-01,68.1
19 | 1979-06-01,65.8
20 | 1979-07-01,60.4
21 | 1979-08-01,64.5
22 | 1979-09-01,66.7
23 | 1979-10-01,62.1
24 | 1979-11-01,63.3
25 | 1979-12-01,61.0
26 | 1980-01-01,67.0
27 | 1980-02-01,66.9
28 | 1980-03-01,56.5
29 | 1980-04-01,52.7
30 | 1980-05-01,51.7
31 | 1980-06-01,58.7
32 | 1980-07-01,62.3
33 | 1980-08-01,67.3
34 | 1980-09-01,73.7
35 | 1980-10-01,75.0
36 | 1980-11-01,76.7
37 | 1980-12-01,64.5
38 | 1981-01-01,71.4
39 | 1981-02-01,66.9
40 | 1981-03-01,66.5
41 | 1981-04-01,72.4
42 | 1981-05-01,76.3
43 | 1981-06-01,73.1
44 | 1981-07-01,74.1
45 | 1981-08-01,77.2
46 | 1981-09-01,73.1
47 | 1981-10-01,70.3
48 | 1981-11-01,62.5
49 | 1981-12-01,64.3
50 | 1982-01-01,71.0
51 | 1982-02-01,66.5
52 | 1982-03-01,62.0
53 | 1982-04-01,65.5
54 | 1982-05-01,67.5
55 | 1982-06-01,65.7
56 | 1982-07-01,65.4
57 | 1982-08-01,65.4
58 | 1982-09-01,69.3
59 | 1982-10-01,73.4
60 | 1982-11-01,72.1
61 | 1982-12-01,71.9
62 | 1983-01-01,70.4
63 | 1983-02-01,74.6
64 | 1983-03-01,80.8
65 | 1983-04-01,89.1
66 | 1983-05-01,93.3
67 | 1983-06-01,92.2
68 | 1983-07-01,92.8
69 | 1983-08-01,90.9
70 | 1983-09-01,89.9
71 | 1983-10-01,89.3
72 | 1983-11-01,91.1
73 | 1983-12-01,94.2
74 | 1984-01-01,100.1
75 | 1984-02-01,97.4
76 | 1984-03-01,101.0
77 | 1984-04-01,96.1
78 | 1984-05-01,98.1
79 | 1984-06-01,95.5
80 | 1984-07-01,96.6
81 | 1984-08-01,99.1
82 | 1984-09-01,100.9
83 | 1984-10-01,96.3
84 | 1984-11-01,95.7
85 | 1984-12-01,92.9
86 | 1985-01-01,96.0
87 | 1985-02-01,93.7
88 | 1985-03-01,93.7
89 | 1985-04-01,94.6
90 | 1985-05-01,91.8
91 | 1985-06-01,96.5
92 | 1985-07-01,94.0
93 | 1985-08-01,92.4
94 | 1985-09-01,92.1
95 | 1985-10-01,88.4
96 | 1985-11-01,90.9
97 | 1985-12-01,93.9
98 | 1986-01-01,95.6
99 | 1986-02-01,95.9
100 | 1986-03-01,95.1
101 | 1986-04-01,96.2
102 | 1986-05-01,94.8
103 | 1986-06-01,99.3
104 | 1986-07-01,97.7
105 | 1986-08-01,94.9
106 | 1986-09-01,91.9
107 | 1986-10-01,95.6
108 | 1986-11-01,91.4
109 | 1986-12-01,89.1
110 | 1987-01-01,90.4
111 | 1987-02-01,90.2
112 | 1987-03-01,90.8
113 | 1987-04-01,92.8
114 | 1987-05-01,91.1
115 | 1987-06-01,91.5
116 | 1987-07-01,93.7
117 | 1987-08-01,94.4
118 | 1987-09-01,93.6
119 | 1987-10-01,89.3
120 | 1987-11-01,83.1
121 | 1987-12-01,86.8
122 | 1988-01-01,90.8
123 | 1988-02-01,91.6
124 | 1988-03-01,94.6
125 | 1988-04-01,91.2
126 | 1988-05-01,94.8
127 | 1988-06-01,94.7
128 | 1988-07-01,93.4
129 | 1988-08-01,97.4
130 | 1988-09-01,97.3
131 | 1988-10-01,94.1
132 | 1988-11-01,93.0
133 | 1988-12-01,91.9
134 | 1989-01-01,97.9
135 | 1989-02-01,95.4
136 | 1989-03-01,94.3
137 | 1989-04-01,91.5
138 | 1989-05-01,90.7
139 | 1989-06-01,90.6
140 | 1989-07-01,92.0
141 | 1989-08-01,89.6
142 | 1989-09-01,95.8
143 | 1989-10-01,93.9
144 | 1989-11-01,90.9
145 | 1989-12-01,90.5
146 | 1990-01-01,93.0
147 | 1990-02-01,89.5
148 | 1990-03-01,91.3
149 | 1990-04-01,93.9
150 | 1990-05-01,90.6
151 | 1990-06-01,88.3
152 | 1990-07-01,88.2
153 | 1990-08-01,76.4
154 | 1990-09-01,72.8
155 | 1990-10-01,63.9
156 | 1990-11-01,66.0
157 | 1990-12-01,65.5
158 | 1991-01-01,66.8
159 | 1991-02-01,70.4
160 | 1991-03-01,87.7
161 | 1991-04-01,81.8
162 | 1991-05-01,78.3
163 | 1991-06-01,82.1
164 | 1991-07-01,82.9
165 | 1991-08-01,82.0
166 | 1991-09-01,83.0
167 | 1991-10-01,78.3
168 | 1991-11-01,69.1
169 | 1991-12-01,68.2
170 | 1992-01-01,67.5
171 | 1992-02-01,68.8
172 | 1992-03-01,76.0
173 | 1992-04-01,77.2
174 | 1992-05-01,79.2
175 | 1992-06-01,80.4
176 | 1992-07-01,76.6
177 | 1992-08-01,76.1
178 | 1992-09-01,75.6
179 | 1992-10-01,73.3
180 | 1992-11-01,85.3
181 | 1992-12-01,91.0
182 | 1993-01-01,89.3
183 | 1993-02-01,86.6
184 | 1993-03-01,85.9
185 | 1993-04-01,85.6
186 | 1993-05-01,80.3
187 | 1993-06-01,81.5
188 | 1993-07-01,77.0
189 | 1993-08-01,77.3
190 | 1993-09-01,77.9
191 | 1993-10-01,82.7
192 | 1993-11-01,81.2
193 | 1993-12-01,88.2
194 | 1994-01-01,94.3
195 | 1994-02-01,93.2
196 | 1994-03-01,91.5
197 | 1994-04-01,92.6
198 | 1994-05-01,92.8
199 | 1994-06-01,91.2
200 | 1994-07-01,89.0
201 | 1994-08-01,91.7
202 | 1994-09-01,91.5
203 | 1994-10-01,92.7
204 | 1994-11-01,91.6
205 | 1994-12-01,95.1
206 | 1995-01-01,97.6
207 | 1995-02-01,95.1
208 | 1995-03-01,90.3
209 | 1995-04-01,92.5
210 | 1995-05-01,89.8
211 | 1995-06-01,92.7
212 | 1995-07-01,94.4
213 | 1995-08-01,96.2
214 | 1995-09-01,88.9
215 | 1995-10-01,90.2
216 | 1995-11-01,88.2
217 | 1995-12-01,91.0
218 | 1996-01-01,89.3
219 | 1996-02-01,88.5
220 | 1996-03-01,93.7
221 | 1996-04-01,92.7
222 | 1996-05-01,89.4
223 | 1996-06-01,92.4
224 | 1996-07-01,94.7
225 | 1996-08-01,95.3
226 | 1996-09-01,94.7
227 | 1996-10-01,96.5
228 | 1996-11-01,99.2
229 | 1996-12-01,96.9
230 | 1997-01-01,97.4
231 | 1997-02-01,99.7
232 | 1997-03-01,100.0
233 | 1997-04-01,101.4
234 | 1997-05-01,103.2
235 | 1997-06-01,104.5
236 | 1997-07-01,107.1
237 | 1997-08-01,104.4
238 | 1997-09-01,106.0
239 | 1997-10-01,105.6
240 | 1997-11-01,107.2
241 | 1997-12-01,102.1
242 | 1998-01-01,106.6
243 | 1998-02-01,110.4
244 | 1998-03-01,106.5
245 | 1998-04-01,108.7
246 | 1998-05-01,106.5
247 | 1998-06-01,105.6
248 | 1998-07-01,105.2
249 | 1998-08-01,104.4
250 | 1998-09-01,100.9
251 | 1998-10-01,97.4
252 | 1998-11-01,102.7
253 | 1998-12-01,100.5
254 | 1999-01-01,103.9
255 | 1999-02-01,108.1
256 | 1999-03-01,105.7
257 | 1999-04-01,104.6
258 | 1999-05-01,106.8
259 | 1999-06-01,107.3
260 | 1999-07-01,106.0
261 | 1999-08-01,104.5
262 | 1999-09-01,107.2
263 | 1999-10-01,103.2
264 | 1999-11-01,107.2
265 | 1999-12-01,105.4
266 | 2000-01-01,112.0
267 | 2000-02-01,111.3
268 | 2000-03-01,107.1
269 | 2000-04-01,109.2
270 | 2000-05-01,110.7
271 | 2000-06-01,106.4
272 | 2000-07-01,108.3
273 | 2000-08-01,107.3
274 | 2000-09-01,106.8
275 | 2000-10-01,105.8
276 | 2000-11-01,107.6
277 | 2000-12-01,98.4
278 | 2001-01-01,94.7
279 | 2001-02-01,90.6
280 | 2001-03-01,91.5
281 | 2001-04-01,88.4
282 | 2001-05-01,92.0
283 | 2001-06-01,92.6
284 | 2001-07-01,92.4
285 | 2001-08-01,91.5
286 | 2001-09-01,81.8
287 | 2001-10-01,82.7
288 | 2001-11-01,83.9
289 | 2001-12-01,88.8
290 | 2002-01-01,93.0
291 | 2002-02-01,90.7
292 | 2002-03-01,95.7
293 | 2002-04-01,93.0
294 | 2002-05-01,96.9
295 | 2002-06-01,92.4
296 | 2002-07-01,88.1
297 | 2002-08-01,87.6
298 | 2002-09-01,86.1
299 | 2002-10-01,80.6
300 | 2002-11-01,84.2
301 | 2002-12-01,86.7
302 | 2003-01-01,82.4
303 | 2003-02-01,79.9
304 | 2003-03-01,77.6
305 | 2003-04-01,86.0
306 | 2003-05-01,92.1
307 | 2003-06-01,89.7
308 | 2003-07-01,90.9
309 | 2003-08-01,89.3
310 | 2003-09-01,87.7
311 | 2003-10-01,89.6
312 | 2003-11-01,93.7
313 | 2003-12-01,92.6
314 | 2004-01-01,103.8
315 | 2004-02-01,94.4
316 | 2004-03-01,95.8
317 | 2004-04-01,94.2
318 | 2004-05-01,90.2
319 | 2004-06-01,95.6
320 | 2004-07-01,96.7
321 | 2004-08-01,95.9
322 | 2004-09-01,94.2
323 | 2004-10-01,91.7
324 | 2004-11-01,92.8
325 | 2004-12-01,97.1
326 | 2005-01-01,95.5
327 | 2005-02-01,94.1
328 | 2005-03-01,92.6
329 | 2005-04-01,87.7
330 | 2005-05-01,86.9
331 | 2005-06-01,96.0
332 | 2005-07-01,96.5
333 | 2005-08-01,89.1
334 | 2005-09-01,76.9
335 | 2005-10-01,74.2
336 | 2005-11-01,81.6
337 | 2005-12-01,91.5
338 | 2006-01-01,91.2
339 | 2006-02-01,86.7
340 | 2006-03-01,88.9
341 | 2006-04-01,87.4
342 | 2006-05-01,79.1
343 | 2006-06-01,84.9
344 | 2006-07-01,84.7
345 | 2006-08-01,82.0
346 | 2006-09-01,85.4
347 | 2006-10-01,93.6
348 | 2006-11-01,92.1
349 | 2006-12-01,91.7
350 | 2007-01-01,96.9
351 | 2007-02-01,91.3
352 | 2007-03-01,88.4
353 | 2007-04-01,87.1
354 | 2007-05-01,88.3
355 | 2007-06-01,85.3
356 | 2007-07-01,90.4
357 | 2007-08-01,83.4
358 | 2007-09-01,83.4
359 | 2007-10-01,80.9
360 | 2007-11-01,76.1
361 | 2007-12-01,75.5
362 | 2008-01-01,78.4
363 | 2008-02-01,70.8
364 | 2008-03-01,69.5
365 | 2008-04-01,62.6
366 | 2008-05-01,59.8
367 | 2008-06-01,56.4
368 | 2008-07-01,61.2
369 | 2008-08-01,63.0
370 | 2008-09-01,70.3
371 | 2008-10-01,57.6
372 | 2008-11-01,55.3
373 | 2008-12-01,60.1
374 | 2009-01-01,61.2
375 | 2009-02-01,56.3
376 | 2009-03-01,57.3
377 | 2009-04-01,65.1
378 | 2009-05-01,68.7
379 | 2009-06-01,70.8
380 | 2009-07-01,66.0
381 | 2009-08-01,65.7
382 | 2009-09-01,73.5
383 | 2009-10-01,70.6
384 | 2009-11-01,67.4
385 | 2009-12-01,72.5
386 | 2010-01-01,74.4
387 | 2010-02-01,73.6
388 | 2010-03-01,73.6
389 | 2010-04-01,72.2
390 | 2010-05-01,73.6
391 | 2010-06-01,76.0
392 | 2010-07-01,67.8
393 | 2010-08-01,68.9
394 | 2010-09-01,68.2
395 | 2010-10-01,67.7
396 | 2010-11-01,71.6
397 | 2010-12-01,74.5
398 | 2011-01-01,74.2
399 | 2011-02-01,77.5
400 | 2011-03-01,67.5
401 | 2011-04-01,69.8
402 | 2011-05-01,74.3
403 | 2011-06-01,71.5
404 | 2011-07-01,63.7
405 | 2011-08-01,55.8
406 | 2011-09-01,59.5
407 | 2011-10-01,60.8
408 | 2011-11-01,63.7
409 | 2011-12-01,69.9
410 | 2012-01-01,75.0
411 | 2012-02-01,75.3
412 | 2012-03-01,76.2
413 | 2012-04-01,76.4
414 | 2012-05-01,79.3
415 | 2012-06-01,73.2
416 | 2012-07-01,72.3
417 | 2012-08-01,74.3
418 | 2012-09-01,78.3
419 | 2012-10-01,82.6
420 | 2012-11-01,82.7
421 | 2012-12-01,72.9
422 | 2013-01-01,73.8
423 | 2013-02-01,77.6
424 | 2013-03-01,78.6
425 | 2013-04-01,76.4
426 | 2013-05-01,84.5
427 | 2013-06-01,84.1
428 | 2013-07-01,85.1
429 | 2013-08-01,82.1
430 | 2013-09-01,77.5
431 | 2013-10-01,73.2
432 | 2013-11-01,75.1
433 | 2013-12-01,82.5
434 | 2014-01-01,81.2
435 | 2014-02-01,81.6
436 | 2014-03-01,80.0
437 | 2014-04-01,84.1
438 | 2014-05-01,81.9
439 | 2014-06-01,82.5
440 | 2014-07-01,81.8
441 | 2014-08-01,82.5
442 | 2014-09-01,84.6
443 | 2014-10-01,86.9
444 | 2014-11-01,88.8
445 | 2014-12-01,93.6
446 | 2015-01-01,98.1
447 | 2015-02-01,95.4
448 | 2015-03-01,93.0
449 | 2015-04-01,95.9
450 | 2015-05-01,90.7
451 | 2015-06-01,96.1
452 | 2015-07-01,93.1
453 | 2015-08-01,91.9
454 | 2015-09-01,87.2
455 | 2015-10-01,90.0
456 | 2015-11-01,91.3
457 | 2015-12-01,92.6
458 | 2016-01-01,92
459 | 2016-02-01,91.7
460 | 2016-03-01,91
461 | 2016-04-01,89
462 | 2016-05-01,94.7
463 | 2016-06-01,93.5
464 | 2016-07-01,90
465 | 2016-08-01,89.8
466 | 2016-09-01,91.2
467 | 2016-10-01,87.2
468 | 2016-11-01,93.8
469 | 2016-12-01,98.2
470 | 2017-01-01,98.5
471 | 2017-02-01,96.3
472 | 2017-03-01,96.9
473 | 2017-04-01,97
474 | 2017-05-01,97.1
475 | 2017-06-01,95
476 | 2017-07-01,93.4
477 | 2017-08-01,96.8
478 | 2017-09-01,95.1
479 | 2017-10-01,100.7
480 | 2017-11-01,98.5
481 | 2017-12-01,95.9
482 | 2018-01-01,95.7
483 | 2018-02-01,99.7
484 | 2018-03-01,101.4
485 | 2018-04-01,98.8
486 | 2018-05-01,98
487 | 2018-06-01,98.2
488 |
--------------------------------------------------------------------------------
/images/add_fb_prophet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/add_fb_prophet.png
--------------------------------------------------------------------------------
/images/install_auto_ts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/install_auto_ts.png
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/Auto_TS/3d4193b5bfbee1d4834224e9451a33e036894d5d/images/logo.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Library dependencies for the python code. You need to install these with
2 |
3 | # conda create -n python=3.6 (or 3.7)
4 | # source activate
5 | # run requirements
6 | # python -m ipykernel install --user --name --display-name ""
7 |
8 | # `pip install -U -r requirements.txt` before you can run this.
9 |
10 | # Base libraries
11 | numpy
12 | pandas
13 | xlrd
14 | scipy
15 | prettytable
16 | xgboost>=2.0.0 # with GPU support
17 | GPUtil
18 | dask>=2022.2.0
19 | distributed>=2022.2.0
20 | GPUtil>=1.4.0
21 | pyyaml>=5.4.1
22 |
23 | # Viz libs
24 | matplotlib
25 | seaborn
26 |
27 | # Stats libraries
28 | scikit-learn>=0.24.0
29 | statsmodels
30 |
31 | # Auto-Arima
32 | pmdarima
33 |
34 | # Facebook Prophet
35 | prophet
36 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="auto_ts",
8 | version="0.0.92",
9 | author="Ram Seshadri",
10 | # author_email="author@example.com",
11 | description="Automatically Build Multiple Time Series models fast - now with Facebook Prophet!",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | license='Apache License 2.0',
15 | url="https://github.com/AutoViML/Auto_TS",
16 | packages=setuptools.find_packages(exclude=("auto_ts/test",)),
17 | install_requires=[
18 | "ipython",
19 | "jupyter",
20 | "pmdarima",
21 | "numpy",
22 | "xlrd",
23 | "pandas",
24 | "matplotlib",
25 | "seaborn",
26 | "prophet",
27 | "scikit-learn>=0.24.0",
28 | "statsmodels",
29 | "xgboost>=2.0",
30 | "prettytable",
31 | "dask>=2022.1.0",
32 | "pyyaml>=5.4.1",
33 | "GPUtil>=1.4.0",
34 | "distributed>=2022.2.0",
35 | ],
36 | classifiers=[
37 | "Programming Language :: Python :: 3",
38 | "Operating System :: OS Independent",
39 | ],
40 | )
41 |
--------------------------------------------------------------------------------
/updates.md:
--------------------------------------------------------------------------------
1 | Latest Updates about Auto_TS library Page:
2 |
3 | January 2024 Update:
4 | We have now added `XGBoost with GPU` support to Auto_TS. Auto_TS will automatically detect if there is a GPU in your Kaggle kernel or your local machine and will run XGBoost with GPU support. Hope this speeds up your computations!.
5 |
6 |
November 2023 Update:
7 | We have now added `Google Cloud Run` support to Auto_TS. You can simply use the instructions in this page to deploy Auto_TS models on Google Cloud Run. Many thanks to abdulrahman305 for providing a Pull Request to add this functionality to Auto_TS.
8 |
9 |
March 2023 Update:
10 | We have now upgraded `FB Prophet` to the latest version which is simply called `prophet`.
11 |
12 |
Aug 2022 Update:
13 | You can now add FB Prophet arguments directly into Auto_TimeSeries using the kwargs argument. See example below:
14 |
15 | 
16 |
17 |
Jan 2022 Update:
18 | New since version 0.0.35: You can now load your file into a Dask dataframe automatically. Just provide the name of your file and if it is too large to fit into a pandas dataframe, Auto_TS will automatically detect and load it into a Dask dataframe.
19 |
20 |
--------------------------------------------------------------------------------