├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── docs.yml
    │   └── tests.yml
├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── DATABASE.md
├── DATA_FORMAT.md
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docker
    ├── .dockerignore
    ├── Dockerfile
    ├── README.md
    └── greenguard-deployment.yml
├── docs
    ├── Makefile
    ├── advanced_usage
    │   ├── concepts.md
    │   ├── csv.md
    │   └── docker.md
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── history.rst
    ├── images
    │   ├── Draco-200.png
    │   ├── Draco.ico
    │   ├── Draco.png
    │   ├── dai-logo.png
    │   └── favicon.ico
    ├── index.rst
    ├── make.bat
    └── readme.rst
├── draco
    ├── __init__.py
    ├── benchmark.py
    ├── db.py
    ├── demo.py
    ├── loaders
    │   ├── __init__.py
    │   └── csv.py
    ├── metrics.py
    ├── pipeline.py
    ├── pipelines
    │   ├── double_lstm
    │   │   ├── double_lstm.json
    │   │   ├── double_lstm_prob.json
    │   │   ├── double_lstm_prob_with_unstack.json
    │   │   └── double_lstm_with_unstack.json
    │   ├── dummy
    │   │   └── dummy.json
    │   ├── lstm
    │   │   ├── lstm.json
    │   │   ├── lstm_prob.json
    │   │   ├── lstm_prob_with_unstack.json
    │   │   └── lstm_with_unstack.json
    │   └── lstm_regressor
    │   │   ├── lstm_regressor.json
    │   │   └── lstm_regressor_with_unstack.json
    ├── primitives
    │   ├── mlblocks.MLPipeline.json
    │   ├── numpy.take.json
    │   └── xgboost.XGBClassifier:probabilities.json
    ├── results.py
    ├── targets.py
    └── utils.py
├── setup.cfg
├── setup.py
├── tests
    ├── test_benchmark.py
    ├── test_metrics.py
    └── test_pipeline.py
├── tox.ini
└── tutorials
    ├── 01_Draco_Machine_Learning.ipynb
    ├── 02_Extract_Readings.ipynb
    ├── 03_Benchmarking.ipynb
    ├── 04_Draco_Regression_Pipeline.ipynb
    ├── Convert NASA CMAPSS to Draco Format.ipynb
    └── pipelines
        ├── double_lstm_with_unstack.ipynb
        ├── lstm_regressor_with_unstack.ipynb
        └── lstm_with_unstack.ipynb


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.py]
14 | max_line_length = 99
15 | 
16 | [*.bat]
17 | indent_style = tab
18 | end_of_line = crlf
19 | 
20 | [LICENSE]
21 | insert_final_newline = false
22 | 
23 | [Makefile]
24 | indent_style = tab
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * Draco version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 | 
 9 |   docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v2
13 | 
14 |     - name: Python
15 |       uses: actions/setup-python@v1
16 |       with:
17 |         python-version: '3.7'
18 | 
19 |     - name: Build
20 |       run: |
21 |         sudo apt install pandoc
22 |         python -m pip install --upgrade pip
23 |         pip install -e .[dev]
24 |         make docs
25 |     - name: Deploy
26 |       uses: peaceiris/actions-gh-pages@v3
27 |       with:
28 |         github_token: ${{secrets.GITHUB_TOKEN}}
29 |         publish_dir: docs/_build/html
30 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: Run Tests
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ '*' ]
  6 |   pull_request:
  7 |     branches: [ master ]
  8 | 
  9 | jobs:
 10 |   docs:
 11 |     runs-on: ${{ matrix.os }}
 12 |     strategy:
 13 |       matrix:
 14 |         python-version: [3.8]
 15 |         os: [ubuntu-latest]
 16 |     steps:
 17 |     - uses: actions/checkout@v1
 18 |     - name: Set up Python ${{ matrix.python-version }}
 19 |       uses: actions/setup-python@v2
 20 |       with:
 21 |         python-version: ${{ matrix.python-version }}
 22 |     - name: Install package
 23 |       run: python -m pip install .[dev]
 24 |     - name: make docs
 25 |       run: make docs
 26 | 
 27 |   lint:
 28 |     runs-on: ${{ matrix.os }}
 29 |     strategy:
 30 |       matrix:
 31 |         python-version: [3.6, 3.7, 3.8]
 32 |         os: [ubuntu-20.04]
 33 |     steps:
 34 |     - uses: actions/checkout@v1
 35 |     - name: Set up Python ${{ matrix.python-version }}
 36 |       uses: actions/setup-python@v2
 37 |       with:
 38 |         python-version: ${{ matrix.python-version }}
 39 |     - name: Install dependencies
 40 |       run: |
 41 |         python -m pip install --upgrade pip
 42 |         pip install .[dev]
 43 |     - name: make lint
 44 |       run: make lint
 45 | 
 46 |   readme:
 47 |     runs-on: ${{ matrix.os }}
 48 |     strategy:
 49 |       matrix:
 50 |         python-version: [3.6, 3.7, 3.8]
 51 |         os: [ubuntu-20.04]
 52 |     steps:
 53 |     - uses: actions/checkout@v1
 54 |     - name: Set up Python ${{ matrix.python-version }}
 55 |       uses: actions/setup-python@v2
 56 |       with:
 57 |         python-version: ${{ matrix.python-version }}
 58 |     - name: Install libgomp1
 59 |       run: |
 60 |           sudo apt-get install libgomp1
 61 |     - name: Install dependencies
 62 |       run: |
 63 |         python -m pip install --upgrade pip
 64 |         pip install rundoc .
 65 |     - name: make readme
 66 |       run: make test-readme
 67 | 
 68 |   unit:
 69 |     runs-on: ${{ matrix.os }}
 70 |     strategy:
 71 |       matrix:
 72 |         python-version: [3.6, 3.7, 3.8]
 73 |         os: [ubuntu-20.04, macos-latest]
 74 |     steps:
 75 |     - uses: actions/checkout@v1
 76 |     - name: Set up Python ${{ matrix.python-version }}
 77 |       uses: actions/setup-python@v2
 78 |       with:
 79 |         python-version: ${{ matrix.python-version }}
 80 |     - name: Install dependencies
 81 |       run: |
 82 |         python -m pip install --upgrade pip
 83 |         pip install .[test]
 84 |     - name: make unit
 85 |       run: make test-unit
 86 | 
 87 |   minimum:
 88 |     runs-on: ${{ matrix.os }}
 89 |     strategy:
 90 |       matrix:
 91 |         python-version: [3.6, 3.7, 3.8]
 92 |         os: [ubuntu-20.04]
 93 |     steps:
 94 |     - uses: actions/checkout@v1
 95 |     - name: Set up Python ${{ matrix.python-version }}
 96 |       uses: actions/setup-python@v2
 97 |       with:
 98 |         python-version: ${{ matrix.python-version }}
 99 |     - name: Install dependencies
100 |       run: |
101 |         python -m pip install --upgrade pip
102 |         pip install .[test]
103 |     - name: make minimum
104 |       run: make test-minimum
105 | 
106 |   tutorials:
107 |     runs-on: ${{ matrix.os }}
108 |     strategy:
109 |       matrix:
110 |         python-version: [3.6, 3.7, 3.8]
111 |         os: [ubuntu-20.04]
112 |     steps:
113 |     - uses: actions/checkout@v1
114 |     - name: Set up Python ${{ matrix.python-version }}
115 |       uses: actions/setup-python@v2
116 |       with:
117 |         python-version: ${{ matrix.python-version }}
118 |     - name: Install dependencies
119 |       run: |
120 |         python -m pip install --upgrade pip
121 |         pip install jupyter .
122 |     - name: make tutorials
123 |       run: make test-tutorials
124 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | docs/api/
 68 | docs/tutorials/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Vim
107 | .*.swp
108 | 
109 | draco/demo/
110 | notebooks/
111 | notebooks-private/
112 | scripts/
113 | dask-worker-space/
114 | tutorials/*.pkl
115 | 
116 | *.pkl
117 | *.DS_Store
118 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | dist: bionic
 3 | language: python
 4 | python:
 5 |   - 3.7
 6 |   - 3.6
 7 | 
 8 | # Command to install dependencies
 9 | install:
10 |   - sudo apt-get install pandoc
11 |   - pip install -U tox-travis codecov
12 | 
13 | after_success: codecov
14 | 
15 | # Command to run tests
16 | script: tox
17 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | Credits
2 | =======
3 | 
4 | * Carles Sala <csala@csail.mit.edu>
5 | * Kalyan Veeramachaneni <kalyan@mit.edu>
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every little bit
  8 | helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at the `GitHub Issues page`_.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 30 | wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | Draco could always use more documentation, whether as part of the
 42 | official Draco docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at the `GitHub Issues page`_.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `Draco` for local development.
 61 | 
 62 | 1. Fork the `Draco` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/Draco.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed,
 68 |    this is how you set up your fork for local development::
 69 | 
 70 |     $ mkvirtualenv Draco
 71 |     $ cd Draco/
 72 |     $ make install-develop
 73 | 
 74 | 4. Create a branch for local development::
 75 | 
 76 |     $ git checkout -b name-of-your-bugfix-or-feature
 77 | 
 78 |    Try to use the naming scheme of prefixing your branch with ``gh-X`` where X is
 79 |    the associated issue, such as ``gh-3-fix-foo-bug``. And if you are not
 80 |    developing on your own fork, further prefix the branch with your GitHub
 81 |    username, like ``githubusername/gh-3-fix-foo-bug``.
 82 | 
 83 |    Now you can make your changes locally.
 84 | 
 85 | 5. While hacking your changes, make sure to cover all your developments with the required
 86 |    unit tests, and that none of the old tests fail as a consequence of your changes.
 87 |    For this, make sure to run the tests suite and check the code coverage::
 88 | 
 89 |     $ make lint       # Check code styling
 90 |     $ make test       # Run the tests
 91 |     $ make coverage   # Get the coverage report
 92 | 
 93 | 6. When you're done making changes, check that your changes pass all the styling checks and
 94 |    tests, including other Python supported versions, using::
 95 | 
 96 |     $ make test-all
 97 | 
 98 | 7. Make also sure to include the necessary documentation in the code as docstrings following
 99 |    the `Google docstrings style`_.
100 |    If you want to view how your documentation will look like when it is published, you can
101 |    generate and view the docs with this command::
102 | 
103 |     $ make view-docs
104 | 
105 | 8. Commit your changes and push your branch to GitHub::
106 | 
107 |     $ git add .
108 |     $ git commit -m "Your detailed description of your changes."
109 |     $ git push origin name-of-your-bugfix-or-feature
110 | 
111 | 9. Submit a pull request through the GitHub website.
112 | 
113 | Pull Request Guidelines
114 | -----------------------
115 | 
116 | Before you submit a pull request, check that it meets these guidelines:
117 | 
118 | 1. It resolves an open GitHub Issue and contains its reference in the title or
119 |    the comment. If there is no associated issue, feel free to create one.
120 | 2. Whenever possible, it resolves only **one** issue. If your PR resolves more than
121 |    one issue, try to split it in more than one pull request.
122 | 3. The pull request should include unit tests that cover all the changed code
123 | 4. If the pull request adds functionality, the docs should be updated. Put
124 |    your new functionality into a function with a docstring, and add the
125 |    feature to the documentation in an appropriate place.
126 | 5. The pull request should work for all the supported Python versions. Check the `Travis Build
127 |    Status page`_ and make sure that all the checks pass.
128 | 
129 | Unit Testing Guidelines
130 | -----------------------
131 | 
132 | All the Unit Tests should comply with the following requirements:
133 | 
134 | 1. Unit Tests should be based only in unittest and pytest modules.
135 | 
136 | 2. The tests that cover a module called ``draco/path/to/a_module.py``
137 |    should be implemented in a separated module called
138 |    ``tests/draco/path/to/test_a_module.py``.
139 |    Note that the module name has the ``test_`` prefix and is located in a path similar
140 |    to the one of the tested module, just inside the ``tests`` folder.
141 | 
142 | 3. Each method of the tested module should have at least one associated test method, and
143 |    each test method should cover only **one** use case or scenario.
144 | 
145 | 4. Test case methods should start with the ``test_`` prefix and have descriptive names
146 |    that indicate which scenario they cover.
147 |    Names such as ``test_some_methed_input_none``, ``test_some_method_value_error`` or
148 |    ``test_some_method_timeout`` are right, but names like ``test_some_method_1``,
149 |    ``some_method`` or ``test_error`` are not.
150 | 
151 | 5. Each test should validate only what the code of the method being tested does, and not
152 |    cover the behavior of any third party package or tool being used, which is assumed to
153 |    work properly as far as it is being passed the right values.
154 | 
155 | 6. Any third party tool that may have any kind of random behavior, such as some Machine
156 |    Learning models, databases or Web APIs, will be mocked using the ``mock`` library, and
157 |    the only thing that will be tested is that our code passes the right values to them.
158 | 
159 | 7. Unit tests should not use anything from outside the test and the code being tested. This
160 |    includes not reading or writing to any file system or database, which will be properly
161 |    mocked.
162 | 
163 | Tips
164 | ----
165 | 
166 | To run a subset of tests::
167 | 
168 |     $ python -m pytest tests.test_draco
169 |     $ python -m pytest -k 'foo'
170 | 
171 | Release Workflow
172 | ----------------
173 | 
174 | The process of releasing a new version involves several steps combining both ``git`` and
175 | ``bumpversion`` which, briefly:
176 | 
177 | 1. Merge what is in ``master`` branch into ``stable`` branch.
178 | 2. Update the version in ``setup.cfg``, ``draco/__init__.py`` and
179 |    ``HISTORY.md`` files.
180 | 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
181 | 4. Merge the new commit from ``stable`` into ``master``.
182 | 5. Update the version in ``setup.cfg`` and ``draco/__init__.py``
183 |    to open the next development iteration.
184 | 
185 | .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
186 |           entry that explains the changes that will be included in the new version.
187 |           Normally this is just a list of the Pull Requests that have been merged to master
188 |           since the last release.
189 | 
190 | Once this is done, run of the following commands:
191 | 
192 | 1. If you are releasing a patch version::
193 | 
194 |     make release
195 | 
196 | 2. If you are releasing a minor version::
197 | 
198 |     make release-minor
199 | 
200 | 3. If you are releasing a major version::
201 | 
202 |     make release-major
203 | 
204 | Release Candidates
205 | ~~~~~~~~~~~~~~~~~~
206 | 
207 | Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release,
208 | in order to make some of the new features available for testing on other projects before they
209 | are included in an actual full-blown release.
210 | 
211 | In order to perform such an action, you can execute::
212 | 
213 |     make release-candidate
214 | 
215 | This will perform the following actions:
216 | 
217 | 1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN``
218 | 
219 | 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)``
220 | 
221 | After this is done, the new pre-release can be installed by including the ``dev`` section in the
222 | dependency specification, either in ``setup.py``::
223 | 
224 |     install_requires = [
225 |         ...
226 |         'draco>=X.Y.Z.dev',
227 |         ...
228 |     ]
229 | 
230 | or in command line::
231 | 
232 |     pip install 'draco>=X.Y.Z.dev'
233 | 
234 | 
235 | .. _GitHub issues page: https://github.com/sintel-dev/Draco/issues
236 | .. _Travis Build Status page: https://travis-ci.org/sintel-dev/Draco/pull_requests
237 | .. _Google docstrings style: https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments
238 | 


--------------------------------------------------------------------------------
/DATABASE.md:
--------------------------------------------------------------------------------
  1 | # Database Schema
  2 | 
  3 | The **Draco Database** contains the following collections and relationships
  4 | 
  5 | * Farm
  6 | * Trubine
  7 |     * Farm
  8 | * Signal
  9 | * Sensor
 10 |     * Turbine
 11 |     * Signal
 12 | * Reading
 13 |     * Sensor
 14 | * PipelineTemplate
 15 | * Pipeline
 16 |     * PipelineTemplate
 17 | * MLTask
 18 |     * Turbine - multiple
 19 | * Target
 20 |     * MLTask
 21 | * Experiment
 22 |     * MLTask
 23 |     * PipelineTemplate
 24 |     * Signal - multiple
 25 | * ExperimenRun
 26 |     * Experiment
 27 | * PipelineRun
 28 |     * Pipeline
 29 |     * ExperimentRun
 30 | 
 31 | ## Farm
 32 | 
 33 | A **Farm** represents a physical Wind Turbines Farm. This collection groups together multiple
 34 | Turbines with shared properties, such as location.
 35 | 
 36 | ### Fields
 37 | 
 38 | * `_id (ObjectID)`: Unique Identifier of this Object
 39 | * `name (String)`: Name or code given to this Object
 40 | * `insert_time (DateTime)`: Time when this Object was inserted
 41 | * `created_by (String)`: Identifier of the user that created this Object
 42 | 
 43 | ## Turbine
 44 | 
 45 | A **Turbine** represents a physical Turbine. A Turbine is part of a **Farm**, and has some
 46 | particular properties, such as the Turbine manufacturer.
 47 | 
 48 | ### Fields
 49 | 
 50 | * `_id (ObjectID)`: Unique Identifier of this Object
 51 | * `farm_id (ObjectID)`: Unique Identifier of the Farm to which this Turbine belongs
 52 | * `name (String)`: Name or code given to this Object
 53 | * `manufacturer (String)`: Name or code of the manufacturer - EXAMPLE
 54 | * `model (String)`: Name or code of the model - EXAMPLE
 55 | * `insert_time (DateTime)`: Time when this Object was inserted
 56 | * `created_by (String)`: Identifier of the user that created this Object
 57 | 
 58 | ## Signal
 59 | 
 60 | The **Signal** collection contains the details about each Signal type.
 61 | This includes shared properties of the signal, such as the sensor type or the measurement units.
 62 | 
 63 | ### Fields
 64 | 
 65 | * `_id (ObjectID)`: Unique Identifier of this Object
 66 | * `name (String)`: Name or code given to this Object
 67 | * `type (String)`: Type of sensor - EXAMPLE
 68 | * `created_by (String)`: Identifier of the user that created this Object
 69 | * `insert_time (DateTime)`: Time when this Object was inserted
 70 | 
 71 | ## Sensor
 72 | 
 73 | A **Sensor** represents a physical sensor that is installed in a Turbine.
 74 | The Sensor collection specifies the turbine and the signal type, as well as properties
 75 | about the Sensor such as the Sensor manufacturer and model and its age.
 76 | 
 77 | ### Fields
 78 | 
 79 | * `_id (ObjectID)`: Unique Identifier of this Object
 80 | * `turbine_id (ObjectID)`: Unique Identifier of the Turbine where this Sensor is installed
 81 | * `signal_id (ObjectID)`: Unique Identifier of the Signal type of this Sensor
 82 | * `name (String)`: Name or code given to this Object
 83 | * `manufacturer (String)`: Name or code of the manufacturer - EXAMPLE
 84 | * `model (String)`: Name or code of the model - EXAMPLE
 85 | * `installation_date (DateTime)`: Time when this Sensor was installed - EXAMPLE
 86 | * `insert_time (DateTime)`: Time when this Object was inserted
 87 | * `created_by (String)`: Identifier of the user that created this Object
 88 | 
 89 | ## Reading
 90 | 
 91 | The **Readings** collection contains all the data readings from a Sensor.
 92 | 
 93 | ### Fields
 94 | 
 95 | * `_id (ObjectID)`: Unique Identifier of this Object
 96 | * `sensor_id (ObjectID)`: Unique Identifier of the Sensor to which this Reading belongs.
 97 | * `timestamp (DateTime)`: Time where this reading took place
 98 | * `value (float)`: Value of the reading
 99 | 
100 | ## PipelineTemplate
101 | 
102 | The **PipelineTemplate** collection contains all the pipeline templates from which the
103 | pipelines that later on will be used to run an experiments are generated.
104 | The template includes all the default hyperparameter values, as well as the tunable
105 | hyperparameter ranges.
106 | 
107 | ### Fields
108 | 
109 | * `_id (ObjectID)`: Unique Identifier of this PipelineTemplate object
110 | * `name (String)`: Name or code given to this Object
111 | * `template (SubDocument)`: JSON representation of this pipeline template
112 | * `insert_time (DateTime)`: Time when this Object was inserted
113 | * `created_by (String)`: Identifier of the user that created this Object
114 | 
115 | ## Pipeline
116 | 
117 | The **Pipeline** collection contains all the pipelines registered in the system, including
118 | their details, such as the list of primitives and all the configured hyperparameter values.
119 | 
120 | ### Fields
121 | 
122 | * `_id (ObjectID)`: Unique Identifier of this object
123 | * `name (String)`: Name or code given to this Object
124 | * `pipeline_template_id (ObjectID)`: Unique Identifier of the PipelineTemplate used to generate this pipeline
125 | * `pipeline (SubDocument)`: JSON representation of this pipeline object
126 | * `insert_time (DateTime)`: Time when this Object was inserted
127 | * `created_by (String)`: Identifier of the user that created this Object
128 | 
129 | ## MLTask
130 | 
131 | An **MLTask** is a specific Machine Learning Problem consisting on a prediction that
132 | is to be made using a Pipeline.
133 | 
134 | ### Fields
135 | 
136 | * `_id (ObjectID)`: Unique Identifier of this object
137 | * `name (String)`: Name or code given to this Object
138 | * `description (String)`: Short text description of this task
139 | * `type (String)`: Type of Machine Learning Task
140 | * `turbine_set (List[ObjectID])`: List of IDs of the Turbines to which this MLTask is applied
141 | * `insert_time (DateTime)`: Time when this Object was inserted
142 | * `created_by (String)`: Identifier of the user that created this Object
143 | 
144 | ## Target
145 | 
146 | The **Target** collection contains the **MLTask** targets with their cutoff times.
147 | 
148 | ### Fields
149 | 
150 | * `_id (ObjectID)`: Unique Identifier of this Object
151 | * `mltask_id (ObjectID)`: Unique Identifier of the MLTask to which this target belongs
152 | * `turbine_id (ObjectID)`: Unique Identifier of the Turbine associated with this target
153 | * `cutoff_time (DateTime)`: Time associated with this Target
154 | 
155 | ## Experiment
156 | 
157 | An **Experiment** represents the process of trying and tuning a PipelineTemplate in order
158 | to solve a MLTask.
159 | 
160 | ### Fields
161 | 
162 | * `_id (ObjectID)`: Unique Identifier of this Object
163 | * `name (String)`: Name or code given to this Object
164 | * `mltask_id (ObjectID)`: Unique Identifier of the MLTask to which this Experiment belongs
165 | * `pipeline_template_id (ObjectID)`: Unique Identifier of the PipelineTemplate used in this Experiment
166 | * `sensor_set (List[ObjectID])`: List of IDs of the Sensors used for this Experiment
167 | * `cv_folds (integer)`: Number of folds used for Cross Validation
168 | * `stratified (bool)`: Whether the Cross Validation was stratified or not
169 | * `random_state (integer)`: Random State used for the Cross Validation shuffling
170 | * `metric (string)`: Name of the metric used
171 | * `insert_time (DateTime)`: Time when this Object was inserted
172 | * `created_by (String)`: Identifier of the user that created this Object
173 | 
174 | ## ExperimentRun
175 | 
176 | An **ExperimentRun** represents a single execution of an Experiment.
177 | 
178 | ### Fields
179 | 
180 | * `_id (ObjectID)`: Unique Identifier of this Object
181 | * `experiment_id (ObjectID - Foreign Key)`: Unique Identifier of the Experiment
182 | * `start_time (DateTime)`: When the execution started
183 | * `end_time (DateTime)`: When the execution ended
184 | * `software_versions (List of Strings)`: version of each python dependency installed in the
185 | *virtualenv* when the execution took place
186 | * `budget_type (String)`: Type of budget used (time or number of iterations)
187 | * `budget_amount (Integer)`: Budget amount
188 | * `status (String)`: Whether the ExperimentRun is still running, finished successfully or failed
189 | * `insert_time (DateTime)`: Time when this Object was inserted
190 | * `created_by (String)`: Identifier of the user that created this Object
191 | 
192 | ## PipelineRun
193 | 
194 | A **PipelineRun** represents a single execution of a Pipeline instance over a MLTask.
195 | 
196 | It contains information about whether the execution was successful or not, when it started
197 | and ended and the cross validation score obtained.
198 | 
199 | ### Fields
200 | 
201 | * `_id (ObjectID)`: Unique Identifier of this Object
202 | * `experimentrun_id (ObjectID)`: Unique Identifier of the ExperimentRun to which this PipelineRun belongs
203 | * `pipeline_id (ObjectID)`: Unique Identifier of the Pipeline
204 | * `start_time (DateTime)`: When the execution started
205 | * `end_time (DateTime)`: When the execution ended
206 | * `score (float)`: Cross Validation score obtained
207 | * `status (String)`: Whether the Signalrun is still running, finished successfully or failed
208 | * `insert_time (DateTime)`: Time when this Object was inserted
209 | 


--------------------------------------------------------------------------------
/DATA_FORMAT.md:
--------------------------------------------------------------------------------
  1 | # Draco Data Format
  2 | 
  3 | ## Input
  4 | 
  5 | The minimum input expected by the **Draco** system consists of the following two elements,
  6 | which need to be passed as `pandas.DataFrame` objects:
  7 | 
  8 | ### Target Times
  9 | 
 10 | A table containing the specification of the problem that we are solving, which has three
 11 | columns:
 12 | 
 13 | * `turbine_id`: Unique identifier of the turbine which this label corresponds to.
 14 | * `cutoff_time`: Time associated with this target
 15 | * `target`: The value that we want to predict. This can either be a numerical value or a
 16 |   categorical label. This column can also be skipped when preparing data that will be used
 17 |   only to make predictions and not to fit any pipeline.
 18 | 
 19 | |    | turbine_id   | cutoff_time         |   target |
 20 | |----|--------------|---------------------|----------|
 21 | |  0 | T1           | 2001-01-02 00:00:00 |        0 |
 22 | |  1 | T1           | 2001-01-03 00:00:00 |        1 |
 23 | |  2 | T2           | 2001-01-04 00:00:00 |        0 |
 24 | 
 25 | ### Readings
 26 | 
 27 | A table containing the signal data from the different sensors, with the following columns:
 28 | 
 29 |   * `turbine_id`: Unique identifier of the turbine which this reading comes from.
 30 |   * `signal_id`: Unique identifier of the signal which this reading comes from.
 31 |   * `timestamp (datetime)`: Time where the reading took place, as a datetime.
 32 |   * `value (float)`: Numeric value of this reading.
 33 | 
 34 | |    | turbine_id   | signal_id   | timestamp           |   value |
 35 | |----|--------------|-------------|---------------------|---------|
 36 | |  0 | T1           | S1          | 2001-01-01 00:00:00 |       1 |
 37 | |  1 | T1           | S1          | 2001-01-01 12:00:00 |       2 |
 38 | |  2 | T1           | S1          | 2001-01-02 00:00:00 |       3 |
 39 | |  3 | T1           | S1          | 2001-01-02 12:00:00 |       4 |
 40 | |  4 | T1           | S1          | 2001-01-03 00:00:00 |       5 |
 41 | |  5 | T1           | S1          | 2001-01-03 12:00:00 |       6 |
 42 | |  6 | T1           | S2          | 2001-01-01 00:00:00 |       7 |
 43 | |  7 | T1           | S2          | 2001-01-01 12:00:00 |       8 |
 44 | |  8 | T1           | S2          | 2001-01-02 00:00:00 |       9 |
 45 | |  9 | T1           | S2          | 2001-01-02 12:00:00 |      10 |
 46 | | 10 | T1           | S2          | 2001-01-03 00:00:00 |      11 |
 47 | | 11 | T1           | S2          | 2001-01-03 12:00:00 |      12 |
 48 | 
 49 | ### Turbines
 50 | 
 51 | Optionally, a third table can be added containing metadata about the turbines.
 52 | The only requirement for this table is to have a `turbine_id` field, and it can have
 53 | an arbitraty number of additional fields.
 54 | 
 55 | |    | turbine_id   | manufacturer   | ...   | ...   | ...   |
 56 | |----|--------------|----------------|-------|-------|-------|
 57 | |  0 | T1           | Siemens        | ...   | ...   | ...   |
 58 | |  1 | T2           | Siemens        | ...   | ...   | ...   |
 59 | 
 60 | 
 61 | ## CSV Format
 62 | 
 63 | As explained in a previous section, the input expected by the **Draco** system consists of
 64 | two tables which need to be passed as `pandas.DataFrame` objects:
 65 | 
 66 | * The `target_times` table, which containing the specification of the problem that we are solving
 67 |   in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value.
 68 | * The `readings` table, which contains the signal readings from the different sensors, with
 69 |   `turbine_id`, `signal_id`, `timestamp` and `value` fields.
 70 | 
 71 | However, in most scenarios the size of the available will far exceed the memory limitations
 72 | of the system on which **Draco** is being run, so loading all the data in a single
 73 | `pandas.DataFrame` will not be possible.
 74 | 
 75 | In order to solve this situation, **Draco** provides a [CSVLoader](
 76 | https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader)
 77 | class which can be used to load data from what we call the **Raw Data Format**.
 78 | 
 79 | ### Raw Data Format
 80 | 
 81 | The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the
 82 | following structure:
 83 | 
 84 | #### Folder Structure
 85 | 
 86 | * All the data from all the turbines is inside a single folder, which here we will call `readings`.
 87 | * Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine:
 88 |     * `readings/T001`
 89 |     * `readings/T002`
 90 |     * ...
 91 | * Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`.
 92 |     * `readings/T001/2010-01.csv`
 93 |     * `readings/T001/2010-02.csv`
 94 |     * `readings/T001/2010-03.csv`
 95 |     * ...
 96 | 
 97 | #### CSV Contents
 98 | 
 99 | * Each CSV file contains three columns:
100 |     * `signal_id`: name or id of the signal.
101 |     * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``.
102 |     * `value`: value of the reading.
103 | 
104 | This is an example of what a CSV contents look like:
105 | 
106 | |    | signal_id   | timestamp         |   value |
107 | |----|-------------|-------------------|---------|
108 | |  0 | S1          | 01/01/01 00:00:00 |       1 |
109 | |  1 | S1          | 01/01/01 12:00:00 |       2 |
110 | |  2 | S1          | 01/02/01 00:00:00 |       3 |
111 | |  3 | S1          | 01/02/01 12:00:00 |       4 |
112 | |  4 | S1          | 01/03/01 00:00:00 |       5 |
113 | |  5 | S1          | 01/03/01 12:00:00 |       6 |
114 | |  6 | S2          | 01/01/01 00:00:00 |       7 |
115 | |  7 | S2          | 01/01/01 12:00:00 |       8 |
116 | |  8 | S2          | 01/02/01 00:00:00 |       9 |
117 | |  9 | S2          | 01/02/01 12:00:00 |      10 |
118 | | 10 | S2          | 01/03/01 00:00:00 |      11 |
119 | | 11 | S2          | 01/03/01 12:00:00 |      12 |
120 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
  1 | # History
  2 | 
  3 | ## 0.3.0 - 2022-07-31
  4 | 
  5 | This release switches from ``MLPrimitives`` to ``ml-stars``.
  6 | Moreover, we remove all pipelines using deep feature synthesis.
  7 | 
  8 | * Update demo bucket - [Issue #76](https://github.com/sintel-dev/Draco/issues/76) by @sarahmish
  9 | * Remove ``dfs`` based pipelines - [Issue #73](https://github.com/sintel-dev/Draco/issues/73) by @sarahmish
 10 | * Move from ``MLPrimitives`` to ``ml-stars`` - [Issue #72](https://github.com/sintel-dev/Draco/issues/72) by @sarahmish
 11 | 
 12 | 
 13 | ## 0.2.0 - 2022-04-12
 14 | 
 15 | This release features a reorganization and renaming of ``Draco`` pipelines. In addtion,
 16 | we update some of the dependencies for general housekeeping.
 17 | 
 18 | * Update Draco dependencies - [Issue #66](https://github.com/signals-dev/Draco/issues/66) by @sarahmish
 19 | * Reorganize pipelines - [Issue #63](https://github.com/signals-dev/Draco/issues/63) by @sarahmish
 20 | 
 21 | 
 22 | ## 0.1.0 - 2022-01-01
 23 | 
 24 | * First release on ``draco-ml`` PyPI
 25 | 
 26 | 
 27 | ## Previous GreenGuard development
 28 | 
 29 | ### 0.3.0 - 2021-01-22
 30 | 
 31 | This release increases the supported version of python to `3.8` and also includes changes
 32 | in the installation requirements, where ``pandas`` and ``scikit-optimize`` packages have
 33 | been updated to support higher versions. This changes come together with the newer versions
 34 | of ``MLBlocks`` and ``MLPrimitives``.
 35 | 
 36 | #### Internal Improvements
 37 | 
 38 | * Fix ``run_benchmark`` generating properly the ``init_hyperparameters`` for the pipelines.
 39 | * New ``FPR`` metric.
 40 | * New ``roc_auc_score`` metric.
 41 | * Multiple benchmarking metrics allowed.
 42 | * Multiple ``tpr`` or ``threshold`` values allowed for the benchmark.
 43 | 
 44 | ### 0.2.6 - 2020-10-23
 45 | 
 46 | * Fix ``mkdir`` when exporting to ``csv`` file the benchmark results.
 47 | * Intermediate steps for the pipelines with demo notebooks for each pipeline.
 48 | 
 49 | #### Resolved Issues
 50 | 
 51 | * Issue #50: Expose partial outputs and executions in the ``GreenGuardPipeline``.
 52 | 
 53 | ### 0.2.5 - 2020-10-09
 54 | 
 55 | With this release we include:
 56 | 
 57 | * `run_benchmark`: A function within the module `benchmark` that allows the user to evaluate
 58 | templates against problems with different window size and resample rules.
 59 | * `summarize_results`: A function that given a `csv` file generates a `xlsx` file with a summary
 60 | tab and a detailed tab with the results from `run_benchmark`.
 61 | 
 62 | ### 0.2.4 - 2020-09-25
 63 | 
 64 | * Fix dependency errors
 65 | 
 66 | ### 0.2.3 - 2020-08-10
 67 | 
 68 | * Added benchmarking module.
 69 | 
 70 | ### 0.2.2 - 2020-07-10
 71 | 
 72 | #### Internal Improvements
 73 | 
 74 | * Added github actions.
 75 | 
 76 | #### Resolved Issues
 77 | 
 78 | * Issue #27: Cache Splits pre-processed data on disk
 79 | 
 80 | ### 0.2.1 - 2020-06-16
 81 | 
 82 | With this release we give the possibility to the user to specify more than one template when
 83 | creating a GreenGuardPipeline. When the `tune` method of this is called, an instance of BTBSession
 84 | is returned and it is in charge of selecting the templates and tuning their hyperparameters until
 85 | achieving the best pipeline.
 86 | 
 87 | #### Internal Improvements
 88 | 
 89 | * Resample by filename inside the `CSVLoader` to avoid oversampling of data that will not be used.
 90 | * Select targets now allows them to be equal.
 91 | * Fixed the csv filename format.
 92 | * Upgraded to BTB.
 93 | 
 94 | #### Bug Fixes
 95 | 
 96 | * Issue #33: Wrong default datetime format
 97 | 
 98 | #### Resolved Issues
 99 | 
100 | * Issue #35: Select targets is too strict
101 | * Issue #36: resample by filename inside csvloader
102 | * Issue #39: Upgrade BTB
103 | * Issue #41: Fix CSV filename format
104 | 
105 | ### 0.2.0 - 2020-02-14
106 | 
107 | First stable release:
108 | 
109 | * efficient data loading and preprocessing
110 | * initial collection of dfs and lstm based pipelines
111 | * optimized pipeline tuning
112 | * documentation and tutorials
113 | 
114 | ### 0.1.0
115 | 
116 | * First release on PyPI
117 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018, MIT Data To AI Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.md
 4 | include LICENSE
 5 | include README.md
 6 | 
 7 | recursive-include draco *.json
 8 | 
 9 | recursive-include tests *
10 | recursive-exclude * __pycache__
11 | recursive-exclude * *.py[co]
12 | 
13 | recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .DEFAULT_GOAL := help
  2 | 
  3 | define BROWSER_PYSCRIPT
  4 | import os, webbrowser, sys
  5 | 
  6 | try:
  7 | 	from urllib import pathname2url
  8 | except:
  9 | 	from urllib.request import pathname2url
 10 | 
 11 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
 12 | endef
 13 | export BROWSER_PYSCRIPT
 14 | 
 15 | define PRINT_HELP_PYSCRIPT
 16 | import re, sys
 17 | 
 18 | for line in sys.stdin:
 19 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
 20 | 	if match:
 21 | 		target, help = match.groups()
 22 | 		print("%-20s %s" % (target, help))
 23 | endef
 24 | export PRINT_HELP_PYSCRIPT
 25 | 
 26 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
 27 | 
 28 | .PHONY: help
 29 | help:
 30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
 31 | 
 32 | 
 33 | # CLEAN TARGETS
 34 | 
 35 | .PHONY: clean-build
 36 | clean-build: ## remove build artifacts
 37 | 	rm -fr build/
 38 | 	rm -fr dist/
 39 | 	rm -fr .eggs/
 40 | 	find . -name '*.egg-info' -exec rm -fr {} +
 41 | 	find . -name '*.egg' -exec rm -f {} +
 42 | 
 43 | .PHONY: clean-pyc
 44 | clean-pyc: ## remove Python file artifacts
 45 | 	find . -name '*.pyc' -exec rm -f {} +
 46 | 	find . -name '*.pyo' -exec rm -f {} +
 47 | 	find . -name '*~' -exec rm -f {} +
 48 | 	find . -name '__pycache__' -exec rm -fr {} +
 49 | 
 50 | .PHONY: clean-docs
 51 | clean-docs: ## remove previously built docs
 52 | 	rm -rf docs/api/ docs/api_reference/api/ docs/tutorials docs/build docs/_build
 53 | 
 54 | .PHONY: clean-coverage
 55 | clean-coverage: ## remove coverage artifacts
 56 | 	rm -f .coverage
 57 | 	rm -f .coverage.*
 58 | 	rm -fr htmlcov/
 59 | 
 60 | .PHONY: clean-test
 61 | clean-test: ## remove test artifacts
 62 | 	rm -fr .tox/
 63 | 	rm -fr .pytest_cache
 64 | 
 65 | .PHONY: clean
 66 | clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts
 67 | 
 68 | 
 69 | # INSTALL TARGETS
 70 | 
 71 | .PHONY: install
 72 | install: clean-build clean-pyc ## install the package to the active Python's site-packages
 73 | 	pip install .
 74 | 
 75 | .PHONY: install-test
 76 | install-test: clean-build clean-pyc ## install the package and test dependencies
 77 | 	pip install .[test]
 78 | 
 79 | .PHONY: install-develop
 80 | install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
 81 | 	pip install -e .[dev]
 82 | 
 83 | MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=')
 84 | 
 85 | .PHONY: install-minimum
 86 | install-minimum: ## install the minimum supported versions of the package dependencies
 87 | 	echo pip install $(MINIMUM)
 88 | 
 89 | 
 90 | # LINT TARGETS
 91 | 
 92 | .PHONY: lint-draco
 93 | lint-btb: ## check style with flake8 and isort
 94 | 	flake8 draco
 95 | 	isort -c --recursive draco
 96 | 
 97 | .PHONY: lint-tests
 98 | lint-tests: ## check style with flake8 and isort
 99 | 	flake8 --ignore=D,SFS2 tests
100 | 	isort -c --recursive tests
101 | 
102 | .PHONY: check-dependencies
103 | check-dependencies: ## test if there are any broken dependencies
104 | 	pip check
105 | 
106 | .PHONY: lint
107 | lint: check-dependencies lint-draco lint-tests ## Run all code style and static testing validations
108 | 
109 | .PHONY: fix-lint
110 | fix-lint: ## fix lint issues using autoflake, autopep8, and isort
111 | 	find draco -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
112 | 	autopep8 --in-place --recursive --aggressive draco
113 | 	isort --apply --atomic --recursive draco tests
114 | 
115 | # TEST TARGETS
116 | 
117 | .PHONY: test-unit
118 | test-unit: ## run tests quickly with the default Python
119 | 	python -m pytest --cov=draco
120 | 
121 | .PHONY: test-readme
122 | test-readme: ## run the readme snippets
123 | 	rm -rf tests/readme_test && mkdir tests/readme_test
124 | 	cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
125 | 	rm -rf tests/readme_test
126 | 
127 | .PHONY: test-tutorials
128 | test-tutorials: ## run the tutorial notebooks
129 | 	find tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
130 | 		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --to=html --stdout {} > /dev/null \;
131 | 
132 | .PHONY: test
133 | test: test-unit test-readme test-tutorials ## test everything that needs test dependencies
134 | 
135 | .PHONY: test-minimum
136 | test-minimum: install-minimum check-dependencies test-unit ## run tests using the minimum supported dependencies
137 | 
138 | .PHONY: test-all
139 | test-all: ## run tests on every Python version with tox
140 | 	tox -r
141 | 
142 | .PHONY: coverage
143 | coverage: ## check code coverage quickly with the default Python
144 | 	coverage run --source draco -m pytest
145 | 	coverage report -m
146 | 	coverage html
147 | 	$(BROWSER) htmlcov/index.html
148 | 
149 | # DOCS TARGETS
150 | 
151 | .PHONY: docs
152 | docs: clean-docs ## generate Sphinx HTML documentation, including API docs
153 | 	$(MAKE) -C docs html
154 | 
155 | .PHONY: view-docs
156 | view-docs: ## view the docs in a browser
157 | 	$(BROWSER) docs/_build/html/index.html
158 | 
159 | .PHONY: serve-docs
160 | serve-docs: view-docs ## compile the docs watching for changes
161 | 	watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs
162 | 
163 | 
164 | # RELEASE TARGETS
165 | 
166 | .PHONY: dist
167 | dist: clean ## builds source and wheel package
168 | 	python setup.py sdist
169 | 	python setup.py bdist_wheel
170 | 	ls -l dist
171 | 
172 | .PHONY: publish-confirm
173 | publish-confirm:
174 | 	@echo "WARNING: This will irreversibly upload a new version to PyPI!"
175 | 	@echo -n "Please type 'confirm' to proceed: " \
176 | 		&& read answer \
177 | 		&& [ "$${answer}" = "confirm" ]
178 | 
179 | .PHONY: publish-test
180 | publish-test: dist publish-confirm ## package and upload a release on TestPyPI
181 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
182 | 
183 | .PHONY: publish
184 | publish: dist publish-confirm ## package and upload a release
185 | 	twine upload dist/*
186 | 
187 | .PHONY: bumpversion-release
188 | bumpversion-release: ## Merge master to stable and bumpversion release
189 | 	git checkout stable || git checkout -b stable
190 | 	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
191 | 	bumpversion release
192 | 	git push --tags origin stable
193 | 
194 | .PHONY: bumpversion-release-test
195 | bumpversion-release-test: ## Merge master to stable and bumpversion release
196 | 	git checkout stable || git checkout -b stable
197 | 	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
198 | 	bumpversion release --no-tag
199 | 	@echo git push --tags origin stable
200 | 
201 | .PHONY: bumpversion-patch
202 | bumpversion-patch: ## Merge stable to master and bumpversion patch
203 | 	git checkout master
204 | 	git merge stable
205 | 	bumpversion --no-tag patch
206 | 	git push
207 | 
208 | .PHONY: bumpversion-candidate
209 | bumpversion-candidate: ## Bump the version to the next candidate
210 | 	bumpversion candidate --no-tag
211 | 
212 | .PHONY: bumpversion-minor
213 | bumpversion-minor: ## Bump the version the next minor skipping the release
214 | 	bumpversion --no-tag minor
215 | 
216 | .PHONY: bumpversion-major
217 | bumpversion-major: ## Bump the version the next major skipping the release
218 | 	bumpversion --no-tag major
219 | 
220 | .PHONY: bumpversion-revert
221 | bumpversion-revert: ## Undo a previous bumpversion-release
222 | 	git checkout master
223 | 	git branch -D stable
224 | 
225 | CLEAN_DIR := $(shell git status --short | grep -v ??)
226 | CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
227 | CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*")
228 | CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
229 | 
230 | .PHONY: check-clean
231 | check-clean: ## Check if the directory has uncommitted changes
232 | ifneq ($(CLEAN_DIR),)
233 | 	$(error There are uncommitted changes)
234 | endif
235 | 
236 | .PHONY: check-master
237 | check-master: ## Check if we are in master branch
238 | ifneq ($(CURRENT_BRANCH),master)
239 | 	$(error Please make the release from master branch\n)
240 | endif
241 | 
242 | .PHONY: check-candidate
243 | check-candidate: ## Check if a release candidate has been made
244 | ifeq ($(CURRENT_VERSION),dev0)
245 | 	$(error Please make a release candidate and test it before atempting a release)
246 | endif
247 | 
248 | .PHONY: check-history
249 | check-history: ## Check if HISTORY.md has been modified
250 | ifeq ($(CHANGELOG_LINES),0)
251 | 	$(error Please insert the release notes in HISTORY.md before releasing)
252 | endif
253 | 
254 | .PHONY: check-release
255 | check-release: check-candidate check-clean check-master check-history ## Check if the release can be made
256 | 	@echo "A new release can be made"
257 | 
258 | .PHONY: release
259 | release: check-release bumpversion-release publish bumpversion-patch
260 | 
261 | .PHONY: release-test
262 | release-test: check-release bumpversion-release-test publish-test bumpversion-revert
263 | 
264 | .PHONY: release-candidate
265 | release-candidate: check-master publish bumpversion-candidate
266 | 
267 | .PHONY: release-candidate-test
268 | release-candidate-test: check-clean check-master publish-test
269 | 
270 | .PHONY: release-minor
271 | release-minor: check-release bumpversion-minor release
272 | 
273 | .PHONY: release-major
274 | release-major: check-release bumpversion-major release
275 | 
276 | 
277 | # DOCKER TARGETS
278 | 
279 | .PHONY: docker-build
280 | docker-build:
281 | 	docker build -f docker/Dockerfile -t draco .
282 | 
283 | .PHONY: docker-login
284 | docker-login:
285 | 	docker login
286 | 
287 | .PHONY: docker-push
288 | docker-push: docker-login docker-build
289 | 	@$(eval VERSION := $(shell python -c 'import draco; print(draco.__version__)'))
290 | 	docker tag draco signalsdev/draco:$(VERSION)
291 | 	docker push signalsdev/draco:$(VERSION)
292 | 	docker tag draco signalsdev/draco
293 | 	docker push signalsdev/draco
294 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="left">
  2 | <img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt="DAI" />
  3 | <i>An open source project from Data to AI Lab at MIT.</i>
  4 | </p>
  5 | 
  6 | <p align="left">
  7 | <img width=20% src="https://dai.lids.mit.edu/wp-content/uploads/2019/03/GreenGuard.png" alt="Draco" />
  8 | </p>
  9 | 
 10 | <p align="left">
 11 | AutoML for Time Series.
 12 | </p>
 13 | 
 14 | 
 15 | [![PyPI Shield](https://img.shields.io/pypi/v/draco-ml.svg)](https://pypi.python.org/pypi/draco-ml)
 16 | [![Tests](https://github.com/sintel-dev/Draco/workflows/Run%20Tests/badge.svg)](https://github.com/sintel-dev/Draco/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster)
 17 | [![Downloads](https://pepy.tech/badge/draco-ml)](https://pepy.tech/project/draco-ml)
 18 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sintel-dev/Draco/master?filepath=tutorials)
 19 | <!--
 20 | [![Coverage Status](https://codecov.io/gh/sintel-dev/Draco/branch/master/graph/badge.svg)](https://codecov.io/gh/sintel-dev/Draco)
 21 | -->
 22 | 
 23 | # Draco
 24 | 
 25 | - License: [MIT](https://github.com/sintel-dev/Draco/blob/master/LICENSE)
 26 | - Documentation: https://sintel-dev.github.io/Draco
 27 | - Homepage: https://github.com/sintel-dev/Draco
 28 | 
 29 | ## Overview
 30 | 
 31 | The Draco project is a collection of end-to-end solutions for machine learning problems
 32 | commonly found in time series monitoring systems. Most tasks utilize sensor data
 33 | emanating from monitoring systems. We utilize the foundational innovations developed for
 34 | automation of machine Learning at Data to AI Lab at MIT.
 35 | 
 36 | The salient aspects of this customized project are:
 37 | 
 38 | * A set of ready to use, well tested pipelines for different machine learning tasks. These are
 39 |   vetted through testing across multiple publicly available datasets for the same task.
 40 | * An easy interface to specify the task, pipeline, and generate results and summarize them.
 41 | * A production ready, deployable pipeline.
 42 | * An easy interface to ``tune`` pipelines using Bayesian Tuning and Bandits library.
 43 | * A community oriented infrastructure to incorporate new pipelines.
 44 | * A robust continuous integration and testing infrastructure.
 45 | * A ``learning database`` recording all past outcomes --> tasks, pipelines, outcomes.
 46 | 
 47 | ## Resources
 48 | 
 49 | * [Data Format](DATA_FORMAT.md).
 50 | * [Draco folder structure](DATA_FORMAT.md#folder-structure).
 51 | 
 52 | # Install
 53 | 
 54 | ## Requirements
 55 | 
 56 | **Draco** has been developed and runs on Python 3.6, 3.7 and 3.8.
 57 | 
 58 | Also, although it is not strictly required, the usage of a [virtualenv](
 59 | https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering
 60 | with other software installed in the system where you are trying to run **Draco**.
 61 | 
 62 | ## Download and Install
 63 | 
 64 | **Draco** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with
 65 | the following command:
 66 | 
 67 | ```bash
 68 | pip install draco-ml
 69 | ```
 70 | 
 71 | This will pull and install the latest stable release from [PyPi](https://pypi.org/).
 72 | 
 73 | If you want to install from source or contribute to the project please read the
 74 | [Contributing Guide](https://sintel-dev.github.io/Draco/contributing.html#get-started).
 75 | 
 76 | # Data Format
 77 | 
 78 | The minimum input expected by the **Draco** system consists of the following two elements,
 79 | which need to be passed as `pandas.DataFrame` objects:
 80 | 
 81 | ## Target Times
 82 | 
 83 | A table containing the specification of the problem that we are solving, which has three
 84 | columns:
 85 | 
 86 | * `turbine_id`: Unique identifier of the turbine which this label corresponds to.
 87 | * `cutoff_time`: Time associated with this target
 88 | * `target`: The value that we want to predict. This can either be a numerical value or a
 89 |   categorical label. This column can also be skipped when preparing data that will be used
 90 |   only to make predictions and not to fit any pipeline.
 91 | 
 92 | |    | turbine_id   | cutoff_time         |   target |
 93 | |----|--------------|---------------------|----------|
 94 | |  0 | T1           | 2001-01-02 00:00:00 |        0 |
 95 | |  1 | T1           | 2001-01-03 00:00:00 |        1 |
 96 | |  2 | T2           | 2001-01-04 00:00:00 |        0 |
 97 | 
 98 | ## Readings
 99 | 
100 | A table containing the signal data from the different sensors, with the following columns:
101 | 
102 |   * `turbine_id`: Unique identifier of the turbine which this reading comes from.
103 |   * `signal_id`: Unique identifier of the signal which this reading comes from.
104 |   * `timestamp (datetime)`: Time where the reading took place, as a datetime.
105 |   * `value (float)`: Numeric value of this reading.
106 | 
107 | |    | turbine_id   | signal_id   | timestamp           |   value |
108 | |----|--------------|-------------|---------------------|---------|
109 | |  0 | T1           | S1          | 2001-01-01 00:00:00 |       1 |
110 | |  1 | T1           | S1          | 2001-01-01 12:00:00 |       2 |
111 | |  2 | T1           | S1          | 2001-01-02 00:00:00 |       3 |
112 | |  3 | T1           | S1          | 2001-01-02 12:00:00 |       4 |
113 | |  4 | T1           | S1          | 2001-01-03 00:00:00 |       5 |
114 | |  5 | T1           | S1          | 2001-01-03 12:00:00 |       6 |
115 | |  6 | T1           | S2          | 2001-01-01 00:00:00 |       7 |
116 | |  7 | T1           | S2          | 2001-01-01 12:00:00 |       8 |
117 | |  8 | T1           | S2          | 2001-01-02 00:00:00 |       9 |
118 | |  9 | T1           | S2          | 2001-01-02 12:00:00 |      10 |
119 | | 10 | T1           | S2          | 2001-01-03 00:00:00 |      11 |
120 | | 11 | T1           | S2          | 2001-01-03 12:00:00 |      12 |
121 | 
122 | ## Turbines
123 | 
124 | Optionally, a third table can be added containing metadata about the turbines.
125 | The only requirement for this table is to have a `turbine_id` field, and it can have
126 | an arbitraty number of additional fields.
127 | 
128 | |    | turbine_id   | manufacturer   | ...   | ...   | ...   |
129 | |----|--------------|----------------|-------|-------|-------|
130 | |  0 | T1           | Siemens        | ...   | ...   | ...   |
131 | |  1 | T2           | Siemens        | ...   | ...   | ...   |
132 | 
133 | ## CSV Format
134 | 
135 | A part from the in-memory data format explained above, which is limited by the memory
136 | allocation capabilities of the system where it is run, **Draco** is also prepared to
137 | load and work with data stored as a collection of CSV files, drastically increasing the amount
138 | of data which it can work with. Further details about this format can be found in the
139 | [project documentation site](DATA_FORMAT.md#csv-format).
140 | 
141 | # Quickstart
142 | 
143 | In this example we will load some demo data and classify it using a **Draco Pipeline**.
144 | 
145 | ## 1. Load and split the demo data
146 | 
147 | The first step is to load the demo data.
148 | 
149 | For this, we will import and call the `draco.demo.load_demo` function without any arguments:
150 | 
151 | ```python3
152 | from draco.demo import load_demo
153 | 
154 | target_times, readings = load_demo()
155 | ```
156 | 
157 | The returned objects are:
158 | 
159 | *  ``target_times``: A ``pandas.DataFrame`` with the ``target_times`` table data:
160 | 
161 |    ```
162 |      turbine_id cutoff_time  target
163 |    0       T001  2013-01-12       0
164 |    1       T001  2013-01-13       0
165 |    2       T001  2013-01-14       0
166 |    3       T001  2013-01-15       1
167 |    4       T001  2013-01-16       0
168 |    ```
169 | 
170 | * ``readings``: A ``pandas.DataFrame`` containing the time series data in the format explained above.
171 | 
172 |    ```
173 |      turbine_id signal_id  timestamp  value
174 |    0       T001       S01 2013-01-10  323.0
175 |    1       T001       S02 2013-01-10  320.0
176 |    2       T001       S03 2013-01-10  284.0
177 |    3       T001       S04 2013-01-10  348.0
178 |    4       T001       S05 2013-01-10  273.0
179 |    ```
180 | 
181 | Once we have loaded the `target_times` and before proceeding to training any Machine Learning
182 | Pipeline, we will have split them in 2 partitions for training and testing.
183 | 
184 | In this case, we will split them using the [train_test_split function from scikit-learn](
185 | https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html),
186 | but it can be done with any other suitable tool.
187 | 
188 | ```python3
189 | from sklearn.model_selection import train_test_split
190 | 
191 | train, test = train_test_split(target_times, test_size=0.25, random_state=0)
192 | ```
193 | 
194 | Notice how we are only splitting the `target_times` data and not the `readings`.
195 | This is because the pipelines will later on take care of selecting the parts of the
196 | `readings` table needed for the training based on the information found inside
197 | the `train` and `test` inputs.
198 | 
199 | Additionally, if we want to calculate a goodness-of-fit score later on, we can separate the
200 | testing target values from the `test` table by popping them from it:
201 | 
202 | ```python3
203 | test_targets = test.pop('target')
204 | ```
205 | 
206 | ## 2. Exploring the available Pipelines
207 | 
208 | Once we have the data ready, we need to find a suitable pipeline.
209 | 
210 | The list of available Draco Pipelines can be obtained using the `draco.get_pipelines`
211 | function.
212 | 
213 | ```python3
214 | from draco import get_pipelines
215 | 
216 | pipelines = get_pipelines()
217 | ```
218 | 
219 | The returned `pipeline` variable will be `list` containing the names of all the pipelines
220 | available in the Draco system:
221 | 
222 | ```
223 | ['lstm',
224 |  'lstm_with_unstack',
225 |  'double_lstm',
226 |  'double_lstm_with_unstack']
227 | ```
228 | 
229 | For the rest of this tutorial, we will select and use the pipeline
230 | `lstm_with_unstack` as our template.
231 | 
232 | ```python3
233 | pipeline_name = 'lstm_with_unstack'
234 | ```
235 | 
236 | ## 3. Fitting the Pipeline
237 | 
238 | Once we have loaded the data and selected the pipeline that we will use, we have to
239 | fit it.
240 | 
241 | For this, we will create an instance of a `DracoPipeline` object passing the name
242 | of the pipeline that we want to use:
243 | 
244 | ```python3
245 | from draco.pipeline import DracoPipeline
246 | 
247 | pipeline = DracoPipeline(pipeline_name)
248 | ```
249 | 
250 | And then we can directly fit it to our data by calling its `fit` method and passing in the
251 | training `target_times` and the complete `readings` table:
252 | 
253 | ```python3
254 | pipeline.fit(train, readings)
255 | ```
256 | 
257 | ## 4. Make predictions
258 | 
259 | After fitting the pipeline, we are ready to make predictions on new data by calling the
260 | `pipeline.predict` method passing the testing `target_times` and, again, the complete
261 | `readings` table.
262 | 
263 | ```python3
264 | predictions = pipeline.predict(test, readings)
265 | ```
266 | 
267 | ## 5. Evaluate the goodness-of-fit
268 | 
269 | Finally, after making predictions we can evaluate how good the prediction was
270 | using any suitable metric.
271 | 
272 | ```python3
273 | from sklearn.metrics import f1_score
274 | 
275 | f1_score(test_targets, predictions)
276 | ```
277 | 
278 | ## What's next?
279 | 
280 | For more details about **Draco** and all its possibilities and features, please check the
281 | [project documentation site](https://sintel-dev.github.io/Draco/)
282 | Also do not forget to have a look at the [tutorials](
283 | https://github.com/sintel-dev/Draco/tree/master/tutorials)!
284 | 


--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | ../notebooks-private/
2 | ../.tox/
3 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | ARG UID=1000
 4 | EXPOSE 8888
 5 | 
 6 | RUN mkdir /app
 7 | COPY setup.py /app
 8 | COPY greenguard /app/greenguard
 9 | COPY tutorials /app/tutorials
10 | RUN pip install -e /app jupyter
11 | 
12 | WORKDIR /app
13 | CMD pip install -e /app && /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' --allow-root
14 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Run GreenGuard using Docker
 2 | 
 3 | GreenGuard is prepared to be run using [Docker](https://docker.com/).
 4 | 
 5 | These are the commands needed to start a Docker container locally that runs a [Jupyter Notebook](
 6 | https://jupyter.org/) already configured to run GreenGuard.
 7 | 
 8 | ```bash
 9 | docker run -ti -p8888:8888 signalsdev/greenguard:latest
10 | ```
11 | 
12 | This will start a Jupyter Notebook instance on your computer already configured to use GreenGuard.
13 | You can access it by pointing your browser at http://127.0.0.1:8888
14 | 
15 | Further details about the usage of this image can be found [here](
16 | https://hub.docker.com/repository/docker/signalsdev/greenguard).
17 | 
18 | ## Run GreenGuard on Kubernetes
19 | 
20 | GreenGuard can also be started using [Kubernetes](https://kubernetes.io/).
21 | 
22 | Here are the minimum steps required to create a POD in a local Kubernetes cluster:
23 | 
24 | 1. Create a yaml file with these contents:
25 | 
26 | For this example, we are assuming that the yaml file is named `greegunard-pod.yml`.
27 | 
28 | ```yml
29 | apiVersion: v1
30 | kind: Pod
31 | metadata:
32 |   name: greenguard
33 | spec:
34 |   containers:
35 |   - name: greenguard
36 |     image: signalsdev/greenguard:latest
37 |     ports:
38 |     - containerPort: 8888
39 | ```
40 | 
41 | 2. Create a POD:
42 | 
43 | After creating the yaml file, you can create a POD in your Kubernetes cluster using the `kubectl`
44 | command:
45 | 
46 | ```bash
47 | kubectl apply -f greenguard-pod.yml
48 | ```
49 | 
50 | 3. Forward the port 8888
51 | 
52 | After the POD is started, you still need to forward a local port to it in order to access the
53 | Jupyter instance.
54 | 
55 | ```bash
56 | kubectl port-forward greenguard 8888
57 | ```
58 | 
59 | 4. Point your browser at http://localhost:8888
60 | 
61 | > **NOTE**: If GreenGuard is run in a production environment we recommend you to use a service and
62 | a deployment instead of just a simple POD. You can find a template of this setup [here](
63 | greenguard-deployment.yml)
64 | 
65 | ## Building the Docker image from scratch
66 | 
67 | If you want to build the Docker image from scratch instead of using the dockerhub image
68 | you will need to:
69 | 
70 | 1. Clone the repository
71 | 
72 | ```bash
73 | git clone git@github.com:signals-dev/GreenGuard.git
74 | cd GreenGuard
75 | ```
76 | 
77 | 2. Build the docker image using the GreenGuard make command.
78 | 
79 | ```bash
80 | make docker-build
81 | ```
82 | 
83 | ## What's next?
84 | 
85 | For more details about **GreenGuard** and all its possibilities and features, please check the
86 | [project documentation site](https://signals-dev.github.io/GreenGuard/)!
87 | 


--------------------------------------------------------------------------------
/docker/greenguard-deployment.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: greenguard
 5 | spec:
 6 |   ports:
 7 |   - name: jupyter
 8 |     port: 8888
 9 |     nodePort: 30088
10 |   selector:
11 |     app: greenguard
12 |   type: NodePort
13 | ---
14 | apiVersion: apps/v1
15 | kind: Deployment
16 | metadata:
17 |   name: greenguard
18 | spec:
19 |   selector:
20 |     matchLabels:
21 |       app: greenguard
22 |   strategy:
23 |     type: Recreate
24 |   template:
25 |     metadata:
26 |       labels:
27 |         app: greenguard
28 |     spec:
29 |       containers:
30 |       - image: signalsdev/greenguard:latest
31 |         name: greenguard
32 |         ports:
33 |         - containerPort: 8888
34 |           name: jupyter
35 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = draco
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/concepts.md:
--------------------------------------------------------------------------------
 1 | # Concepts
 2 | 
 3 | Here we briefly explain some of the concepts and terminology used within the Draco
 4 | project and documentation.
 5 | 
 6 | ## Primitive
 7 | 
 8 | We call the smallest computational blocks used in a Machine Learning process
 9 | **primitives**, which:
10 | 
11 | * Can be either classes or functions.
12 | * Have some initialization arguments, which MLBlocks calls `init_params`.
13 | * Have some tunable hyperparameters, which have types and a list or range of valid values.
14 | 
15 | ## Template
16 | 
17 | Primitives can be combined to form what we call **Templates**, which:
18 | 
19 | * Have a list of primitives.
20 | * Have some initialization arguments, which correspond to the initialization arguments
21 |   of their primitives.
22 | * Have some tunable hyperparameters, which correspond to the tunable hyperparameters
23 |   of their primitives.
24 | 
25 | ## Pipeline
26 | 
27 | Templates can be used to build **Pipelines** by taking and fixing a set of valid
28 | hyperparameters for a Template. Hence, Pipelines:
29 | 
30 | * Have a list of primitives, which corresponds to the list of primitives of their template.
31 | * Have some initialization arguments, which correspond to the initialization arguments
32 |   of their template.
33 | * Have some hyperparameter values, which fall within the ranges of valid tunable
34 |   hyperparameters of their template.
35 | 
36 | A pipeline can be fitted and evaluated directly using [MLBlocks](
37 | https://MLBazaar.github.io/MLBlocks), or using the **DracoPipeline**.
38 | 
39 | ## Tuning
40 | 
41 | We call tuning the process of, given a dataset and a collection of templates, finding the pipeline
42 | derived from the templates that gets the best possible score on the dataset.
43 | 
44 | This process usually involves fitting and evaluating multiple pipelines with different
45 | hyperparameter configurations on the same data while using optimization algorithms to deduce
46 | which hyperparameters are more likely to get the best results in the next iterations.
47 | 
48 | We call each one of these evaluations a **tuning iteration**.
49 | 
50 | The process of selecting and tuning the templates is handled by a [BTBSession](
51 | https://MLBazaar.github.io/BTB/tutorials/03_Session.html), which is responsible for
52 | discarding the templates that do not work on the given data and for keeping
53 | track of the template and hyperparameters that obtain the best performance.
54 | 
55 | ## DracoPipeline
56 | 
57 | This class is the one in charge of loading the **MLBlocks Pipelines** configured in the
58 | system and use them to learn from the data and make predictions.
59 | 
60 | This class is also responsible for creating the BTBSession that will handle the
61 | selection and tuning of the templates.
62 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/csv.md:
--------------------------------------------------------------------------------
 1 | # CSV Format
 2 | 
 3 | As explained in a previous section, the input expected by the **Draco** system consists of
 4 | two tables which need to be passed as `pandas.DataFrame` objects:
 5 | 
 6 | * The `target_times` table, which containing the specification of the problem that we are solving
 7 |   in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value.
 8 | * The `readings` table, which contains the signal readings from the different sensors, with
 9 |   `turbine_id`, `signal_id`, `timestamp` and `value` fields.
10 | 
11 | However, in most scenarios the size of the available will far exceed the memory limitations
12 | of the system on which **Draco** is being run, so loading all the data in a single
13 | `pandas.DataFrame` will not be possible.
14 | 
15 | In order to solve this situation, **Draco** provides a [CSVLoader](
16 | https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader)
17 | class which can be used to load data from what we call the **Raw Data Format**.
18 | 
19 | ## Raw Data Format
20 | 
21 | The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the
22 | following structure:
23 | 
24 | * All the data from all the turbines is inside a single folder, which here we will call `readings`.
25 | * Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine:
26 |     * `readings/T001`
27 |     * `readings/T002`
28 |     * ...
29 | * Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`.
30 |     * `readings/T001/2010-01.csv`
31 |     * `readings/T001/2010-02.csv`
32 |     * `readings/T001/2010-03.csv`
33 |     * ...
34 | * Each CSV file contains three columns:
35 |     * `signal_id`: name or id of the signal.
36 |     * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``.
37 |     * `value`: value of the reading.
38 | 
39 | This is an example of what a CSV contents look like:
40 | 
41 | |    | signal_id   | timestamp         |   value |
42 | |----|-------------|-------------------|---------|
43 | |  0 | S1          | 01/01/01 00:00:00 |       1 |
44 | |  1 | S1          | 01/01/01 12:00:00 |       2 |
45 | |  2 | S1          | 01/02/01 00:00:00 |       3 |
46 | |  3 | S1          | 01/02/01 12:00:00 |       4 |
47 | |  4 | S1          | 01/03/01 00:00:00 |       5 |
48 | |  5 | S1          | 01/03/01 12:00:00 |       6 |
49 | |  6 | S2          | 01/01/01 00:00:00 |       7 |
50 | |  7 | S2          | 01/01/01 12:00:00 |       8 |
51 | |  8 | S2          | 01/02/01 00:00:00 |       9 |
52 | |  9 | S2          | 01/02/01 12:00:00 |      10 |
53 | | 10 | S2          | 01/03/01 00:00:00 |      11 |
54 | | 11 | S2          | 01/03/01 12:00:00 |      12 |
55 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/docker.md:
--------------------------------------------------------------------------------
  1 | # Docker Usage
  2 | 
  3 | **Draco** comes configured and ready to be distributed and run as a docker image which starts
  4 | a jupyter notebook already configured to use draco, with all the required dependencies already
  5 | installed.
  6 | 
  7 | ## Requirements
  8 | 
  9 | The only requirement in order to run the Draco Docker image is to have Docker installed and
 10 | that the user has enough permissions to run it.
 11 | 
 12 | Installation instructions for any possible system compatible can be found [here](https://docs.docker.com/install/)
 13 | 
 14 | Additionally, the system that builds the Draco Docker image will also need to have a working
 15 | internet connection that allows downloading the base image and the additional python depenedencies.
 16 | 
 17 | ## Building the Draco Docker Image
 18 | 
 19 | After having cloned the **Draco** repository, all you have to do in order to build the Draco Docker
 20 | Image is running this command:
 21 | 
 22 | ```bash
 23 | make docker-jupyter-build
 24 | ```
 25 | 
 26 | After a few minutes, the new image, called `draco-jupyter`, will have been built into the system
 27 | and will be ready to be used or distributed.
 28 | 
 29 | ## Distributing the Draco Docker Image
 30 | 
 31 | Once the `draco-jupyter` image is built, it can be distributed in several ways.
 32 | 
 33 | ### Distributing using a Docker registry
 34 | 
 35 | The simplest way to distribute the recently created image is [using a registry](https://docs.docker.com/registry/).
 36 | 
 37 | In order to do so, we will need to have write access to a public or private registry (remember to
 38 | [login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands:
 39 | 
 40 | ```bash
 41 | docker tag draco-jupyter:latest your-registry-name:some-tag
 42 | docker push your-registry-name:some-tag
 43 | ```
 44 | 
 45 | Afterwards, in the receiving machine:
 46 | 
 47 | ```bash
 48 | docker pull your-registry-name:some-tag
 49 | docker tag your-registry-name:some-tag draco-jupyter:latest
 50 | ```
 51 | 
 52 | ### Distributing as a file
 53 | 
 54 | If the distribution of the image has to be done offline for any reason, it can be achieved
 55 | using the following command.
 56 | 
 57 | In the system that already has the image:
 58 | 
 59 | ```bash
 60 | docker save --output draco-jupyter.tar draco-jupyter
 61 | ```
 62 | 
 63 | Then copy over the file `draco-jupyter.tar` to the new system and there, run:
 64 | 
 65 | ```bash
 66 | docker load --input draco-jupyter.tar
 67 | ```
 68 | 
 69 | After these commands, the `draco-jupyter` image should be available and ready to be used in the
 70 | new system.
 71 | 
 72 | 
 73 | ## Running the draco-jupyter image
 74 | 
 75 | Once the `draco-jupyter` image has been built, pulled or loaded, it is ready to be run.
 76 | 
 77 | This can be done in two ways:
 78 | 
 79 | ### Running draco-jupyter with the code
 80 | 
 81 | If the Draco source code is available in the system, running the image is as simple as running
 82 | this command from within the root of the project:
 83 | 
 84 | ```bash
 85 | make docker-jupyter-run
 86 | ```
 87 | 
 88 | This will start a jupyter notebook using the docker image, which you can access by pointing your
 89 | browser at http://127.0.0.1:8888
 90 | 
 91 | In this case, the local version of the project will also mounted within the Docker container,
 92 | which means that any changes that you do in your local code will immediately be available
 93 | within your notebooks, and that any notebook that you create within jupyter will also show
 94 | up in your `notebooks` folder!
 95 | 
 96 | ### Running draco-jupyter without the draco code
 97 | 
 98 | If the Draco source code is not available in the system and only the Docker Image is, you can
 99 | still run the image by using this command:
100 | 
101 | ```bash
102 | docker run -ti -p8888:8888 draco-jupyter
103 | ```
104 | 
105 | In this case, the code changes and the notebooks that you create within jupyter will stay
106 | inside the container and you will only be able to access and download them through the
107 | jupyter interface.
108 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Draco documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another
 17 | # directory, add these directories to sys.path here. If the directory is
 18 | # relative to the documentation root, use os.path.abspath to make it
 19 | # absolute, like shown here.
 20 | 
 21 | import sphinx_rtd_theme # For read the docs theme
 22 | 
 23 | import draco
 24 | 
 25 | # -- General configuration ---------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 33 | extensions = [
 34 |     'm2r',
 35 |     'nbsphinx',
 36 |     'sphinx.ext.autodoc',
 37 |     'sphinx.ext.githubpages',
 38 |     'sphinx.ext.viewcode',
 39 |     'sphinx.ext.napoleon',
 40 |     'autodocsumm',
 41 | ]
 42 | 
 43 | autodoc_default_options = {
 44 |     'autosummary': True,
 45 | }
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ['_templates']
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | source_suffix = ['.rst', '.md']
 53 | 
 54 | # The master toctree document.
 55 | master_doc = 'index'
 56 | 
 57 | # Jupyter Notebooks
 58 | nbsphinx_execute = 'never'
 59 | 
 60 | # General information about the project.
 61 | project = 'Draco'
 62 | slug = 'draco'
 63 | title = project + ' Documentation',
 64 | copyright = '2018, MIT Data To AI Lab'
 65 | author = 'MIT Data To AI Lab'
 66 | description = 'AutoML for Time Series'
 67 | user = 'sintel-dev'
 68 | 
 69 | # The version info for the project you're documenting, acts as replacement
 70 | # for |version| and |release|, also used in various other places throughout
 71 | # the built documents.
 72 | #
 73 | # The short X.Y version.
 74 | version = draco.__version__
 75 | # The full version, including alpha/beta/rc tags.
 76 | release = draco.__version__
 77 | 
 78 | # The language for content autogenerated by Sphinx. Refer to documentation
 79 | # for a list of supported languages.
 80 | #
 81 | # This is also used if you do content translation via gettext catalogs.
 82 | # Usually you set "language" from the command line for these cases.
 83 | language = None
 84 | 
 85 | # List of patterns, relative to source directory, that match files and
 86 | # directories to ignore when looking for source files.
 87 | # This patterns also effect to html_static_path and html_extra_path
 88 | exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
 89 | 
 90 | # The name of the Pygments (syntax highlighting) style to use.
 91 | pygments_style = 'sphinx'
 92 | 
 93 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 94 | todo_include_todos = False
 95 | 
 96 | # -- Options for HTML output -------------------------------------------
 97 | 
 98 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 99 | # a list of builtin themes.
100 | #
101 | html_theme = 'sphinx_rtd_theme'
102 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
103 | 
104 | # Readthedocs additions
105 | html_context = {
106 |     'display_github': True,
107 |     'github_user': user,
108 |     'github_repo': project,
109 |     'github_version': 'master',
110 |     'conf_py_path': '/docs/',
111 | }
112 | 
113 | # Theme options are theme-specific and customize the look and feel of a
114 | # theme further.  For a list of options available for each theme, see the
115 | # documentation.
116 | html_theme_options = {
117 |     'collapse_navigation': False,
118 |     'display_version': False,
119 | }
120 | 
121 | # Add any paths that contain custom static files (such as style sheets) here,
122 | # relative to this directory. They are copied after the builtin static files,
123 | # so a file named "default.css" will overwrite the builtin "default.css".
124 | # html_static_path = ['_static']
125 | 
126 | # The name of an image file (relative to this directory) to use as a favicon of
127 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
128 | # pixels large.
129 | # html_favicon = 'images/favicon.ico'
130 | html_favicon = 'images/Draco.ico'
131 | 
132 | # If given, this must be the name of an image file (path relative to the
133 | # configuration directory) that is the logo of the docs. It is placed at
134 | # the top of the sidebar; its width should therefore not exceed 200 pixels.
135 | # html_logo = 'images/dai-logo.png'
136 | html_logo = 'images/Draco-200.png'
137 | 
138 | # -- Options for HTMLHelp output ---------------------------------------
139 | 
140 | # Output file base name for HTML help builder.
141 | htmlhelp_basename = slug + 'doc'
142 | 
143 | 
144 | # -- Options for LaTeX output ------------------------------------------
145 | 
146 | latex_elements = {
147 |     # The paper size ('letterpaper' or 'a4paper').
148 |     #
149 |     # 'papersize': 'letterpaper',
150 | 
151 |     # The font size ('10pt', '11pt' or '12pt').
152 |     #
153 |     # 'pointsize': '10pt',
154 | 
155 |     # Additional stuff for the LaTeX preamble.
156 |     #
157 |     # 'preamble': '',
158 | 
159 |     # Latex figure (float) alignment
160 |     #
161 |     # 'figure_align': 'htbp',
162 | }
163 | 
164 | # Grouping the document tree into LaTeX files. List of tuples
165 | # (source start file, target name, title, author, documentclass
166 | # [howto, manual, or own class]).
167 | latex_documents = [(
168 |     master_doc,
169 |     slug + '.tex',
170 |     title,
171 |     author,
172 |     'manual'
173 | )]
174 | 
175 | 
176 | # -- Options for manual page output ------------------------------------
177 | 
178 | # One entry per manual page. List of tuples
179 | # (source start file, name, description, authors, manual section).
180 | man_pages = [(
181 |     master_doc,
182 |     slug,
183 |     title,
184 |     [author],
185 |     1
186 | )]
187 | 
188 | 
189 | # -- Options for Texinfo output ----------------------------------------
190 | 
191 | # Grouping the document tree into Texinfo files. List of tuples
192 | # (source start file, target name, title, author,
193 | #  dir menu entry, description, category)
194 | texinfo_documents = [(
195 |     master_doc,
196 |     slug,
197 |     title,
198 |     author,
199 |     slug,
200 |     description,
201 |     'Miscellaneous'
202 | )]
203 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../HISTORY.md
2 | 


--------------------------------------------------------------------------------
/docs/images/Draco-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco-200.png


--------------------------------------------------------------------------------
/docs/images/Draco.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco.ico


--------------------------------------------------------------------------------
/docs/images/Draco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco.png


--------------------------------------------------------------------------------
/docs/images/dai-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/dai-logo.png


--------------------------------------------------------------------------------
/docs/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/favicon.ico


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: readme.rst
 2 | 
 3 | .. toctree::
 4 |    :hidden:
 5 |    :maxdepth: 2
 6 | 
 7 |    Overview <readme>
 8 | 
 9 | .. toctree::
10 |    :caption: Tutorials
11 |    :hidden:
12 | 
13 |    tutorials/01_Draco_Quickstart
14 |    tutorials/02_Extract_Readings
15 | 
16 | .. toctree::
17 |    :caption: Advanced Usage
18 |    :hidden:
19 | 
20 |    advanced_usage/concepts
21 |    advanced_usage/csv
22 |    advanced_usage/docker
23 | 
24 | .. toctree::
25 |    :caption: Resources
26 |    :hidden:
27 | 
28 |    API Reference <api/draco>
29 |    contributing
30 |    authors
31 |    history
32 | 
33 | Indices and tables
34 | ==================
35 | * :ref:`genindex`
36 | * :ref:`modindex`
37 | * :ref:`search`
38 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=draco
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../README.md
2 | 


--------------------------------------------------------------------------------
/draco/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Top-level package for Draco."""
 4 | 
 5 | __author__ = """MIT Data To AI Lab"""
 6 | __email__ = 'dailabmit@gmail.com'
 7 | __version__ = '0.3.1.dev0'
 8 | 
 9 | import os
10 | 
11 | from draco.pipeline import DracoPipeline, get_pipelines
12 | 
13 | _BASE_PATH = os.path.abspath(os.path.dirname(__file__))
14 | MLBLOCKS_PRIMITIVES = os.path.join(_BASE_PATH, 'primitives')
15 | MLBLOCKS_PIPELINES = tuple(
16 |     dirname
17 |     for dirname, _, _ in os.walk(os.path.join(_BASE_PATH, 'pipelines'))
18 | )
19 | 
20 | __all__ = (
21 |     'DracoPipeline',
22 |     'get_pipelines',
23 | )
24 | 


--------------------------------------------------------------------------------
/draco/db.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import getpass
 4 | import json
 5 | import logging
 6 | from datetime import datetime
 7 | 
 8 | from pymongo import MongoClient
 9 | 
10 | from draco.utils import remove_dots, restore_dots
11 | 
12 | LOGGER = logging.getLogger(__name__)
13 | 
14 | 
15 | class MongoDB(object):
16 | 
17 |     def __init__(self, database=None, config=None, **kwargs):
18 |         if config:
19 |             with open(config, 'r') as f:
20 |                 config = json.load(f)
21 |         else:
22 |             config = kwargs
23 | 
24 |         host = config.get('host', 'localhost')
25 |         port = config.get('port', 27017)
26 |         user = config.get('user')
27 |         password = config.get('password')
28 |         database = database or config.get('database', 'test')
29 |         auth_database = config.get('auth_database', 'admin')
30 | 
31 |         if user and not password:
32 |             password = getpass.getpass(prompt='Please insert database password: ')
33 | 
34 |         client = MongoClient(
35 |             host=host,
36 |             port=port,
37 |             username=user,
38 |             password=password,
39 |             authSource=auth_database
40 |         )
41 | 
42 |         LOGGER.info("Setting up a MongoClient %s", client)
43 | 
44 |         self._db = client[database]
45 | 
46 |     def load_template(self, template_name):
47 |         match = {
48 |             'name': template_name
49 |         }
50 | 
51 |         cursor = self._db.templates.find(match)
52 |         templates = list(cursor.sort('insert_ts', -1).limit(1))
53 | 
54 |         if templates:
55 |             return restore_dots(templates[0])
56 | 
57 |     def insert_template(self, template):
58 |         if 'name' not in template:
59 |             raise ValueError("Templates need to have a name key")
60 | 
61 |         template['insert_ts'] = datetime.utcnow()
62 |         template = remove_dots(template)
63 | 
64 |         self._db.templates.insert_one(template)
65 | 
66 |     def insert_pipeline(self, candidate, score, dataset, table, column):
67 | 
68 |         pipeline = candidate.to_dict()
69 | 
70 |         pipeline['score'] = score
71 |         pipeline['dataset'] = dataset
72 |         pipeline['table'] = table
73 |         pipeline['column'] = column
74 |         pipeline['insert_ts'] = datetime.utcnow()
75 | 
76 |         pipeline = remove_dots(pipeline)
77 | 
78 |         self._db.pipelines.insert_one(pipeline)
79 | 


--------------------------------------------------------------------------------
/draco/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | import pandas as pd
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | S3_URL = 'https://sintel-draco.s3.amazonaws.com/'
11 | DEMO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'demo')
12 | 
13 | _FILES = {
14 |     'DEFAULT': [
15 |         ('target_times', 'cutoff_time'),
16 |         ('readings', 'timestamp')
17 |     ],
18 |     'RUL': [
19 |         ('rul_train_target_times', 'cutoff_time'),
20 |         ('rul_test_target_times', 'cutoff_time'),
21 |         ('rul_readings', 'timestamp')
22 |     ]
23 | }
24 | 
25 | def _load_or_download(filename, dates):
26 |     filename += '.csv.gz'
27 |     file_path = os.path.join(DEMO_PATH, filename)
28 |     if os.path.exists(file_path):
29 |         return pd.read_csv(file_path, compression='gzip', parse_dates=[dates])
30 | 
31 |     os.makedirs(DEMO_PATH, exist_ok=True)
32 |     url = S3_URL + filename
33 | 
34 |     LOGGER.info('Downloading %s from %s', filename, url)
35 |     data = pd.read_csv(url, compression='gzip', parse_dates=[dates])
36 |     data.to_csv(file_path, index=False, compression='gzip')
37 | 
38 |     return data
39 | 
40 | 
41 | def load_demo(name='default', load_readings=True):
42 |     """Load the demo included in the Draco project.
43 | 
44 |     The first time that this function is executed, the data will be downloaded
45 |     and cached inside the `draco/demo` folder.
46 |     Subsequent calls will load the cached data instead of downloading it again.
47 |     
48 |     Args:
49 |         name (str):
50 |             Name of the dataset to load. If "RUL", load NASA's CMAPSS dataset
51 |             https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan.
52 |             If "default" then load default demo.
53 |         load_readings (bool):
54 |             Whether to load the ``readings`` table or not.
55 | 
56 |     Returns:
57 |         tuple[pandas.DataFrame]:
58 |             target_times and readings tables
59 |     """
60 |     files = _FILES[name.upper()]
61 | 
62 |     if not load_readings:
63 |         files = files[:-1]
64 | 
65 |     output = list()
66 |     for filename, dates in files:
67 |         output.append(_load_or_download(filename, dates))
68 | 
69 |     return tuple(output)
70 | 
71 | 
72 | def generate_raw_readings(output_path='demo'):
73 |     """Generate raw readings based on the demo data.
74 | 
75 |     Args:
76 |         path (str):
77 |             Path where the readings will be generated.
78 |     """
79 |     target_times, readings = load_demo()
80 | 
81 |     for turbine_id in target_times.turbine_id.unique():
82 |         turbine_path = os.path.join(output_path, turbine_id)
83 |         os.makedirs(turbine_path, exist_ok=True)
84 |         data = readings[readings.turbine_id == turbine_id]
85 |         for month in range(1, 13):
86 |             month_data = data[data.timestamp.dt.month == month].copy()
87 |             month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %H:%M:%S')
88 |             month_path = os.path.join(turbine_path, '2013-{:02d}.csv'.format(month))
89 |             LOGGER.info('Generating file %s', month_path)
90 |             month_data[['signal_id', 'timestamp', 'value']].to_csv(month_path, index=False)
91 | 
92 |     return target_times
93 | 


--------------------------------------------------------------------------------
/draco/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | from draco.loaders.csv import CSVLoader
2 | 
3 | __all__ = (
4 |     'CSVLoader',
5 | )
6 | 


--------------------------------------------------------------------------------
/draco/loaders/csv.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | import dask
  5 | import pandas as pd
  6 | 
  7 | from draco.targets import drop_duplicates, select_valid_targets
  8 | 
  9 | LOGGER = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class CSVLoader:
 13 |     """Load the required readings from CSV files.
 14 | 
 15 |     The CSVLoader class is responsible for analyzing the target_times table
 16 |     and then load the required readings from CSV files.
 17 | 
 18 |     Also, optionally, it can perform a resampling aggregation while loading
 19 |     the data, reducing the amount of memory requirements.
 20 | 
 21 |     The CSVLoader class uses Dask to parallelize all the IO and resampling
 22 |     computation and reduce loading times.
 23 | 
 24 |     Args:
 25 |         readings_path (str):
 26 |             Path to the readings folder, where a folder exist for each turbine.
 27 |         rule (str):
 28 |             Resampling rule, as expected by ``DataFrame.resmple``. The rule is a
 29 |             string representation of a TimeDelta, which includes a number and a
 30 |             unit. For example: ``3d``, ``1w``, ``6h``.
 31 |             If ``None``, resampling is disabled.
 32 |         aggregation (str):
 33 |             Name of the aggregation to perform during the resampling.
 34 |         unstack (bool):
 35 |             Whether to unstack the resampled data, generating one column per signal.
 36 |             Only used when resampling. Defaults to ``False``.
 37 |     """
 38 | 
 39 |     DEFAULT_DATETIME_FMT = '%m/%d/%y %H:%M:%S'
 40 |     DEFAULT_FILENAME_FMT = '%Y-%m.csv'
 41 | 
 42 |     def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False,
 43 |                  datetime_fmt=DEFAULT_DATETIME_FMT, filename_fmt=DEFAULT_FILENAME_FMT):
 44 |         self._readings_path = readings_path
 45 |         self._rule = rule
 46 |         self._aggregation = aggregation
 47 |         self._unstack = unstack
 48 |         self._datetime_fmt = datetime_fmt
 49 |         self._filename_fmt = filename_fmt
 50 | 
 51 |     @dask.delayed
 52 |     def __filter_by_signal(self, readings, signals):
 53 |         if signals is not None:
 54 |             LOGGER.debug('Filtering by signal')
 55 |             readings = readings[readings.signal_id.isin(signals)]
 56 | 
 57 |         try:
 58 |             readings['value'] = readings['value'].astype(float)
 59 |         except ValueError:
 60 |             signals = readings[readings['value'].str.isnumeric()].signal_id.unique()
 61 |             raise ValueError('Signals contain non-numerical values: {}'.format(signals))
 62 | 
 63 |         LOGGER.debug('Selected %s readings by signal', len(readings))
 64 | 
 65 |         return readings.copy()
 66 | 
 67 |     @dask.delayed
 68 |     def __filter_by_timestamp(self, readings, timestamps):
 69 |         LOGGER.debug('Parsing timestamps')
 70 |         readings_ts = pd.to_datetime(readings['timestamp'], format=self._datetime_fmt)
 71 |         readings['timestamp'] = readings_ts
 72 | 
 73 |         LOGGER.debug('Filtering by timestamp')
 74 | 
 75 |         related = [False] * len(readings)
 76 |         for row in timestamps.itertuples():
 77 |             lower = row.start <= readings_ts
 78 |             upper = readings_ts <= row.stop
 79 |             related |= lower & upper
 80 | 
 81 |         readings = readings[related]
 82 | 
 83 |         LOGGER.debug('Selected %s readings by timestamp', len(readings))
 84 | 
 85 |         return readings.copy()
 86 | 
 87 |     @dask.delayed
 88 |     def __load_readings_file(self, turbine_file, timestamps, signals):
 89 |         LOGGER.debug('Loading file %s', turbine_file)
 90 |         data = pd.read_csv(turbine_file, low_memory=False)
 91 |         data.columns = data.columns.str.lower()
 92 |         data = data.rename(columns={'signal': 'signal_id'})
 93 | 
 94 |         if 'unnamed: 0' in data.columns:
 95 |             # Someone forgot to drop the index before
 96 |             # storing the DataFrame as a CSV
 97 |             del data['unnamed: 0']
 98 | 
 99 |         LOGGER.debug('Loaded %s readings from file %s', len(data), turbine_file)
100 | 
101 |         return data
102 | 
103 |     @dask.delayed
104 |     def __consolidate(self, readings, turbine_id):
105 |         readings = pd.concat(readings, ignore_index=True)
106 |         readings.insert(0, 'turbine_id', turbine_id)
107 | 
108 |         LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id)
109 | 
110 |         return readings
111 | 
112 |     def _get_filenames(self, turbine_path, timestamps):
113 |         min_csv = timestamps.start.dt.strftime(self._filename_fmt)
114 |         max_csv = timestamps.stop.dt.strftime(self._filename_fmt)
115 | 
116 |         for filename in sorted(os.listdir(turbine_path)):
117 |             if ((min_csv <= filename) & (filename <= max_csv)).any():
118 |                 yield os.path.join(turbine_path, filename)
119 | 
120 |     @staticmethod
121 |     def _join_names(names):
122 |         """Join the names of a multi-level index with an underscore."""
123 | 
124 |         levels = (str(name) for name in names if name != '')
125 |         return '_'.join(levels)
126 | 
127 |     @dask.delayed
128 |     def __resample(self, readings):
129 |         LOGGER.info('Resampling: %s - %s', self._rule, self._aggregation)
130 |         grouped = readings.groupby('signal_id')
131 |         dfr = grouped.resample(rule=self._rule, on='timestamp')
132 |         agg = dfr.agg(self._aggregation)
133 | 
134 |         LOGGER.info('%s readings reduced to %s', len(readings), len(agg))
135 | 
136 |         if self._unstack:
137 |             agg = agg.unstack(level='signal_id').reset_index()
138 |             agg.columns = agg.columns.map(self._join_names)
139 |             return agg
140 |         else:
141 |             return agg.reset_index()
142 | 
143 |     def _load_turbine(self, turbine_id, timestamps, signals=None):
144 |         if 'turbine_id' in timestamps:
145 |             timestamps = timestamps[timestamps.turbine_id == turbine_id]
146 | 
147 |         turbine_path = os.path.join(self._readings_path, turbine_id)
148 |         filenames = self._get_filenames(turbine_path, timestamps)
149 | 
150 |         readings = list()
151 |         for filename in filenames:
152 |             file_readings = self.__load_readings_file(filename, timestamps, signals)
153 |             file_readings = self.__filter_by_signal(file_readings, signals)
154 |             file_readings = self.__filter_by_timestamp(file_readings, timestamps)
155 | 
156 |             if self._rule:
157 |                 file_readings = self.__resample(file_readings)
158 | 
159 |             readings.append(file_readings)
160 | 
161 |         if readings:
162 |             readings = self.__consolidate(readings, turbine_id)
163 | 
164 |         return readings
165 | 
166 |     @staticmethod
167 |     def _get_timestamps(target_times, window_size):
168 |         cutoff_times = target_times.cutoff_time
169 |         min_times = cutoff_times - window_size
170 | 
171 |         return pd.DataFrame({
172 |             'turbine_id': target_times.turbine_id,
173 |             'start': min_times,
174 |             'stop': cutoff_times,
175 |         })
176 | 
177 |     def load(self, target_times, window_size, signals=None, debug=False, select_valid=True):
178 |         """Load the readings needed for the given target_times and window_size.
179 | 
180 |         Optionally filter the signals that are loaded and discard the rest.
181 | 
182 |         Args:
183 |             target_times (str or pandas.DataFrame):
184 |                 target_times ``DataFrame`` or path to the corresponding CSV file.
185 |                 The table must have three volumns, ``turbine_id``, ``target`` and
186 |                 ``cutoff_time``.
187 |             window_size (str):
188 |                 Amount of data to load before each cutoff time, specified as a string
189 |                 representation of a TimeDelta, which includes a number and a
190 |                 unit. For example: ``3d``, ``1w``, ``6h``.
191 |             signals (list or pandas.DataFrame):
192 |                 List of signal names or table that has a ``signal_id`` column to
193 |                 use as the signal names list.
194 |             debug (bool):
195 |                 Force single thread execution for easy debugging. Defaults to ``False``.
196 | 
197 |         Returns:
198 |             pandas.DataFrame:
199 |                 Table of readings for the target times, including the columns ``turbine_id``,
200 |                 ``signal_id``, ``timestamp`` and ``value``.
201 |         """
202 |         if isinstance(target_times, str):
203 |             target_times = pd.read_csv(target_times)
204 |             target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time'])
205 | 
206 |         target_times = drop_duplicates(target_times)
207 | 
208 |         if isinstance(signals, pd.DataFrame):
209 |             signals = signals.signal_id
210 | 
211 |         window_size = pd.to_timedelta(window_size)
212 |         timestamps = self._get_timestamps(target_times, window_size)
213 | 
214 |         readings = list()
215 |         for turbine_id in timestamps.turbine_id.unique():
216 |             readings.append(self._load_turbine(turbine_id, timestamps, signals))
217 | 
218 |         dask_scheduler = 'single-threaded' if debug else None
219 |         computed = dask.compute(*readings, scheduler=dask_scheduler)
220 | 
221 |         found_readings = [c for c in computed if len(c)]
222 |         if not found_readings:
223 |             msg = 'No readings found for the given target times in {}'.format(self._readings_path)
224 |             raise ValueError(msg)
225 | 
226 |         readings = pd.concat(found_readings, ignore_index=True, sort=False)
227 | 
228 |         LOGGER.info('Loaded %s turbine readings', len(readings))
229 | 
230 |         if select_valid:
231 |             target_times = select_valid_targets(target_times, readings, window_size, self._rule)
232 |             return target_times, readings
233 | 
234 |         return readings
235 | 


--------------------------------------------------------------------------------
/draco/metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | 
 4 | import numpy as np
 5 | from sklearn.metrics import (accuracy_score, f1_score, mean_absolute_error,
 6 |                              mean_squared_error, roc_curve, roc_auc_score, r2_score)
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | def f1_macro(exp, obs):
12 |     return f1_score(exp, obs, average='macro')
13 | 
14 | 
15 | def threshold_score(ground_truth, probabilities, tpr):
16 |     roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1)
17 |     try:
18 |         index = np.where(roc_tpr >= tpr)[0][0]
19 |     except:
20 |         LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate')
21 |         index = -1
22 | 
23 |     return roc_threshold[index]
24 | 
25 | 
26 | def tpr_score(ground_truth, probabilities, threshold):
27 |     roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1)
28 |     try:
29 |         index = np.where(roc_threshold >= threshold)[0][0]
30 |     except:
31 |         LOGGER.warn('Could not find a tpr that satisfies the requested threshold')
32 |         index = -1
33 | 
34 |     return roc_tpr[index]
35 | 
36 | 
37 | def fpr_score(ground_truth, probabilities, tpr=None, threshold=None):
38 |     """Compute the False Positive Rate associated with the given True Positive Rate.
39 | 
40 |     This metric computes the False Positive Rate that needs to be assumed in order
41 |     to achieve the desired True Positive Rate.
42 |     The metric is computed by finding the minimum necessary threshold to ensure
43 |     that the TPR is satisfied and then computing the associated FPR. The final output
44 |     is 1 minus the found FPR to produce a maximization score between 0 and 1.
45 | 
46 |     Args:
47 |         ground_truth (numpy.ndarray):
48 |             ``numpy.ndarray`` of the known values for the given predictions.
49 |         probabilities (numpy.ndarray):
50 |             ``numpy.ndarray`` with the generated predictions in probability.
51 |         tpr (float):
52 |             ``float`` value representing the percentage of True Positive Rate
53 |             to be satisfied.
54 | 
55 |     Returns:
56 |         float:
57 |             Value between 0 and 1, where bigger is better.
58 |     """
59 |     roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1)
60 |     try:
61 |         if tpr:
62 |             index = np.where(roc_tpr >= tpr)[0][0]
63 |         elif threshold:
64 |             index = np.where(roc_threshold >= threshold)[0][0]
65 | 
66 |     except:
67 |         LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate')
68 |         index = -1
69 | 
70 |     return 1 - roc_fpr[index]
71 | 
72 | 
73 | METRICS = {
74 |     'accuracy': (accuracy_score, False),
75 |     'f1': (f1_score, False),
76 |     'f1_macro': (f1_macro, False),
77 |     'r2': (r2_score, False),
78 |     'mse': (mean_squared_error, True),
79 |     'mae': (mean_absolute_error, True),
80 |     'fpr': (fpr_score, False),
81 |     'roc_auc_score': (roc_auc_score, False)
82 | }
83 | 


--------------------------------------------------------------------------------
/draco/pipelines/double_lstm/double_lstm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primitives": [
 3 |         "pandas.DataFrame.pop",
 4 |         "pandas.DataFrame.pop",
 5 |         "sklearn.impute.SimpleImputer",
 6 |         "sklearn.preprocessing.MinMaxScaler",
 7 |         "pandas.DataFrame",
 8 |         "pandas.DataFrame.set",
 9 |         "pandas.DataFrame.set",
10 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
11 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier"
12 |     ],
13 |     "init_params": {
14 |         "pandas.DataFrame.pop#1": {
15 |             "item": "turbine_id"
16 |         },
17 |         "pandas.DataFrame.pop#2": {
18 |             "item": "timestamp"
19 |         },
20 |         "sklearn.preprocessing.MinMaxScaler#1": {
21 |             "feature_range": [
22 |                 -1,
23 |                 1
24 |             ]
25 |         },
26 |         "pandas.DataFrame#1": {
27 |             "index": null,
28 |             "columns": null
29 |         },
30 |         "pandas.DataFrame.set#1": {
31 |             "key": "turbine_id"
32 |         },
33 |         "pandas.DataFrame.set#2": {
34 |             "key": "timestamp"
35 |         },
36 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
37 |             "window_size": 24,
38 |             "cutoff_time": "cutoff_time",
39 |             "time_index": "timestamp"
40 |         },
41 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": {
42 |             "epochs": 35,
43 |             "verbose": false
44 |         }
45 |     },
46 |     "input_names": {
47 |         "pandas.DataFrame.pop#1": {
48 |             "X": "readings"
49 |         },
50 |         "pandas.DataFrame.pop#2": {
51 |             "X": "readings"
52 |         },
53 |         "sklearn.impute.SimpleImputer#1": {
54 |             "X": "readings"
55 |         },
56 |         "sklearn.preprocessing.MinMaxScaler#1": {
57 |             "X": "readings"
58 |         },
59 |         "pandas.DataFrame#1": {
60 |             "X": "readings"
61 |         },
62 |         "pandas.DataFrame.set#1": {
63 |             "X": "readings",
64 |             "value": "turbine_id"
65 |         },
66 |         "pandas.DataFrame.set#2": {
67 |             "X": "readings",
68 |             "value": "timestamp"
69 |         },
70 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
71 |             "timeseries": "readings"
72 |         }
73 |     },
74 |     "output_names": {
75 |         "pandas.DataFrame.pop#1": {
76 |             "item": "turbine_id"
77 |         },
78 |         "pandas.DataFrame.pop#2": {
79 |             "item": "timestamp"
80 |         },
81 |         "sklearn.impute.SimpleImputer#1": {
82 |             "X": "readings"
83 |         },
84 |         "sklearn.preprocessing.MinMaxScaler#1": {
85 |             "X": "readings"
86 |         },
87 |         "pandas.DataFrame#1": {
88 |             "X": "readings"
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/draco/pipelines/double_lstm/double_lstm_prob.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primitives": [
 3 |         "pandas.DataFrame.pop",
 4 |         "pandas.DataFrame.pop",
 5 |         "sklearn.impute.SimpleImputer",
 6 |         "sklearn.preprocessing.MinMaxScaler",
 7 |         "pandas.DataFrame",
 8 |         "pandas.DataFrame.set",
 9 |         "pandas.DataFrame.set",
10 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
11 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier",
12 |         "numpy.take"
13 |     ],
14 |     "init_params": {
15 |         "pandas.DataFrame.pop#1": {
16 |             "item": "turbine_id"
17 |         },
18 |         "pandas.DataFrame.pop#2": {
19 |             "item": "timestamp"
20 |         },
21 |         "sklearn.preprocessing.MinMaxScaler#1": {
22 |             "feature_range": [
23 |                 -1,
24 |                 1
25 |             ]
26 |         },
27 |         "pandas.DataFrame#1": {
28 |             "index": null,
29 |             "columns": null
30 |         },
31 |         "pandas.DataFrame.set#1": {
32 |             "key": "turbine_id"
33 |         },
34 |         "pandas.DataFrame.set#2": {
35 |             "key": "timestamp"
36 |         },
37 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
38 |             "window_size": 24,
39 |             "cutoff_time": "cutoff_time",
40 |             "time_index": "timestamp"
41 |         },
42 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": {
43 |             "epochs": 35,
44 |             "verbose": false,
45 |             "classification": false,
46 |             "loss": "keras.losses.binary_crossentropy"
47 |         },
48 |         "numpy.take#1": {
49 |             "indices": 1,
50 |             "axis": 1
51 |         }
52 |     },
53 |     "input_names": {
54 |         "pandas.DataFrame.pop#1": {
55 |             "X": "readings"
56 |         },
57 |         "pandas.DataFrame.pop#2": {
58 |             "X": "readings"
59 |         },
60 |         "sklearn.impute.SimpleImputer#1": {
61 |             "X": "readings"
62 |         },
63 |         "sklearn.preprocessing.MinMaxScaler#1": {
64 |             "X": "readings"
65 |         },
66 |         "pandas.DataFrame#1": {
67 |             "X": "readings"
68 |         },
69 |         "pandas.DataFrame.set#1": {
70 |             "X": "readings",
71 |             "value": "turbine_id"
72 |         },
73 |         "pandas.DataFrame.set#2": {
74 |             "X": "readings",
75 |             "value": "timestamp"
76 |         },
77 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
78 |             "timeseries": "readings"
79 |         }
80 |     },
81 |     "output_names": {
82 |         "pandas.DataFrame.pop#1": {
83 |             "item": "turbine_id"
84 |         },
85 |         "pandas.DataFrame.pop#2": {
86 |             "item": "timestamp"
87 |         },
88 |         "sklearn.impute.SimpleImputer#1": {
89 |             "X": "readings"
90 |         },
91 |         "sklearn.preprocessing.MinMaxScaler#1": {
92 |             "X": "readings"
93 |         },
94 |         "pandas.DataFrame#1": {
95 |             "X": "readings"
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "primitives": [
  3 |         "pandas.DataFrame.resample",
  4 |         "pandas.DataFrame.unstack",
  5 |         "pandas.DataFrame.pop",
  6 |         "pandas.DataFrame.pop",
  7 |         "sklearn.impute.SimpleImputer",
  8 |         "sklearn.preprocessing.MinMaxScaler",
  9 |         "pandas.DataFrame",
 10 |         "pandas.DataFrame.set",
 11 |         "pandas.DataFrame.set",
 12 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
 13 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier",
 14 |         "numpy.take"
 15 |     ],
 16 |     "init_params": {
 17 |         "pandas.DataFrame.resample#1": {
 18 |             "rule": "3600s",
 19 |             "on": "timestamp",
 20 |             "groupby": [
 21 |                 "turbine_id",
 22 |                 "signal_id"
 23 |             ],
 24 |             "aggregation": "mean",
 25 |             "reset_index": false
 26 |         },
 27 |         "pandas.DataFrame.unstack#1": {
 28 |             "level": "signal_id",
 29 |             "reset_index": true
 30 |         },
 31 |         "pandas.DataFrame.pop#1": {
 32 |             "item": "turbine_id"
 33 |         },
 34 |         "pandas.DataFrame.pop#2": {
 35 |             "item": "timestamp"
 36 |         },
 37 |         "sklearn.preprocessing.MinMaxScaler#1": {
 38 |             "feature_range": [
 39 |                 -1,
 40 |                 1
 41 |             ]
 42 |         },
 43 |         "pandas.DataFrame#1": {
 44 |             "index": null,
 45 |             "columns": null
 46 |         },
 47 |         "pandas.DataFrame.set#1": {
 48 |             "key": "turbine_id"
 49 |         },
 50 |         "pandas.DataFrame.set#2": {
 51 |             "key": "timestamp"
 52 |         },
 53 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 54 |             "window_size": 24,
 55 |             "cutoff_time": "cutoff_time",
 56 |             "time_index": "timestamp"
 57 |         },
 58 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": {
 59 |             "epochs": 35,
 60 |             "verbose": false,
 61 |             "classification": false,
 62 |             "loss": "keras.losses.binary_crossentropy"
 63 |         },
 64 |         "numpy.take#1": {
 65 |             "indices": 1,
 66 |             "axis": 1
 67 |         }
 68 |     },
 69 |     "input_names": {
 70 |         "pandas.DataFrame.resample#1": {
 71 |             "X": "readings"
 72 |         },
 73 |         "pandas.DataFrame.unstack#1": {
 74 |             "X": "readings"
 75 |         },
 76 |         "pandas.DataFrame.pop#1": {
 77 |             "X": "readings"
 78 |         },
 79 |         "pandas.DataFrame.pop#2": {
 80 |             "X": "readings"
 81 |         },
 82 |         "sklearn.impute.SimpleImputer#1": {
 83 |             "X": "readings"
 84 |         },
 85 |         "sklearn.preprocessing.MinMaxScaler#1": {
 86 |             "X": "readings"
 87 |         },
 88 |         "pandas.DataFrame#1": {
 89 |             "X": "readings"
 90 |         },
 91 |         "pandas.DataFrame.set#1": {
 92 |             "X": "readings",
 93 |             "value": "turbine_id"
 94 |         },
 95 |         "pandas.DataFrame.set#2": {
 96 |             "X": "readings",
 97 |             "value": "timestamp"
 98 |         },
 99 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
100 |             "timeseries": "readings"
101 |         }
102 |     },
103 |     "output_names": {
104 |         "pandas.DataFrame.resample#1": {
105 |             "X": "readings"
106 |         },
107 |         "pandas.DataFrame.unstack#1": {
108 |             "X": "readings"
109 |         },
110 |         "pandas.DataFrame.pop#1": {
111 |             "item": "turbine_id"
112 |         },
113 |         "pandas.DataFrame.pop#2": {
114 |             "item": "timestamp"
115 |         },
116 |         "sklearn.impute.SimpleImputer#1": {
117 |             "X": "readings"
118 |         },
119 |         "sklearn.preprocessing.MinMaxScaler#1": {
120 |             "X": "readings"
121 |         },
122 |         "pandas.DataFrame#1": {
123 |             "X": "readings"
124 |         }
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/draco/pipelines/double_lstm/double_lstm_with_unstack.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "primitives": [
  3 |         "pandas.DataFrame.resample",
  4 |         "pandas.DataFrame.unstack",
  5 |         "pandas.DataFrame.pop",
  6 |         "pandas.DataFrame.pop",
  7 |         "sklearn.impute.SimpleImputer",
  8 |         "sklearn.preprocessing.MinMaxScaler",
  9 |         "pandas.DataFrame",
 10 |         "pandas.DataFrame.set",
 11 |         "pandas.DataFrame.set",
 12 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
 13 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier"
 14 |     ],
 15 |     "init_params": {
 16 |         "pandas.DataFrame.resample#1": {
 17 |             "rule": "3600s",
 18 |             "on": "timestamp",
 19 |             "groupby": [
 20 |                 "turbine_id",
 21 |                 "signal_id"
 22 |             ],
 23 |             "aggregation": "mean",
 24 |             "reset_index": false
 25 |         },
 26 |         "pandas.DataFrame.unstack#1": {
 27 |             "level": "signal_id",
 28 |             "reset_index": true
 29 |         },
 30 |         "pandas.DataFrame.pop#1": {
 31 |             "item": "turbine_id"
 32 |         },
 33 |         "pandas.DataFrame.pop#2": {
 34 |             "item": "timestamp"
 35 |         },
 36 |         "sklearn.preprocessing.MinMaxScaler#1": {
 37 |             "feature_range": [
 38 |                 -1,
 39 |                 1
 40 |             ]
 41 |         },
 42 |         "pandas.DataFrame#1": {
 43 |             "index": null,
 44 |             "columns": null
 45 |         },
 46 |         "pandas.DataFrame.set#1": {
 47 |             "key": "turbine_id"
 48 |         },
 49 |         "pandas.DataFrame.set#2": {
 50 |             "key": "timestamp"
 51 |         },
 52 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 53 |             "window_size": 24,
 54 |             "cutoff_time": "cutoff_time",
 55 |             "time_index": "timestamp"
 56 |         },
 57 |         "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": {
 58 |             "epochs": 35,
 59 |             "verbose": false
 60 |         }
 61 |     },
 62 |     "input_names": {
 63 |         "pandas.DataFrame.resample#1": {
 64 |             "X": "readings"
 65 |         },
 66 |         "pandas.DataFrame.unstack#1": {
 67 |             "X": "readings"
 68 |         },
 69 |         "pandas.DataFrame.pop#1": {
 70 |             "X": "readings"
 71 |         },
 72 |         "pandas.DataFrame.pop#2": {
 73 |             "X": "readings"
 74 |         },
 75 |         "sklearn.impute.SimpleImputer#1": {
 76 |             "X": "readings"
 77 |         },
 78 |         "sklearn.preprocessing.MinMaxScaler#1": {
 79 |             "X": "readings"
 80 |         },
 81 |         "pandas.DataFrame#1": {
 82 |             "X": "readings"
 83 |         },
 84 |         "pandas.DataFrame.set#1": {
 85 |             "X": "readings",
 86 |             "value": "turbine_id"
 87 |         },
 88 |         "pandas.DataFrame.set#2": {
 89 |             "X": "readings",
 90 |             "value": "timestamp"
 91 |         },
 92 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 93 |             "timeseries": "readings"
 94 |         }
 95 |     },
 96 |     "output_names": {
 97 |         "pandas.DataFrame.resample#1": {
 98 |             "X": "readings"
 99 |         },
100 |         "pandas.DataFrame.unstack#1": {
101 |             "X": "readings"
102 |         },
103 |         "pandas.DataFrame.pop#1": {
104 |             "item": "turbine_id"
105 |         },
106 |         "pandas.DataFrame.pop#2": {
107 |             "item": "timestamp"
108 |         },
109 |         "sklearn.impute.SimpleImputer#1": {
110 |             "X": "readings"
111 |         },
112 |         "sklearn.preprocessing.MinMaxScaler#1": {
113 |             "X": "readings"
114 |         },
115 |         "pandas.DataFrame#1": {
116 |             "X": "readings"
117 |         }
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/draco/pipelines/dummy/dummy.json:
--------------------------------------------------------------------------------
1 | {
2 |     "primitives": [
3 |         "sklearn.impute.SimpleImputer",
4 |         "sklearn.preprocessing.MinMaxScaler",
5 |         "sklearn.linear_model.LogisticRegression"
6 |     ]
7 | }
8 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm/lstm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primitives": [
 3 |         "pandas.DataFrame.pop",
 4 |         "pandas.DataFrame.pop",
 5 |         "sklearn.impute.SimpleImputer",
 6 |         "sklearn.preprocessing.MinMaxScaler",
 7 |         "pandas.DataFrame",
 8 |         "pandas.DataFrame.set",
 9 |         "pandas.DataFrame.set",
10 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
11 |         "keras.Sequential.LSTMTimeSeriesClassifier"
12 |     ],
13 |     "init_params": {
14 |         "pandas.DataFrame.pop#1": {
15 |             "item": "turbine_id"
16 |         },
17 |         "pandas.DataFrame.pop#2": {
18 |             "item": "timestamp"
19 |         },
20 |         "sklearn.preprocessing.MinMaxScaler#1": {
21 |             "feature_range": [
22 |                 -1,
23 |                 1
24 |             ]
25 |         },
26 |         "pandas.DataFrame#1": {
27 |             "index": null,
28 |             "columns": null
29 |         },
30 |         "pandas.DataFrame.set#1": {
31 |             "key": "turbine_id"
32 |         },
33 |         "pandas.DataFrame.set#2": {
34 |             "key": "timestamp"
35 |         },
36 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
37 |             "window_size": 24,
38 |             "cutoff_time": "cutoff_time",
39 |             "time_index": "timestamp"
40 |         },
41 |         "keras.Sequential.LSTMTimeSeriesClassifier#1": {
42 |             "epochs": 35,
43 |             "verbose": false
44 |         }
45 |     },
46 |     "input_names": {
47 |         "pandas.DataFrame.pop#1": {
48 |             "X": "readings"
49 |         },
50 |         "pandas.DataFrame.pop#2": {
51 |             "X": "readings"
52 |         },
53 |         "sklearn.impute.SimpleImputer#1": {
54 |             "X": "readings"
55 |         },
56 |         "sklearn.preprocessing.MinMaxScaler#1": {
57 |             "X": "readings"
58 |         },
59 |         "pandas.DataFrame#1": {
60 |             "X": "readings"
61 |         },
62 |         "pandas.DataFrame.set#1": {
63 |             "X": "readings",
64 |             "value": "turbine_id"
65 |         },
66 |         "pandas.DataFrame.set#2": {
67 |             "X": "readings",
68 |             "value": "timestamp"
69 |         },
70 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
71 |             "timeseries": "readings"
72 |         }
73 |     },
74 |     "output_names": {
75 |         "pandas.DataFrame.pop#1": {
76 |             "item": "turbine_id"
77 |         },
78 |         "pandas.DataFrame.pop#2": {
79 |             "item": "timestamp"
80 |         },
81 |         "sklearn.impute.SimpleImputer#1": {
82 |             "X": "readings"
83 |         },
84 |         "sklearn.preprocessing.MinMaxScaler#1": {
85 |             "X": "readings"
86 |         },
87 |         "pandas.DataFrame#1": {
88 |             "X": "readings"
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm/lstm_prob.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primitives": [
 3 |         "pandas.DataFrame.pop",
 4 |         "pandas.DataFrame.pop",
 5 |         "sklearn.impute.SimpleImputer",
 6 |         "sklearn.preprocessing.MinMaxScaler",
 7 |         "pandas.DataFrame",
 8 |         "pandas.DataFrame.set",
 9 |         "pandas.DataFrame.set",
10 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
11 |         "keras.Sequential.LSTMTimeSeriesClassifier",
12 |         "numpy.take"
13 |     ],
14 |     "init_params": {
15 |         "pandas.DataFrame.pop#1": {
16 |             "item": "turbine_id"
17 |         },
18 |         "pandas.DataFrame.pop#2": {
19 |             "item": "timestamp"
20 |         },
21 |         "sklearn.preprocessing.MinMaxScaler#1": {
22 |             "feature_range": [
23 |                 -1,
24 |                 1
25 |             ]
26 |         },
27 |         "pandas.DataFrame#1": {
28 |             "index": null,
29 |             "columns": null
30 |         },
31 |         "pandas.DataFrame.set#1": {
32 |             "key": "turbine_id"
33 |         },
34 |         "pandas.DataFrame.set#2": {
35 |             "key": "timestamp"
36 |         },
37 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
38 |             "window_size": 24,
39 |             "cutoff_time": "cutoff_time",
40 |             "time_index": "timestamp"
41 |         },
42 |         "keras.Sequential.LSTMTimeSeriesClassifier#1": {
43 |             "epochs": 35,
44 |             "verbose": false,
45 |             "classification": false,
46 |             "loss": "keras.losses.binary_crossentropy"
47 |         },
48 |         "numpy.take#1": {
49 |             "indices": 1,
50 |             "axis": 1
51 |         }
52 |     },
53 |     "input_names": {
54 |         "pandas.DataFrame.pop#1": {
55 |             "X": "readings"
56 |         },
57 |         "pandas.DataFrame.pop#2": {
58 |             "X": "readings"
59 |         },
60 |         "sklearn.impute.SimpleImputer#1": {
61 |             "X": "readings"
62 |         },
63 |         "sklearn.preprocessing.MinMaxScaler#1": {
64 |             "X": "readings"
65 |         },
66 |         "pandas.DataFrame#1": {
67 |             "X": "readings"
68 |         },
69 |         "pandas.DataFrame.set#1": {
70 |             "X": "readings",
71 |             "value": "turbine_id"
72 |         },
73 |         "pandas.DataFrame.set#2": {
74 |             "X": "readings",
75 |             "value": "timestamp"
76 |         },
77 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
78 |             "timeseries": "readings"
79 |         }
80 |     },
81 |     "output_names": {
82 |         "pandas.DataFrame.pop#1": {
83 |             "item": "turbine_id"
84 |         },
85 |         "pandas.DataFrame.pop#2": {
86 |             "item": "timestamp"
87 |         },
88 |         "sklearn.impute.SimpleImputer#1": {
89 |             "X": "readings"
90 |         },
91 |         "sklearn.preprocessing.MinMaxScaler#1": {
92 |             "X": "readings"
93 |         },
94 |         "pandas.DataFrame#1": {
95 |             "X": "readings"
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm/lstm_prob_with_unstack.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "primitives": [
  3 |         "pandas.DataFrame.resample",
  4 |         "pandas.DataFrame.unstack",
  5 |         "pandas.DataFrame.pop",
  6 |         "pandas.DataFrame.pop",
  7 |         "sklearn.impute.SimpleImputer",
  8 |         "sklearn.preprocessing.MinMaxScaler",
  9 |         "pandas.DataFrame",
 10 |         "pandas.DataFrame.set",
 11 |         "pandas.DataFrame.set",
 12 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
 13 |         "keras.Sequential.LSTMTimeSeriesClassifier",
 14 |         "numpy.take"
 15 |     ],
 16 |     "init_params": {
 17 |         "pandas.DataFrame.resample#1": {
 18 |             "rule": "3600s",
 19 |             "on": "timestamp",
 20 |             "groupby": [
 21 |                 "turbine_id",
 22 |                 "signal_id"
 23 |             ],
 24 |             "aggregation": "mean",
 25 |             "reset_index": false
 26 |         },
 27 |         "pandas.DataFrame.unstack#1": {
 28 |             "level": "signal_id",
 29 |             "reset_index": true
 30 |         },
 31 |         "pandas.DataFrame.pop#1": {
 32 |             "item": "turbine_id"
 33 |         },
 34 |         "pandas.DataFrame.pop#2": {
 35 |             "item": "timestamp"
 36 |         },
 37 |         "sklearn.preprocessing.MinMaxScaler#1": {
 38 |             "feature_range": [
 39 |                 -1,
 40 |                 1
 41 |             ]
 42 |         },
 43 |         "pandas.DataFrame#1": {
 44 |             "index": null,
 45 |             "columns": null
 46 |         },
 47 |         "pandas.DataFrame.set#1": {
 48 |             "key": "turbine_id"
 49 |         },
 50 |         "pandas.DataFrame.set#2": {
 51 |             "key": "timestamp"
 52 |         },
 53 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 54 |             "window_size": 24,
 55 |             "cutoff_time": "cutoff_time",
 56 |             "time_index": "timestamp"
 57 |         },
 58 |         "keras.Sequential.LSTMTimeSeriesClassifier#1": {
 59 |             "epochs": 35,
 60 |             "verbose": false,
 61 |             "classification": false,
 62 |             "loss": "keras.losses.binary_crossentropy"
 63 |         },
 64 |         "numpy.take#1": {
 65 |             "indices": 1,
 66 |             "axis": 1
 67 |         }
 68 |     },
 69 |     "input_names": {
 70 |         "pandas.DataFrame.resample#1": {
 71 |             "X": "readings"
 72 |         },
 73 |         "pandas.DataFrame.unstack#1": {
 74 |             "X": "readings"
 75 |         },
 76 |         "pandas.DataFrame.pop#1": {
 77 |             "X": "readings"
 78 |         },
 79 |         "pandas.DataFrame.pop#2": {
 80 |             "X": "readings"
 81 |         },
 82 |         "sklearn.impute.SimpleImputer#1": {
 83 |             "X": "readings"
 84 |         },
 85 |         "sklearn.preprocessing.MinMaxScaler#1": {
 86 |             "X": "readings"
 87 |         },
 88 |         "pandas.DataFrame#1": {
 89 |             "X": "readings"
 90 |         },
 91 |         "pandas.DataFrame.set#1": {
 92 |             "X": "readings",
 93 |             "value": "turbine_id"
 94 |         },
 95 |         "pandas.DataFrame.set#2": {
 96 |             "X": "readings",
 97 |             "value": "timestamp"
 98 |         },
 99 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
100 |             "timeseries": "readings"
101 |         }
102 |     },
103 |     "output_names": {
104 |         "pandas.DataFrame.resample#1": {
105 |             "X": "readings"
106 |         },
107 |         "pandas.DataFrame.unstack#1": {
108 |             "X": "readings"
109 |         },
110 |         "pandas.DataFrame.pop#1": {
111 |             "item": "turbine_id"
112 |         },
113 |         "pandas.DataFrame.pop#2": {
114 |             "item": "timestamp"
115 |         },
116 |         "sklearn.impute.SimpleImputer#1": {
117 |             "X": "readings"
118 |         },
119 |         "sklearn.preprocessing.MinMaxScaler#1": {
120 |             "X": "readings"
121 |         },
122 |         "pandas.DataFrame#1": {
123 |             "X": "readings"
124 |         }
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm/lstm_with_unstack.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "primitives": [
  3 |         "pandas.DataFrame.resample",
  4 |         "pandas.DataFrame.unstack",
  5 |         "pandas.DataFrame.pop",
  6 |         "pandas.DataFrame.pop",
  7 |         "sklearn.impute.SimpleImputer",
  8 |         "sklearn.preprocessing.MinMaxScaler",
  9 |         "pandas.DataFrame",
 10 |         "pandas.DataFrame.set",
 11 |         "pandas.DataFrame.set",
 12 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
 13 |         "keras.Sequential.LSTMTimeSeriesClassifier"
 14 |     ],
 15 |     "init_params": {
 16 |         "pandas.DataFrame.resample#1": {
 17 |             "rule": "3600s",
 18 |             "on": "timestamp",
 19 |             "groupby": [
 20 |                 "turbine_id",
 21 |                 "signal_id"
 22 |             ],
 23 |             "aggregation": "mean",
 24 |             "reset_index": false
 25 |         },
 26 |         "pandas.DataFrame.unstack#1": {
 27 |             "level": "signal_id",
 28 |             "reset_index": true
 29 |         },
 30 |         "pandas.DataFrame.pop#1": {
 31 |             "item": "turbine_id"
 32 |         },
 33 |         "pandas.DataFrame.pop#2": {
 34 |             "item": "timestamp"
 35 |         },
 36 |         "sklearn.preprocessing.MinMaxScaler#1": {
 37 |             "feature_range": [
 38 |                 -1,
 39 |                 1
 40 |             ]
 41 |         },
 42 |         "pandas.DataFrame#1": {
 43 |             "index": null,
 44 |             "columns": null
 45 |         },
 46 |         "pandas.DataFrame.set#1": {
 47 |             "key": "turbine_id"
 48 |         },
 49 |         "pandas.DataFrame.set#2": {
 50 |             "key": "timestamp"
 51 |         },
 52 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 53 |             "window_size": 24,
 54 |             "cutoff_time": "cutoff_time",
 55 |             "time_index": "timestamp"
 56 |         },
 57 |         "keras.Sequential.LSTMTimeSeriesClassifier#1": {
 58 |             "epochs": 35,
 59 |             "verbose": false
 60 |         }
 61 |     },
 62 |     "input_names": {
 63 |         "pandas.DataFrame.resample#1": {
 64 |             "X": "readings"
 65 |         },
 66 |         "pandas.DataFrame.unstack#1": {
 67 |             "X": "readings"
 68 |         },
 69 |         "pandas.DataFrame.pop#1": {
 70 |             "X": "readings"
 71 |         },
 72 |         "pandas.DataFrame.pop#2": {
 73 |             "X": "readings"
 74 |         },
 75 |         "sklearn.impute.SimpleImputer#1": {
 76 |             "X": "readings"
 77 |         },
 78 |         "sklearn.preprocessing.MinMaxScaler#1": {
 79 |             "X": "readings"
 80 |         },
 81 |         "pandas.DataFrame#1": {
 82 |             "X": "readings"
 83 |         },
 84 |         "pandas.DataFrame.set#1": {
 85 |             "X": "readings",
 86 |             "value": "turbine_id"
 87 |         },
 88 |         "pandas.DataFrame.set#2": {
 89 |             "X": "readings",
 90 |             "value": "timestamp"
 91 |         },
 92 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 93 |             "timeseries": "readings"
 94 |         }
 95 |     },
 96 |     "output_names": {
 97 |         "pandas.DataFrame.resample#1": {
 98 |             "X": "readings"
 99 |         },
100 |         "pandas.DataFrame.unstack#1": {
101 |             "X": "readings"
102 |         },
103 |         "pandas.DataFrame.pop#1": {
104 |             "item": "turbine_id"
105 |         },
106 |         "pandas.DataFrame.pop#2": {
107 |             "item": "timestamp"
108 |         },
109 |         "sklearn.impute.SimpleImputer#1": {
110 |             "X": "readings"
111 |         },
112 |         "sklearn.preprocessing.MinMaxScaler#1": {
113 |             "X": "readings"
114 |         },
115 |         "pandas.DataFrame#1": {
116 |             "X": "readings"
117 |         }
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm_regressor/lstm_regressor.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primitives": [
 3 |         "pandas.DataFrame.pop",
 4 |         "pandas.DataFrame.pop",
 5 |         "sklearn.impute.SimpleImputer",
 6 |         "sklearn.preprocessing.MinMaxScaler",
 7 |         "pandas.DataFrame",
 8 |         "pandas.DataFrame.set",
 9 |         "pandas.DataFrame.set",
10 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
11 |         "keras.Sequential.LSTMTimeSeriesRegressor"
12 |     ],
13 |     "init_params": {
14 |         "pandas.DataFrame.pop#1": {
15 |             "item": "turbine_id"
16 |         },
17 |         "pandas.DataFrame.pop#2": {
18 |             "item": "timestamp"
19 |         },
20 |         "sklearn.preprocessing.MinMaxScaler#1": {
21 |             "feature_range": [
22 |                 -1,
23 |                 1
24 |             ]
25 |         },
26 |         "pandas.DataFrame#1": {
27 |             "index": null,
28 |             "columns": null
29 |         },
30 |         "pandas.DataFrame.set#1": {
31 |             "key": "turbine_id"
32 |         },
33 |         "pandas.DataFrame.set#2": {
34 |             "key": "timestamp"
35 |         },
36 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
37 |             "window_size": 24,
38 |             "cutoff_time": "cutoff_time",
39 |             "time_index": "timestamp"
40 |         },
41 |         "keras.Sequential.LSTMTimeSeriesRegressor#1": {
42 |             "epochs": 35,
43 |             "verbose": false
44 |         }
45 |     },
46 |     "input_names": {
47 |         "pandas.DataFrame.pop#1": {
48 |             "X": "readings"
49 |         },
50 |         "pandas.DataFrame.pop#2": {
51 |             "X": "readings"
52 |         },
53 |         "sklearn.impute.SimpleImputer#1": {
54 |             "X": "readings"
55 |         },
56 |         "sklearn.preprocessing.MinMaxScaler#1": {
57 |             "X": "readings"
58 |         },
59 |         "pandas.DataFrame#1": {
60 |             "X": "readings"
61 |         },
62 |         "pandas.DataFrame.set#1": {
63 |             "X": "readings",
64 |             "value": "turbine_id"
65 |         },
66 |         "pandas.DataFrame.set#2": {
67 |             "X": "readings",
68 |             "value": "timestamp"
69 |         },
70 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
71 |             "timeseries": "readings"
72 |         }
73 |     },
74 |     "output_names": {
75 |         "pandas.DataFrame.pop#1": {
76 |             "item": "turbine_id"
77 |         },
78 |         "pandas.DataFrame.pop#2": {
79 |             "item": "timestamp"
80 |         },
81 |         "sklearn.impute.SimpleImputer#1": {
82 |             "X": "readings"
83 |         },
84 |         "sklearn.preprocessing.MinMaxScaler#1": {
85 |             "X": "readings"
86 |         },
87 |         "pandas.DataFrame#1": {
88 |             "X": "readings"
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "primitives": [
  3 |         "pandas.DataFrame.resample",
  4 |         "pandas.DataFrame.unstack",
  5 |         "pandas.DataFrame.pop",
  6 |         "pandas.DataFrame.pop",
  7 |         "sklearn.impute.SimpleImputer",
  8 |         "sklearn.preprocessing.MinMaxScaler",
  9 |         "pandas.DataFrame",
 10 |         "pandas.DataFrame.set",
 11 |         "pandas.DataFrame.set",
 12 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences",
 13 |         "keras.Sequential.LSTMTimeSeriesRegressor"
 14 |     ],
 15 |     "init_params": {
 16 |         "pandas.DataFrame.resample#1": {
 17 |             "rule": "600s",
 18 |             "on": "timestamp",
 19 |             "groupby": [
 20 |                 "turbine_id",
 21 |                 "signal_id"
 22 |             ],
 23 |             "aggregation": "mean",
 24 |             "reset_index": false
 25 |         },
 26 |         "pandas.DataFrame.unstack#1": {
 27 |             "level": "signal_id",
 28 |             "reset_index": true
 29 |         },
 30 |         "pandas.DataFrame.pop#1": {
 31 |             "item": "turbine_id"
 32 |         },
 33 |         "pandas.DataFrame.pop#2": {
 34 |             "item": "timestamp"
 35 |         },
 36 |         "sklearn.preprocessing.MinMaxScaler#1": {
 37 |             "feature_range": [
 38 |                 -1,
 39 |                 1
 40 |             ]
 41 |         },
 42 |         "pandas.DataFrame#1": {
 43 |             "index": null,
 44 |             "columns": null
 45 |         },
 46 |         "pandas.DataFrame.set#1": {
 47 |             "key": "turbine_id"
 48 |         },
 49 |         "pandas.DataFrame.set#2": {
 50 |             "key": "timestamp"
 51 |         },
 52 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 53 |             "window_size": 24,
 54 |             "cutoff_time": "cutoff_time",
 55 |             "time_index": "timestamp"
 56 |         },
 57 |         "keras.Sequential.LSTMTimeSeriesRegressor#1": {
 58 |             "epochs": 35,
 59 |             "verbose": true
 60 |         }
 61 |     },
 62 |     "input_names": {
 63 |         "pandas.DataFrame.resample#1": {
 64 |             "X": "readings"
 65 |         },
 66 |         "pandas.DataFrame.unstack#1": {
 67 |             "X": "readings"
 68 |         },
 69 |         "pandas.DataFrame.pop#1": {
 70 |             "X": "readings"
 71 |         },
 72 |         "pandas.DataFrame.pop#2": {
 73 |             "X": "readings"
 74 |         },
 75 |         "sklearn.impute.SimpleImputer#1": {
 76 |             "X": "readings"
 77 |         },
 78 |         "sklearn.preprocessing.MinMaxScaler#1": {
 79 |             "X": "readings"
 80 |         },
 81 |         "pandas.DataFrame#1": {
 82 |             "X": "readings"
 83 |         },
 84 |         "pandas.DataFrame.set#1": {
 85 |             "X": "readings",
 86 |             "value": "turbine_id"
 87 |         },
 88 |         "pandas.DataFrame.set#2": {
 89 |             "X": "readings",
 90 |             "value": "timestamp"
 91 |         },
 92 |         "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": {
 93 |             "timeseries": "readings"
 94 |         }
 95 |     },
 96 |     "output_names": {
 97 |         "pandas.DataFrame.resample#1": {
 98 |             "X": "readings"
 99 |         },
100 |         "pandas.DataFrame.unstack#1": {
101 |             "X": "readings"
102 |         },
103 |         "pandas.DataFrame.pop#1": {
104 |             "item": "turbine_id"
105 |         },
106 |         "pandas.DataFrame.pop#2": {
107 |             "item": "timestamp"
108 |         },
109 |         "sklearn.impute.SimpleImputer#1": {
110 |             "X": "readings"
111 |         },
112 |         "sklearn.preprocessing.MinMaxScaler#1": {
113 |             "X": "readings"
114 |         },
115 |         "pandas.DataFrame#1": {
116 |             "X": "readings"
117 |         }
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/draco/primitives/mlblocks.MLPipeline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "mlblocks.MLPipeline",
 3 |     "primitive": "mlblocks.MLPipeline",
 4 |     "fit": {
 5 |         "method": "fit",
 6 |         "args": "get_fit_args"
 7 |     },
 8 |     "produce": {
 9 |         "method": "predict",
10 |         "args": "get_predict_args",
11 |         "output": "get_outputs"
12 |     },
13 |     "hyperparameters": {
14 |         "fixed": {
15 |             "pipeline": {
16 |                 "type": "str",
17 |                 "default": null
18 |             },
19 |             "primitives": {
20 |                 "type": "list",
21 |                 "default": []
22 |             },
23 |             "init_params": {
24 |                 "type": "dict",
25 |                 "default": {}
26 |             },
27 |             "input_names": {
28 |                 "type": "dict",
29 |                 "default": {}
30 |             },
31 |             "output_names": {
32 |                 "type": "dict",
33 |                 "default": {}
34 |             }
35 |         }
36 |     }
37 | }


--------------------------------------------------------------------------------
/draco/primitives/numpy.take.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "numpy.take",
 3 |     "contributors": [
 4 |         "Plamen Valentinov Kolev <plamen@csail.mit.edu>"
 5 |     ],
 6 |     "documentation": "https://docs.scipy.org/doc/numpy/reference/",
 7 |     "description": "Take elements from an array along an axis.",
 8 |     "classifiers": {
 9 |         "type": "postprocessor"
10 |     },
11 |     "modalities": [],
12 |     "primitive": "numpy.take",
13 |     "produce": {
14 |         "args": [
15 |             {
16 |                 "name": "y",
17 |                 "keyword": "a",
18 |                 "type": "ndarray"
19 |             }
20 |         ],
21 |         "output": [
22 |             {
23 |                 "name": "y",
24 |                 "type": "ndarray"
25 |             }
26 |         ]
27 |     },
28 |     "hyperparameters": {
29 |         "fixed": {
30 |             "indices": {
31 |                 "type": "int",
32 |                 "default": 0
33 |             },
34 |             "axis": {
35 |                 "type": "int",
36 |                 "default": null
37 |             }
38 |         }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/draco/primitives/xgboost.XGBClassifier:probabilities.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "xgboost.XGBClassifier",
 3 |     "contributors": [
 4 |         "Carles Sala <csala@csail.mit.edu>"
 5 |     ],
 6 |     "documentation": "https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier",
 7 |     "description": "Implementation of the scikit-learn API for XGBoost classification.",
 8 |     "classifiers": {
 9 |         "type": "estimator",
10 |         "subtype": "classifier"
11 |     },
12 |     "modalities": [],
13 |     "primitive": "xgboost.XGBClassifier",
14 |     "fit": {
15 |         "method": "fit",
16 |         "args": [
17 |             {
18 |                 "name": "X",
19 |                 "type": "ndarray"
20 |             },
21 |             {
22 |                 "name": "y",
23 |                 "type": "array"
24 |             }
25 |         ]
26 |     },
27 |     "produce": {
28 |         "method": "predict_proba",
29 |         "args": [
30 |             {
31 |                 "name": "X",
32 |                 "keyword": "data",
33 |                 "type": "ndarray"
34 |             }
35 |         ],
36 |         "output": [
37 |             {
38 |                 "name": "y",
39 |                 "type": "array"
40 |             }
41 |         ]
42 |     },
43 |     "hyperparameters": {
44 |         "fixed": {
45 |             "n_jobs": {
46 |                 "type": "int",
47 |                 "default": -1
48 |             }
49 |         },
50 |         "tunable": {
51 |             "n_estimators": {
52 |                 "type": "int",
53 |                 "default": 100,
54 |                 "range": [
55 |                     10,
56 |                     1000
57 |                 ]
58 |             },
59 |             "max_depth": {
60 |                 "type": "int",
61 |                 "default": 3,
62 |                 "range": [
63 |                     3,
64 |                     10
65 |                 ]
66 |             },
67 |             "learning_rate": {
68 |                 "type": "float",
69 |                 "default": 0.1,
70 |                 "range": [
71 |                     0,
72 |                     1
73 |                 ]
74 |             },
75 |             "gamma": {
76 |                 "type": "float",
77 |                 "default": 0,
78 |                 "range": [
79 |                     0,
80 |                     1
81 |                 ]
82 |             },
83 |             "min_child_weight": {
84 |                 "type": "int",
85 |                 "default": 1,
86 |                 "range": [
87 |                     1,
88 |                     10
89 |                 ]
90 |             }
91 |         }
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/draco/results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from random import random
  3 | 
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def load_results(files):
  8 |     problems_results = dict()
  9 |     for filename in files:
 10 |         problem = os.path.basename(filename).replace('.csv', '')
 11 |         problems_results[problem] = pd.read_csv(filename).round(6)
 12 | 
 13 |     return problems_results
 14 | 
 15 | 
 16 | def get_wins_by_problems(results):
 17 |     df = results.groupby('problem_name')['template', 'window_size', 'resample_rule', 'fpr_threshold=0.5']
 18 |     df = df.apply(max)
 19 |     df = df.rename(columns={'fpr_threshold=0.5': 'score'})
 20 | 
 21 |     return df
 22 | 
 23 | 
 24 | def get_exclusive_wins(scores, column, pivot_columns=['window_size', 'resample_rule']):
 25 |     summary = {}
 26 |     for problem in scores.problem_name.unique():
 27 |         df = scores[scores['problem_name'] == problem]
 28 |         df['wr'] = df.apply(
 29 |             lambda row: '{}_{}_{}'.format(row[pivot_columns[0]], row[pivot_columns[1]], random()), axis=1)
 30 |         df = df.pivot(index='wr', columns=column, values='fpr_threshold=0.5')
 31 | 
 32 |         is_winner = df.T.rank(method='min', ascending=False) == 1
 33 |         num_winners = is_winner.sum()
 34 |         is_exclusive = num_winners == 1
 35 |         is_exclusive_winner = is_winner & is_exclusive
 36 |         summary[problem] = is_exclusive_winner.sum(axis=1)
 37 | 
 38 |     summary_df = pd.DataFrame(summary)
 39 |     summary_df.index.name = 'template'
 40 |     columns = summary_df.columns.sort_values(ascending=False)
 41 |     return summary_df[columns]
 42 | 
 43 | 
 44 | def add_sheet(dfs, name, writer, cell_fmt, index_fmt, header_fmt):
 45 |     startrow = 0
 46 |     widths = [0]
 47 |     if not isinstance(dfs, dict):
 48 |         dfs = {None: dfs}
 49 | 
 50 |     for df_name, df in dfs.items():
 51 |         df = df.reset_index()
 52 |         startrow += bool(df_name)
 53 |         df.to_excel(writer, sheet_name=name, startrow=startrow + 1, index=False, header=False)
 54 | 
 55 |         worksheet = writer.sheets[name]
 56 | 
 57 |         if df_name:
 58 |             worksheet.write(startrow - 1, 0, df_name, index_fmt)
 59 |             widths[0] = max(widths[0], len(df_name))
 60 | 
 61 |         for idx, column in enumerate(df.columns):
 62 |             worksheet.write(startrow, idx, column, header_fmt)
 63 |             width = max(len(column), *df[column].astype(str).str.len()) + 1
 64 |             if len(widths) > idx:
 65 |                 widths[idx] = max(widths[idx], width)
 66 |             else:
 67 |                 widths.append(width)
 68 | 
 69 |         startrow += len(df) + 2
 70 | 
 71 |     for idx, width in enumerate(widths):
 72 |         fmt = cell_fmt if idx else index_fmt
 73 |         worksheet.set_column(idx, idx, width + 1, fmt)
 74 | 
 75 | 
 76 | def write_results(results, output):
 77 |     writer = pd.ExcelWriter(output, engine='xlsxwriter')
 78 |     cell_fmt = writer.book.add_format({
 79 |         "font_name": "Arial",
 80 |         "font_size": "10"
 81 |     })
 82 |     index_fmt = writer.book.add_format({
 83 |         "font_name": "Arial",
 84 |         "font_size": "10",
 85 |         "bold": True,
 86 |     })
 87 |     header_fmt = writer.book.add_format({
 88 |         "font_name": "Arial",
 89 |         "font_size": "10",
 90 |         "bold": True,
 91 |         "bottom": 1
 92 |     })
 93 | 
 94 |     if isinstance(results, dict):
 95 |         results = pd.concat(list(results.values()), ignore_index=True)
 96 | 
 97 |     window = get_exclusive_wins(results, 'window_size', ['window_size', 'fpr_threshold=0.5'])
 98 | 
 99 |     resample_pivots = ['resample_rule', ['problem_name', 'fpr_threshold=0.5']]
100 |     resample = get_exclusive_wins(results, 'resample_rule', resample_pivots)
101 | 
102 |     summary = {
103 |         'Best pipeline by Problem': get_wins_by_problems(results),
104 |         'Rankings - Number of wins': get_exclusive_wins(results, 'template'),
105 |         'Resample Rule': resample,
106 |         'Window Size': window
107 |     }
108 |     add_sheet(summary, 'Summary', writer, cell_fmt, index_fmt, header_fmt)
109 | 
110 |     for problem in results['problem_name'].unique():
111 |         add_sheet(
112 |             results[results['problem_name'] == problem],
113 |             problem,
114 |             writer,
115 |             cell_fmt,
116 |             index_fmt,
117 |             header_fmt
118 |         )
119 | 
120 |     writer.save()
121 | 


--------------------------------------------------------------------------------
/draco/targets.py:
--------------------------------------------------------------------------------
  1 | """Targets module.
  2 | 
  3 | This module contains functions to work with target_times.
  4 | """
  5 | 
  6 | import logging
  7 | import warnings
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from tqdm.auto import trange
 12 | 
 13 | LOGGER = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def make_targets(target_times, window_size, target, new_targets=None):
 17 |     target_times = target_times.sort_values('cutoff_time', ascending=True)
 18 |     cutoff_times = target_times.cutoff_time
 19 |     window_size = pd.to_timedelta(window_size)
 20 |     original_size = len(target_times)
 21 |     current_size = original_size
 22 |     new_targets = new_targets or current_size
 23 | 
 24 |     for index in trange(len(cutoff_times) - 1):
 25 |         timestamp = cutoff_times.iloc[index]
 26 |         next_time = cutoff_times.iloc[index + 1]
 27 | 
 28 |         if timestamp + (window_size * 2) >= next_time:
 29 |             continue
 30 | 
 31 |         span_start = timestamp + window_size
 32 |         span_end = next_time - window_size
 33 |         span_length = (span_end - span_start).total_seconds()
 34 | 
 35 |         delay = pd.to_timedelta(np.random.randint(span_length), unit='s')
 36 |         cutoff_time = span_start + delay
 37 | 
 38 |         target_times = target_times.append(pd.Series({
 39 |             'turbine_id': target_times.iloc[index].turbine_id,
 40 |             'cutoff_time': cutoff_time,
 41 |             'target': target
 42 |         }), ignore_index=True)
 43 | 
 44 |         current_size = len(target_times)
 45 |         if current_size == original_size + new_targets:
 46 |             return target_times.sort_values('cutoff_time', ascending=True)
 47 | 
 48 |     if current_size == original_size:
 49 |         warnings.warn('There is no space left between to add more targets.')
 50 |         return target_times
 51 | 
 52 |     new_targets = new_targets - (current_size - original_size)
 53 |     return make_targets(target_times, window_size, target, new_targets)
 54 | 
 55 | 
 56 | def _to_timedelta(specification):
 57 |     if isinstance(specification, int):
 58 |         specification = '{}s'.format(specification)
 59 | 
 60 |     return pd.to_timedelta(specification)
 61 | 
 62 | 
 63 | def make_target_times(failure_dates, step, start=None, end=None, forecast_window=0,
 64 |                       prediction_window=0, before=0, after=0, offset=0, max_true=None,
 65 |                       max_false=None, shuffle=True):
 66 | 
 67 |     step = _to_timedelta(step)
 68 |     start = start or failure_dates.timestamp.min()
 69 |     start = start or failure_dates.min()
 70 | 
 71 |     forecast_window = _to_timedelta(forecast_window)
 72 |     prediction_window = _to_timedelta(prediction_window)
 73 |     before = _to_timedelta(before)
 74 |     after = _to_timedelta(after)
 75 |     offset = _to_timedelta(offset)
 76 | 
 77 |     target_times = pd.DataFrame()
 78 |     turbines = failure_dates.turbine_id.unique()
 79 |     failures = failure_dates.set_index(['turbine_id', 'date'])
 80 | 
 81 |     for turbine in turbines:
 82 |         turbine_failures = failures.loc[turbine]
 83 | 
 84 |         min_failure_date = turbine_failures.index.min() - before
 85 |         last_failure_date = turbine_failures.index.max() + after
 86 |         turbine_targets = list()
 87 |         while min_failure_date < last_failure_date:
 88 |             max_failure_date = min_failure_date + prediction_window
 89 |             day_failures = turbine_failures.loc[min_failure_date:max_failure_date]
 90 | 
 91 |             min_failure_date = min_failure_date + offset
 92 | 
 93 |             turbine_targets.append({
 94 |                 'turbine_id': turbine,
 95 |                 'target': int(bool(len(day_failures))),
 96 |                 'cutoff_time': min_failure_date - forecast_window
 97 |             })
 98 | 
 99 |         turbine_targets = pd.DataFrame(turbine_targets)
100 |         failed = turbine_targets[turbine_targets.target == 1]
101 |         target_times = target_times.append(failed)
102 | 
103 |         non_failed = turbine_targets[turbine_targets.target == 0]
104 |         non_failed = non_failed.sample(min(max_false, len(non_failed)))
105 | 
106 |         target_times = target_times.append(non_failed)
107 | 
108 |     if shuffle:
109 |         target_times = target_times.sample(len(target_times))
110 | 
111 |     return target_times
112 | 
113 | 
114 | def _valid_targets(timestamps):
115 |     def apply_function(row):
116 |         cutoff = row.cutoff_time
117 |         try:
118 |             times = timestamps.loc[row.turbine_id]
119 |         except KeyError:
120 |             return False
121 | 
122 |         return times['min'] <= cutoff <= times['max']
123 | 
124 |     return apply_function
125 | 
126 | 
127 | def select_valid_targets(target_times, readings, window_size, rule=None):
128 |     """Filter out target_times without enough data for this window_size.
129 | 
130 |     The table_times table is scanned and checked against the readings table
131 |     considering the window_size. All the target times entries that do not
132 |     have enough data are dropped.
133 | 
134 |     Args:
135 |         target_times (pandas.DataFrame):
136 |             Target times table, with at least turbined_id and cutoff_time fields.
137 |         readings (pandas.DataFrame):
138 |             Readings table, with at least turbine_id, signal_id, and timestamp ields.
139 |         window_size (str or pandas.TimeDelta):
140 |             TimeDelta specification that indicates the lenght of the training window.
141 |         rule (str or pandas.TimeDelta):
142 |             Resampling rule specification. If given, add that to the max timestamp
143 |             to ensure the period is completely covered.
144 | 
145 |     Returns:
146 |         pandas.DataFrame:
147 |             New target_times table without the invalid targets.
148 |     """
149 | 
150 |     timestamps = readings.groupby('turbine_id').timestamp.agg(['min', 'max'])
151 |     timestamps['min'] += pd.to_timedelta(window_size)
152 | 
153 |     if rule is not None:
154 |         timestamps['max'] += pd.to_timedelta(rule)
155 | 
156 |     valid = target_times.apply(_valid_targets(timestamps), axis=1)
157 |     valid_targets = target_times[valid]
158 | 
159 |     length = len(valid_targets)
160 |     LOGGER.info('Dropped %s targets without enough data. Final target_times size: %s',
161 |                 len(target_times) - length, length)
162 | 
163 |     return valid_targets
164 | 
165 | 
166 | def drop_duplicates(target_times):
167 |     length = len(target_times)
168 |     filtered = target_times.drop_duplicates()
169 |     new_length = len(filtered)
170 |     if length != new_length:
171 |         LOGGER.warn('Dropped %s duplicate targets!', length - new_length)
172 | 
173 |     filtered = filtered.drop_duplicates(subset=['turbine_id', 'cutoff_time'], keep=False)
174 |     final_length = len(filtered)
175 |     if new_length != final_length:
176 |         LOGGER.warn('Dropped %s incoherent targets!', new_length - final_length)
177 | 
178 |     return filtered.copy()
179 | 


--------------------------------------------------------------------------------
/draco/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | 
 5 | from mlblocks import MLPipeline
 6 | 
 7 | 
 8 | def clone_pipeline(pipeline):
 9 |     return MLPipeline.from_dict(pipeline.to_dict())
10 | 
11 | 
12 | def walk(document, transform):
13 |     if not isinstance(document, dict):
14 |         return document
15 | 
16 |     new_doc = dict()
17 |     for key, value in document.items():
18 |         if isinstance(value, dict):
19 |             value = walk(value, transform)
20 |         elif isinstance(value, list):
21 |             value = [walk(v, transform) for v in value]
22 | 
23 |         new_key, new_value = transform(key, value)
24 |         new_doc[new_key] = new_value
25 | 
26 |     return new_doc
27 | 
28 | 
29 | def remove_dots(document):
30 |     return walk(document, lambda key, value: (key.replace('.', '-'), value))
31 | 
32 | 
33 | def restore_dots(document):
34 |     return walk(document, lambda key, value: (key.replace('-', '.'), value))
35 | 
36 | 
37 | def logging_setup(verbosity=1, logfile=None, logger_name=None):
38 |     logger = logging.getLogger(logger_name)
39 |     log_level = (3 - verbosity) * 10
40 |     fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s'
41 |     formatter = logging.Formatter(fmt)
42 |     logger.setLevel(log_level)
43 |     logger.propagate = False
44 | 
45 |     if logfile:
46 |         file_handler = logging.FileHandler(logfile)
47 |         file_handler.setLevel(logging.DEBUG)
48 |         file_handler.setFormatter(formatter)
49 |         logger.addHandler(file_handler)
50 | 
51 |     else:
52 |         console_handler = logging.StreamHandler()
53 |         console_handler.setLevel(log_level)
54 |         console_handler.setFormatter(formatter)
55 |         logger.addHandler(console_handler)
56 | 
57 | 
58 | def as_list(param):
59 |     """Make sure that param is a ``list``."""
60 |     if isinstance(param, (list, tuple)):
61 |         return param
62 | 
63 |     return [param]
64 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.3.1.dev0
 3 | commit = True
 4 | tag = True
 5 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
 6 | serialize = 
 7 | 	{major}.{minor}.{patch}.{release}{candidate}
 8 | 	{major}.{minor}.{patch}
 9 | 
10 | [bumpversion:part:release]
11 | optional_value = release
12 | first_value = dev
13 | values = 
14 | 	dev
15 | 	release
16 | 
17 | [bumpversion:part:candidate]
18 | 
19 | [bumpversion:file:setup.py]
20 | search = version='{current_version}'
21 | replace = version='{new_version}'
22 | 
23 | [bumpversion:file:draco/__init__.py]
24 | search = __version__ = '{current_version}'
25 | replace = __version__ = '{new_version}'
26 | 
27 | [bdist_wheel]
28 | universal = 1
29 | 
30 | [flake8]
31 | max-line-length = 99
32 | exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints
33 | ignore = # keep empty to prevent default ignores
34 | 
35 | [isort]
36 | include_trailing_comment = True
37 | line_length = 99
38 | lines_between_types = 0
39 | multi_line_output = 4
40 | not_skip = __init__.py
41 | use_parentheses = True
42 | 
43 | [aliases]
44 | test = pytest
45 | 
46 | [tool:pytest]
47 | collect_ignore = ['setup.py']
48 | 
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from setuptools import setup, find_packages
  5 | 
  6 | try:
  7 |     with open('README.md', encoding='utf-8') as readme_file:
  8 |         readme = readme_file.read()
  9 | except IOError:
 10 |     readme = ''
 11 | 
 12 | try:
 13 |     with open('HISTORY.md', encoding='utf-8') as history_file:
 14 |         history = history_file.read()
 15 | except IOError:
 16 |     history = ''
 17 | 
 18 | install_requires = [
 19 |     'baytune>=0.4.0,<0.5',
 20 |     'ml-stars>=0.1.0',
 21 |     'mlblocks>=0.4.0,<0.5',
 22 |     'pymongo>=3.7.2,<4',
 23 |     'scikit-learn>=0.21,<1.2',
 24 |     'tqdm<4.50.0,>=4.36.1',
 25 |     'scipy>=1.0.1,<2',
 26 |     'numpy>=1.16.0,<1.19',
 27 |     'pandas>=1,<2',
 28 |     'tensorflow>=2,<2.3',
 29 |     'partd>=1.1.0,<2',
 30 |     'fsspec>=0.8.5,<0.9',
 31 |     'dask>=2.6.0,<3',
 32 |     'tabulate>=0.8.3,<0.9',
 33 |     'xlsxwriter>=1.3.6,<1.4',
 34 |     # fix conflicts
 35 |     'protobuf<4',
 36 |     'importlib-metadata<5',
 37 | ]
 38 | 
 39 | setup_requires = [
 40 |     'pytest-runner>=2.11.1',
 41 | ]
 42 | 
 43 | tests_require = [
 44 |     'pytest>=3.4.2',
 45 |     'pytest-cov>=2.6.0',
 46 |     'jupyter>=1.0.0,<2',
 47 |     'rundoc>=0.4.3,<0.5',
 48 | ]
 49 | 
 50 | development_requires = [
 51 |     # general
 52 |     'bumpversion>=0.5.3,<0.6',
 53 |     'pip>=9.0.1',
 54 |     'watchdog>=0.8.3,<0.11',
 55 | 
 56 |     # docs
 57 |     'm2r>=0.2.0,<0.3',
 58 |     'nbsphinx>=0.5.0,<0.7',
 59 |     'Sphinx>=1.7.1,<3',
 60 |     'sphinx_rtd_theme>=0.2.4,<0.5',
 61 |     'docutils>=0.14,<0.18',
 62 |     'autodocsumm>=0.1.10',
 63 |     'markupsafe<2.1.0',
 64 |     'Jinja2>=2,<3',
 65 | 
 66 |     # style check
 67 |     'flake8>=3.7.7,<4',
 68 |     'isort>=4.3.4,<5',
 69 | 
 70 |     # fix style issues
 71 |     'autoflake>=1.1,<2',
 72 |     'autopep8>=1.4.3,<2',
 73 |     'importlib-metadata<5',
 74 | 
 75 |     # distribute on PyPI
 76 |     'twine>=1.10.0,<4',
 77 |     'wheel>=0.30.0',
 78 | 
 79 |     # Advanced testing
 80 |     'coverage>=4.5.1,<6',
 81 |     'tox>=2.9.1,<4',
 82 | ]
 83 | 
 84 | setup(
 85 |     author='MIT Data To AI Lab',
 86 |     author_email='dailabmit@gmail.com',
 87 |     classifiers=[
 88 |         'Development Status :: 2 - Pre-Alpha',
 89 |         'Intended Audience :: Developers',
 90 |         'License :: OSI Approved :: MIT License',
 91 |         'Natural Language :: English',
 92 |         'Programming Language :: Python :: 3',
 93 |         'Programming Language :: Python :: 3.6',
 94 |         'Programming Language :: Python :: 3.7',
 95 |         'Programming Language :: Python :: 3.8',
 96 |     ],
 97 |     description='AutoML for Time Series.',
 98 |     entry_points={
 99 |         'mlblocks': [
100 |             'pipelines=draco:MLBLOCKS_PIPELINES',
101 |             'primitives=draco:MLBLOCKS_PRIMITIVES'
102 |         ],
103 |     },
104 |     extras_require={
105 |         'test': tests_require,
106 |         'dev': development_requires + tests_require,
107 |     },
108 |     include_package_data=True,
109 |     install_requires=install_requires,
110 |     keywords='wind machine learning draco',
111 |     license='MIT license',
112 |     long_description=readme + '\n\n' + history,
113 |     long_description_content_type='text/markdown',
114 |     name='draco-ml',
115 |     packages=find_packages(include=['draco', 'draco.*']),
116 |     python_requires='>=3.6,<3.9',
117 |     setup_requires=setup_requires,
118 |     test_suite='tests',
119 |     tests_require=tests_require,
120 |     url='https://github.com/sintel-dev/Draco',
121 |     version='0.3.1.dev0',
122 |     zip_safe=False,
123 | )
124 | 


--------------------------------------------------------------------------------
/tests/test_benchmark.py:
--------------------------------------------------------------------------------
 1 | """Tests for `draco.benchmark` module."""
 2 | import numpy as np
 3 | 
 4 | from draco.benchmark import evaluate_templates
 5 | from draco.demo import load_demo
 6 | 
 7 | 
 8 | def test_predict():
 9 |     # setup
10 |     templates = [
11 |         'lstm_with_unstack'
12 |     ]
13 | 
14 |     window_size_rule = [
15 |         ('1d', '1h')
16 |     ]
17 | 
18 |     target_times, readings = load_demo()
19 |     target_times = target_times.head(40)
20 |     readings = readings.head(100)
21 | 
22 |     # run
23 |     scores_df = evaluate_templates(
24 |         target_times=target_times,
25 |         readings=readings,
26 |         templates=templates,
27 |         window_size_rule=window_size_rule,
28 |         tuning_iterations=1,
29 |         cv_splits=2
30 |     )
31 | 
32 |     # assert
33 |     expected_columns = [
34 |         'problem_name',
35 |         'window_size',
36 |         'resample_rule',
37 |         'template',
38 |         'default_test',
39 |         'default_cv',
40 |         'tuned_cv',
41 |         'tuned_test',
42 |         'tuning_metric',
43 |         'tuning_metric_kwargs',
44 |         'fit_predict_time',
45 |         'default_cv_time',
46 |         'average_cv_time',
47 |         'total_time',
48 |         'status',
49 |         'accuracy_threshold/0.5',
50 |         'f1_threshold/0.5',
51 |         'fpr_threshold/0.5',
52 |         'tpr_threshold/0.5',
53 |     ]
54 | 
55 |     expected_dtypes = [
56 |         np.dtype('O'),
57 |         np.dtype('O'),
58 |         np.dtype('O'),
59 |         np.dtype('O'),
60 |         np.dtype('float64'),
61 |         np.dtype('float64'),
62 |         np.dtype('float64'),
63 |         np.dtype('float64'),
64 |         np.dtype('O'),
65 |         np.dtype('O'),
66 |         np.dtype('<m8[ns]'),
67 |         np.dtype('<m8[ns]'),
68 |         np.dtype('<m8[ns]'),
69 |         np.dtype('<m8[ns]'),
70 |         np.dtype('O'),
71 |         np.dtype('float64'),
72 |         np.dtype('float64'),
73 |         np.dtype('float64'),
74 |         np.dtype('float64')
75 |     ]
76 | 
77 |     assert (scores_df.columns.to_list() == expected_columns)
78 |     assert (scores_df.tuned_test.notnull)
79 |     assert (scores_df.dtypes.to_list() == expected_dtypes)
80 | 


--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from draco.metrics import fpr_score
 4 | 
 5 | 
 6 | def test_fpr_score_perfect_scenario():
 7 |     truth = [0, 0, 0, 1, 1, 1]
 8 |     false_probs = [0.2, 0.4, 0.6]
 9 |     true_probs = [0.8, 0.7, 0.9]
10 |     probs = np.concatenate([false_probs, true_probs])
11 |     score = fpr_score(truth, probs, tpr=1)
12 |     assert score == 1
13 | 
14 | 
15 | def test_fpr_score_predict_over_half():
16 |     truth = [0, 0, 0, 0, 1, 1, 1, 1]
17 |     false_probs = [0.1, 0.2, 0.4, 0.6]
18 |     true_probs = [0.5, 0.7, 0.8, 0.9]
19 |     probs = np.concatenate([false_probs, true_probs])
20 |     score = fpr_score(truth, probs, tpr=1)
21 |     assert score == 0.75
22 | 
23 | 
24 | def test_fpr_score_predict_half():
25 |     truth = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
26 |     false_probs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
27 |     true_probs = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
28 |     probs = np.concatenate([false_probs, true_probs])
29 |     score = fpr_score(truth, probs, tpr=1)
30 |     assert score == 0.5
31 | 
32 | 
33 | def test_fpr_score_predict_one_third():
34 |     truth = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
35 |     false_probs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
36 |     true_probs = [0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
37 |     probs = np.concatenate([false_probs, true_probs])
38 |     score = fpr_score(truth, probs, tpr=1)
39 |     assert round(score, 4) == 0.3333
40 | 


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Tests for `draco.pipeline` module."""
 5 | from unittest import TestCase
 6 | from unittest.mock import patch
 7 | 
 8 | import pandas as pd
 9 | import pytest
10 | from mlblocks import MLPipeline
11 | 
12 | from draco.pipeline import DracoPipeline, get_pipelines
13 | 
14 | 
15 | def test_get_pipelines():
16 |     output = get_pipelines()
17 |     assert isinstance(output, list)
18 | 
19 | 
20 | def test_get_pipelines_type():
21 |     output = get_pipelines(pipeline_type='lstm')
22 |     assert isinstance(output, list)
23 |     for path in output:
24 |         assert 'lstm' in path
25 | 
26 | 
27 | def test_get_pipelines_type_error():
28 |     with pytest.raises(FileNotFoundError):
29 |         get_pipelines(pipeline_type='does-not-exist')
30 | 
31 | 
32 | def test_loading_pipelines():
33 |     draco_pipelines = get_pipelines()
34 |     for pipeline in draco_pipelines:
35 |         mlpipeline = MLPipeline(pipeline)
36 |         assert isinstance(mlpipeline, MLPipeline)
37 | 
38 | 
39 | class TestDracoPipeline(TestCase):
40 | 
41 |     def _get_data(self):
42 |         target_times = pd.DataFrame({
43 |             'turbine_id': ['T001'],
44 |             'cutoff_time': [pd.Timestamp('2010-01-01')],
45 |             'target': [1]
46 |         })
47 |         readings = pd.DataFrame({
48 |             'turbine_id': ['T001'],
49 |             'timestamp': [pd.Timestamp('2010-01-01')],
50 |             'signal_id': ['S1'],
51 |             'value': [0.1]
52 |         })
53 |         return target_times, readings
54 | 
55 |     @patch('draco.pipeline.MLPipeline')
56 |     @patch('draco.pipeline.load_pipeline')
57 |     def test_fit(self, load_pipeline_mock, mlpipeline_mock):
58 |         load_pipeline_mock.return_value = dict()
59 | 
60 |         # Run
61 |         instance = DracoPipeline('a_pipeline', 'accuracy')
62 |         target_times, readings = self._get_data()
63 |         instance.fit(target_times, readings)
64 | 
65 |         # Asserts
66 |         assert instance.fitted
67 | 
68 |     @patch('draco.pipeline.MLPipeline')
69 |     @patch('draco.pipeline.load_pipeline')
70 |     def test_predict(self, load_pipeline_mock, mlpipeline_mock):
71 |         load_pipeline_mock.return_value = dict()
72 | 
73 |         # Run
74 |         instance = DracoPipeline('a_pipeline', 'accuracy')
75 |         instance.fitted = True
76 |         target_times, readings = self._get_data()
77 |         instance.predict(target_times, readings)
78 | 
79 |     def test_save_load(self):
80 |         file = 'path.pkl'
81 | 
82 |         # Run
83 |         instance = DracoPipeline('dummy', 'accuracy')
84 |         instance.save(file)
85 |         new_instance = DracoPipeline.load(file)
86 | 
87 |         # Asserts
88 |         assert isinstance(new_instance, instance.__class__)
89 |         assert instance.template == new_instance.template
90 |         assert instance.fitted == new_instance.fitted
91 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [testenv:docs]
 2 | skipsdist = true
 3 | extras = dev
 4 | commands =
 5 |     /usr/bin/env make docs
 6 | 
 7 | [tox]
 8 | envlist = py3{6,7,8}-{lint,readme,unit,minimum}
 9 | 
10 | [travis]
11 | python =
12 |     3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials
13 |     3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials
14 |     3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials
15 | 
16 | [gh-actions]
17 | python =
18 |     3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials
19 |     3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials
20 |     3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials
21 | 
22 | [testenv]
23 | passenv = CI TRAVIS TRAVIS_*
24 | skipsdist = false
25 | skip_install = false
26 | deps =
27 |     readme: rundoc
28 |     tutorials: jupyter
29 | extras =
30 |     lint: dev
31 |     unit: test
32 |     minimum: test
33 | commands =
34 |     lint: /usr/bin/env make lint
35 |     readme: /usr/bin/env make test-readme
36 |     unit: /usr/bin/env make test-unit
37 |     minimum: /usr/bin/env make test-minimum
38 |     tutorials: /usr/bin/env make test-tutorials
39 |     rm -r {envdir}
40 | whitelist_externals =
41 |     rm
42 | 


--------------------------------------------------------------------------------
/tutorials/03_Benchmarking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Benchmarking"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 0. Setup the logging\n",
 15 |     "\n",
 16 |     "This step sets up logging in our environment to increase our visibility over\n",
 17 |     "the steps that Draco performs."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import logging;\n",
 27 |     "\n",
 28 |     "logging.basicConfig(level=logging.INFO)\n",
 29 |     "logging.getLogger().setLevel(level=logging.ERROR)\n",
 30 |     "logging.getLogger('draco').setLevel(level=logging.INFO)\n",
 31 |     "\n",
 32 |     "import warnings\n",
 33 |     "warnings.simplefilter(\"ignore\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "\n",
 41 |     "## Running the Benchmarking\n",
 42 |     "\n",
 43 |     "The user API for the Draco Benchmarking is the `draco.benchmark.evaluate_templates` function.\n",
 44 |     "\n",
 45 |     "The `evaluate_templates` function accepts the following arguments:\n",
 46 |     "* `templates (list)`: List of templates to try.\n",
 47 |     "* `window_size_rule (list)`: List of tupples (int, str or Timedelta object).\n",
 48 |     "* `metric (function or str)`: Metric to use. If an ``str`` is give it must be one of the metrics defined in the `draco.metrics.METRICS` dictionary.\n",
 49 |     "* `tuning_iterations (int)`: Number of iterations to be used.\n",
 50 |     "* `init_params (dict)`: Initialization parameters for the pipelines.\n",
 51 |     "* `target_times (DataFrame)`: Contains the specefication problem that we are solving, which has three columns:\n",
 52 |     "    * `turbine_id`: Unique identifier of the turbine which this label corresponds to.\n",
 53 |     "    * `cutoff_time`: Time associated with this target.\n",
 54 |     "    * `target`: The value that we want to predict. This can either be a numerical value\n",
 55 |     "        or a categorical label. This column can also be skipped when preparing\n",
 56 |     "        data that will be used only to make predictions and not to fit any\n",
 57 |     "        pipeline.\n",
 58 |     "* `readings (DataFrame)`: Contains the signal data from different sensors, with the following columns:\n",
 59 |     "    * `turbine_id`: Unique identifier of the turbine which this reading comes from.\n",
 60 |     "    * `signal_id`: Unique identifier of the signal which this reading comes from.\n",
 61 |     "    * `timestamp (datetime)`: Time where the reading took place, as a datetime.\n",
 62 |     "    * `value (float)`: Numeric value of this reading.\n",
 63 |     "* `preprocessing (int, list or dict)`: Number of preprocessing steps to be used.\n",
 64 |     "* `cost (bool)`: Wheter the metric is a cost function (the lower the better) or not.\n",
 65 |     "* `test_size (float)`: Percentage of the data set to be used for the test.\n",
 66 |     "* `cv_splits (int)`: Amount of splits to create.\n",
 67 |     "* `random_state (int)`: Random number of train_test split.\n",
 68 |     "* `output_path (str)`: Path where to save the benchmark report.\n",
 69 |     "* `cache_path (str)`: If given, cache the generated cross validation splits in this folder. Defatuls to ``None``."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 2,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "templates = [\n",
 79 |     "    'lstm_prob_with_unstack',\n",
 80 |     "    'double_lstm_prob_with_unstack'\n",
 81 |     "]\n",
 82 |     "window_size_rule = [('1d', '1h'), ('2d', '2h')]\n",
 83 |     "init_params = {\n",
 84 |     "    'lstm_prob_with_unstack': {\n",
 85 |     "        'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n",
 86 |     "            'epochs': 1,\n",
 87 |     "        }\n",
 88 |     "    },\n",
 89 |     "    'double_lstm_prob_with_unstack': {\n",
 90 |     "        'keras.Sequential.DoubleLSTMTimeSeriesClassifier#1': {\n",
 91 |     "            'epochs': 1,\n",
 92 |     "        }\n",
 93 |     "    }\n",
 94 |     "}\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 3,
100 |    "metadata": {
101 |     "scrolled": false
102 |    },
103 |    "outputs": [
104 |     {
105 |      "name": "stderr",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "INFO:draco.benchmark:Evaluating template lstm_prob_with_unstack on problem None (1d, 1h)\n",
109 |       "2023-04-07 14:33:33.017625: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
110 |       "2023-04-07 14:33:33.043631: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc3e937a8e0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
111 |       "2023-04-07 14:33:33.043643: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version\n",
112 |       "INFO:draco.pipeline:New configuration found:\n",
113 |       "  Template: lstm_prob_with_unstack \n",
114 |       "    Hyperparameters: \n",
115 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
116 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n",
117 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n",
118 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n",
119 |       "INFO:draco.benchmark:Evaluating template lstm_prob_with_unstack on problem None (2d, 2h)\n",
120 |       "INFO:draco.pipeline:New configuration found:\n",
121 |       "  Template: lstm_prob_with_unstack \n",
122 |       "    Hyperparameters: \n",
123 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
124 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n",
125 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n",
126 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n",
127 |       "INFO:draco.pipeline:New configuration found:\n",
128 |       "  Template: lstm_prob_with_unstack \n",
129 |       "    Hyperparameters: \n",
130 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): median\n",
131 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 137\n",
132 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.612475373625103\n",
133 |       "      ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 191\n",
134 |       "INFO:draco.benchmark:Evaluating template double_lstm_prob_with_unstack on problem None (1d, 1h)\n",
135 |       "INFO:draco.pipeline:New configuration found:\n",
136 |       "  Template: double_lstm_prob_with_unstack \n",
137 |       "    Hyperparameters: \n",
138 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
139 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n",
140 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n",
141 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80\n",
142 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3\n",
143 |       "INFO:draco.pipeline:New configuration found:\n",
144 |       "  Template: double_lstm_prob_with_unstack \n",
145 |       "    Hyperparameters: \n",
146 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n",
147 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 245\n",
148 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.4308586778212253\n",
149 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 221\n",
150 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.5926391753395145\n",
151 |       "INFO:draco.benchmark:Evaluating template double_lstm_prob_with_unstack on problem None (2d, 2h)\n",
152 |       "INFO:draco.pipeline:New configuration found:\n",
153 |       "  Template: double_lstm_prob_with_unstack \n",
154 |       "    Hyperparameters: \n",
155 |       "      ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n",
156 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n",
157 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n",
158 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80\n",
159 |       "      ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "from draco.benchmark import evaluate_templates\n",
165 |     "\n",
166 |     "results = evaluate_templates(\n",
167 |     "    templates=templates,\n",
168 |     "    window_size_rule=window_size_rule,\n",
169 |     "    init_params=init_params,\n",
170 |     "    tuning_iterations=3,\n",
171 |     "    cv_splits=3,\n",
172 |     ")"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 4,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "data": {
182 |       "text/html": [
183 |        "<div>\n",
184 |        "<style scoped>\n",
185 |        "    .dataframe tbody tr th:only-of-type {\n",
186 |        "        vertical-align: middle;\n",
187 |        "    }\n",
188 |        "\n",
189 |        "    .dataframe tbody tr th {\n",
190 |        "        vertical-align: top;\n",
191 |        "    }\n",
192 |        "\n",
193 |        "    .dataframe thead th {\n",
194 |        "        text-align: right;\n",
195 |        "    }\n",
196 |        "</style>\n",
197 |        "<table border=\"1\" class=\"dataframe\">\n",
198 |        "  <thead>\n",
199 |        "    <tr style=\"text-align: right;\">\n",
200 |        "      <th></th>\n",
201 |        "      <th>problem_name</th>\n",
202 |        "      <th>window_size</th>\n",
203 |        "      <th>resample_rule</th>\n",
204 |        "      <th>template</th>\n",
205 |        "      <th>default_test</th>\n",
206 |        "      <th>default_cv</th>\n",
207 |        "      <th>tuned_cv</th>\n",
208 |        "      <th>tuned_test</th>\n",
209 |        "      <th>tuning_metric</th>\n",
210 |        "      <th>tuning_metric_kwargs</th>\n",
211 |        "      <th>fit_predict_time</th>\n",
212 |        "      <th>default_cv_time</th>\n",
213 |        "      <th>average_cv_time</th>\n",
214 |        "      <th>total_time</th>\n",
215 |        "      <th>status</th>\n",
216 |        "      <th>accuracy_threshold/0.5</th>\n",
217 |        "      <th>f1_threshold/0.5</th>\n",
218 |        "      <th>fpr_threshold/0.5</th>\n",
219 |        "      <th>tpr_threshold/0.5</th>\n",
220 |        "    </tr>\n",
221 |        "  </thead>\n",
222 |        "  <tbody>\n",
223 |        "    <tr>\n",
224 |        "      <th>0</th>\n",
225 |        "      <td>None</td>\n",
226 |        "      <td>1d</td>\n",
227 |        "      <td>1h</td>\n",
228 |        "      <td>lstm_prob_with_unstack</td>\n",
229 |        "      <td>0.494505</td>\n",
230 |        "      <td>0.589905</td>\n",
231 |        "      <td>0.589905</td>\n",
232 |        "      <td>0.322650</td>\n",
233 |        "      <td>roc_auc_score</td>\n",
234 |        "      <td>{'threshold': 0.5}</td>\n",
235 |        "      <td>0 days 00:00:03.873157</td>\n",
236 |        "      <td>0 days 00:00:14.369536</td>\n",
237 |        "      <td>0 days 00:00:08.178422</td>\n",
238 |        "      <td>0 days 00:00:47.144655</td>\n",
239 |        "      <td>OK</td>\n",
240 |        "      <td>0.280899</td>\n",
241 |        "      <td>0.255814</td>\n",
242 |        "      <td>1.0</td>\n",
243 |        "      <td>0.0</td>\n",
244 |        "    </tr>\n",
245 |        "    <tr>\n",
246 |        "      <th>1</th>\n",
247 |        "      <td>None</td>\n",
248 |        "      <td>2d</td>\n",
249 |        "      <td>2h</td>\n",
250 |        "      <td>lstm_prob_with_unstack</td>\n",
251 |        "      <td>0.446581</td>\n",
252 |        "      <td>0.543056</td>\n",
253 |        "      <td>0.561570</td>\n",
254 |        "      <td>0.707875</td>\n",
255 |        "      <td>roc_auc_score</td>\n",
256 |        "      <td>{'threshold': 0.5}</td>\n",
257 |        "      <td>0 days 00:00:03.460467</td>\n",
258 |        "      <td>0 days 00:00:12.121905</td>\n",
259 |        "      <td>0 days 00:00:08.275919</td>\n",
260 |        "      <td>0 days 00:00:44.449291</td>\n",
261 |        "      <td>OK</td>\n",
262 |        "      <td>0.730337</td>\n",
263 |        "      <td>0.586207</td>\n",
264 |        "      <td>1.0</td>\n",
265 |        "      <td>0.0</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>2</th>\n",
269 |        "      <td>None</td>\n",
270 |        "      <td>1d</td>\n",
271 |        "      <td>1h</td>\n",
272 |        "      <td>double_lstm_prob_with_unstack</td>\n",
273 |        "      <td>0.813187</td>\n",
274 |        "      <td>0.307993</td>\n",
275 |        "      <td>0.592696</td>\n",
276 |        "      <td>0.417582</td>\n",
277 |        "      <td>roc_auc_score</td>\n",
278 |        "      <td>{'threshold': 0.5}</td>\n",
279 |        "      <td>0 days 00:00:05.460985</td>\n",
280 |        "      <td>0 days 00:00:18.103660</td>\n",
281 |        "      <td>0 days 00:00:14.011877</td>\n",
282 |        "      <td>0 days 00:01:11.192546</td>\n",
283 |        "      <td>OK</td>\n",
284 |        "      <td>0.303371</td>\n",
285 |        "      <td>0.367347</td>\n",
286 |        "      <td>1.0</td>\n",
287 |        "      <td>0.0</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>3</th>\n",
291 |        "      <td>None</td>\n",
292 |        "      <td>2d</td>\n",
293 |        "      <td>2h</td>\n",
294 |        "      <td>double_lstm_prob_with_unstack</td>\n",
295 |        "      <td>0.245726</td>\n",
296 |        "      <td>0.663919</td>\n",
297 |        "      <td>0.663919</td>\n",
298 |        "      <td>0.293346</td>\n",
299 |        "      <td>roc_auc_score</td>\n",
300 |        "      <td>{'threshold': 0.5}</td>\n",
301 |        "      <td>0 days 00:00:05.568835</td>\n",
302 |        "      <td>0 days 00:00:17.948361</td>\n",
303 |        "      <td>0 days 00:00:14.003816</td>\n",
304 |        "      <td>0 days 00:01:11.051792</td>\n",
305 |        "      <td>OK</td>\n",
306 |        "      <td>0.303371</td>\n",
307 |        "      <td>0.184211</td>\n",
308 |        "      <td>1.0</td>\n",
309 |        "      <td>0.0</td>\n",
310 |        "    </tr>\n",
311 |        "  </tbody>\n",
312 |        "</table>\n",
313 |        "</div>"
314 |       ],
315 |       "text/plain": [
316 |        "  problem_name window_size resample_rule                       template  \\\n",
317 |        "0         None          1d            1h         lstm_prob_with_unstack   \n",
318 |        "1         None          2d            2h         lstm_prob_with_unstack   \n",
319 |        "2         None          1d            1h  double_lstm_prob_with_unstack   \n",
320 |        "3         None          2d            2h  double_lstm_prob_with_unstack   \n",
321 |        "\n",
322 |        "   default_test  default_cv  tuned_cv  tuned_test  tuning_metric  \\\n",
323 |        "0      0.494505    0.589905  0.589905    0.322650  roc_auc_score   \n",
324 |        "1      0.446581    0.543056  0.561570    0.707875  roc_auc_score   \n",
325 |        "2      0.813187    0.307993  0.592696    0.417582  roc_auc_score   \n",
326 |        "3      0.245726    0.663919  0.663919    0.293346  roc_auc_score   \n",
327 |        "\n",
328 |        "  tuning_metric_kwargs       fit_predict_time        default_cv_time  \\\n",
329 |        "0   {'threshold': 0.5} 0 days 00:00:03.873157 0 days 00:00:14.369536   \n",
330 |        "1   {'threshold': 0.5} 0 days 00:00:03.460467 0 days 00:00:12.121905   \n",
331 |        "2   {'threshold': 0.5} 0 days 00:00:05.460985 0 days 00:00:18.103660   \n",
332 |        "3   {'threshold': 0.5} 0 days 00:00:05.568835 0 days 00:00:17.948361   \n",
333 |        "\n",
334 |        "         average_cv_time             total_time status  \\\n",
335 |        "0 0 days 00:00:08.178422 0 days 00:00:47.144655     OK   \n",
336 |        "1 0 days 00:00:08.275919 0 days 00:00:44.449291     OK   \n",
337 |        "2 0 days 00:00:14.011877 0 days 00:01:11.192546     OK   \n",
338 |        "3 0 days 00:00:14.003816 0 days 00:01:11.051792     OK   \n",
339 |        "\n",
340 |        "   accuracy_threshold/0.5  f1_threshold/0.5  fpr_threshold/0.5  \\\n",
341 |        "0                0.280899          0.255814                1.0   \n",
342 |        "1                0.730337          0.586207                1.0   \n",
343 |        "2                0.303371          0.367347                1.0   \n",
344 |        "3                0.303371          0.184211                1.0   \n",
345 |        "\n",
346 |        "   tpr_threshold/0.5  \n",
347 |        "0                0.0  \n",
348 |        "1                0.0  \n",
349 |        "2                0.0  \n",
350 |        "3                0.0  "
351 |       ]
352 |      },
353 |      "execution_count": 4,
354 |      "metadata": {},
355 |      "output_type": "execute_result"
356 |     }
357 |    ],
358 |    "source": [
359 |     "results"
360 |    ]
361 |   }
362 |  ],
363 |  "metadata": {
364 |   "kernelspec": {
365 |    "display_name": "Python 3 (ipykernel)",
366 |    "language": "python",
367 |    "name": "python3"
368 |   },
369 |   "language_info": {
370 |    "codemirror_mode": {
371 |     "name": "ipython",
372 |     "version": 3
373 |    },
374 |    "file_extension": ".py",
375 |    "mimetype": "text/x-python",
376 |    "name": "python",
377 |    "nbconvert_exporter": "python",
378 |    "pygments_lexer": "ipython3",
379 |    "version": "3.8.16"
380 |   }
381 |  },
382 |  "nbformat": 4,
383 |  "nbformat_minor": 2
384 | }
385 | 


--------------------------------------------------------------------------------
/tutorials/04_Draco_Regression_Pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Draco Regression Pipeline"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this tutorial we will show you how to use Draco Regression pipelines to solve a Machine Learning problem\n",
 15 |     "defined via a Target Times table.\n",
 16 |     "\n",
 17 |     "During the next steps we will:\n",
 18 |     "\n",
 19 |     "- Load demo Remaining Useful Life (dataset) with training and testing target times and readings\n",
 20 |     "- Find available pipelines and load one of them\n",
 21 |     "- Build and fit a Machine Learning pipeline\n",
 22 |     "- Make predictions using the fitted pipeline\n",
 23 |     "- Evaluate how good the predictions are"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## 0. Setup the logging\n",
 31 |     "\n",
 32 |     "This step sets up logging in our environment to increase our visibility over\n",
 33 |     "the steps that Draco performs."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 1,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import logging;\n",
 43 |     "\n",
 44 |     "logging.basicConfig(level=logging.INFO)\n",
 45 |     "logging.getLogger().setLevel(level=logging.INFO)\n",
 46 |     "\n",
 47 |     "import warnings\n",
 48 |     "warnings.simplefilter(\"ignore\")"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 1. Load the Data\n",
 56 |     "\n",
 57 |     "The first step is to load the data that we are going to use.\n",
 58 |     "\n",
 59 |     "In order to use the demo data included in Draco, the `draco.demo.load_demo` function can be used."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 2,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from draco.demo import load_demo\n",
 69 |     "\n",
 70 |     "train_target_times, test_target_times, readings = load_demo(name='rul')"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "This will download some demo data from [Draco S3 demo Bucket](\n",
 78 |     "https://d3-ai-draco.s3.amazonaws.com/index.html) and load it as\n",
 79 |     "the necessary `target_times` and `readings` tables.\n",
 80 |     "\n",
 81 |     "The exact format of these tables is described in the Draco README and docs:"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "text/html": [
 92 |        "<div>\n",
 93 |        "<style scoped>\n",
 94 |        "    .dataframe tbody tr th:only-of-type {\n",
 95 |        "        vertical-align: middle;\n",
 96 |        "    }\n",
 97 |        "\n",
 98 |        "    .dataframe tbody tr th {\n",
 99 |        "        vertical-align: top;\n",
100 |        "    }\n",
101 |        "\n",
102 |        "    .dataframe thead th {\n",
103 |        "        text-align: right;\n",
104 |        "    }\n",
105 |        "</style>\n",
106 |        "<table border=\"1\" class=\"dataframe\">\n",
107 |        "  <thead>\n",
108 |        "    <tr style=\"text-align: right;\">\n",
109 |        "      <th></th>\n",
110 |        "      <th>turbine_id</th>\n",
111 |        "      <th>cutoff_time</th>\n",
112 |        "      <th>target</th>\n",
113 |        "    </tr>\n",
114 |        "  </thead>\n",
115 |        "  <tbody>\n",
116 |        "    <tr>\n",
117 |        "      <th>0</th>\n",
118 |        "      <td>1</td>\n",
119 |        "      <td>2013-01-12 04:20:00</td>\n",
120 |        "      <td>166</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>1</th>\n",
124 |        "      <td>1</td>\n",
125 |        "      <td>2013-01-12 04:30:00</td>\n",
126 |        "      <td>165</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>2</th>\n",
130 |        "      <td>1</td>\n",
131 |        "      <td>2013-01-12 04:40:00</td>\n",
132 |        "      <td>164</td>\n",
133 |        "    </tr>\n",
134 |        "    <tr>\n",
135 |        "      <th>3</th>\n",
136 |        "      <td>1</td>\n",
137 |        "      <td>2013-01-12 04:50:00</td>\n",
138 |        "      <td>163</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>4</th>\n",
142 |        "      <td>1</td>\n",
143 |        "      <td>2013-01-12 05:00:00</td>\n",
144 |        "      <td>162</td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "   turbine_id         cutoff_time  target\n",
152 |        "0           1 2013-01-12 04:20:00     166\n",
153 |        "1           1 2013-01-12 04:30:00     165\n",
154 |        "2           1 2013-01-12 04:40:00     164\n",
155 |        "3           1 2013-01-12 04:50:00     163\n",
156 |        "4           1 2013-01-12 05:00:00     162"
157 |       ]
158 |      },
159 |      "execution_count": 3,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "train_target_times.head()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 4,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/plain": [
176 |        "(18131, 3)"
177 |       ]
178 |      },
179 |      "execution_count": 4,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "train_target_times.shape"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 5,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "turbine_id              int64\n",
197 |        "cutoff_time    datetime64[ns]\n",
198 |        "target                  int64\n",
199 |        "dtype: object"
200 |       ]
201 |      },
202 |      "execution_count": 5,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "train_target_times.dtypes"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 6,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/html": [
219 |        "<div>\n",
220 |        "<style scoped>\n",
221 |        "    .dataframe tbody tr th:only-of-type {\n",
222 |        "        vertical-align: middle;\n",
223 |        "    }\n",
224 |        "\n",
225 |        "    .dataframe tbody tr th {\n",
226 |        "        vertical-align: top;\n",
227 |        "    }\n",
228 |        "\n",
229 |        "    .dataframe thead th {\n",
230 |        "        text-align: right;\n",
231 |        "    }\n",
232 |        "</style>\n",
233 |        "<table border=\"1\" class=\"dataframe\">\n",
234 |        "  <thead>\n",
235 |        "    <tr style=\"text-align: right;\">\n",
236 |        "      <th></th>\n",
237 |        "      <th>turbine_id</th>\n",
238 |        "      <th>cutoff_time</th>\n",
239 |        "      <th>target</th>\n",
240 |        "    </tr>\n",
241 |        "  </thead>\n",
242 |        "  <tbody>\n",
243 |        "    <tr>\n",
244 |        "      <th>0</th>\n",
245 |        "      <td>1</td>\n",
246 |        "      <td>2013-01-13 13:10:00</td>\n",
247 |        "      <td>112.0</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>1</th>\n",
251 |        "      <td>2</td>\n",
252 |        "      <td>2013-01-14 08:00:00</td>\n",
253 |        "      <td>98.0</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>2</th>\n",
257 |        "      <td>3</td>\n",
258 |        "      <td>2013-01-14 02:50:00</td>\n",
259 |        "      <td>69.0</td>\n",
260 |        "    </tr>\n",
261 |        "    <tr>\n",
262 |        "      <th>3</th>\n",
263 |        "      <td>4</td>\n",
264 |        "      <td>2013-01-14 01:10:00</td>\n",
265 |        "      <td>82.0</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>4</th>\n",
269 |        "      <td>5</td>\n",
270 |        "      <td>2013-01-14 13:10:00</td>\n",
271 |        "      <td>91.0</td>\n",
272 |        "    </tr>\n",
273 |        "  </tbody>\n",
274 |        "</table>\n",
275 |        "</div>"
276 |       ],
277 |       "text/plain": [
278 |        "   turbine_id         cutoff_time  target\n",
279 |        "0           1 2013-01-13 13:10:00   112.0\n",
280 |        "1           2 2013-01-14 08:00:00    98.0\n",
281 |        "2           3 2013-01-14 02:50:00    69.0\n",
282 |        "3           4 2013-01-14 01:10:00    82.0\n",
283 |        "4           5 2013-01-14 13:10:00    91.0"
284 |       ]
285 |      },
286 |      "execution_count": 6,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "test_target_times.head()"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 7,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "(100, 3)"
304 |       ]
305 |      },
306 |      "execution_count": 7,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "test_target_times.shape"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 8,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "turbine_id              int64\n",
324 |        "cutoff_time    datetime64[ns]\n",
325 |        "target                float64\n",
326 |        "dtype: object"
327 |       ]
328 |      },
329 |      "execution_count": 8,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "test_target_times.dtypes"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 9,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/html": [
346 |        "<div>\n",
347 |        "<style scoped>\n",
348 |        "    .dataframe tbody tr th:only-of-type {\n",
349 |        "        vertical-align: middle;\n",
350 |        "    }\n",
351 |        "\n",
352 |        "    .dataframe tbody tr th {\n",
353 |        "        vertical-align: top;\n",
354 |        "    }\n",
355 |        "\n",
356 |        "    .dataframe thead th {\n",
357 |        "        text-align: right;\n",
358 |        "    }\n",
359 |        "</style>\n",
360 |        "<table border=\"1\" class=\"dataframe\">\n",
361 |        "  <thead>\n",
362 |        "    <tr style=\"text-align: right;\">\n",
363 |        "      <th></th>\n",
364 |        "      <th>turbine_id</th>\n",
365 |        "      <th>timestamp</th>\n",
366 |        "      <th>signal_id</th>\n",
367 |        "      <th>value</th>\n",
368 |        "    </tr>\n",
369 |        "  </thead>\n",
370 |        "  <tbody>\n",
371 |        "    <tr>\n",
372 |        "      <th>0</th>\n",
373 |        "      <td>1</td>\n",
374 |        "      <td>2013-01-12 00:10:00</td>\n",
375 |        "      <td>operational setting 1</td>\n",
376 |        "      <td>-0.0007</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>1</th>\n",
380 |        "      <td>1</td>\n",
381 |        "      <td>2013-01-12 00:20:00</td>\n",
382 |        "      <td>operational setting 1</td>\n",
383 |        "      <td>0.0019</td>\n",
384 |        "    </tr>\n",
385 |        "    <tr>\n",
386 |        "      <th>2</th>\n",
387 |        "      <td>1</td>\n",
388 |        "      <td>2013-01-12 00:30:00</td>\n",
389 |        "      <td>operational setting 1</td>\n",
390 |        "      <td>-0.0043</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>3</th>\n",
394 |        "      <td>1</td>\n",
395 |        "      <td>2013-01-12 00:40:00</td>\n",
396 |        "      <td>operational setting 1</td>\n",
397 |        "      <td>0.0007</td>\n",
398 |        "    </tr>\n",
399 |        "    <tr>\n",
400 |        "      <th>4</th>\n",
401 |        "      <td>1</td>\n",
402 |        "      <td>2013-01-12 00:50:00</td>\n",
403 |        "      <td>operational setting 1</td>\n",
404 |        "      <td>-0.0019</td>\n",
405 |        "    </tr>\n",
406 |        "  </tbody>\n",
407 |        "</table>\n",
408 |        "</div>"
409 |       ],
410 |       "text/plain": [
411 |        "   turbine_id           timestamp              signal_id   value\n",
412 |        "0           1 2013-01-12 00:10:00  operational setting 1 -0.0007\n",
413 |        "1           1 2013-01-12 00:20:00  operational setting 1  0.0019\n",
414 |        "2           1 2013-01-12 00:30:00  operational setting 1 -0.0043\n",
415 |        "3           1 2013-01-12 00:40:00  operational setting 1  0.0007\n",
416 |        "4           1 2013-01-12 00:50:00  operational setting 1 -0.0019"
417 |       ]
418 |      },
419 |      "execution_count": 9,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "readings.head()"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 10,
431 |    "metadata": {},
432 |    "outputs": [
433 |     {
434 |      "data": {
435 |       "text/plain": [
436 |        "(809448, 4)"
437 |       ]
438 |      },
439 |      "execution_count": 10,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "readings.shape"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 11,
451 |    "metadata": {},
452 |    "outputs": [
453 |     {
454 |      "data": {
455 |       "text/plain": [
456 |        "turbine_id             int64\n",
457 |        "timestamp     datetime64[ns]\n",
458 |        "signal_id             object\n",
459 |        "value                float64\n",
460 |        "dtype: object"
461 |       ]
462 |      },
463 |      "execution_count": 11,
464 |      "metadata": {},
465 |      "output_type": "execute_result"
466 |     }
467 |    ],
468 |    "source": [
469 |     "readings.dtypes"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "### Load your own Dataset"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "Alternatively, if you want to load your own dataset, all you have to do is load the\n",
484 |     "`target_times` and `readings` tables as `pandas.DataFrame` objects.\n",
485 |     "\n",
486 |     "Make sure to parse the corresponding datetime fields!\n",
487 |     "\n",
488 |     "```python\n",
489 |     "import pandas as pd\n",
490 |     "\n",
491 |     "target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n",
492 |     "readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])\n",
493 |     "```"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "markdown",
498 |    "metadata": {},
499 |    "source": [
500 |     "## 2. Finding the available Pipelines\n",
501 |     "\n",
502 |     "The next step will be to select a collection of templates from the ones\n",
503 |     "available in Draco.\n",
504 |     "\n",
505 |     "For this, we can use the `draco.get_pipelines` function, which will\n",
506 |     "return us the list of all the available MLBlocks pipelines found in the\n",
507 |     "Draco system."
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": 12,
513 |    "metadata": {},
514 |    "outputs": [
515 |     {
516 |      "data": {
517 |       "text/plain": [
518 |        "['dummy',\n",
519 |        " 'lstm_regressor_with_unstack',\n",
520 |        " 'lstm_regressor',\n",
521 |        " 'double_lstm_prob_with_unstack',\n",
522 |        " 'double_lstm_prob',\n",
523 |        " 'double_lstm',\n",
524 |        " 'double_lstm_with_unstack',\n",
525 |        " 'lstm_prob_with_unstack',\n",
526 |        " 'lstm_with_unstack',\n",
527 |        " 'lstm_prob',\n",
528 |        " 'lstm']"
529 |       ]
530 |      },
531 |      "execution_count": 12,
532 |      "metadata": {},
533 |      "output_type": "execute_result"
534 |     }
535 |    ],
536 |    "source": [
537 |     "from draco import get_pipelines\n",
538 |     "\n",
539 |     "get_pipelines()"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "markdown",
544 |    "metadata": {},
545 |    "source": [
546 |     "Optionally, we can pass a string to select the pipelines that contain it:"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 13,
552 |    "metadata": {},
553 |    "outputs": [
554 |     {
555 |      "data": {
556 |       "text/plain": [
557 |        "['lstm_regressor_with_unstack', 'lstm_regressor']"
558 |       ]
559 |      },
560 |      "execution_count": 13,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "get_pipelines('regressor')"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "We will use the regression pipeline `lstm_regressor_with_unstack`\n",
574 |     "\n",
575 |     "The `lstm_regressor_with_unstack` pipeline contains the following steps:\n",
576 |     "\n",
577 |     "- Resample the data using a 10 minute average aggregation\n",
578 |     "- Unstack the data by signal, so each signal is in a different column\n",
579 |     "- Impute missing values in the readings table\n",
580 |     "- Normalize (scale) the data between [-1, 1].\n",
581 |     "- Create window sequences using target times.\n",
582 |     "- Apply an LSTM Regressor"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 14,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "pipeline_name = 'lstm_regressor_with_unstack'"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "## 3. Fitting a Draco Pipeline\n",
599 |     "\n",
600 |     "Once we have loaded the data, we create a **DracoPipeline** instance by passing `pipeline_name` which is the name of a pipeline, the path to a template json file, or a list that can combine both of them."
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": 15,
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": [
609 |     "from draco.pipeline import DracoPipeline\n",
610 |     "\n",
611 |     "init_params = {\n",
612 |     "    \"keras.Sequential.LSTMTimeSeriesRegressor#1\": {\n",
613 |     "        \"epochs\": 10\n",
614 |     "    }\n",
615 |     "}\n",
616 |     "\n",
617 |     "pipeline = DracoPipeline(pipeline_name, init_params=init_params)"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "markdown",
622 |    "metadata": {},
623 |    "source": [
624 |     "To train a pipeline we use the `fit` method passing the `target_times` and the `readings` table:"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 16,
630 |    "metadata": {},
631 |    "outputs": [
632 |     {
633 |      "name": "stderr",
634 |      "output_type": "stream",
635 |      "text": [
636 |       "2023-04-07 16:46:35.571262: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
637 |       "2023-04-07 16:46:35.594871: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff23c392800 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
638 |       "2023-04-07 16:46:35.594885: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version\n"
639 |      ]
640 |     },
641 |     {
642 |      "name": "stdout",
643 |      "output_type": "stream",
644 |      "text": [
645 |       "Epoch 1/10\n",
646 |       "227/227 [==============================] - 6s 28ms/step - loss: 9064.8613 - mse: 9064.8613 - val_loss: 11566.7559 - val_mse: 11566.7559\n",
647 |       "Epoch 2/10\n",
648 |       "227/227 [==============================] - 6s 27ms/step - loss: 6775.8911 - mse: 6775.8911 - val_loss: 9392.9561 - val_mse: 9392.9561\n",
649 |       "Epoch 3/10\n",
650 |       "227/227 [==============================] - 6s 27ms/step - loss: 5391.6719 - mse: 5391.6719 - val_loss: 7923.1221 - val_mse: 7923.1221\n",
651 |       "Epoch 4/10\n",
652 |       "227/227 [==============================] - 6s 28ms/step - loss: 4524.3457 - mse: 4524.3457 - val_loss: 6955.8647 - val_mse: 6955.8647\n",
653 |       "Epoch 5/10\n",
654 |       "227/227 [==============================] - 7s 33ms/step - loss: 4040.5396 - mse: 4040.5396 - val_loss: 6356.0605 - val_mse: 6356.0605\n",
655 |       "Epoch 6/10\n",
656 |       "227/227 [==============================] - 6s 28ms/step - loss: 3802.5298 - mse: 3802.5298 - val_loss: 5998.2061 - val_mse: 5998.2061\n",
657 |       "Epoch 7/10\n",
658 |       "227/227 [==============================] - 7s 30ms/step - loss: 3683.9429 - mse: 3683.9429 - val_loss: 5790.9092 - val_mse: 5790.9092\n",
659 |       "Epoch 8/10\n",
660 |       "227/227 [==============================] - 7s 33ms/step - loss: 3636.9177 - mse: 3636.9177 - val_loss: 5674.6558 - val_mse: 5674.6558\n",
661 |       "Epoch 9/10\n",
662 |       "227/227 [==============================] - 7s 30ms/step - loss: 3609.4973 - mse: 3609.4973 - val_loss: 5619.3926 - val_mse: 5619.3926\n",
663 |       "Epoch 10/10\n",
664 |       "227/227 [==============================] - 7s 29ms/step - loss: 3617.7119 - mse: 3617.7119 - val_loss: 5587.2671 - val_mse: 5587.2671\n"
665 |      ]
666 |     }
667 |    ],
668 |    "source": [
669 |     "pipeline.fit(train_target_times, readings)"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "markdown",
674 |    "metadata": {},
675 |    "source": [
676 |     "## 4. Use the fitted pipeline\n",
677 |     "\n",
678 |     "After fitting the pipeline, we are ready to make predictions on new data:"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": 17,
684 |    "metadata": {},
685 |    "outputs": [
686 |     {
687 |      "name": "stdout",
688 |      "output_type": "stream",
689 |      "text": [
690 |       "2/2 [==============================] - 0s 3ms/step\n"
691 |      ]
692 |     }
693 |    ],
694 |    "source": [
695 |     "predictions = pipeline.predict(test_target_times, readings)"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "markdown",
700 |    "metadata": {},
701 |    "source": [
702 |     "And evaluate its prediction performance:"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 18,
708 |    "metadata": {},
709 |    "outputs": [
710 |     {
711 |      "data": {
712 |       "text/plain": [
713 |        "-0.1533211964451806"
714 |       ]
715 |      },
716 |      "execution_count": 18,
717 |      "metadata": {},
718 |      "output_type": "execute_result"
719 |     }
720 |    ],
721 |    "source": [
722 |     "from sklearn.metrics import r2_score\n",
723 |     "\n",
724 |     "r2_score(test_target_times['target'], predictions)"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "markdown",
729 |    "metadata": {},
730 |    "source": [
731 |     "## 5. Save and load the pipeline\n",
732 |     "\n",
733 |     "Since the tuning and fitting process takes time to execute and requires a lot of data, you\n",
734 |     "will probably want to save a fitted instance and load it later to analyze new signals\n",
735 |     "instead of fitting pipelines over and over again.\n",
736 |     "\n",
737 |     "This can be done by using the `save` and `load` methods from the `DracoPipeline`.\n",
738 |     "\n",
739 |     "In order to save an instance, call its `save` method passing it the path and filename\n",
740 |     "where the model should be saved."
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 19,
746 |    "metadata": {},
747 |    "outputs": [],
748 |    "source": [
749 |     "path = 'my_pipeline.pkl'\n",
750 |     "\n",
751 |     "pipeline.save(path)"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "markdown",
756 |    "metadata": {},
757 |    "source": [
758 |     "Once the pipeline is saved, it can be loaded back as a new `DracoPipeline` by using the\n",
759 |     "`DracoPipeline.load` method:"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": 20,
765 |    "metadata": {},
766 |    "outputs": [],
767 |    "source": [
768 |     "new_pipeline = DracoPipeline.load(path)"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "markdown",
773 |    "metadata": {},
774 |    "source": [
775 |     "Once loaded, it can be directly used to make predictions on new data."
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 21,
781 |    "metadata": {},
782 |    "outputs": [
783 |     {
784 |      "name": "stdout",
785 |      "output_type": "stream",
786 |      "text": [
787 |       "2/2 [==============================] - 0s 5ms/step\n"
788 |      ]
789 |     },
790 |     {
791 |      "data": {
792 |       "text/plain": [
793 |        "array([[91.7917  ],\n",
794 |        "       [91.791695],\n",
795 |        "       [91.79166 ],\n",
796 |        "       [91.79167 ],\n",
797 |        "       [91.79167 ]], dtype=float32)"
798 |       ]
799 |      },
800 |      "execution_count": 21,
801 |      "metadata": {},
802 |      "output_type": "execute_result"
803 |     }
804 |    ],
805 |    "source": [
806 |     "predictions = new_pipeline.predict(test_target_times, readings)\n",
807 |     "predictions[0:5]"
808 |    ]
809 |   }
810 |  ],
811 |  "metadata": {
812 |   "kernelspec": {
813 |    "display_name": "Python 3 (ipykernel)",
814 |    "language": "python",
815 |    "name": "python3"
816 |   },
817 |   "language_info": {
818 |    "codemirror_mode": {
819 |     "name": "ipython",
820 |     "version": 3
821 |    },
822 |    "file_extension": ".py",
823 |    "mimetype": "text/x-python",
824 |    "name": "python",
825 |    "nbconvert_exporter": "python",
826 |    "pygments_lexer": "ipython3",
827 |    "version": "3.8.16"
828 |   }
829 |  },
830 |  "nbformat": 4,
831 |  "nbformat_minor": 2
832 | }
833 | 


--------------------------------------------------------------------------------
/tutorials/Convert NASA CMAPSS to Draco Format.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2f3d8acf",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Convert CMAPSS to Draco Format\n",
  9 |     "\n",
 10 |     "In this notebook we download [CMAPSS](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan) data and reformat it as Draco pipelines expect."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "f39b805c",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import datetime\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "\n",
 24 |     "import matplotlib.pyplot as plt"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "626a2da0",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## 1. Download Data"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "id": "ff641cff",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import io\n",
 43 |     "import os\n",
 44 |     "import urllib\n",
 45 |     "import zipfile\n",
 46 |     "\n",
 47 |     "DATA_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/CMAPSSData.zip'\n",
 48 |     "\n",
 49 |     "response = urllib.request.urlopen(DATA_URL)\n",
 50 |     "bytes_io = io.BytesIO(response.read())\n",
 51 |     "\n",
 52 |     "with zipfile.ZipFile(bytes_io) as zf:\n",
 53 |     "    zf.extractall('CMAPSSData')"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "9c435699",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## 2. Read Data"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "id": "1bb002ac",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# columns\n",
 72 |     "\n",
 73 |     "index = ['unit number', 'time, in cycles']\n",
 74 |     "setting = ['operational setting {}'.format(i + 1) for i in range(0, 3)]\n",
 75 |     "sensor = ['sensor measurement {}'.format(i + 1) for i in range(0, 21)]\n",
 76 |     "\n",
 77 |     "all_columns = index + setting + sensor"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "id": "74478b0f",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "train = pd.read_csv('CMAPSSData/train_FD001.txt', sep=' ', header=None)\n",
 88 |     "train = train.dropna(axis=1)\n",
 89 |     "train.columns = all_columns\n",
 90 |     "\n",
 91 |     "test = pd.read_csv('CMAPSSData/test_FD001.txt', sep=' ', header=None)\n",
 92 |     "test = test.dropna(axis=1)\n",
 93 |     "test.columns = all_columns\n",
 94 |     "\n",
 95 |     "y_test = pd.read_csv('CMAPSSData/RUL_FD001.txt', sep=' ', header=None)\n",
 96 |     "y_test = y_test.dropna(axis=1)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "dd480185",
102 |    "metadata": {},
103 |    "source": [
104 |     "## 3. Create columns\n",
105 |     "\n",
106 |     "### 3.a create `RUL` column\n",
107 |     "How do we create **Remaining Useful Life (RUL)** column for the training dataset? We can assume that the last entry in the training dataset is the maximum life expectancy for that unit. Then each cycle we have will decrease by that number."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "id": "eb0270ba",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def get_max(x):\n",
118 |     "    return cycles_max[x]\n",
119 |     "\n",
120 |     "cycles_max = train.groupby(\"unit number\")[\"time, in cycles\"].max().to_dict()\n",
121 |     "cycles_max = train['unit number'].apply(get_max)\n",
122 |     "\n",
123 |     "train['RUL'] = cycles_max - train[\"time, in cycles\"]"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "id": "57fbd3b9",
129 |    "metadata": {},
130 |    "source": [
131 |     "### 3.b create `cutoff_time` column\n",
132 |     "`cutoff_time` is a datetime column with relation to the `cycle` number. We pick a start date and start incrementing from there."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 6,
138 |    "id": "3e320356",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "def get_timestamp(x):\n",
143 |     "    return start + datetime.timedelta(minutes=x * 10)\n",
144 |     "\n",
145 |     "start = datetime.datetime(2013, 1, 12)\n",
146 |     "train['timestamp'] = train['time, in cycles'].apply(get_timestamp)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "id": "11f78b71",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "def get_timestamp_test(x):\n",
157 |     "    return last[x['unit number']] + datetime.timedelta(minutes=x['time, in cycles'] * 10)\n",
158 |     "\n",
159 |     "last = train.groupby('unit number').last()['timestamp'].to_dict()\n",
160 |     "test['timestamp'] = test.apply(get_timestamp_test, axis=1)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "id": "95bec88f",
166 |    "metadata": {},
167 |    "source": [
168 |     "### 4. Format Data\n",
169 |     "\n",
170 |     "make `label_times` have three columns, namely: `['turbine_id', 'cutoff_time', 'target']`."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 8,
176 |    "id": "1ce4320e",
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/html": [
182 |        "<div>\n",
183 |        "<style scoped>\n",
184 |        "    .dataframe tbody tr th:only-of-type {\n",
185 |        "        vertical-align: middle;\n",
186 |        "    }\n",
187 |        "\n",
188 |        "    .dataframe tbody tr th {\n",
189 |        "        vertical-align: top;\n",
190 |        "    }\n",
191 |        "\n",
192 |        "    .dataframe thead th {\n",
193 |        "        text-align: right;\n",
194 |        "    }\n",
195 |        "</style>\n",
196 |        "<table border=\"1\" class=\"dataframe\">\n",
197 |        "  <thead>\n",
198 |        "    <tr style=\"text-align: right;\">\n",
199 |        "      <th></th>\n",
200 |        "      <th>turbine_id</th>\n",
201 |        "      <th>cutoff_time</th>\n",
202 |        "      <th>target</th>\n",
203 |        "    </tr>\n",
204 |        "  </thead>\n",
205 |        "  <tbody>\n",
206 |        "    <tr>\n",
207 |        "      <th>25</th>\n",
208 |        "      <td>1</td>\n",
209 |        "      <td>2013-01-12 04:20:00</td>\n",
210 |        "      <td>166</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>26</th>\n",
214 |        "      <td>1</td>\n",
215 |        "      <td>2013-01-12 04:30:00</td>\n",
216 |        "      <td>165</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>27</th>\n",
220 |        "      <td>1</td>\n",
221 |        "      <td>2013-01-12 04:40:00</td>\n",
222 |        "      <td>164</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>28</th>\n",
226 |        "      <td>1</td>\n",
227 |        "      <td>2013-01-12 04:50:00</td>\n",
228 |        "      <td>163</td>\n",
229 |        "    </tr>\n",
230 |        "    <tr>\n",
231 |        "      <th>29</th>\n",
232 |        "      <td>1</td>\n",
233 |        "      <td>2013-01-12 05:00:00</td>\n",
234 |        "      <td>162</td>\n",
235 |        "    </tr>\n",
236 |        "  </tbody>\n",
237 |        "</table>\n",
238 |        "</div>"
239 |       ],
240 |       "text/plain": [
241 |        "    turbine_id         cutoff_time  target\n",
242 |        "25           1 2013-01-12 04:20:00     166\n",
243 |        "26           1 2013-01-12 04:30:00     165\n",
244 |        "27           1 2013-01-12 04:40:00     164\n",
245 |        "28           1 2013-01-12 04:50:00     163\n",
246 |        "29           1 2013-01-12 05:00:00     162"
247 |       ]
248 |      },
249 |      "execution_count": 8,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "train_label_times = train[['unit number', 'timestamp', 'RUL']].copy()\n",
256 |     "train_label_times.columns = ['turbine_id', 'cutoff_time', 'target']\n",
257 |     "\n",
258 |     "# drop first 24 occurances\n",
259 |     "train_label_times = train_label_times[train_label_times.groupby('turbine_id').cumcount('turbine_id') > 24]\n",
260 |     "train_label_times.head()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 9,
266 |    "id": "f320e753",
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/html": [
272 |        "<div>\n",
273 |        "<style scoped>\n",
274 |        "    .dataframe tbody tr th:only-of-type {\n",
275 |        "        vertical-align: middle;\n",
276 |        "    }\n",
277 |        "\n",
278 |        "    .dataframe tbody tr th {\n",
279 |        "        vertical-align: top;\n",
280 |        "    }\n",
281 |        "\n",
282 |        "    .dataframe thead th {\n",
283 |        "        text-align: right;\n",
284 |        "    }\n",
285 |        "</style>\n",
286 |        "<table border=\"1\" class=\"dataframe\">\n",
287 |        "  <thead>\n",
288 |        "    <tr style=\"text-align: right;\">\n",
289 |        "      <th></th>\n",
290 |        "      <th>turbine_id</th>\n",
291 |        "      <th>cutoff_time</th>\n",
292 |        "      <th>target</th>\n",
293 |        "    </tr>\n",
294 |        "  </thead>\n",
295 |        "  <tbody>\n",
296 |        "    <tr>\n",
297 |        "      <th>0</th>\n",
298 |        "      <td>1</td>\n",
299 |        "      <td>2013-01-13 13:10:00</td>\n",
300 |        "      <td>112.0</td>\n",
301 |        "    </tr>\n",
302 |        "    <tr>\n",
303 |        "      <th>1</th>\n",
304 |        "      <td>2</td>\n",
305 |        "      <td>2013-01-14 08:00:00</td>\n",
306 |        "      <td>98.0</td>\n",
307 |        "    </tr>\n",
308 |        "    <tr>\n",
309 |        "      <th>2</th>\n",
310 |        "      <td>3</td>\n",
311 |        "      <td>2013-01-14 02:50:00</td>\n",
312 |        "      <td>69.0</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>3</th>\n",
316 |        "      <td>4</td>\n",
317 |        "      <td>2013-01-14 01:10:00</td>\n",
318 |        "      <td>82.0</td>\n",
319 |        "    </tr>\n",
320 |        "    <tr>\n",
321 |        "      <th>4</th>\n",
322 |        "      <td>5</td>\n",
323 |        "      <td>2013-01-14 13:10:00</td>\n",
324 |        "      <td>91.0</td>\n",
325 |        "    </tr>\n",
326 |        "  </tbody>\n",
327 |        "</table>\n",
328 |        "</div>"
329 |       ],
330 |       "text/plain": [
331 |        "   turbine_id         cutoff_time  target\n",
332 |        "0           1 2013-01-13 13:10:00   112.0\n",
333 |        "1           2 2013-01-14 08:00:00    98.0\n",
334 |        "2           3 2013-01-14 02:50:00    69.0\n",
335 |        "3           4 2013-01-14 01:10:00    82.0\n",
336 |        "4           5 2013-01-14 13:10:00    91.0"
337 |       ]
338 |      },
339 |      "execution_count": 9,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "test_label_times = test[['unit number', 'timestamp']].groupby('unit number').last().reset_index()\n",
346 |     "test_label_times.columns = ['turbine_id', 'cutoff_time']\n",
347 |     "test_label_times['target'] = np.array(y_test).astype('float32')\n",
348 |     "test_label_times.head()"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 10,
354 |    "id": "50be8dc4",
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "reading_columns = ['unit number', 'timestamp'] + setting + sensor\n",
359 |     "readings = pd.concat([train, test])[reading_columns]\n",
360 |     "readings = readings.melt(id_vars=['unit number', 'timestamp'])\n",
361 |     "readings.columns = ['turbine_id', 'timestamp', 'signal_id', 'value']"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "id": "01a77e60",
367 |    "metadata": {},
368 |    "source": [
369 |     "## 5. Save Data"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 11,
375 |    "id": "5f622ff7",
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "readings.to_csv('rul_readings.csv.gz', compression='gzip', index=False)\n",
380 |     "train_label_times.to_csv('rul_train_target_times.csv.gz', compression='gzip', index=False)\n",
381 |     "test_label_times.to_csv('rul_test_target_times.csv.gz', compression='gzip', index=False)"
382 |    ]
383 |   }
384 |  ],
385 |  "metadata": {
386 |   "kernelspec": {
387 |    "display_name": "Python 3 (ipykernel)",
388 |    "language": "python",
389 |    "name": "python3"
390 |   },
391 |   "language_info": {
392 |    "codemirror_mode": {
393 |     "name": "ipython",
394 |     "version": 3
395 |    },
396 |    "file_extension": ".py",
397 |    "mimetype": "text/x-python",
398 |    "name": "python",
399 |    "nbconvert_exporter": "python",
400 |    "pygments_lexer": "ipython3",
401 |    "version": "3.7.11"
402 |   }
403 |  },
404 |  "nbformat": 4,
405 |  "nbformat_minor": 5
406 | }
407 | 


--------------------------------------------------------------------------------