├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ ├── docs.yml │ └── tests.yml ├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── DATABASE.md ├── DATA_FORMAT.md ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docker ├── .dockerignore ├── Dockerfile ├── README.md └── greenguard-deployment.yml ├── docs ├── Makefile ├── advanced_usage │ ├── concepts.md │ ├── csv.md │ └── docker.md ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── images │ ├── Draco-200.png │ ├── Draco.ico │ ├── Draco.png │ ├── dai-logo.png │ └── favicon.ico ├── index.rst ├── make.bat └── readme.rst ├── draco ├── __init__.py ├── benchmark.py ├── db.py ├── demo.py ├── loaders │ ├── __init__.py │ └── csv.py ├── metrics.py ├── pipeline.py ├── pipelines │ ├── double_lstm │ │ ├── double_lstm.json │ │ ├── double_lstm_prob.json │ │ ├── double_lstm_prob_with_unstack.json │ │ └── double_lstm_with_unstack.json │ ├── dummy │ │ └── dummy.json │ ├── lstm │ │ ├── lstm.json │ │ ├── lstm_prob.json │ │ ├── lstm_prob_with_unstack.json │ │ └── lstm_with_unstack.json │ └── lstm_regressor │ │ ├── lstm_regressor.json │ │ └── lstm_regressor_with_unstack.json ├── primitives │ ├── mlblocks.MLPipeline.json │ ├── numpy.take.json │ └── xgboost.XGBClassifier:probabilities.json ├── results.py ├── targets.py └── utils.py ├── setup.cfg ├── setup.py ├── tests ├── test_benchmark.py ├── test_metrics.py └── test_pipeline.py ├── tox.ini └── tutorials ├── 01_Draco_Machine_Learning.ipynb ├── 02_Extract_Readings.ipynb ├── 03_Benchmarking.ipynb ├── 04_Draco_Regression_Pipeline.ipynb ├── Convert NASA CMAPSS to Draco Format.ipynb └── pipelines ├── double_lstm_with_unstack.ipynb ├── lstm_regressor_with_unstack.ipynb └── lstm_with_unstack.ipynb /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.py] 14 | max_line_length = 99 15 | 16 | [*.bat] 17 | indent_style = tab 18 | end_of_line = crlf 19 | 20 | [LICENSE] 21 | insert_final_newline = false 22 | 23 | [Makefile] 24 | indent_style = tab 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * Draco version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Generate Docs 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | jobs: 8 | 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: Python 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: '3.7' 18 | 19 | - name: Build 20 | run: | 21 | sudo apt install pandoc 22 | python -m pip install --upgrade pip 23 | pip install -e .[dev] 24 | make docs 25 | - name: Deploy 26 | uses: peaceiris/actions-gh-pages@v3 27 | with: 28 | github_token: ${{secrets.GITHUB_TOKEN}} 29 | publish_dir: docs/_build/html 30 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | branches: [ '*' ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | docs: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: [3.8] 15 | os: [ubuntu-latest] 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install package 23 | run: python -m pip install .[dev] 24 | - name: make docs 25 | run: make docs 26 | 27 | lint: 28 | runs-on: ${{ matrix.os }} 29 | strategy: 30 | matrix: 31 | python-version: [3.6, 3.7, 3.8] 32 | os: [ubuntu-20.04] 33 | steps: 34 | - uses: actions/checkout@v1 35 | - name: Set up Python ${{ matrix.python-version }} 36 | uses: actions/setup-python@v2 37 | with: 38 | python-version: ${{ matrix.python-version }} 39 | - name: Install dependencies 40 | run: | 41 | python -m pip install --upgrade pip 42 | pip install .[dev] 43 | - name: make lint 44 | run: make lint 45 | 46 | readme: 47 | runs-on: ${{ matrix.os }} 48 | strategy: 49 | matrix: 50 | python-version: [3.6, 3.7, 3.8] 51 | os: [ubuntu-20.04] 52 | steps: 53 | - uses: actions/checkout@v1 54 | - name: Set up Python ${{ matrix.python-version }} 55 | uses: actions/setup-python@v2 56 | with: 57 | python-version: ${{ matrix.python-version }} 58 | - name: Install libgomp1 59 | run: | 60 | sudo apt-get install libgomp1 61 | - name: Install dependencies 62 | run: | 63 | python -m pip install --upgrade pip 64 | pip install rundoc . 65 | - name: make readme 66 | run: make test-readme 67 | 68 | unit: 69 | runs-on: ${{ matrix.os }} 70 | strategy: 71 | matrix: 72 | python-version: [3.6, 3.7, 3.8] 73 | os: [ubuntu-20.04, macos-latest] 74 | steps: 75 | - uses: actions/checkout@v1 76 | - name: Set up Python ${{ matrix.python-version }} 77 | uses: actions/setup-python@v2 78 | with: 79 | python-version: ${{ matrix.python-version }} 80 | - name: Install dependencies 81 | run: | 82 | python -m pip install --upgrade pip 83 | pip install .[test] 84 | - name: make unit 85 | run: make test-unit 86 | 87 | minimum: 88 | runs-on: ${{ matrix.os }} 89 | strategy: 90 | matrix: 91 | python-version: [3.6, 3.7, 3.8] 92 | os: [ubuntu-20.04] 93 | steps: 94 | - uses: actions/checkout@v1 95 | - name: Set up Python ${{ matrix.python-version }} 96 | uses: actions/setup-python@v2 97 | with: 98 | python-version: ${{ matrix.python-version }} 99 | - name: Install dependencies 100 | run: | 101 | python -m pip install --upgrade pip 102 | pip install .[test] 103 | - name: make minimum 104 | run: make test-minimum 105 | 106 | tutorials: 107 | runs-on: ${{ matrix.os }} 108 | strategy: 109 | matrix: 110 | python-version: [3.6, 3.7, 3.8] 111 | os: [ubuntu-20.04] 112 | steps: 113 | - uses: actions/checkout@v1 114 | - name: Set up Python ${{ matrix.python-version }} 115 | uses: actions/setup-python@v2 116 | with: 117 | python-version: ${{ matrix.python-version }} 118 | - name: Install dependencies 119 | run: | 120 | python -m pip install --upgrade pip 121 | pip install jupyter . 122 | - name: make tutorials 123 | run: make test-tutorials 124 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | docs/api/ 68 | docs/tutorials/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Vim 107 | .*.swp 108 | 109 | draco/demo/ 110 | notebooks/ 111 | notebooks-private/ 112 | scripts/ 113 | dask-worker-space/ 114 | tutorials/*.pkl 115 | 116 | *.pkl 117 | *.DS_Store 118 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | dist: bionic 3 | language: python 4 | python: 5 | - 3.7 6 | - 3.6 7 | 8 | # Command to install dependencies 9 | install: 10 | - sudo apt-get install pandoc 11 | - pip install -U tox-travis codecov 12 | 13 | after_success: codecov 14 | 15 | # Command to run tests 16 | script: tox 17 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Credits 2 | ======= 3 | 4 | * Carles Sala 5 | * Kalyan Veeramachaneni 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at the `GitHub Issues page`_. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | Draco could always use more documentation, whether as part of the 42 | official Draco docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at the `GitHub Issues page`_. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `Draco` for local development. 61 | 62 | 1. Fork the `Draco` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/Draco.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, 68 | this is how you set up your fork for local development:: 69 | 70 | $ mkvirtualenv Draco 71 | $ cd Draco/ 72 | $ make install-develop 73 | 74 | 4. Create a branch for local development:: 75 | 76 | $ git checkout -b name-of-your-bugfix-or-feature 77 | 78 | Try to use the naming scheme of prefixing your branch with ``gh-X`` where X is 79 | the associated issue, such as ``gh-3-fix-foo-bug``. And if you are not 80 | developing on your own fork, further prefix the branch with your GitHub 81 | username, like ``githubusername/gh-3-fix-foo-bug``. 82 | 83 | Now you can make your changes locally. 84 | 85 | 5. While hacking your changes, make sure to cover all your developments with the required 86 | unit tests, and that none of the old tests fail as a consequence of your changes. 87 | For this, make sure to run the tests suite and check the code coverage:: 88 | 89 | $ make lint # Check code styling 90 | $ make test # Run the tests 91 | $ make coverage # Get the coverage report 92 | 93 | 6. When you're done making changes, check that your changes pass all the styling checks and 94 | tests, including other Python supported versions, using:: 95 | 96 | $ make test-all 97 | 98 | 7. Make also sure to include the necessary documentation in the code as docstrings following 99 | the `Google docstrings style`_. 100 | If you want to view how your documentation will look like when it is published, you can 101 | generate and view the docs with this command:: 102 | 103 | $ make view-docs 104 | 105 | 8. Commit your changes and push your branch to GitHub:: 106 | 107 | $ git add . 108 | $ git commit -m "Your detailed description of your changes." 109 | $ git push origin name-of-your-bugfix-or-feature 110 | 111 | 9. Submit a pull request through the GitHub website. 112 | 113 | Pull Request Guidelines 114 | ----------------------- 115 | 116 | Before you submit a pull request, check that it meets these guidelines: 117 | 118 | 1. It resolves an open GitHub Issue and contains its reference in the title or 119 | the comment. If there is no associated issue, feel free to create one. 120 | 2. Whenever possible, it resolves only **one** issue. If your PR resolves more than 121 | one issue, try to split it in more than one pull request. 122 | 3. The pull request should include unit tests that cover all the changed code 123 | 4. If the pull request adds functionality, the docs should be updated. Put 124 | your new functionality into a function with a docstring, and add the 125 | feature to the documentation in an appropriate place. 126 | 5. The pull request should work for all the supported Python versions. Check the `Travis Build 127 | Status page`_ and make sure that all the checks pass. 128 | 129 | Unit Testing Guidelines 130 | ----------------------- 131 | 132 | All the Unit Tests should comply with the following requirements: 133 | 134 | 1. Unit Tests should be based only in unittest and pytest modules. 135 | 136 | 2. The tests that cover a module called ``draco/path/to/a_module.py`` 137 | should be implemented in a separated module called 138 | ``tests/draco/path/to/test_a_module.py``. 139 | Note that the module name has the ``test_`` prefix and is located in a path similar 140 | to the one of the tested module, just inside the ``tests`` folder. 141 | 142 | 3. Each method of the tested module should have at least one associated test method, and 143 | each test method should cover only **one** use case or scenario. 144 | 145 | 4. Test case methods should start with the ``test_`` prefix and have descriptive names 146 | that indicate which scenario they cover. 147 | Names such as ``test_some_methed_input_none``, ``test_some_method_value_error`` or 148 | ``test_some_method_timeout`` are right, but names like ``test_some_method_1``, 149 | ``some_method`` or ``test_error`` are not. 150 | 151 | 5. Each test should validate only what the code of the method being tested does, and not 152 | cover the behavior of any third party package or tool being used, which is assumed to 153 | work properly as far as it is being passed the right values. 154 | 155 | 6. Any third party tool that may have any kind of random behavior, such as some Machine 156 | Learning models, databases or Web APIs, will be mocked using the ``mock`` library, and 157 | the only thing that will be tested is that our code passes the right values to them. 158 | 159 | 7. Unit tests should not use anything from outside the test and the code being tested. This 160 | includes not reading or writing to any file system or database, which will be properly 161 | mocked. 162 | 163 | Tips 164 | ---- 165 | 166 | To run a subset of tests:: 167 | 168 | $ python -m pytest tests.test_draco 169 | $ python -m pytest -k 'foo' 170 | 171 | Release Workflow 172 | ---------------- 173 | 174 | The process of releasing a new version involves several steps combining both ``git`` and 175 | ``bumpversion`` which, briefly: 176 | 177 | 1. Merge what is in ``master`` branch into ``stable`` branch. 178 | 2. Update the version in ``setup.cfg``, ``draco/__init__.py`` and 179 | ``HISTORY.md`` files. 180 | 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 181 | 4. Merge the new commit from ``stable`` into ``master``. 182 | 5. Update the version in ``setup.cfg`` and ``draco/__init__.py`` 183 | to open the next development iteration. 184 | 185 | .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new 186 | entry that explains the changes that will be included in the new version. 187 | Normally this is just a list of the Pull Requests that have been merged to master 188 | since the last release. 189 | 190 | Once this is done, run of the following commands: 191 | 192 | 1. If you are releasing a patch version:: 193 | 194 | make release 195 | 196 | 2. If you are releasing a minor version:: 197 | 198 | make release-minor 199 | 200 | 3. If you are releasing a major version:: 201 | 202 | make release-major 203 | 204 | Release Candidates 205 | ~~~~~~~~~~~~~~~~~~ 206 | 207 | Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release, 208 | in order to make some of the new features available for testing on other projects before they 209 | are included in an actual full-blown release. 210 | 211 | In order to perform such an action, you can execute:: 212 | 213 | make release-candidate 214 | 215 | This will perform the following actions: 216 | 217 | 1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN`` 218 | 219 | 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)`` 220 | 221 | After this is done, the new pre-release can be installed by including the ``dev`` section in the 222 | dependency specification, either in ``setup.py``:: 223 | 224 | install_requires = [ 225 | ... 226 | 'draco>=X.Y.Z.dev', 227 | ... 228 | ] 229 | 230 | or in command line:: 231 | 232 | pip install 'draco>=X.Y.Z.dev' 233 | 234 | 235 | .. _GitHub issues page: https://github.com/sintel-dev/Draco/issues 236 | .. _Travis Build Status page: https://travis-ci.org/sintel-dev/Draco/pull_requests 237 | .. _Google docstrings style: https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments 238 | -------------------------------------------------------------------------------- /DATABASE.md: -------------------------------------------------------------------------------- 1 | # Database Schema 2 | 3 | The **Draco Database** contains the following collections and relationships 4 | 5 | * Farm 6 | * Trubine 7 | * Farm 8 | * Signal 9 | * Sensor 10 | * Turbine 11 | * Signal 12 | * Reading 13 | * Sensor 14 | * PipelineTemplate 15 | * Pipeline 16 | * PipelineTemplate 17 | * MLTask 18 | * Turbine - multiple 19 | * Target 20 | * MLTask 21 | * Experiment 22 | * MLTask 23 | * PipelineTemplate 24 | * Signal - multiple 25 | * ExperimenRun 26 | * Experiment 27 | * PipelineRun 28 | * Pipeline 29 | * ExperimentRun 30 | 31 | ## Farm 32 | 33 | A **Farm** represents a physical Wind Turbines Farm. This collection groups together multiple 34 | Turbines with shared properties, such as location. 35 | 36 | ### Fields 37 | 38 | * `_id (ObjectID)`: Unique Identifier of this Object 39 | * `name (String)`: Name or code given to this Object 40 | * `insert_time (DateTime)`: Time when this Object was inserted 41 | * `created_by (String)`: Identifier of the user that created this Object 42 | 43 | ## Turbine 44 | 45 | A **Turbine** represents a physical Turbine. A Turbine is part of a **Farm**, and has some 46 | particular properties, such as the Turbine manufacturer. 47 | 48 | ### Fields 49 | 50 | * `_id (ObjectID)`: Unique Identifier of this Object 51 | * `farm_id (ObjectID)`: Unique Identifier of the Farm to which this Turbine belongs 52 | * `name (String)`: Name or code given to this Object 53 | * `manufacturer (String)`: Name or code of the manufacturer - EXAMPLE 54 | * `model (String)`: Name or code of the model - EXAMPLE 55 | * `insert_time (DateTime)`: Time when this Object was inserted 56 | * `created_by (String)`: Identifier of the user that created this Object 57 | 58 | ## Signal 59 | 60 | The **Signal** collection contains the details about each Signal type. 61 | This includes shared properties of the signal, such as the sensor type or the measurement units. 62 | 63 | ### Fields 64 | 65 | * `_id (ObjectID)`: Unique Identifier of this Object 66 | * `name (String)`: Name or code given to this Object 67 | * `type (String)`: Type of sensor - EXAMPLE 68 | * `created_by (String)`: Identifier of the user that created this Object 69 | * `insert_time (DateTime)`: Time when this Object was inserted 70 | 71 | ## Sensor 72 | 73 | A **Sensor** represents a physical sensor that is installed in a Turbine. 74 | The Sensor collection specifies the turbine and the signal type, as well as properties 75 | about the Sensor such as the Sensor manufacturer and model and its age. 76 | 77 | ### Fields 78 | 79 | * `_id (ObjectID)`: Unique Identifier of this Object 80 | * `turbine_id (ObjectID)`: Unique Identifier of the Turbine where this Sensor is installed 81 | * `signal_id (ObjectID)`: Unique Identifier of the Signal type of this Sensor 82 | * `name (String)`: Name or code given to this Object 83 | * `manufacturer (String)`: Name or code of the manufacturer - EXAMPLE 84 | * `model (String)`: Name or code of the model - EXAMPLE 85 | * `installation_date (DateTime)`: Time when this Sensor was installed - EXAMPLE 86 | * `insert_time (DateTime)`: Time when this Object was inserted 87 | * `created_by (String)`: Identifier of the user that created this Object 88 | 89 | ## Reading 90 | 91 | The **Readings** collection contains all the data readings from a Sensor. 92 | 93 | ### Fields 94 | 95 | * `_id (ObjectID)`: Unique Identifier of this Object 96 | * `sensor_id (ObjectID)`: Unique Identifier of the Sensor to which this Reading belongs. 97 | * `timestamp (DateTime)`: Time where this reading took place 98 | * `value (float)`: Value of the reading 99 | 100 | ## PipelineTemplate 101 | 102 | The **PipelineTemplate** collection contains all the pipeline templates from which the 103 | pipelines that later on will be used to run an experiments are generated. 104 | The template includes all the default hyperparameter values, as well as the tunable 105 | hyperparameter ranges. 106 | 107 | ### Fields 108 | 109 | * `_id (ObjectID)`: Unique Identifier of this PipelineTemplate object 110 | * `name (String)`: Name or code given to this Object 111 | * `template (SubDocument)`: JSON representation of this pipeline template 112 | * `insert_time (DateTime)`: Time when this Object was inserted 113 | * `created_by (String)`: Identifier of the user that created this Object 114 | 115 | ## Pipeline 116 | 117 | The **Pipeline** collection contains all the pipelines registered in the system, including 118 | their details, such as the list of primitives and all the configured hyperparameter values. 119 | 120 | ### Fields 121 | 122 | * `_id (ObjectID)`: Unique Identifier of this object 123 | * `name (String)`: Name or code given to this Object 124 | * `pipeline_template_id (ObjectID)`: Unique Identifier of the PipelineTemplate used to generate this pipeline 125 | * `pipeline (SubDocument)`: JSON representation of this pipeline object 126 | * `insert_time (DateTime)`: Time when this Object was inserted 127 | * `created_by (String)`: Identifier of the user that created this Object 128 | 129 | ## MLTask 130 | 131 | An **MLTask** is a specific Machine Learning Problem consisting on a prediction that 132 | is to be made using a Pipeline. 133 | 134 | ### Fields 135 | 136 | * `_id (ObjectID)`: Unique Identifier of this object 137 | * `name (String)`: Name or code given to this Object 138 | * `description (String)`: Short text description of this task 139 | * `type (String)`: Type of Machine Learning Task 140 | * `turbine_set (List[ObjectID])`: List of IDs of the Turbines to which this MLTask is applied 141 | * `insert_time (DateTime)`: Time when this Object was inserted 142 | * `created_by (String)`: Identifier of the user that created this Object 143 | 144 | ## Target 145 | 146 | The **Target** collection contains the **MLTask** targets with their cutoff times. 147 | 148 | ### Fields 149 | 150 | * `_id (ObjectID)`: Unique Identifier of this Object 151 | * `mltask_id (ObjectID)`: Unique Identifier of the MLTask to which this target belongs 152 | * `turbine_id (ObjectID)`: Unique Identifier of the Turbine associated with this target 153 | * `cutoff_time (DateTime)`: Time associated with this Target 154 | 155 | ## Experiment 156 | 157 | An **Experiment** represents the process of trying and tuning a PipelineTemplate in order 158 | to solve a MLTask. 159 | 160 | ### Fields 161 | 162 | * `_id (ObjectID)`: Unique Identifier of this Object 163 | * `name (String)`: Name or code given to this Object 164 | * `mltask_id (ObjectID)`: Unique Identifier of the MLTask to which this Experiment belongs 165 | * `pipeline_template_id (ObjectID)`: Unique Identifier of the PipelineTemplate used in this Experiment 166 | * `sensor_set (List[ObjectID])`: List of IDs of the Sensors used for this Experiment 167 | * `cv_folds (integer)`: Number of folds used for Cross Validation 168 | * `stratified (bool)`: Whether the Cross Validation was stratified or not 169 | * `random_state (integer)`: Random State used for the Cross Validation shuffling 170 | * `metric (string)`: Name of the metric used 171 | * `insert_time (DateTime)`: Time when this Object was inserted 172 | * `created_by (String)`: Identifier of the user that created this Object 173 | 174 | ## ExperimentRun 175 | 176 | An **ExperimentRun** represents a single execution of an Experiment. 177 | 178 | ### Fields 179 | 180 | * `_id (ObjectID)`: Unique Identifier of this Object 181 | * `experiment_id (ObjectID - Foreign Key)`: Unique Identifier of the Experiment 182 | * `start_time (DateTime)`: When the execution started 183 | * `end_time (DateTime)`: When the execution ended 184 | * `software_versions (List of Strings)`: version of each python dependency installed in the 185 | *virtualenv* when the execution took place 186 | * `budget_type (String)`: Type of budget used (time or number of iterations) 187 | * `budget_amount (Integer)`: Budget amount 188 | * `status (String)`: Whether the ExperimentRun is still running, finished successfully or failed 189 | * `insert_time (DateTime)`: Time when this Object was inserted 190 | * `created_by (String)`: Identifier of the user that created this Object 191 | 192 | ## PipelineRun 193 | 194 | A **PipelineRun** represents a single execution of a Pipeline instance over a MLTask. 195 | 196 | It contains information about whether the execution was successful or not, when it started 197 | and ended and the cross validation score obtained. 198 | 199 | ### Fields 200 | 201 | * `_id (ObjectID)`: Unique Identifier of this Object 202 | * `experimentrun_id (ObjectID)`: Unique Identifier of the ExperimentRun to which this PipelineRun belongs 203 | * `pipeline_id (ObjectID)`: Unique Identifier of the Pipeline 204 | * `start_time (DateTime)`: When the execution started 205 | * `end_time (DateTime)`: When the execution ended 206 | * `score (float)`: Cross Validation score obtained 207 | * `status (String)`: Whether the Signalrun is still running, finished successfully or failed 208 | * `insert_time (DateTime)`: Time when this Object was inserted 209 | -------------------------------------------------------------------------------- /DATA_FORMAT.md: -------------------------------------------------------------------------------- 1 | # Draco Data Format 2 | 3 | ## Input 4 | 5 | The minimum input expected by the **Draco** system consists of the following two elements, 6 | which need to be passed as `pandas.DataFrame` objects: 7 | 8 | ### Target Times 9 | 10 | A table containing the specification of the problem that we are solving, which has three 11 | columns: 12 | 13 | * `turbine_id`: Unique identifier of the turbine which this label corresponds to. 14 | * `cutoff_time`: Time associated with this target 15 | * `target`: The value that we want to predict. This can either be a numerical value or a 16 | categorical label. This column can also be skipped when preparing data that will be used 17 | only to make predictions and not to fit any pipeline. 18 | 19 | | | turbine_id | cutoff_time | target | 20 | |----|--------------|---------------------|----------| 21 | | 0 | T1 | 2001-01-02 00:00:00 | 0 | 22 | | 1 | T1 | 2001-01-03 00:00:00 | 1 | 23 | | 2 | T2 | 2001-01-04 00:00:00 | 0 | 24 | 25 | ### Readings 26 | 27 | A table containing the signal data from the different sensors, with the following columns: 28 | 29 | * `turbine_id`: Unique identifier of the turbine which this reading comes from. 30 | * `signal_id`: Unique identifier of the signal which this reading comes from. 31 | * `timestamp (datetime)`: Time where the reading took place, as a datetime. 32 | * `value (float)`: Numeric value of this reading. 33 | 34 | | | turbine_id | signal_id | timestamp | value | 35 | |----|--------------|-------------|---------------------|---------| 36 | | 0 | T1 | S1 | 2001-01-01 00:00:00 | 1 | 37 | | 1 | T1 | S1 | 2001-01-01 12:00:00 | 2 | 38 | | 2 | T1 | S1 | 2001-01-02 00:00:00 | 3 | 39 | | 3 | T1 | S1 | 2001-01-02 12:00:00 | 4 | 40 | | 4 | T1 | S1 | 2001-01-03 00:00:00 | 5 | 41 | | 5 | T1 | S1 | 2001-01-03 12:00:00 | 6 | 42 | | 6 | T1 | S2 | 2001-01-01 00:00:00 | 7 | 43 | | 7 | T1 | S2 | 2001-01-01 12:00:00 | 8 | 44 | | 8 | T1 | S2 | 2001-01-02 00:00:00 | 9 | 45 | | 9 | T1 | S2 | 2001-01-02 12:00:00 | 10 | 46 | | 10 | T1 | S2 | 2001-01-03 00:00:00 | 11 | 47 | | 11 | T1 | S2 | 2001-01-03 12:00:00 | 12 | 48 | 49 | ### Turbines 50 | 51 | Optionally, a third table can be added containing metadata about the turbines. 52 | The only requirement for this table is to have a `turbine_id` field, and it can have 53 | an arbitraty number of additional fields. 54 | 55 | | | turbine_id | manufacturer | ... | ... | ... | 56 | |----|--------------|----------------|-------|-------|-------| 57 | | 0 | T1 | Siemens | ... | ... | ... | 58 | | 1 | T2 | Siemens | ... | ... | ... | 59 | 60 | 61 | ## CSV Format 62 | 63 | As explained in a previous section, the input expected by the **Draco** system consists of 64 | two tables which need to be passed as `pandas.DataFrame` objects: 65 | 66 | * The `target_times` table, which containing the specification of the problem that we are solving 67 | in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value. 68 | * The `readings` table, which contains the signal readings from the different sensors, with 69 | `turbine_id`, `signal_id`, `timestamp` and `value` fields. 70 | 71 | However, in most scenarios the size of the available will far exceed the memory limitations 72 | of the system on which **Draco** is being run, so loading all the data in a single 73 | `pandas.DataFrame` will not be possible. 74 | 75 | In order to solve this situation, **Draco** provides a [CSVLoader]( 76 | https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader) 77 | class which can be used to load data from what we call the **Raw Data Format**. 78 | 79 | ### Raw Data Format 80 | 81 | The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the 82 | following structure: 83 | 84 | #### Folder Structure 85 | 86 | * All the data from all the turbines is inside a single folder, which here we will call `readings`. 87 | * Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine: 88 | * `readings/T001` 89 | * `readings/T002` 90 | * ... 91 | * Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`. 92 | * `readings/T001/2010-01.csv` 93 | * `readings/T001/2010-02.csv` 94 | * `readings/T001/2010-03.csv` 95 | * ... 96 | 97 | #### CSV Contents 98 | 99 | * Each CSV file contains three columns: 100 | * `signal_id`: name or id of the signal. 101 | * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``. 102 | * `value`: value of the reading. 103 | 104 | This is an example of what a CSV contents look like: 105 | 106 | | | signal_id | timestamp | value | 107 | |----|-------------|-------------------|---------| 108 | | 0 | S1 | 01/01/01 00:00:00 | 1 | 109 | | 1 | S1 | 01/01/01 12:00:00 | 2 | 110 | | 2 | S1 | 01/02/01 00:00:00 | 3 | 111 | | 3 | S1 | 01/02/01 12:00:00 | 4 | 112 | | 4 | S1 | 01/03/01 00:00:00 | 5 | 113 | | 5 | S1 | 01/03/01 12:00:00 | 6 | 114 | | 6 | S2 | 01/01/01 00:00:00 | 7 | 115 | | 7 | S2 | 01/01/01 12:00:00 | 8 | 116 | | 8 | S2 | 01/02/01 00:00:00 | 9 | 117 | | 9 | S2 | 01/02/01 12:00:00 | 10 | 118 | | 10 | S2 | 01/03/01 00:00:00 | 11 | 119 | | 11 | S2 | 01/03/01 12:00:00 | 12 | 120 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | # History 2 | 3 | ## 0.3.0 - 2022-07-31 4 | 5 | This release switches from ``MLPrimitives`` to ``ml-stars``. 6 | Moreover, we remove all pipelines using deep feature synthesis. 7 | 8 | * Update demo bucket - [Issue #76](https://github.com/sintel-dev/Draco/issues/76) by @sarahmish 9 | * Remove ``dfs`` based pipelines - [Issue #73](https://github.com/sintel-dev/Draco/issues/73) by @sarahmish 10 | * Move from ``MLPrimitives`` to ``ml-stars`` - [Issue #72](https://github.com/sintel-dev/Draco/issues/72) by @sarahmish 11 | 12 | 13 | ## 0.2.0 - 2022-04-12 14 | 15 | This release features a reorganization and renaming of ``Draco`` pipelines. In addtion, 16 | we update some of the dependencies for general housekeeping. 17 | 18 | * Update Draco dependencies - [Issue #66](https://github.com/signals-dev/Draco/issues/66) by @sarahmish 19 | * Reorganize pipelines - [Issue #63](https://github.com/signals-dev/Draco/issues/63) by @sarahmish 20 | 21 | 22 | ## 0.1.0 - 2022-01-01 23 | 24 | * First release on ``draco-ml`` PyPI 25 | 26 | 27 | ## Previous GreenGuard development 28 | 29 | ### 0.3.0 - 2021-01-22 30 | 31 | This release increases the supported version of python to `3.8` and also includes changes 32 | in the installation requirements, where ``pandas`` and ``scikit-optimize`` packages have 33 | been updated to support higher versions. This changes come together with the newer versions 34 | of ``MLBlocks`` and ``MLPrimitives``. 35 | 36 | #### Internal Improvements 37 | 38 | * Fix ``run_benchmark`` generating properly the ``init_hyperparameters`` for the pipelines. 39 | * New ``FPR`` metric. 40 | * New ``roc_auc_score`` metric. 41 | * Multiple benchmarking metrics allowed. 42 | * Multiple ``tpr`` or ``threshold`` values allowed for the benchmark. 43 | 44 | ### 0.2.6 - 2020-10-23 45 | 46 | * Fix ``mkdir`` when exporting to ``csv`` file the benchmark results. 47 | * Intermediate steps for the pipelines with demo notebooks for each pipeline. 48 | 49 | #### Resolved Issues 50 | 51 | * Issue #50: Expose partial outputs and executions in the ``GreenGuardPipeline``. 52 | 53 | ### 0.2.5 - 2020-10-09 54 | 55 | With this release we include: 56 | 57 | * `run_benchmark`: A function within the module `benchmark` that allows the user to evaluate 58 | templates against problems with different window size and resample rules. 59 | * `summarize_results`: A function that given a `csv` file generates a `xlsx` file with a summary 60 | tab and a detailed tab with the results from `run_benchmark`. 61 | 62 | ### 0.2.4 - 2020-09-25 63 | 64 | * Fix dependency errors 65 | 66 | ### 0.2.3 - 2020-08-10 67 | 68 | * Added benchmarking module. 69 | 70 | ### 0.2.2 - 2020-07-10 71 | 72 | #### Internal Improvements 73 | 74 | * Added github actions. 75 | 76 | #### Resolved Issues 77 | 78 | * Issue #27: Cache Splits pre-processed data on disk 79 | 80 | ### 0.2.1 - 2020-06-16 81 | 82 | With this release we give the possibility to the user to specify more than one template when 83 | creating a GreenGuardPipeline. When the `tune` method of this is called, an instance of BTBSession 84 | is returned and it is in charge of selecting the templates and tuning their hyperparameters until 85 | achieving the best pipeline. 86 | 87 | #### Internal Improvements 88 | 89 | * Resample by filename inside the `CSVLoader` to avoid oversampling of data that will not be used. 90 | * Select targets now allows them to be equal. 91 | * Fixed the csv filename format. 92 | * Upgraded to BTB. 93 | 94 | #### Bug Fixes 95 | 96 | * Issue #33: Wrong default datetime format 97 | 98 | #### Resolved Issues 99 | 100 | * Issue #35: Select targets is too strict 101 | * Issue #36: resample by filename inside csvloader 102 | * Issue #39: Upgrade BTB 103 | * Issue #41: Fix CSV filename format 104 | 105 | ### 0.2.0 - 2020-02-14 106 | 107 | First stable release: 108 | 109 | * efficient data loading and preprocessing 110 | * initial collection of dfs and lstm based pipelines 111 | * optimized pipeline tuning 112 | * documentation and tutorials 113 | 114 | ### 0.1.0 115 | 116 | * First release on PyPI 117 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, MIT Data To AI Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.md 4 | include LICENSE 5 | include README.md 6 | 7 | recursive-include draco *.json 8 | 9 | recursive-include tests * 10 | recursive-exclude * __pycache__ 11 | recursive-exclude * *.py[co] 12 | 13 | recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | 3 | define BROWSER_PYSCRIPT 4 | import os, webbrowser, sys 5 | 6 | try: 7 | from urllib import pathname2url 8 | except: 9 | from urllib.request import pathname2url 10 | 11 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 12 | endef 13 | export BROWSER_PYSCRIPT 14 | 15 | define PRINT_HELP_PYSCRIPT 16 | import re, sys 17 | 18 | for line in sys.stdin: 19 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 20 | if match: 21 | target, help = match.groups() 22 | print("%-20s %s" % (target, help)) 23 | endef 24 | export PRINT_HELP_PYSCRIPT 25 | 26 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 27 | 28 | .PHONY: help 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | 33 | # CLEAN TARGETS 34 | 35 | .PHONY: clean-build 36 | clean-build: ## remove build artifacts 37 | rm -fr build/ 38 | rm -fr dist/ 39 | rm -fr .eggs/ 40 | find . -name '*.egg-info' -exec rm -fr {} + 41 | find . -name '*.egg' -exec rm -f {} + 42 | 43 | .PHONY: clean-pyc 44 | clean-pyc: ## remove Python file artifacts 45 | find . -name '*.pyc' -exec rm -f {} + 46 | find . -name '*.pyo' -exec rm -f {} + 47 | find . -name '*~' -exec rm -f {} + 48 | find . -name '__pycache__' -exec rm -fr {} + 49 | 50 | .PHONY: clean-docs 51 | clean-docs: ## remove previously built docs 52 | rm -rf docs/api/ docs/api_reference/api/ docs/tutorials docs/build docs/_build 53 | 54 | .PHONY: clean-coverage 55 | clean-coverage: ## remove coverage artifacts 56 | rm -f .coverage 57 | rm -f .coverage.* 58 | rm -fr htmlcov/ 59 | 60 | .PHONY: clean-test 61 | clean-test: ## remove test artifacts 62 | rm -fr .tox/ 63 | rm -fr .pytest_cache 64 | 65 | .PHONY: clean 66 | clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts 67 | 68 | 69 | # INSTALL TARGETS 70 | 71 | .PHONY: install 72 | install: clean-build clean-pyc ## install the package to the active Python's site-packages 73 | pip install . 74 | 75 | .PHONY: install-test 76 | install-test: clean-build clean-pyc ## install the package and test dependencies 77 | pip install .[test] 78 | 79 | .PHONY: install-develop 80 | install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development 81 | pip install -e .[dev] 82 | 83 | MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=') 84 | 85 | .PHONY: install-minimum 86 | install-minimum: ## install the minimum supported versions of the package dependencies 87 | echo pip install $(MINIMUM) 88 | 89 | 90 | # LINT TARGETS 91 | 92 | .PHONY: lint-draco 93 | lint-btb: ## check style with flake8 and isort 94 | flake8 draco 95 | isort -c --recursive draco 96 | 97 | .PHONY: lint-tests 98 | lint-tests: ## check style with flake8 and isort 99 | flake8 --ignore=D,SFS2 tests 100 | isort -c --recursive tests 101 | 102 | .PHONY: check-dependencies 103 | check-dependencies: ## test if there are any broken dependencies 104 | pip check 105 | 106 | .PHONY: lint 107 | lint: check-dependencies lint-draco lint-tests ## Run all code style and static testing validations 108 | 109 | .PHONY: fix-lint 110 | fix-lint: ## fix lint issues using autoflake, autopep8, and isort 111 | find draco -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables 112 | autopep8 --in-place --recursive --aggressive draco 113 | isort --apply --atomic --recursive draco tests 114 | 115 | # TEST TARGETS 116 | 117 | .PHONY: test-unit 118 | test-unit: ## run tests quickly with the default Python 119 | python -m pytest --cov=draco 120 | 121 | .PHONY: test-readme 122 | test-readme: ## run the readme snippets 123 | rm -rf tests/readme_test && mkdir tests/readme_test 124 | cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md 125 | rm -rf tests/readme_test 126 | 127 | .PHONY: test-tutorials 128 | test-tutorials: ## run the tutorial notebooks 129 | find tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ 130 | jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --to=html --stdout {} > /dev/null \; 131 | 132 | .PHONY: test 133 | test: test-unit test-readme test-tutorials ## test everything that needs test dependencies 134 | 135 | .PHONY: test-minimum 136 | test-minimum: install-minimum check-dependencies test-unit ## run tests using the minimum supported dependencies 137 | 138 | .PHONY: test-all 139 | test-all: ## run tests on every Python version with tox 140 | tox -r 141 | 142 | .PHONY: coverage 143 | coverage: ## check code coverage quickly with the default Python 144 | coverage run --source draco -m pytest 145 | coverage report -m 146 | coverage html 147 | $(BROWSER) htmlcov/index.html 148 | 149 | # DOCS TARGETS 150 | 151 | .PHONY: docs 152 | docs: clean-docs ## generate Sphinx HTML documentation, including API docs 153 | $(MAKE) -C docs html 154 | 155 | .PHONY: view-docs 156 | view-docs: ## view the docs in a browser 157 | $(BROWSER) docs/_build/html/index.html 158 | 159 | .PHONY: serve-docs 160 | serve-docs: view-docs ## compile the docs watching for changes 161 | watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs 162 | 163 | 164 | # RELEASE TARGETS 165 | 166 | .PHONY: dist 167 | dist: clean ## builds source and wheel package 168 | python setup.py sdist 169 | python setup.py bdist_wheel 170 | ls -l dist 171 | 172 | .PHONY: publish-confirm 173 | publish-confirm: 174 | @echo "WARNING: This will irreversibly upload a new version to PyPI!" 175 | @echo -n "Please type 'confirm' to proceed: " \ 176 | && read answer \ 177 | && [ "$${answer}" = "confirm" ] 178 | 179 | .PHONY: publish-test 180 | publish-test: dist publish-confirm ## package and upload a release on TestPyPI 181 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 182 | 183 | .PHONY: publish 184 | publish: dist publish-confirm ## package and upload a release 185 | twine upload dist/* 186 | 187 | .PHONY: bumpversion-release 188 | bumpversion-release: ## Merge master to stable and bumpversion release 189 | git checkout stable || git checkout -b stable 190 | git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" 191 | bumpversion release 192 | git push --tags origin stable 193 | 194 | .PHONY: bumpversion-release-test 195 | bumpversion-release-test: ## Merge master to stable and bumpversion release 196 | git checkout stable || git checkout -b stable 197 | git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" 198 | bumpversion release --no-tag 199 | @echo git push --tags origin stable 200 | 201 | .PHONY: bumpversion-patch 202 | bumpversion-patch: ## Merge stable to master and bumpversion patch 203 | git checkout master 204 | git merge stable 205 | bumpversion --no-tag patch 206 | git push 207 | 208 | .PHONY: bumpversion-candidate 209 | bumpversion-candidate: ## Bump the version to the next candidate 210 | bumpversion candidate --no-tag 211 | 212 | .PHONY: bumpversion-minor 213 | bumpversion-minor: ## Bump the version the next minor skipping the release 214 | bumpversion --no-tag minor 215 | 216 | .PHONY: bumpversion-major 217 | bumpversion-major: ## Bump the version the next major skipping the release 218 | bumpversion --no-tag major 219 | 220 | .PHONY: bumpversion-revert 221 | bumpversion-revert: ## Undo a previous bumpversion-release 222 | git checkout master 223 | git branch -D stable 224 | 225 | CLEAN_DIR := $(shell git status --short | grep -v ??) 226 | CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) 227 | CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*") 228 | CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) 229 | 230 | .PHONY: check-clean 231 | check-clean: ## Check if the directory has uncommitted changes 232 | ifneq ($(CLEAN_DIR),) 233 | $(error There are uncommitted changes) 234 | endif 235 | 236 | .PHONY: check-master 237 | check-master: ## Check if we are in master branch 238 | ifneq ($(CURRENT_BRANCH),master) 239 | $(error Please make the release from master branch\n) 240 | endif 241 | 242 | .PHONY: check-candidate 243 | check-candidate: ## Check if a release candidate has been made 244 | ifeq ($(CURRENT_VERSION),dev0) 245 | $(error Please make a release candidate and test it before atempting a release) 246 | endif 247 | 248 | .PHONY: check-history 249 | check-history: ## Check if HISTORY.md has been modified 250 | ifeq ($(CHANGELOG_LINES),0) 251 | $(error Please insert the release notes in HISTORY.md before releasing) 252 | endif 253 | 254 | .PHONY: check-release 255 | check-release: check-candidate check-clean check-master check-history ## Check if the release can be made 256 | @echo "A new release can be made" 257 | 258 | .PHONY: release 259 | release: check-release bumpversion-release publish bumpversion-patch 260 | 261 | .PHONY: release-test 262 | release-test: check-release bumpversion-release-test publish-test bumpversion-revert 263 | 264 | .PHONY: release-candidate 265 | release-candidate: check-master publish bumpversion-candidate 266 | 267 | .PHONY: release-candidate-test 268 | release-candidate-test: check-clean check-master publish-test 269 | 270 | .PHONY: release-minor 271 | release-minor: check-release bumpversion-minor release 272 | 273 | .PHONY: release-major 274 | release-major: check-release bumpversion-major release 275 | 276 | 277 | # DOCKER TARGETS 278 | 279 | .PHONY: docker-build 280 | docker-build: 281 | docker build -f docker/Dockerfile -t draco . 282 | 283 | .PHONY: docker-login 284 | docker-login: 285 | docker login 286 | 287 | .PHONY: docker-push 288 | docker-push: docker-login docker-build 289 | @$(eval VERSION := $(shell python -c 'import draco; print(draco.__version__)')) 290 | docker tag draco signalsdev/draco:$(VERSION) 291 | docker push signalsdev/draco:$(VERSION) 292 | docker tag draco signalsdev/draco 293 | docker push signalsdev/draco 294 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | DAI 3 | An open source project from Data to AI Lab at MIT. 4 |

5 | 6 |

7 | Draco 8 |

9 | 10 |

11 | AutoML for Time Series. 12 |

13 | 14 | 15 | [![PyPI Shield](https://img.shields.io/pypi/v/draco-ml.svg)](https://pypi.python.org/pypi/draco-ml) 16 | [![Tests](https://github.com/sintel-dev/Draco/workflows/Run%20Tests/badge.svg)](https://github.com/sintel-dev/Draco/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) 17 | [![Downloads](https://pepy.tech/badge/draco-ml)](https://pepy.tech/project/draco-ml) 18 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sintel-dev/Draco/master?filepath=tutorials) 19 | 22 | 23 | # Draco 24 | 25 | - License: [MIT](https://github.com/sintel-dev/Draco/blob/master/LICENSE) 26 | - Documentation: https://sintel-dev.github.io/Draco 27 | - Homepage: https://github.com/sintel-dev/Draco 28 | 29 | ## Overview 30 | 31 | The Draco project is a collection of end-to-end solutions for machine learning problems 32 | commonly found in time series monitoring systems. Most tasks utilize sensor data 33 | emanating from monitoring systems. We utilize the foundational innovations developed for 34 | automation of machine Learning at Data to AI Lab at MIT. 35 | 36 | The salient aspects of this customized project are: 37 | 38 | * A set of ready to use, well tested pipelines for different machine learning tasks. These are 39 | vetted through testing across multiple publicly available datasets for the same task. 40 | * An easy interface to specify the task, pipeline, and generate results and summarize them. 41 | * A production ready, deployable pipeline. 42 | * An easy interface to ``tune`` pipelines using Bayesian Tuning and Bandits library. 43 | * A community oriented infrastructure to incorporate new pipelines. 44 | * A robust continuous integration and testing infrastructure. 45 | * A ``learning database`` recording all past outcomes --> tasks, pipelines, outcomes. 46 | 47 | ## Resources 48 | 49 | * [Data Format](DATA_FORMAT.md). 50 | * [Draco folder structure](DATA_FORMAT.md#folder-structure). 51 | 52 | # Install 53 | 54 | ## Requirements 55 | 56 | **Draco** has been developed and runs on Python 3.6, 3.7 and 3.8. 57 | 58 | Also, although it is not strictly required, the usage of a [virtualenv]( 59 | https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering 60 | with other software installed in the system where you are trying to run **Draco**. 61 | 62 | ## Download and Install 63 | 64 | **Draco** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with 65 | the following command: 66 | 67 | ```bash 68 | pip install draco-ml 69 | ``` 70 | 71 | This will pull and install the latest stable release from [PyPi](https://pypi.org/). 72 | 73 | If you want to install from source or contribute to the project please read the 74 | [Contributing Guide](https://sintel-dev.github.io/Draco/contributing.html#get-started). 75 | 76 | # Data Format 77 | 78 | The minimum input expected by the **Draco** system consists of the following two elements, 79 | which need to be passed as `pandas.DataFrame` objects: 80 | 81 | ## Target Times 82 | 83 | A table containing the specification of the problem that we are solving, which has three 84 | columns: 85 | 86 | * `turbine_id`: Unique identifier of the turbine which this label corresponds to. 87 | * `cutoff_time`: Time associated with this target 88 | * `target`: The value that we want to predict. This can either be a numerical value or a 89 | categorical label. This column can also be skipped when preparing data that will be used 90 | only to make predictions and not to fit any pipeline. 91 | 92 | | | turbine_id | cutoff_time | target | 93 | |----|--------------|---------------------|----------| 94 | | 0 | T1 | 2001-01-02 00:00:00 | 0 | 95 | | 1 | T1 | 2001-01-03 00:00:00 | 1 | 96 | | 2 | T2 | 2001-01-04 00:00:00 | 0 | 97 | 98 | ## Readings 99 | 100 | A table containing the signal data from the different sensors, with the following columns: 101 | 102 | * `turbine_id`: Unique identifier of the turbine which this reading comes from. 103 | * `signal_id`: Unique identifier of the signal which this reading comes from. 104 | * `timestamp (datetime)`: Time where the reading took place, as a datetime. 105 | * `value (float)`: Numeric value of this reading. 106 | 107 | | | turbine_id | signal_id | timestamp | value | 108 | |----|--------------|-------------|---------------------|---------| 109 | | 0 | T1 | S1 | 2001-01-01 00:00:00 | 1 | 110 | | 1 | T1 | S1 | 2001-01-01 12:00:00 | 2 | 111 | | 2 | T1 | S1 | 2001-01-02 00:00:00 | 3 | 112 | | 3 | T1 | S1 | 2001-01-02 12:00:00 | 4 | 113 | | 4 | T1 | S1 | 2001-01-03 00:00:00 | 5 | 114 | | 5 | T1 | S1 | 2001-01-03 12:00:00 | 6 | 115 | | 6 | T1 | S2 | 2001-01-01 00:00:00 | 7 | 116 | | 7 | T1 | S2 | 2001-01-01 12:00:00 | 8 | 117 | | 8 | T1 | S2 | 2001-01-02 00:00:00 | 9 | 118 | | 9 | T1 | S2 | 2001-01-02 12:00:00 | 10 | 119 | | 10 | T1 | S2 | 2001-01-03 00:00:00 | 11 | 120 | | 11 | T1 | S2 | 2001-01-03 12:00:00 | 12 | 121 | 122 | ## Turbines 123 | 124 | Optionally, a third table can be added containing metadata about the turbines. 125 | The only requirement for this table is to have a `turbine_id` field, and it can have 126 | an arbitraty number of additional fields. 127 | 128 | | | turbine_id | manufacturer | ... | ... | ... | 129 | |----|--------------|----------------|-------|-------|-------| 130 | | 0 | T1 | Siemens | ... | ... | ... | 131 | | 1 | T2 | Siemens | ... | ... | ... | 132 | 133 | ## CSV Format 134 | 135 | A part from the in-memory data format explained above, which is limited by the memory 136 | allocation capabilities of the system where it is run, **Draco** is also prepared to 137 | load and work with data stored as a collection of CSV files, drastically increasing the amount 138 | of data which it can work with. Further details about this format can be found in the 139 | [project documentation site](DATA_FORMAT.md#csv-format). 140 | 141 | # Quickstart 142 | 143 | In this example we will load some demo data and classify it using a **Draco Pipeline**. 144 | 145 | ## 1. Load and split the demo data 146 | 147 | The first step is to load the demo data. 148 | 149 | For this, we will import and call the `draco.demo.load_demo` function without any arguments: 150 | 151 | ```python3 152 | from draco.demo import load_demo 153 | 154 | target_times, readings = load_demo() 155 | ``` 156 | 157 | The returned objects are: 158 | 159 | * ``target_times``: A ``pandas.DataFrame`` with the ``target_times`` table data: 160 | 161 | ``` 162 | turbine_id cutoff_time target 163 | 0 T001 2013-01-12 0 164 | 1 T001 2013-01-13 0 165 | 2 T001 2013-01-14 0 166 | 3 T001 2013-01-15 1 167 | 4 T001 2013-01-16 0 168 | ``` 169 | 170 | * ``readings``: A ``pandas.DataFrame`` containing the time series data in the format explained above. 171 | 172 | ``` 173 | turbine_id signal_id timestamp value 174 | 0 T001 S01 2013-01-10 323.0 175 | 1 T001 S02 2013-01-10 320.0 176 | 2 T001 S03 2013-01-10 284.0 177 | 3 T001 S04 2013-01-10 348.0 178 | 4 T001 S05 2013-01-10 273.0 179 | ``` 180 | 181 | Once we have loaded the `target_times` and before proceeding to training any Machine Learning 182 | Pipeline, we will have split them in 2 partitions for training and testing. 183 | 184 | In this case, we will split them using the [train_test_split function from scikit-learn]( 185 | https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html), 186 | but it can be done with any other suitable tool. 187 | 188 | ```python3 189 | from sklearn.model_selection import train_test_split 190 | 191 | train, test = train_test_split(target_times, test_size=0.25, random_state=0) 192 | ``` 193 | 194 | Notice how we are only splitting the `target_times` data and not the `readings`. 195 | This is because the pipelines will later on take care of selecting the parts of the 196 | `readings` table needed for the training based on the information found inside 197 | the `train` and `test` inputs. 198 | 199 | Additionally, if we want to calculate a goodness-of-fit score later on, we can separate the 200 | testing target values from the `test` table by popping them from it: 201 | 202 | ```python3 203 | test_targets = test.pop('target') 204 | ``` 205 | 206 | ## 2. Exploring the available Pipelines 207 | 208 | Once we have the data ready, we need to find a suitable pipeline. 209 | 210 | The list of available Draco Pipelines can be obtained using the `draco.get_pipelines` 211 | function. 212 | 213 | ```python3 214 | from draco import get_pipelines 215 | 216 | pipelines = get_pipelines() 217 | ``` 218 | 219 | The returned `pipeline` variable will be `list` containing the names of all the pipelines 220 | available in the Draco system: 221 | 222 | ``` 223 | ['lstm', 224 | 'lstm_with_unstack', 225 | 'double_lstm', 226 | 'double_lstm_with_unstack'] 227 | ``` 228 | 229 | For the rest of this tutorial, we will select and use the pipeline 230 | `lstm_with_unstack` as our template. 231 | 232 | ```python3 233 | pipeline_name = 'lstm_with_unstack' 234 | ``` 235 | 236 | ## 3. Fitting the Pipeline 237 | 238 | Once we have loaded the data and selected the pipeline that we will use, we have to 239 | fit it. 240 | 241 | For this, we will create an instance of a `DracoPipeline` object passing the name 242 | of the pipeline that we want to use: 243 | 244 | ```python3 245 | from draco.pipeline import DracoPipeline 246 | 247 | pipeline = DracoPipeline(pipeline_name) 248 | ``` 249 | 250 | And then we can directly fit it to our data by calling its `fit` method and passing in the 251 | training `target_times` and the complete `readings` table: 252 | 253 | ```python3 254 | pipeline.fit(train, readings) 255 | ``` 256 | 257 | ## 4. Make predictions 258 | 259 | After fitting the pipeline, we are ready to make predictions on new data by calling the 260 | `pipeline.predict` method passing the testing `target_times` and, again, the complete 261 | `readings` table. 262 | 263 | ```python3 264 | predictions = pipeline.predict(test, readings) 265 | ``` 266 | 267 | ## 5. Evaluate the goodness-of-fit 268 | 269 | Finally, after making predictions we can evaluate how good the prediction was 270 | using any suitable metric. 271 | 272 | ```python3 273 | from sklearn.metrics import f1_score 274 | 275 | f1_score(test_targets, predictions) 276 | ``` 277 | 278 | ## What's next? 279 | 280 | For more details about **Draco** and all its possibilities and features, please check the 281 | [project documentation site](https://sintel-dev.github.io/Draco/) 282 | Also do not forget to have a look at the [tutorials]( 283 | https://github.com/sintel-dev/Draco/tree/master/tutorials)! 284 | -------------------------------------------------------------------------------- /docker/.dockerignore: -------------------------------------------------------------------------------- 1 | ../notebooks-private/ 2 | ../.tox/ 3 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | ARG UID=1000 4 | EXPOSE 8888 5 | 6 | RUN mkdir /app 7 | COPY setup.py /app 8 | COPY greenguard /app/greenguard 9 | COPY tutorials /app/tutorials 10 | RUN pip install -e /app jupyter 11 | 12 | WORKDIR /app 13 | CMD pip install -e /app && /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' --allow-root 14 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Run GreenGuard using Docker 2 | 3 | GreenGuard is prepared to be run using [Docker](https://docker.com/). 4 | 5 | These are the commands needed to start a Docker container locally that runs a [Jupyter Notebook]( 6 | https://jupyter.org/) already configured to run GreenGuard. 7 | 8 | ```bash 9 | docker run -ti -p8888:8888 signalsdev/greenguard:latest 10 | ``` 11 | 12 | This will start a Jupyter Notebook instance on your computer already configured to use GreenGuard. 13 | You can access it by pointing your browser at http://127.0.0.1:8888 14 | 15 | Further details about the usage of this image can be found [here]( 16 | https://hub.docker.com/repository/docker/signalsdev/greenguard). 17 | 18 | ## Run GreenGuard on Kubernetes 19 | 20 | GreenGuard can also be started using [Kubernetes](https://kubernetes.io/). 21 | 22 | Here are the minimum steps required to create a POD in a local Kubernetes cluster: 23 | 24 | 1. Create a yaml file with these contents: 25 | 26 | For this example, we are assuming that the yaml file is named `greegunard-pod.yml`. 27 | 28 | ```yml 29 | apiVersion: v1 30 | kind: Pod 31 | metadata: 32 | name: greenguard 33 | spec: 34 | containers: 35 | - name: greenguard 36 | image: signalsdev/greenguard:latest 37 | ports: 38 | - containerPort: 8888 39 | ``` 40 | 41 | 2. Create a POD: 42 | 43 | After creating the yaml file, you can create a POD in your Kubernetes cluster using the `kubectl` 44 | command: 45 | 46 | ```bash 47 | kubectl apply -f greenguard-pod.yml 48 | ``` 49 | 50 | 3. Forward the port 8888 51 | 52 | After the POD is started, you still need to forward a local port to it in order to access the 53 | Jupyter instance. 54 | 55 | ```bash 56 | kubectl port-forward greenguard 8888 57 | ``` 58 | 59 | 4. Point your browser at http://localhost:8888 60 | 61 | > **NOTE**: If GreenGuard is run in a production environment we recommend you to use a service and 62 | a deployment instead of just a simple POD. You can find a template of this setup [here]( 63 | greenguard-deployment.yml) 64 | 65 | ## Building the Docker image from scratch 66 | 67 | If you want to build the Docker image from scratch instead of using the dockerhub image 68 | you will need to: 69 | 70 | 1. Clone the repository 71 | 72 | ```bash 73 | git clone git@github.com:signals-dev/GreenGuard.git 74 | cd GreenGuard 75 | ``` 76 | 77 | 2. Build the docker image using the GreenGuard make command. 78 | 79 | ```bash 80 | make docker-build 81 | ``` 82 | 83 | ## What's next? 84 | 85 | For more details about **GreenGuard** and all its possibilities and features, please check the 86 | [project documentation site](https://signals-dev.github.io/GreenGuard/)! 87 | -------------------------------------------------------------------------------- /docker/greenguard-deployment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: greenguard 5 | spec: 6 | ports: 7 | - name: jupyter 8 | port: 8888 9 | nodePort: 30088 10 | selector: 11 | app: greenguard 12 | type: NodePort 13 | --- 14 | apiVersion: apps/v1 15 | kind: Deployment 16 | metadata: 17 | name: greenguard 18 | spec: 19 | selector: 20 | matchLabels: 21 | app: greenguard 22 | strategy: 23 | type: Recreate 24 | template: 25 | metadata: 26 | labels: 27 | app: greenguard 28 | spec: 29 | containers: 30 | - image: signalsdev/greenguard:latest 31 | name: greenguard 32 | ports: 33 | - containerPort: 8888 34 | name: jupyter 35 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = draco 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/advanced_usage/concepts.md: -------------------------------------------------------------------------------- 1 | # Concepts 2 | 3 | Here we briefly explain some of the concepts and terminology used within the Draco 4 | project and documentation. 5 | 6 | ## Primitive 7 | 8 | We call the smallest computational blocks used in a Machine Learning process 9 | **primitives**, which: 10 | 11 | * Can be either classes or functions. 12 | * Have some initialization arguments, which MLBlocks calls `init_params`. 13 | * Have some tunable hyperparameters, which have types and a list or range of valid values. 14 | 15 | ## Template 16 | 17 | Primitives can be combined to form what we call **Templates**, which: 18 | 19 | * Have a list of primitives. 20 | * Have some initialization arguments, which correspond to the initialization arguments 21 | of their primitives. 22 | * Have some tunable hyperparameters, which correspond to the tunable hyperparameters 23 | of their primitives. 24 | 25 | ## Pipeline 26 | 27 | Templates can be used to build **Pipelines** by taking and fixing a set of valid 28 | hyperparameters for a Template. Hence, Pipelines: 29 | 30 | * Have a list of primitives, which corresponds to the list of primitives of their template. 31 | * Have some initialization arguments, which correspond to the initialization arguments 32 | of their template. 33 | * Have some hyperparameter values, which fall within the ranges of valid tunable 34 | hyperparameters of their template. 35 | 36 | A pipeline can be fitted and evaluated directly using [MLBlocks]( 37 | https://MLBazaar.github.io/MLBlocks), or using the **DracoPipeline**. 38 | 39 | ## Tuning 40 | 41 | We call tuning the process of, given a dataset and a collection of templates, finding the pipeline 42 | derived from the templates that gets the best possible score on the dataset. 43 | 44 | This process usually involves fitting and evaluating multiple pipelines with different 45 | hyperparameter configurations on the same data while using optimization algorithms to deduce 46 | which hyperparameters are more likely to get the best results in the next iterations. 47 | 48 | We call each one of these evaluations a **tuning iteration**. 49 | 50 | The process of selecting and tuning the templates is handled by a [BTBSession]( 51 | https://MLBazaar.github.io/BTB/tutorials/03_Session.html), which is responsible for 52 | discarding the templates that do not work on the given data and for keeping 53 | track of the template and hyperparameters that obtain the best performance. 54 | 55 | ## DracoPipeline 56 | 57 | This class is the one in charge of loading the **MLBlocks Pipelines** configured in the 58 | system and use them to learn from the data and make predictions. 59 | 60 | This class is also responsible for creating the BTBSession that will handle the 61 | selection and tuning of the templates. 62 | -------------------------------------------------------------------------------- /docs/advanced_usage/csv.md: -------------------------------------------------------------------------------- 1 | # CSV Format 2 | 3 | As explained in a previous section, the input expected by the **Draco** system consists of 4 | two tables which need to be passed as `pandas.DataFrame` objects: 5 | 6 | * The `target_times` table, which containing the specification of the problem that we are solving 7 | in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value. 8 | * The `readings` table, which contains the signal readings from the different sensors, with 9 | `turbine_id`, `signal_id`, `timestamp` and `value` fields. 10 | 11 | However, in most scenarios the size of the available will far exceed the memory limitations 12 | of the system on which **Draco** is being run, so loading all the data in a single 13 | `pandas.DataFrame` will not be possible. 14 | 15 | In order to solve this situation, **Draco** provides a [CSVLoader]( 16 | https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader) 17 | class which can be used to load data from what we call the **Raw Data Format**. 18 | 19 | ## Raw Data Format 20 | 21 | The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the 22 | following structure: 23 | 24 | * All the data from all the turbines is inside a single folder, which here we will call `readings`. 25 | * Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine: 26 | * `readings/T001` 27 | * `readings/T002` 28 | * ... 29 | * Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`. 30 | * `readings/T001/2010-01.csv` 31 | * `readings/T001/2010-02.csv` 32 | * `readings/T001/2010-03.csv` 33 | * ... 34 | * Each CSV file contains three columns: 35 | * `signal_id`: name or id of the signal. 36 | * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``. 37 | * `value`: value of the reading. 38 | 39 | This is an example of what a CSV contents look like: 40 | 41 | | | signal_id | timestamp | value | 42 | |----|-------------|-------------------|---------| 43 | | 0 | S1 | 01/01/01 00:00:00 | 1 | 44 | | 1 | S1 | 01/01/01 12:00:00 | 2 | 45 | | 2 | S1 | 01/02/01 00:00:00 | 3 | 46 | | 3 | S1 | 01/02/01 12:00:00 | 4 | 47 | | 4 | S1 | 01/03/01 00:00:00 | 5 | 48 | | 5 | S1 | 01/03/01 12:00:00 | 6 | 49 | | 6 | S2 | 01/01/01 00:00:00 | 7 | 50 | | 7 | S2 | 01/01/01 12:00:00 | 8 | 51 | | 8 | S2 | 01/02/01 00:00:00 | 9 | 52 | | 9 | S2 | 01/02/01 12:00:00 | 10 | 53 | | 10 | S2 | 01/03/01 00:00:00 | 11 | 54 | | 11 | S2 | 01/03/01 12:00:00 | 12 | 55 | -------------------------------------------------------------------------------- /docs/advanced_usage/docker.md: -------------------------------------------------------------------------------- 1 | # Docker Usage 2 | 3 | **Draco** comes configured and ready to be distributed and run as a docker image which starts 4 | a jupyter notebook already configured to use draco, with all the required dependencies already 5 | installed. 6 | 7 | ## Requirements 8 | 9 | The only requirement in order to run the Draco Docker image is to have Docker installed and 10 | that the user has enough permissions to run it. 11 | 12 | Installation instructions for any possible system compatible can be found [here](https://docs.docker.com/install/) 13 | 14 | Additionally, the system that builds the Draco Docker image will also need to have a working 15 | internet connection that allows downloading the base image and the additional python depenedencies. 16 | 17 | ## Building the Draco Docker Image 18 | 19 | After having cloned the **Draco** repository, all you have to do in order to build the Draco Docker 20 | Image is running this command: 21 | 22 | ```bash 23 | make docker-jupyter-build 24 | ``` 25 | 26 | After a few minutes, the new image, called `draco-jupyter`, will have been built into the system 27 | and will be ready to be used or distributed. 28 | 29 | ## Distributing the Draco Docker Image 30 | 31 | Once the `draco-jupyter` image is built, it can be distributed in several ways. 32 | 33 | ### Distributing using a Docker registry 34 | 35 | The simplest way to distribute the recently created image is [using a registry](https://docs.docker.com/registry/). 36 | 37 | In order to do so, we will need to have write access to a public or private registry (remember to 38 | [login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands: 39 | 40 | ```bash 41 | docker tag draco-jupyter:latest your-registry-name:some-tag 42 | docker push your-registry-name:some-tag 43 | ``` 44 | 45 | Afterwards, in the receiving machine: 46 | 47 | ```bash 48 | docker pull your-registry-name:some-tag 49 | docker tag your-registry-name:some-tag draco-jupyter:latest 50 | ``` 51 | 52 | ### Distributing as a file 53 | 54 | If the distribution of the image has to be done offline for any reason, it can be achieved 55 | using the following command. 56 | 57 | In the system that already has the image: 58 | 59 | ```bash 60 | docker save --output draco-jupyter.tar draco-jupyter 61 | ``` 62 | 63 | Then copy over the file `draco-jupyter.tar` to the new system and there, run: 64 | 65 | ```bash 66 | docker load --input draco-jupyter.tar 67 | ``` 68 | 69 | After these commands, the `draco-jupyter` image should be available and ready to be used in the 70 | new system. 71 | 72 | 73 | ## Running the draco-jupyter image 74 | 75 | Once the `draco-jupyter` image has been built, pulled or loaded, it is ready to be run. 76 | 77 | This can be done in two ways: 78 | 79 | ### Running draco-jupyter with the code 80 | 81 | If the Draco source code is available in the system, running the image is as simple as running 82 | this command from within the root of the project: 83 | 84 | ```bash 85 | make docker-jupyter-run 86 | ``` 87 | 88 | This will start a jupyter notebook using the docker image, which you can access by pointing your 89 | browser at http://127.0.0.1:8888 90 | 91 | In this case, the local version of the project will also mounted within the Docker container, 92 | which means that any changes that you do in your local code will immediately be available 93 | within your notebooks, and that any notebook that you create within jupyter will also show 94 | up in your `notebooks` folder! 95 | 96 | ### Running draco-jupyter without the draco code 97 | 98 | If the Draco source code is not available in the system and only the Docker Image is, you can 99 | still run the image by using this command: 100 | 101 | ```bash 102 | docker run -ti -p8888:8888 draco-jupyter 103 | ``` 104 | 105 | In this case, the code changes and the notebooks that you create within jupyter will stay 106 | inside the container and you will only be able to access and download them through the 107 | jupyter interface. 108 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Draco documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | 21 | import sphinx_rtd_theme # For read the docs theme 22 | 23 | import draco 24 | 25 | # -- General configuration --------------------------------------------- 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 33 | extensions = [ 34 | 'm2r', 35 | 'nbsphinx', 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.githubpages', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.napoleon', 40 | 'autodocsumm', 41 | ] 42 | 43 | autodoc_default_options = { 44 | 'autosummary': True, 45 | } 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ['_templates'] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | source_suffix = ['.rst', '.md'] 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # Jupyter Notebooks 58 | nbsphinx_execute = 'never' 59 | 60 | # General information about the project. 61 | project = 'Draco' 62 | slug = 'draco' 63 | title = project + ' Documentation', 64 | copyright = '2018, MIT Data To AI Lab' 65 | author = 'MIT Data To AI Lab' 66 | description = 'AutoML for Time Series' 67 | user = 'sintel-dev' 68 | 69 | # The version info for the project you're documenting, acts as replacement 70 | # for |version| and |release|, also used in various other places throughout 71 | # the built documents. 72 | # 73 | # The short X.Y version. 74 | version = draco.__version__ 75 | # The full version, including alpha/beta/rc tags. 76 | release = draco.__version__ 77 | 78 | # The language for content autogenerated by Sphinx. Refer to documentation 79 | # for a list of supported languages. 80 | # 81 | # This is also used if you do content translation via gettext catalogs. 82 | # Usually you set "language" from the command line for these cases. 83 | language = None 84 | 85 | # List of patterns, relative to source directory, that match files and 86 | # directories to ignore when looking for source files. 87 | # This patterns also effect to html_static_path and html_extra_path 88 | exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 89 | 90 | # The name of the Pygments (syntax highlighting) style to use. 91 | pygments_style = 'sphinx' 92 | 93 | # If true, `todo` and `todoList` produce output, else they produce nothing. 94 | todo_include_todos = False 95 | 96 | # -- Options for HTML output ------------------------------------------- 97 | 98 | # The theme to use for HTML and HTML Help pages. See the documentation for 99 | # a list of builtin themes. 100 | # 101 | html_theme = 'sphinx_rtd_theme' 102 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 103 | 104 | # Readthedocs additions 105 | html_context = { 106 | 'display_github': True, 107 | 'github_user': user, 108 | 'github_repo': project, 109 | 'github_version': 'master', 110 | 'conf_py_path': '/docs/', 111 | } 112 | 113 | # Theme options are theme-specific and customize the look and feel of a 114 | # theme further. For a list of options available for each theme, see the 115 | # documentation. 116 | html_theme_options = { 117 | 'collapse_navigation': False, 118 | 'display_version': False, 119 | } 120 | 121 | # Add any paths that contain custom static files (such as style sheets) here, 122 | # relative to this directory. They are copied after the builtin static files, 123 | # so a file named "default.css" will overwrite the builtin "default.css". 124 | # html_static_path = ['_static'] 125 | 126 | # The name of an image file (relative to this directory) to use as a favicon of 127 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 128 | # pixels large. 129 | # html_favicon = 'images/favicon.ico' 130 | html_favicon = 'images/Draco.ico' 131 | 132 | # If given, this must be the name of an image file (path relative to the 133 | # configuration directory) that is the logo of the docs. It is placed at 134 | # the top of the sidebar; its width should therefore not exceed 200 pixels. 135 | # html_logo = 'images/dai-logo.png' 136 | html_logo = 'images/Draco-200.png' 137 | 138 | # -- Options for HTMLHelp output --------------------------------------- 139 | 140 | # Output file base name for HTML help builder. 141 | htmlhelp_basename = slug + 'doc' 142 | 143 | 144 | # -- Options for LaTeX output ------------------------------------------ 145 | 146 | latex_elements = { 147 | # The paper size ('letterpaper' or 'a4paper'). 148 | # 149 | # 'papersize': 'letterpaper', 150 | 151 | # The font size ('10pt', '11pt' or '12pt'). 152 | # 153 | # 'pointsize': '10pt', 154 | 155 | # Additional stuff for the LaTeX preamble. 156 | # 157 | # 'preamble': '', 158 | 159 | # Latex figure (float) alignment 160 | # 161 | # 'figure_align': 'htbp', 162 | } 163 | 164 | # Grouping the document tree into LaTeX files. List of tuples 165 | # (source start file, target name, title, author, documentclass 166 | # [howto, manual, or own class]). 167 | latex_documents = [( 168 | master_doc, 169 | slug + '.tex', 170 | title, 171 | author, 172 | 'manual' 173 | )] 174 | 175 | 176 | # -- Options for manual page output ------------------------------------ 177 | 178 | # One entry per manual page. List of tuples 179 | # (source start file, name, description, authors, manual section). 180 | man_pages = [( 181 | master_doc, 182 | slug, 183 | title, 184 | [author], 185 | 1 186 | )] 187 | 188 | 189 | # -- Options for Texinfo output ---------------------------------------- 190 | 191 | # Grouping the document tree into Texinfo files. List of tuples 192 | # (source start file, target name, title, author, 193 | # dir menu entry, description, category) 194 | texinfo_documents = [( 195 | master_doc, 196 | slug, 197 | title, 198 | author, 199 | slug, 200 | description, 201 | 'Miscellaneous' 202 | )] 203 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../HISTORY.md 2 | -------------------------------------------------------------------------------- /docs/images/Draco-200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco-200.png -------------------------------------------------------------------------------- /docs/images/Draco.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco.ico -------------------------------------------------------------------------------- /docs/images/Draco.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/Draco.png -------------------------------------------------------------------------------- /docs/images/dai-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/dai-logo.png -------------------------------------------------------------------------------- /docs/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sintel-dev/Draco/113e14fddb3b31570537aaf011b0e95255511855/docs/images/favicon.ico -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: readme.rst 2 | 3 | .. toctree:: 4 | :hidden: 5 | :maxdepth: 2 6 | 7 | Overview 8 | 9 | .. toctree:: 10 | :caption: Tutorials 11 | :hidden: 12 | 13 | tutorials/01_Draco_Quickstart 14 | tutorials/02_Extract_Readings 15 | 16 | .. toctree:: 17 | :caption: Advanced Usage 18 | :hidden: 19 | 20 | advanced_usage/concepts 21 | advanced_usage/csv 22 | advanced_usage/docker 23 | 24 | .. toctree:: 25 | :caption: Resources 26 | :hidden: 27 | 28 | API Reference 29 | contributing 30 | authors 31 | history 32 | 33 | Indices and tables 34 | ================== 35 | * :ref:`genindex` 36 | * :ref:`modindex` 37 | * :ref:`search` 38 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=draco 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../README.md 2 | -------------------------------------------------------------------------------- /draco/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level package for Draco.""" 4 | 5 | __author__ = """MIT Data To AI Lab""" 6 | __email__ = 'dailabmit@gmail.com' 7 | __version__ = '0.3.1.dev0' 8 | 9 | import os 10 | 11 | from draco.pipeline import DracoPipeline, get_pipelines 12 | 13 | _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) 14 | MLBLOCKS_PRIMITIVES = os.path.join(_BASE_PATH, 'primitives') 15 | MLBLOCKS_PIPELINES = tuple( 16 | dirname 17 | for dirname, _, _ in os.walk(os.path.join(_BASE_PATH, 'pipelines')) 18 | ) 19 | 20 | __all__ = ( 21 | 'DracoPipeline', 22 | 'get_pipelines', 23 | ) 24 | -------------------------------------------------------------------------------- /draco/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import getpass 4 | import json 5 | import logging 6 | from datetime import datetime 7 | 8 | from pymongo import MongoClient 9 | 10 | from draco.utils import remove_dots, restore_dots 11 | 12 | LOGGER = logging.getLogger(__name__) 13 | 14 | 15 | class MongoDB(object): 16 | 17 | def __init__(self, database=None, config=None, **kwargs): 18 | if config: 19 | with open(config, 'r') as f: 20 | config = json.load(f) 21 | else: 22 | config = kwargs 23 | 24 | host = config.get('host', 'localhost') 25 | port = config.get('port', 27017) 26 | user = config.get('user') 27 | password = config.get('password') 28 | database = database or config.get('database', 'test') 29 | auth_database = config.get('auth_database', 'admin') 30 | 31 | if user and not password: 32 | password = getpass.getpass(prompt='Please insert database password: ') 33 | 34 | client = MongoClient( 35 | host=host, 36 | port=port, 37 | username=user, 38 | password=password, 39 | authSource=auth_database 40 | ) 41 | 42 | LOGGER.info("Setting up a MongoClient %s", client) 43 | 44 | self._db = client[database] 45 | 46 | def load_template(self, template_name): 47 | match = { 48 | 'name': template_name 49 | } 50 | 51 | cursor = self._db.templates.find(match) 52 | templates = list(cursor.sort('insert_ts', -1).limit(1)) 53 | 54 | if templates: 55 | return restore_dots(templates[0]) 56 | 57 | def insert_template(self, template): 58 | if 'name' not in template: 59 | raise ValueError("Templates need to have a name key") 60 | 61 | template['insert_ts'] = datetime.utcnow() 62 | template = remove_dots(template) 63 | 64 | self._db.templates.insert_one(template) 65 | 66 | def insert_pipeline(self, candidate, score, dataset, table, column): 67 | 68 | pipeline = candidate.to_dict() 69 | 70 | pipeline['score'] = score 71 | pipeline['dataset'] = dataset 72 | pipeline['table'] = table 73 | pipeline['column'] = column 74 | pipeline['insert_ts'] = datetime.utcnow() 75 | 76 | pipeline = remove_dots(pipeline) 77 | 78 | self._db.pipelines.insert_one(pipeline) 79 | -------------------------------------------------------------------------------- /draco/demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | 6 | import pandas as pd 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | S3_URL = 'https://sintel-draco.s3.amazonaws.com/' 11 | DEMO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'demo') 12 | 13 | _FILES = { 14 | 'DEFAULT': [ 15 | ('target_times', 'cutoff_time'), 16 | ('readings', 'timestamp') 17 | ], 18 | 'RUL': [ 19 | ('rul_train_target_times', 'cutoff_time'), 20 | ('rul_test_target_times', 'cutoff_time'), 21 | ('rul_readings', 'timestamp') 22 | ] 23 | } 24 | 25 | def _load_or_download(filename, dates): 26 | filename += '.csv.gz' 27 | file_path = os.path.join(DEMO_PATH, filename) 28 | if os.path.exists(file_path): 29 | return pd.read_csv(file_path, compression='gzip', parse_dates=[dates]) 30 | 31 | os.makedirs(DEMO_PATH, exist_ok=True) 32 | url = S3_URL + filename 33 | 34 | LOGGER.info('Downloading %s from %s', filename, url) 35 | data = pd.read_csv(url, compression='gzip', parse_dates=[dates]) 36 | data.to_csv(file_path, index=False, compression='gzip') 37 | 38 | return data 39 | 40 | 41 | def load_demo(name='default', load_readings=True): 42 | """Load the demo included in the Draco project. 43 | 44 | The first time that this function is executed, the data will be downloaded 45 | and cached inside the `draco/demo` folder. 46 | Subsequent calls will load the cached data instead of downloading it again. 47 | 48 | Args: 49 | name (str): 50 | Name of the dataset to load. If "RUL", load NASA's CMAPSS dataset 51 | https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan. 52 | If "default" then load default demo. 53 | load_readings (bool): 54 | Whether to load the ``readings`` table or not. 55 | 56 | Returns: 57 | tuple[pandas.DataFrame]: 58 | target_times and readings tables 59 | """ 60 | files = _FILES[name.upper()] 61 | 62 | if not load_readings: 63 | files = files[:-1] 64 | 65 | output = list() 66 | for filename, dates in files: 67 | output.append(_load_or_download(filename, dates)) 68 | 69 | return tuple(output) 70 | 71 | 72 | def generate_raw_readings(output_path='demo'): 73 | """Generate raw readings based on the demo data. 74 | 75 | Args: 76 | path (str): 77 | Path where the readings will be generated. 78 | """ 79 | target_times, readings = load_demo() 80 | 81 | for turbine_id in target_times.turbine_id.unique(): 82 | turbine_path = os.path.join(output_path, turbine_id) 83 | os.makedirs(turbine_path, exist_ok=True) 84 | data = readings[readings.turbine_id == turbine_id] 85 | for month in range(1, 13): 86 | month_data = data[data.timestamp.dt.month == month].copy() 87 | month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %H:%M:%S') 88 | month_path = os.path.join(turbine_path, '2013-{:02d}.csv'.format(month)) 89 | LOGGER.info('Generating file %s', month_path) 90 | month_data[['signal_id', 'timestamp', 'value']].to_csv(month_path, index=False) 91 | 92 | return target_times 93 | -------------------------------------------------------------------------------- /draco/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from draco.loaders.csv import CSVLoader 2 | 3 | __all__ = ( 4 | 'CSVLoader', 5 | ) 6 | -------------------------------------------------------------------------------- /draco/loaders/csv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import dask 5 | import pandas as pd 6 | 7 | from draco.targets import drop_duplicates, select_valid_targets 8 | 9 | LOGGER = logging.getLogger(__name__) 10 | 11 | 12 | class CSVLoader: 13 | """Load the required readings from CSV files. 14 | 15 | The CSVLoader class is responsible for analyzing the target_times table 16 | and then load the required readings from CSV files. 17 | 18 | Also, optionally, it can perform a resampling aggregation while loading 19 | the data, reducing the amount of memory requirements. 20 | 21 | The CSVLoader class uses Dask to parallelize all the IO and resampling 22 | computation and reduce loading times. 23 | 24 | Args: 25 | readings_path (str): 26 | Path to the readings folder, where a folder exist for each turbine. 27 | rule (str): 28 | Resampling rule, as expected by ``DataFrame.resmple``. The rule is a 29 | string representation of a TimeDelta, which includes a number and a 30 | unit. For example: ``3d``, ``1w``, ``6h``. 31 | If ``None``, resampling is disabled. 32 | aggregation (str): 33 | Name of the aggregation to perform during the resampling. 34 | unstack (bool): 35 | Whether to unstack the resampled data, generating one column per signal. 36 | Only used when resampling. Defaults to ``False``. 37 | """ 38 | 39 | DEFAULT_DATETIME_FMT = '%m/%d/%y %H:%M:%S' 40 | DEFAULT_FILENAME_FMT = '%Y-%m.csv' 41 | 42 | def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False, 43 | datetime_fmt=DEFAULT_DATETIME_FMT, filename_fmt=DEFAULT_FILENAME_FMT): 44 | self._readings_path = readings_path 45 | self._rule = rule 46 | self._aggregation = aggregation 47 | self._unstack = unstack 48 | self._datetime_fmt = datetime_fmt 49 | self._filename_fmt = filename_fmt 50 | 51 | @dask.delayed 52 | def __filter_by_signal(self, readings, signals): 53 | if signals is not None: 54 | LOGGER.debug('Filtering by signal') 55 | readings = readings[readings.signal_id.isin(signals)] 56 | 57 | try: 58 | readings['value'] = readings['value'].astype(float) 59 | except ValueError: 60 | signals = readings[readings['value'].str.isnumeric()].signal_id.unique() 61 | raise ValueError('Signals contain non-numerical values: {}'.format(signals)) 62 | 63 | LOGGER.debug('Selected %s readings by signal', len(readings)) 64 | 65 | return readings.copy() 66 | 67 | @dask.delayed 68 | def __filter_by_timestamp(self, readings, timestamps): 69 | LOGGER.debug('Parsing timestamps') 70 | readings_ts = pd.to_datetime(readings['timestamp'], format=self._datetime_fmt) 71 | readings['timestamp'] = readings_ts 72 | 73 | LOGGER.debug('Filtering by timestamp') 74 | 75 | related = [False] * len(readings) 76 | for row in timestamps.itertuples(): 77 | lower = row.start <= readings_ts 78 | upper = readings_ts <= row.stop 79 | related |= lower & upper 80 | 81 | readings = readings[related] 82 | 83 | LOGGER.debug('Selected %s readings by timestamp', len(readings)) 84 | 85 | return readings.copy() 86 | 87 | @dask.delayed 88 | def __load_readings_file(self, turbine_file, timestamps, signals): 89 | LOGGER.debug('Loading file %s', turbine_file) 90 | data = pd.read_csv(turbine_file, low_memory=False) 91 | data.columns = data.columns.str.lower() 92 | data = data.rename(columns={'signal': 'signal_id'}) 93 | 94 | if 'unnamed: 0' in data.columns: 95 | # Someone forgot to drop the index before 96 | # storing the DataFrame as a CSV 97 | del data['unnamed: 0'] 98 | 99 | LOGGER.debug('Loaded %s readings from file %s', len(data), turbine_file) 100 | 101 | return data 102 | 103 | @dask.delayed 104 | def __consolidate(self, readings, turbine_id): 105 | readings = pd.concat(readings, ignore_index=True) 106 | readings.insert(0, 'turbine_id', turbine_id) 107 | 108 | LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id) 109 | 110 | return readings 111 | 112 | def _get_filenames(self, turbine_path, timestamps): 113 | min_csv = timestamps.start.dt.strftime(self._filename_fmt) 114 | max_csv = timestamps.stop.dt.strftime(self._filename_fmt) 115 | 116 | for filename in sorted(os.listdir(turbine_path)): 117 | if ((min_csv <= filename) & (filename <= max_csv)).any(): 118 | yield os.path.join(turbine_path, filename) 119 | 120 | @staticmethod 121 | def _join_names(names): 122 | """Join the names of a multi-level index with an underscore.""" 123 | 124 | levels = (str(name) for name in names if name != '') 125 | return '_'.join(levels) 126 | 127 | @dask.delayed 128 | def __resample(self, readings): 129 | LOGGER.info('Resampling: %s - %s', self._rule, self._aggregation) 130 | grouped = readings.groupby('signal_id') 131 | dfr = grouped.resample(rule=self._rule, on='timestamp') 132 | agg = dfr.agg(self._aggregation) 133 | 134 | LOGGER.info('%s readings reduced to %s', len(readings), len(agg)) 135 | 136 | if self._unstack: 137 | agg = agg.unstack(level='signal_id').reset_index() 138 | agg.columns = agg.columns.map(self._join_names) 139 | return agg 140 | else: 141 | return agg.reset_index() 142 | 143 | def _load_turbine(self, turbine_id, timestamps, signals=None): 144 | if 'turbine_id' in timestamps: 145 | timestamps = timestamps[timestamps.turbine_id == turbine_id] 146 | 147 | turbine_path = os.path.join(self._readings_path, turbine_id) 148 | filenames = self._get_filenames(turbine_path, timestamps) 149 | 150 | readings = list() 151 | for filename in filenames: 152 | file_readings = self.__load_readings_file(filename, timestamps, signals) 153 | file_readings = self.__filter_by_signal(file_readings, signals) 154 | file_readings = self.__filter_by_timestamp(file_readings, timestamps) 155 | 156 | if self._rule: 157 | file_readings = self.__resample(file_readings) 158 | 159 | readings.append(file_readings) 160 | 161 | if readings: 162 | readings = self.__consolidate(readings, turbine_id) 163 | 164 | return readings 165 | 166 | @staticmethod 167 | def _get_timestamps(target_times, window_size): 168 | cutoff_times = target_times.cutoff_time 169 | min_times = cutoff_times - window_size 170 | 171 | return pd.DataFrame({ 172 | 'turbine_id': target_times.turbine_id, 173 | 'start': min_times, 174 | 'stop': cutoff_times, 175 | }) 176 | 177 | def load(self, target_times, window_size, signals=None, debug=False, select_valid=True): 178 | """Load the readings needed for the given target_times and window_size. 179 | 180 | Optionally filter the signals that are loaded and discard the rest. 181 | 182 | Args: 183 | target_times (str or pandas.DataFrame): 184 | target_times ``DataFrame`` or path to the corresponding CSV file. 185 | The table must have three volumns, ``turbine_id``, ``target`` and 186 | ``cutoff_time``. 187 | window_size (str): 188 | Amount of data to load before each cutoff time, specified as a string 189 | representation of a TimeDelta, which includes a number and a 190 | unit. For example: ``3d``, ``1w``, ``6h``. 191 | signals (list or pandas.DataFrame): 192 | List of signal names or table that has a ``signal_id`` column to 193 | use as the signal names list. 194 | debug (bool): 195 | Force single thread execution for easy debugging. Defaults to ``False``. 196 | 197 | Returns: 198 | pandas.DataFrame: 199 | Table of readings for the target times, including the columns ``turbine_id``, 200 | ``signal_id``, ``timestamp`` and ``value``. 201 | """ 202 | if isinstance(target_times, str): 203 | target_times = pd.read_csv(target_times) 204 | target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time']) 205 | 206 | target_times = drop_duplicates(target_times) 207 | 208 | if isinstance(signals, pd.DataFrame): 209 | signals = signals.signal_id 210 | 211 | window_size = pd.to_timedelta(window_size) 212 | timestamps = self._get_timestamps(target_times, window_size) 213 | 214 | readings = list() 215 | for turbine_id in timestamps.turbine_id.unique(): 216 | readings.append(self._load_turbine(turbine_id, timestamps, signals)) 217 | 218 | dask_scheduler = 'single-threaded' if debug else None 219 | computed = dask.compute(*readings, scheduler=dask_scheduler) 220 | 221 | found_readings = [c for c in computed if len(c)] 222 | if not found_readings: 223 | msg = 'No readings found for the given target times in {}'.format(self._readings_path) 224 | raise ValueError(msg) 225 | 226 | readings = pd.concat(found_readings, ignore_index=True, sort=False) 227 | 228 | LOGGER.info('Loaded %s turbine readings', len(readings)) 229 | 230 | if select_valid: 231 | target_times = select_valid_targets(target_times, readings, window_size, self._rule) 232 | return target_times, readings 233 | 234 | return readings 235 | -------------------------------------------------------------------------------- /draco/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | import numpy as np 5 | from sklearn.metrics import (accuracy_score, f1_score, mean_absolute_error, 6 | mean_squared_error, roc_curve, roc_auc_score, r2_score) 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | def f1_macro(exp, obs): 12 | return f1_score(exp, obs, average='macro') 13 | 14 | 15 | def threshold_score(ground_truth, probabilities, tpr): 16 | roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) 17 | try: 18 | index = np.where(roc_tpr >= tpr)[0][0] 19 | except: 20 | LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate') 21 | index = -1 22 | 23 | return roc_threshold[index] 24 | 25 | 26 | def tpr_score(ground_truth, probabilities, threshold): 27 | roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) 28 | try: 29 | index = np.where(roc_threshold >= threshold)[0][0] 30 | except: 31 | LOGGER.warn('Could not find a tpr that satisfies the requested threshold') 32 | index = -1 33 | 34 | return roc_tpr[index] 35 | 36 | 37 | def fpr_score(ground_truth, probabilities, tpr=None, threshold=None): 38 | """Compute the False Positive Rate associated with the given True Positive Rate. 39 | 40 | This metric computes the False Positive Rate that needs to be assumed in order 41 | to achieve the desired True Positive Rate. 42 | The metric is computed by finding the minimum necessary threshold to ensure 43 | that the TPR is satisfied and then computing the associated FPR. The final output 44 | is 1 minus the found FPR to produce a maximization score between 0 and 1. 45 | 46 | Args: 47 | ground_truth (numpy.ndarray): 48 | ``numpy.ndarray`` of the known values for the given predictions. 49 | probabilities (numpy.ndarray): 50 | ``numpy.ndarray`` with the generated predictions in probability. 51 | tpr (float): 52 | ``float`` value representing the percentage of True Positive Rate 53 | to be satisfied. 54 | 55 | Returns: 56 | float: 57 | Value between 0 and 1, where bigger is better. 58 | """ 59 | roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) 60 | try: 61 | if tpr: 62 | index = np.where(roc_tpr >= tpr)[0][0] 63 | elif threshold: 64 | index = np.where(roc_threshold >= threshold)[0][0] 65 | 66 | except: 67 | LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate') 68 | index = -1 69 | 70 | return 1 - roc_fpr[index] 71 | 72 | 73 | METRICS = { 74 | 'accuracy': (accuracy_score, False), 75 | 'f1': (f1_score, False), 76 | 'f1_macro': (f1_macro, False), 77 | 'r2': (r2_score, False), 78 | 'mse': (mean_squared_error, True), 79 | 'mae': (mean_absolute_error, True), 80 | 'fpr': (fpr_score, False), 81 | 'roc_auc_score': (roc_auc_score, False) 82 | } 83 | -------------------------------------------------------------------------------- /draco/pipelines/double_lstm/double_lstm.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.pop", 4 | "pandas.DataFrame.pop", 5 | "sklearn.impute.SimpleImputer", 6 | "sklearn.preprocessing.MinMaxScaler", 7 | "pandas.DataFrame", 8 | "pandas.DataFrame.set", 9 | "pandas.DataFrame.set", 10 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 11 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier" 12 | ], 13 | "init_params": { 14 | "pandas.DataFrame.pop#1": { 15 | "item": "turbine_id" 16 | }, 17 | "pandas.DataFrame.pop#2": { 18 | "item": "timestamp" 19 | }, 20 | "sklearn.preprocessing.MinMaxScaler#1": { 21 | "feature_range": [ 22 | -1, 23 | 1 24 | ] 25 | }, 26 | "pandas.DataFrame#1": { 27 | "index": null, 28 | "columns": null 29 | }, 30 | "pandas.DataFrame.set#1": { 31 | "key": "turbine_id" 32 | }, 33 | "pandas.DataFrame.set#2": { 34 | "key": "timestamp" 35 | }, 36 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 37 | "window_size": 24, 38 | "cutoff_time": "cutoff_time", 39 | "time_index": "timestamp" 40 | }, 41 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { 42 | "epochs": 35, 43 | "verbose": false 44 | } 45 | }, 46 | "input_names": { 47 | "pandas.DataFrame.pop#1": { 48 | "X": "readings" 49 | }, 50 | "pandas.DataFrame.pop#2": { 51 | "X": "readings" 52 | }, 53 | "sklearn.impute.SimpleImputer#1": { 54 | "X": "readings" 55 | }, 56 | "sklearn.preprocessing.MinMaxScaler#1": { 57 | "X": "readings" 58 | }, 59 | "pandas.DataFrame#1": { 60 | "X": "readings" 61 | }, 62 | "pandas.DataFrame.set#1": { 63 | "X": "readings", 64 | "value": "turbine_id" 65 | }, 66 | "pandas.DataFrame.set#2": { 67 | "X": "readings", 68 | "value": "timestamp" 69 | }, 70 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 71 | "timeseries": "readings" 72 | } 73 | }, 74 | "output_names": { 75 | "pandas.DataFrame.pop#1": { 76 | "item": "turbine_id" 77 | }, 78 | "pandas.DataFrame.pop#2": { 79 | "item": "timestamp" 80 | }, 81 | "sklearn.impute.SimpleImputer#1": { 82 | "X": "readings" 83 | }, 84 | "sklearn.preprocessing.MinMaxScaler#1": { 85 | "X": "readings" 86 | }, 87 | "pandas.DataFrame#1": { 88 | "X": "readings" 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /draco/pipelines/double_lstm/double_lstm_prob.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.pop", 4 | "pandas.DataFrame.pop", 5 | "sklearn.impute.SimpleImputer", 6 | "sklearn.preprocessing.MinMaxScaler", 7 | "pandas.DataFrame", 8 | "pandas.DataFrame.set", 9 | "pandas.DataFrame.set", 10 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 11 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier", 12 | "numpy.take" 13 | ], 14 | "init_params": { 15 | "pandas.DataFrame.pop#1": { 16 | "item": "turbine_id" 17 | }, 18 | "pandas.DataFrame.pop#2": { 19 | "item": "timestamp" 20 | }, 21 | "sklearn.preprocessing.MinMaxScaler#1": { 22 | "feature_range": [ 23 | -1, 24 | 1 25 | ] 26 | }, 27 | "pandas.DataFrame#1": { 28 | "index": null, 29 | "columns": null 30 | }, 31 | "pandas.DataFrame.set#1": { 32 | "key": "turbine_id" 33 | }, 34 | "pandas.DataFrame.set#2": { 35 | "key": "timestamp" 36 | }, 37 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 38 | "window_size": 24, 39 | "cutoff_time": "cutoff_time", 40 | "time_index": "timestamp" 41 | }, 42 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { 43 | "epochs": 35, 44 | "verbose": false, 45 | "classification": false, 46 | "loss": "keras.losses.binary_crossentropy" 47 | }, 48 | "numpy.take#1": { 49 | "indices": 1, 50 | "axis": 1 51 | } 52 | }, 53 | "input_names": { 54 | "pandas.DataFrame.pop#1": { 55 | "X": "readings" 56 | }, 57 | "pandas.DataFrame.pop#2": { 58 | "X": "readings" 59 | }, 60 | "sklearn.impute.SimpleImputer#1": { 61 | "X": "readings" 62 | }, 63 | "sklearn.preprocessing.MinMaxScaler#1": { 64 | "X": "readings" 65 | }, 66 | "pandas.DataFrame#1": { 67 | "X": "readings" 68 | }, 69 | "pandas.DataFrame.set#1": { 70 | "X": "readings", 71 | "value": "turbine_id" 72 | }, 73 | "pandas.DataFrame.set#2": { 74 | "X": "readings", 75 | "value": "timestamp" 76 | }, 77 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 78 | "timeseries": "readings" 79 | } 80 | }, 81 | "output_names": { 82 | "pandas.DataFrame.pop#1": { 83 | "item": "turbine_id" 84 | }, 85 | "pandas.DataFrame.pop#2": { 86 | "item": "timestamp" 87 | }, 88 | "sklearn.impute.SimpleImputer#1": { 89 | "X": "readings" 90 | }, 91 | "sklearn.preprocessing.MinMaxScaler#1": { 92 | "X": "readings" 93 | }, 94 | "pandas.DataFrame#1": { 95 | "X": "readings" 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.resample", 4 | "pandas.DataFrame.unstack", 5 | "pandas.DataFrame.pop", 6 | "pandas.DataFrame.pop", 7 | "sklearn.impute.SimpleImputer", 8 | "sklearn.preprocessing.MinMaxScaler", 9 | "pandas.DataFrame", 10 | "pandas.DataFrame.set", 11 | "pandas.DataFrame.set", 12 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 13 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier", 14 | "numpy.take" 15 | ], 16 | "init_params": { 17 | "pandas.DataFrame.resample#1": { 18 | "rule": "3600s", 19 | "on": "timestamp", 20 | "groupby": [ 21 | "turbine_id", 22 | "signal_id" 23 | ], 24 | "aggregation": "mean", 25 | "reset_index": false 26 | }, 27 | "pandas.DataFrame.unstack#1": { 28 | "level": "signal_id", 29 | "reset_index": true 30 | }, 31 | "pandas.DataFrame.pop#1": { 32 | "item": "turbine_id" 33 | }, 34 | "pandas.DataFrame.pop#2": { 35 | "item": "timestamp" 36 | }, 37 | "sklearn.preprocessing.MinMaxScaler#1": { 38 | "feature_range": [ 39 | -1, 40 | 1 41 | ] 42 | }, 43 | "pandas.DataFrame#1": { 44 | "index": null, 45 | "columns": null 46 | }, 47 | "pandas.DataFrame.set#1": { 48 | "key": "turbine_id" 49 | }, 50 | "pandas.DataFrame.set#2": { 51 | "key": "timestamp" 52 | }, 53 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 54 | "window_size": 24, 55 | "cutoff_time": "cutoff_time", 56 | "time_index": "timestamp" 57 | }, 58 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { 59 | "epochs": 35, 60 | "verbose": false, 61 | "classification": false, 62 | "loss": "keras.losses.binary_crossentropy" 63 | }, 64 | "numpy.take#1": { 65 | "indices": 1, 66 | "axis": 1 67 | } 68 | }, 69 | "input_names": { 70 | "pandas.DataFrame.resample#1": { 71 | "X": "readings" 72 | }, 73 | "pandas.DataFrame.unstack#1": { 74 | "X": "readings" 75 | }, 76 | "pandas.DataFrame.pop#1": { 77 | "X": "readings" 78 | }, 79 | "pandas.DataFrame.pop#2": { 80 | "X": "readings" 81 | }, 82 | "sklearn.impute.SimpleImputer#1": { 83 | "X": "readings" 84 | }, 85 | "sklearn.preprocessing.MinMaxScaler#1": { 86 | "X": "readings" 87 | }, 88 | "pandas.DataFrame#1": { 89 | "X": "readings" 90 | }, 91 | "pandas.DataFrame.set#1": { 92 | "X": "readings", 93 | "value": "turbine_id" 94 | }, 95 | "pandas.DataFrame.set#2": { 96 | "X": "readings", 97 | "value": "timestamp" 98 | }, 99 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 100 | "timeseries": "readings" 101 | } 102 | }, 103 | "output_names": { 104 | "pandas.DataFrame.resample#1": { 105 | "X": "readings" 106 | }, 107 | "pandas.DataFrame.unstack#1": { 108 | "X": "readings" 109 | }, 110 | "pandas.DataFrame.pop#1": { 111 | "item": "turbine_id" 112 | }, 113 | "pandas.DataFrame.pop#2": { 114 | "item": "timestamp" 115 | }, 116 | "sklearn.impute.SimpleImputer#1": { 117 | "X": "readings" 118 | }, 119 | "sklearn.preprocessing.MinMaxScaler#1": { 120 | "X": "readings" 121 | }, 122 | "pandas.DataFrame#1": { 123 | "X": "readings" 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /draco/pipelines/double_lstm/double_lstm_with_unstack.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.resample", 4 | "pandas.DataFrame.unstack", 5 | "pandas.DataFrame.pop", 6 | "pandas.DataFrame.pop", 7 | "sklearn.impute.SimpleImputer", 8 | "sklearn.preprocessing.MinMaxScaler", 9 | "pandas.DataFrame", 10 | "pandas.DataFrame.set", 11 | "pandas.DataFrame.set", 12 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 13 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier" 14 | ], 15 | "init_params": { 16 | "pandas.DataFrame.resample#1": { 17 | "rule": "3600s", 18 | "on": "timestamp", 19 | "groupby": [ 20 | "turbine_id", 21 | "signal_id" 22 | ], 23 | "aggregation": "mean", 24 | "reset_index": false 25 | }, 26 | "pandas.DataFrame.unstack#1": { 27 | "level": "signal_id", 28 | "reset_index": true 29 | }, 30 | "pandas.DataFrame.pop#1": { 31 | "item": "turbine_id" 32 | }, 33 | "pandas.DataFrame.pop#2": { 34 | "item": "timestamp" 35 | }, 36 | "sklearn.preprocessing.MinMaxScaler#1": { 37 | "feature_range": [ 38 | -1, 39 | 1 40 | ] 41 | }, 42 | "pandas.DataFrame#1": { 43 | "index": null, 44 | "columns": null 45 | }, 46 | "pandas.DataFrame.set#1": { 47 | "key": "turbine_id" 48 | }, 49 | "pandas.DataFrame.set#2": { 50 | "key": "timestamp" 51 | }, 52 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 53 | "window_size": 24, 54 | "cutoff_time": "cutoff_time", 55 | "time_index": "timestamp" 56 | }, 57 | "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { 58 | "epochs": 35, 59 | "verbose": false 60 | } 61 | }, 62 | "input_names": { 63 | "pandas.DataFrame.resample#1": { 64 | "X": "readings" 65 | }, 66 | "pandas.DataFrame.unstack#1": { 67 | "X": "readings" 68 | }, 69 | "pandas.DataFrame.pop#1": { 70 | "X": "readings" 71 | }, 72 | "pandas.DataFrame.pop#2": { 73 | "X": "readings" 74 | }, 75 | "sklearn.impute.SimpleImputer#1": { 76 | "X": "readings" 77 | }, 78 | "sklearn.preprocessing.MinMaxScaler#1": { 79 | "X": "readings" 80 | }, 81 | "pandas.DataFrame#1": { 82 | "X": "readings" 83 | }, 84 | "pandas.DataFrame.set#1": { 85 | "X": "readings", 86 | "value": "turbine_id" 87 | }, 88 | "pandas.DataFrame.set#2": { 89 | "X": "readings", 90 | "value": "timestamp" 91 | }, 92 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 93 | "timeseries": "readings" 94 | } 95 | }, 96 | "output_names": { 97 | "pandas.DataFrame.resample#1": { 98 | "X": "readings" 99 | }, 100 | "pandas.DataFrame.unstack#1": { 101 | "X": "readings" 102 | }, 103 | "pandas.DataFrame.pop#1": { 104 | "item": "turbine_id" 105 | }, 106 | "pandas.DataFrame.pop#2": { 107 | "item": "timestamp" 108 | }, 109 | "sklearn.impute.SimpleImputer#1": { 110 | "X": "readings" 111 | }, 112 | "sklearn.preprocessing.MinMaxScaler#1": { 113 | "X": "readings" 114 | }, 115 | "pandas.DataFrame#1": { 116 | "X": "readings" 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /draco/pipelines/dummy/dummy.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "sklearn.impute.SimpleImputer", 4 | "sklearn.preprocessing.MinMaxScaler", 5 | "sklearn.linear_model.LogisticRegression" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /draco/pipelines/lstm/lstm.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.pop", 4 | "pandas.DataFrame.pop", 5 | "sklearn.impute.SimpleImputer", 6 | "sklearn.preprocessing.MinMaxScaler", 7 | "pandas.DataFrame", 8 | "pandas.DataFrame.set", 9 | "pandas.DataFrame.set", 10 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 11 | "keras.Sequential.LSTMTimeSeriesClassifier" 12 | ], 13 | "init_params": { 14 | "pandas.DataFrame.pop#1": { 15 | "item": "turbine_id" 16 | }, 17 | "pandas.DataFrame.pop#2": { 18 | "item": "timestamp" 19 | }, 20 | "sklearn.preprocessing.MinMaxScaler#1": { 21 | "feature_range": [ 22 | -1, 23 | 1 24 | ] 25 | }, 26 | "pandas.DataFrame#1": { 27 | "index": null, 28 | "columns": null 29 | }, 30 | "pandas.DataFrame.set#1": { 31 | "key": "turbine_id" 32 | }, 33 | "pandas.DataFrame.set#2": { 34 | "key": "timestamp" 35 | }, 36 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 37 | "window_size": 24, 38 | "cutoff_time": "cutoff_time", 39 | "time_index": "timestamp" 40 | }, 41 | "keras.Sequential.LSTMTimeSeriesClassifier#1": { 42 | "epochs": 35, 43 | "verbose": false 44 | } 45 | }, 46 | "input_names": { 47 | "pandas.DataFrame.pop#1": { 48 | "X": "readings" 49 | }, 50 | "pandas.DataFrame.pop#2": { 51 | "X": "readings" 52 | }, 53 | "sklearn.impute.SimpleImputer#1": { 54 | "X": "readings" 55 | }, 56 | "sklearn.preprocessing.MinMaxScaler#1": { 57 | "X": "readings" 58 | }, 59 | "pandas.DataFrame#1": { 60 | "X": "readings" 61 | }, 62 | "pandas.DataFrame.set#1": { 63 | "X": "readings", 64 | "value": "turbine_id" 65 | }, 66 | "pandas.DataFrame.set#2": { 67 | "X": "readings", 68 | "value": "timestamp" 69 | }, 70 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 71 | "timeseries": "readings" 72 | } 73 | }, 74 | "output_names": { 75 | "pandas.DataFrame.pop#1": { 76 | "item": "turbine_id" 77 | }, 78 | "pandas.DataFrame.pop#2": { 79 | "item": "timestamp" 80 | }, 81 | "sklearn.impute.SimpleImputer#1": { 82 | "X": "readings" 83 | }, 84 | "sklearn.preprocessing.MinMaxScaler#1": { 85 | "X": "readings" 86 | }, 87 | "pandas.DataFrame#1": { 88 | "X": "readings" 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /draco/pipelines/lstm/lstm_prob.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.pop", 4 | "pandas.DataFrame.pop", 5 | "sklearn.impute.SimpleImputer", 6 | "sklearn.preprocessing.MinMaxScaler", 7 | "pandas.DataFrame", 8 | "pandas.DataFrame.set", 9 | "pandas.DataFrame.set", 10 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 11 | "keras.Sequential.LSTMTimeSeriesClassifier", 12 | "numpy.take" 13 | ], 14 | "init_params": { 15 | "pandas.DataFrame.pop#1": { 16 | "item": "turbine_id" 17 | }, 18 | "pandas.DataFrame.pop#2": { 19 | "item": "timestamp" 20 | }, 21 | "sklearn.preprocessing.MinMaxScaler#1": { 22 | "feature_range": [ 23 | -1, 24 | 1 25 | ] 26 | }, 27 | "pandas.DataFrame#1": { 28 | "index": null, 29 | "columns": null 30 | }, 31 | "pandas.DataFrame.set#1": { 32 | "key": "turbine_id" 33 | }, 34 | "pandas.DataFrame.set#2": { 35 | "key": "timestamp" 36 | }, 37 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 38 | "window_size": 24, 39 | "cutoff_time": "cutoff_time", 40 | "time_index": "timestamp" 41 | }, 42 | "keras.Sequential.LSTMTimeSeriesClassifier#1": { 43 | "epochs": 35, 44 | "verbose": false, 45 | "classification": false, 46 | "loss": "keras.losses.binary_crossentropy" 47 | }, 48 | "numpy.take#1": { 49 | "indices": 1, 50 | "axis": 1 51 | } 52 | }, 53 | "input_names": { 54 | "pandas.DataFrame.pop#1": { 55 | "X": "readings" 56 | }, 57 | "pandas.DataFrame.pop#2": { 58 | "X": "readings" 59 | }, 60 | "sklearn.impute.SimpleImputer#1": { 61 | "X": "readings" 62 | }, 63 | "sklearn.preprocessing.MinMaxScaler#1": { 64 | "X": "readings" 65 | }, 66 | "pandas.DataFrame#1": { 67 | "X": "readings" 68 | }, 69 | "pandas.DataFrame.set#1": { 70 | "X": "readings", 71 | "value": "turbine_id" 72 | }, 73 | "pandas.DataFrame.set#2": { 74 | "X": "readings", 75 | "value": "timestamp" 76 | }, 77 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 78 | "timeseries": "readings" 79 | } 80 | }, 81 | "output_names": { 82 | "pandas.DataFrame.pop#1": { 83 | "item": "turbine_id" 84 | }, 85 | "pandas.DataFrame.pop#2": { 86 | "item": "timestamp" 87 | }, 88 | "sklearn.impute.SimpleImputer#1": { 89 | "X": "readings" 90 | }, 91 | "sklearn.preprocessing.MinMaxScaler#1": { 92 | "X": "readings" 93 | }, 94 | "pandas.DataFrame#1": { 95 | "X": "readings" 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /draco/pipelines/lstm/lstm_prob_with_unstack.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.resample", 4 | "pandas.DataFrame.unstack", 5 | "pandas.DataFrame.pop", 6 | "pandas.DataFrame.pop", 7 | "sklearn.impute.SimpleImputer", 8 | "sklearn.preprocessing.MinMaxScaler", 9 | "pandas.DataFrame", 10 | "pandas.DataFrame.set", 11 | "pandas.DataFrame.set", 12 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 13 | "keras.Sequential.LSTMTimeSeriesClassifier", 14 | "numpy.take" 15 | ], 16 | "init_params": { 17 | "pandas.DataFrame.resample#1": { 18 | "rule": "3600s", 19 | "on": "timestamp", 20 | "groupby": [ 21 | "turbine_id", 22 | "signal_id" 23 | ], 24 | "aggregation": "mean", 25 | "reset_index": false 26 | }, 27 | "pandas.DataFrame.unstack#1": { 28 | "level": "signal_id", 29 | "reset_index": true 30 | }, 31 | "pandas.DataFrame.pop#1": { 32 | "item": "turbine_id" 33 | }, 34 | "pandas.DataFrame.pop#2": { 35 | "item": "timestamp" 36 | }, 37 | "sklearn.preprocessing.MinMaxScaler#1": { 38 | "feature_range": [ 39 | -1, 40 | 1 41 | ] 42 | }, 43 | "pandas.DataFrame#1": { 44 | "index": null, 45 | "columns": null 46 | }, 47 | "pandas.DataFrame.set#1": { 48 | "key": "turbine_id" 49 | }, 50 | "pandas.DataFrame.set#2": { 51 | "key": "timestamp" 52 | }, 53 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 54 | "window_size": 24, 55 | "cutoff_time": "cutoff_time", 56 | "time_index": "timestamp" 57 | }, 58 | "keras.Sequential.LSTMTimeSeriesClassifier#1": { 59 | "epochs": 35, 60 | "verbose": false, 61 | "classification": false, 62 | "loss": "keras.losses.binary_crossentropy" 63 | }, 64 | "numpy.take#1": { 65 | "indices": 1, 66 | "axis": 1 67 | } 68 | }, 69 | "input_names": { 70 | "pandas.DataFrame.resample#1": { 71 | "X": "readings" 72 | }, 73 | "pandas.DataFrame.unstack#1": { 74 | "X": "readings" 75 | }, 76 | "pandas.DataFrame.pop#1": { 77 | "X": "readings" 78 | }, 79 | "pandas.DataFrame.pop#2": { 80 | "X": "readings" 81 | }, 82 | "sklearn.impute.SimpleImputer#1": { 83 | "X": "readings" 84 | }, 85 | "sklearn.preprocessing.MinMaxScaler#1": { 86 | "X": "readings" 87 | }, 88 | "pandas.DataFrame#1": { 89 | "X": "readings" 90 | }, 91 | "pandas.DataFrame.set#1": { 92 | "X": "readings", 93 | "value": "turbine_id" 94 | }, 95 | "pandas.DataFrame.set#2": { 96 | "X": "readings", 97 | "value": "timestamp" 98 | }, 99 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 100 | "timeseries": "readings" 101 | } 102 | }, 103 | "output_names": { 104 | "pandas.DataFrame.resample#1": { 105 | "X": "readings" 106 | }, 107 | "pandas.DataFrame.unstack#1": { 108 | "X": "readings" 109 | }, 110 | "pandas.DataFrame.pop#1": { 111 | "item": "turbine_id" 112 | }, 113 | "pandas.DataFrame.pop#2": { 114 | "item": "timestamp" 115 | }, 116 | "sklearn.impute.SimpleImputer#1": { 117 | "X": "readings" 118 | }, 119 | "sklearn.preprocessing.MinMaxScaler#1": { 120 | "X": "readings" 121 | }, 122 | "pandas.DataFrame#1": { 123 | "X": "readings" 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /draco/pipelines/lstm/lstm_with_unstack.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.resample", 4 | "pandas.DataFrame.unstack", 5 | "pandas.DataFrame.pop", 6 | "pandas.DataFrame.pop", 7 | "sklearn.impute.SimpleImputer", 8 | "sklearn.preprocessing.MinMaxScaler", 9 | "pandas.DataFrame", 10 | "pandas.DataFrame.set", 11 | "pandas.DataFrame.set", 12 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 13 | "keras.Sequential.LSTMTimeSeriesClassifier" 14 | ], 15 | "init_params": { 16 | "pandas.DataFrame.resample#1": { 17 | "rule": "3600s", 18 | "on": "timestamp", 19 | "groupby": [ 20 | "turbine_id", 21 | "signal_id" 22 | ], 23 | "aggregation": "mean", 24 | "reset_index": false 25 | }, 26 | "pandas.DataFrame.unstack#1": { 27 | "level": "signal_id", 28 | "reset_index": true 29 | }, 30 | "pandas.DataFrame.pop#1": { 31 | "item": "turbine_id" 32 | }, 33 | "pandas.DataFrame.pop#2": { 34 | "item": "timestamp" 35 | }, 36 | "sklearn.preprocessing.MinMaxScaler#1": { 37 | "feature_range": [ 38 | -1, 39 | 1 40 | ] 41 | }, 42 | "pandas.DataFrame#1": { 43 | "index": null, 44 | "columns": null 45 | }, 46 | "pandas.DataFrame.set#1": { 47 | "key": "turbine_id" 48 | }, 49 | "pandas.DataFrame.set#2": { 50 | "key": "timestamp" 51 | }, 52 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 53 | "window_size": 24, 54 | "cutoff_time": "cutoff_time", 55 | "time_index": "timestamp" 56 | }, 57 | "keras.Sequential.LSTMTimeSeriesClassifier#1": { 58 | "epochs": 35, 59 | "verbose": false 60 | } 61 | }, 62 | "input_names": { 63 | "pandas.DataFrame.resample#1": { 64 | "X": "readings" 65 | }, 66 | "pandas.DataFrame.unstack#1": { 67 | "X": "readings" 68 | }, 69 | "pandas.DataFrame.pop#1": { 70 | "X": "readings" 71 | }, 72 | "pandas.DataFrame.pop#2": { 73 | "X": "readings" 74 | }, 75 | "sklearn.impute.SimpleImputer#1": { 76 | "X": "readings" 77 | }, 78 | "sklearn.preprocessing.MinMaxScaler#1": { 79 | "X": "readings" 80 | }, 81 | "pandas.DataFrame#1": { 82 | "X": "readings" 83 | }, 84 | "pandas.DataFrame.set#1": { 85 | "X": "readings", 86 | "value": "turbine_id" 87 | }, 88 | "pandas.DataFrame.set#2": { 89 | "X": "readings", 90 | "value": "timestamp" 91 | }, 92 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 93 | "timeseries": "readings" 94 | } 95 | }, 96 | "output_names": { 97 | "pandas.DataFrame.resample#1": { 98 | "X": "readings" 99 | }, 100 | "pandas.DataFrame.unstack#1": { 101 | "X": "readings" 102 | }, 103 | "pandas.DataFrame.pop#1": { 104 | "item": "turbine_id" 105 | }, 106 | "pandas.DataFrame.pop#2": { 107 | "item": "timestamp" 108 | }, 109 | "sklearn.impute.SimpleImputer#1": { 110 | "X": "readings" 111 | }, 112 | "sklearn.preprocessing.MinMaxScaler#1": { 113 | "X": "readings" 114 | }, 115 | "pandas.DataFrame#1": { 116 | "X": "readings" 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /draco/pipelines/lstm_regressor/lstm_regressor.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.pop", 4 | "pandas.DataFrame.pop", 5 | "sklearn.impute.SimpleImputer", 6 | "sklearn.preprocessing.MinMaxScaler", 7 | "pandas.DataFrame", 8 | "pandas.DataFrame.set", 9 | "pandas.DataFrame.set", 10 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 11 | "keras.Sequential.LSTMTimeSeriesRegressor" 12 | ], 13 | "init_params": { 14 | "pandas.DataFrame.pop#1": { 15 | "item": "turbine_id" 16 | }, 17 | "pandas.DataFrame.pop#2": { 18 | "item": "timestamp" 19 | }, 20 | "sklearn.preprocessing.MinMaxScaler#1": { 21 | "feature_range": [ 22 | -1, 23 | 1 24 | ] 25 | }, 26 | "pandas.DataFrame#1": { 27 | "index": null, 28 | "columns": null 29 | }, 30 | "pandas.DataFrame.set#1": { 31 | "key": "turbine_id" 32 | }, 33 | "pandas.DataFrame.set#2": { 34 | "key": "timestamp" 35 | }, 36 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 37 | "window_size": 24, 38 | "cutoff_time": "cutoff_time", 39 | "time_index": "timestamp" 40 | }, 41 | "keras.Sequential.LSTMTimeSeriesRegressor#1": { 42 | "epochs": 35, 43 | "verbose": false 44 | } 45 | }, 46 | "input_names": { 47 | "pandas.DataFrame.pop#1": { 48 | "X": "readings" 49 | }, 50 | "pandas.DataFrame.pop#2": { 51 | "X": "readings" 52 | }, 53 | "sklearn.impute.SimpleImputer#1": { 54 | "X": "readings" 55 | }, 56 | "sklearn.preprocessing.MinMaxScaler#1": { 57 | "X": "readings" 58 | }, 59 | "pandas.DataFrame#1": { 60 | "X": "readings" 61 | }, 62 | "pandas.DataFrame.set#1": { 63 | "X": "readings", 64 | "value": "turbine_id" 65 | }, 66 | "pandas.DataFrame.set#2": { 67 | "X": "readings", 68 | "value": "timestamp" 69 | }, 70 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 71 | "timeseries": "readings" 72 | } 73 | }, 74 | "output_names": { 75 | "pandas.DataFrame.pop#1": { 76 | "item": "turbine_id" 77 | }, 78 | "pandas.DataFrame.pop#2": { 79 | "item": "timestamp" 80 | }, 81 | "sklearn.impute.SimpleImputer#1": { 82 | "X": "readings" 83 | }, 84 | "sklearn.preprocessing.MinMaxScaler#1": { 85 | "X": "readings" 86 | }, 87 | "pandas.DataFrame#1": { 88 | "X": "readings" 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json: -------------------------------------------------------------------------------- 1 | { 2 | "primitives": [ 3 | "pandas.DataFrame.resample", 4 | "pandas.DataFrame.unstack", 5 | "pandas.DataFrame.pop", 6 | "pandas.DataFrame.pop", 7 | "sklearn.impute.SimpleImputer", 8 | "sklearn.preprocessing.MinMaxScaler", 9 | "pandas.DataFrame", 10 | "pandas.DataFrame.set", 11 | "pandas.DataFrame.set", 12 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", 13 | "keras.Sequential.LSTMTimeSeriesRegressor" 14 | ], 15 | "init_params": { 16 | "pandas.DataFrame.resample#1": { 17 | "rule": "600s", 18 | "on": "timestamp", 19 | "groupby": [ 20 | "turbine_id", 21 | "signal_id" 22 | ], 23 | "aggregation": "mean", 24 | "reset_index": false 25 | }, 26 | "pandas.DataFrame.unstack#1": { 27 | "level": "signal_id", 28 | "reset_index": true 29 | }, 30 | "pandas.DataFrame.pop#1": { 31 | "item": "turbine_id" 32 | }, 33 | "pandas.DataFrame.pop#2": { 34 | "item": "timestamp" 35 | }, 36 | "sklearn.preprocessing.MinMaxScaler#1": { 37 | "feature_range": [ 38 | -1, 39 | 1 40 | ] 41 | }, 42 | "pandas.DataFrame#1": { 43 | "index": null, 44 | "columns": null 45 | }, 46 | "pandas.DataFrame.set#1": { 47 | "key": "turbine_id" 48 | }, 49 | "pandas.DataFrame.set#2": { 50 | "key": "timestamp" 51 | }, 52 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 53 | "window_size": 24, 54 | "cutoff_time": "cutoff_time", 55 | "time_index": "timestamp" 56 | }, 57 | "keras.Sequential.LSTMTimeSeriesRegressor#1": { 58 | "epochs": 35, 59 | "verbose": true 60 | } 61 | }, 62 | "input_names": { 63 | "pandas.DataFrame.resample#1": { 64 | "X": "readings" 65 | }, 66 | "pandas.DataFrame.unstack#1": { 67 | "X": "readings" 68 | }, 69 | "pandas.DataFrame.pop#1": { 70 | "X": "readings" 71 | }, 72 | "pandas.DataFrame.pop#2": { 73 | "X": "readings" 74 | }, 75 | "sklearn.impute.SimpleImputer#1": { 76 | "X": "readings" 77 | }, 78 | "sklearn.preprocessing.MinMaxScaler#1": { 79 | "X": "readings" 80 | }, 81 | "pandas.DataFrame#1": { 82 | "X": "readings" 83 | }, 84 | "pandas.DataFrame.set#1": { 85 | "X": "readings", 86 | "value": "turbine_id" 87 | }, 88 | "pandas.DataFrame.set#2": { 89 | "X": "readings", 90 | "value": "timestamp" 91 | }, 92 | "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { 93 | "timeseries": "readings" 94 | } 95 | }, 96 | "output_names": { 97 | "pandas.DataFrame.resample#1": { 98 | "X": "readings" 99 | }, 100 | "pandas.DataFrame.unstack#1": { 101 | "X": "readings" 102 | }, 103 | "pandas.DataFrame.pop#1": { 104 | "item": "turbine_id" 105 | }, 106 | "pandas.DataFrame.pop#2": { 107 | "item": "timestamp" 108 | }, 109 | "sklearn.impute.SimpleImputer#1": { 110 | "X": "readings" 111 | }, 112 | "sklearn.preprocessing.MinMaxScaler#1": { 113 | "X": "readings" 114 | }, 115 | "pandas.DataFrame#1": { 116 | "X": "readings" 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /draco/primitives/mlblocks.MLPipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mlblocks.MLPipeline", 3 | "primitive": "mlblocks.MLPipeline", 4 | "fit": { 5 | "method": "fit", 6 | "args": "get_fit_args" 7 | }, 8 | "produce": { 9 | "method": "predict", 10 | "args": "get_predict_args", 11 | "output": "get_outputs" 12 | }, 13 | "hyperparameters": { 14 | "fixed": { 15 | "pipeline": { 16 | "type": "str", 17 | "default": null 18 | }, 19 | "primitives": { 20 | "type": "list", 21 | "default": [] 22 | }, 23 | "init_params": { 24 | "type": "dict", 25 | "default": {} 26 | }, 27 | "input_names": { 28 | "type": "dict", 29 | "default": {} 30 | }, 31 | "output_names": { 32 | "type": "dict", 33 | "default": {} 34 | } 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /draco/primitives/numpy.take.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "numpy.take", 3 | "contributors": [ 4 | "Plamen Valentinov Kolev " 5 | ], 6 | "documentation": "https://docs.scipy.org/doc/numpy/reference/", 7 | "description": "Take elements from an array along an axis.", 8 | "classifiers": { 9 | "type": "postprocessor" 10 | }, 11 | "modalities": [], 12 | "primitive": "numpy.take", 13 | "produce": { 14 | "args": [ 15 | { 16 | "name": "y", 17 | "keyword": "a", 18 | "type": "ndarray" 19 | } 20 | ], 21 | "output": [ 22 | { 23 | "name": "y", 24 | "type": "ndarray" 25 | } 26 | ] 27 | }, 28 | "hyperparameters": { 29 | "fixed": { 30 | "indices": { 31 | "type": "int", 32 | "default": 0 33 | }, 34 | "axis": { 35 | "type": "int", 36 | "default": null 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /draco/primitives/xgboost.XGBClassifier:probabilities.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "xgboost.XGBClassifier", 3 | "contributors": [ 4 | "Carles Sala " 5 | ], 6 | "documentation": "https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier", 7 | "description": "Implementation of the scikit-learn API for XGBoost classification.", 8 | "classifiers": { 9 | "type": "estimator", 10 | "subtype": "classifier" 11 | }, 12 | "modalities": [], 13 | "primitive": "xgboost.XGBClassifier", 14 | "fit": { 15 | "method": "fit", 16 | "args": [ 17 | { 18 | "name": "X", 19 | "type": "ndarray" 20 | }, 21 | { 22 | "name": "y", 23 | "type": "array" 24 | } 25 | ] 26 | }, 27 | "produce": { 28 | "method": "predict_proba", 29 | "args": [ 30 | { 31 | "name": "X", 32 | "keyword": "data", 33 | "type": "ndarray" 34 | } 35 | ], 36 | "output": [ 37 | { 38 | "name": "y", 39 | "type": "array" 40 | } 41 | ] 42 | }, 43 | "hyperparameters": { 44 | "fixed": { 45 | "n_jobs": { 46 | "type": "int", 47 | "default": -1 48 | } 49 | }, 50 | "tunable": { 51 | "n_estimators": { 52 | "type": "int", 53 | "default": 100, 54 | "range": [ 55 | 10, 56 | 1000 57 | ] 58 | }, 59 | "max_depth": { 60 | "type": "int", 61 | "default": 3, 62 | "range": [ 63 | 3, 64 | 10 65 | ] 66 | }, 67 | "learning_rate": { 68 | "type": "float", 69 | "default": 0.1, 70 | "range": [ 71 | 0, 72 | 1 73 | ] 74 | }, 75 | "gamma": { 76 | "type": "float", 77 | "default": 0, 78 | "range": [ 79 | 0, 80 | 1 81 | ] 82 | }, 83 | "min_child_weight": { 84 | "type": "int", 85 | "default": 1, 86 | "range": [ 87 | 1, 88 | 10 89 | ] 90 | } 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /draco/results.py: -------------------------------------------------------------------------------- 1 | import os 2 | from random import random 3 | 4 | import pandas as pd 5 | 6 | 7 | def load_results(files): 8 | problems_results = dict() 9 | for filename in files: 10 | problem = os.path.basename(filename).replace('.csv', '') 11 | problems_results[problem] = pd.read_csv(filename).round(6) 12 | 13 | return problems_results 14 | 15 | 16 | def get_wins_by_problems(results): 17 | df = results.groupby('problem_name')['template', 'window_size', 'resample_rule', 'fpr_threshold=0.5'] 18 | df = df.apply(max) 19 | df = df.rename(columns={'fpr_threshold=0.5': 'score'}) 20 | 21 | return df 22 | 23 | 24 | def get_exclusive_wins(scores, column, pivot_columns=['window_size', 'resample_rule']): 25 | summary = {} 26 | for problem in scores.problem_name.unique(): 27 | df = scores[scores['problem_name'] == problem] 28 | df['wr'] = df.apply( 29 | lambda row: '{}_{}_{}'.format(row[pivot_columns[0]], row[pivot_columns[1]], random()), axis=1) 30 | df = df.pivot(index='wr', columns=column, values='fpr_threshold=0.5') 31 | 32 | is_winner = df.T.rank(method='min', ascending=False) == 1 33 | num_winners = is_winner.sum() 34 | is_exclusive = num_winners == 1 35 | is_exclusive_winner = is_winner & is_exclusive 36 | summary[problem] = is_exclusive_winner.sum(axis=1) 37 | 38 | summary_df = pd.DataFrame(summary) 39 | summary_df.index.name = 'template' 40 | columns = summary_df.columns.sort_values(ascending=False) 41 | return summary_df[columns] 42 | 43 | 44 | def add_sheet(dfs, name, writer, cell_fmt, index_fmt, header_fmt): 45 | startrow = 0 46 | widths = [0] 47 | if not isinstance(dfs, dict): 48 | dfs = {None: dfs} 49 | 50 | for df_name, df in dfs.items(): 51 | df = df.reset_index() 52 | startrow += bool(df_name) 53 | df.to_excel(writer, sheet_name=name, startrow=startrow + 1, index=False, header=False) 54 | 55 | worksheet = writer.sheets[name] 56 | 57 | if df_name: 58 | worksheet.write(startrow - 1, 0, df_name, index_fmt) 59 | widths[0] = max(widths[0], len(df_name)) 60 | 61 | for idx, column in enumerate(df.columns): 62 | worksheet.write(startrow, idx, column, header_fmt) 63 | width = max(len(column), *df[column].astype(str).str.len()) + 1 64 | if len(widths) > idx: 65 | widths[idx] = max(widths[idx], width) 66 | else: 67 | widths.append(width) 68 | 69 | startrow += len(df) + 2 70 | 71 | for idx, width in enumerate(widths): 72 | fmt = cell_fmt if idx else index_fmt 73 | worksheet.set_column(idx, idx, width + 1, fmt) 74 | 75 | 76 | def write_results(results, output): 77 | writer = pd.ExcelWriter(output, engine='xlsxwriter') 78 | cell_fmt = writer.book.add_format({ 79 | "font_name": "Arial", 80 | "font_size": "10" 81 | }) 82 | index_fmt = writer.book.add_format({ 83 | "font_name": "Arial", 84 | "font_size": "10", 85 | "bold": True, 86 | }) 87 | header_fmt = writer.book.add_format({ 88 | "font_name": "Arial", 89 | "font_size": "10", 90 | "bold": True, 91 | "bottom": 1 92 | }) 93 | 94 | if isinstance(results, dict): 95 | results = pd.concat(list(results.values()), ignore_index=True) 96 | 97 | window = get_exclusive_wins(results, 'window_size', ['window_size', 'fpr_threshold=0.5']) 98 | 99 | resample_pivots = ['resample_rule', ['problem_name', 'fpr_threshold=0.5']] 100 | resample = get_exclusive_wins(results, 'resample_rule', resample_pivots) 101 | 102 | summary = { 103 | 'Best pipeline by Problem': get_wins_by_problems(results), 104 | 'Rankings - Number of wins': get_exclusive_wins(results, 'template'), 105 | 'Resample Rule': resample, 106 | 'Window Size': window 107 | } 108 | add_sheet(summary, 'Summary', writer, cell_fmt, index_fmt, header_fmt) 109 | 110 | for problem in results['problem_name'].unique(): 111 | add_sheet( 112 | results[results['problem_name'] == problem], 113 | problem, 114 | writer, 115 | cell_fmt, 116 | index_fmt, 117 | header_fmt 118 | ) 119 | 120 | writer.save() 121 | -------------------------------------------------------------------------------- /draco/targets.py: -------------------------------------------------------------------------------- 1 | """Targets module. 2 | 3 | This module contains functions to work with target_times. 4 | """ 5 | 6 | import logging 7 | import warnings 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from tqdm.auto import trange 12 | 13 | LOGGER = logging.getLogger(__name__) 14 | 15 | 16 | def make_targets(target_times, window_size, target, new_targets=None): 17 | target_times = target_times.sort_values('cutoff_time', ascending=True) 18 | cutoff_times = target_times.cutoff_time 19 | window_size = pd.to_timedelta(window_size) 20 | original_size = len(target_times) 21 | current_size = original_size 22 | new_targets = new_targets or current_size 23 | 24 | for index in trange(len(cutoff_times) - 1): 25 | timestamp = cutoff_times.iloc[index] 26 | next_time = cutoff_times.iloc[index + 1] 27 | 28 | if timestamp + (window_size * 2) >= next_time: 29 | continue 30 | 31 | span_start = timestamp + window_size 32 | span_end = next_time - window_size 33 | span_length = (span_end - span_start).total_seconds() 34 | 35 | delay = pd.to_timedelta(np.random.randint(span_length), unit='s') 36 | cutoff_time = span_start + delay 37 | 38 | target_times = target_times.append(pd.Series({ 39 | 'turbine_id': target_times.iloc[index].turbine_id, 40 | 'cutoff_time': cutoff_time, 41 | 'target': target 42 | }), ignore_index=True) 43 | 44 | current_size = len(target_times) 45 | if current_size == original_size + new_targets: 46 | return target_times.sort_values('cutoff_time', ascending=True) 47 | 48 | if current_size == original_size: 49 | warnings.warn('There is no space left between to add more targets.') 50 | return target_times 51 | 52 | new_targets = new_targets - (current_size - original_size) 53 | return make_targets(target_times, window_size, target, new_targets) 54 | 55 | 56 | def _to_timedelta(specification): 57 | if isinstance(specification, int): 58 | specification = '{}s'.format(specification) 59 | 60 | return pd.to_timedelta(specification) 61 | 62 | 63 | def make_target_times(failure_dates, step, start=None, end=None, forecast_window=0, 64 | prediction_window=0, before=0, after=0, offset=0, max_true=None, 65 | max_false=None, shuffle=True): 66 | 67 | step = _to_timedelta(step) 68 | start = start or failure_dates.timestamp.min() 69 | start = start or failure_dates.min() 70 | 71 | forecast_window = _to_timedelta(forecast_window) 72 | prediction_window = _to_timedelta(prediction_window) 73 | before = _to_timedelta(before) 74 | after = _to_timedelta(after) 75 | offset = _to_timedelta(offset) 76 | 77 | target_times = pd.DataFrame() 78 | turbines = failure_dates.turbine_id.unique() 79 | failures = failure_dates.set_index(['turbine_id', 'date']) 80 | 81 | for turbine in turbines: 82 | turbine_failures = failures.loc[turbine] 83 | 84 | min_failure_date = turbine_failures.index.min() - before 85 | last_failure_date = turbine_failures.index.max() + after 86 | turbine_targets = list() 87 | while min_failure_date < last_failure_date: 88 | max_failure_date = min_failure_date + prediction_window 89 | day_failures = turbine_failures.loc[min_failure_date:max_failure_date] 90 | 91 | min_failure_date = min_failure_date + offset 92 | 93 | turbine_targets.append({ 94 | 'turbine_id': turbine, 95 | 'target': int(bool(len(day_failures))), 96 | 'cutoff_time': min_failure_date - forecast_window 97 | }) 98 | 99 | turbine_targets = pd.DataFrame(turbine_targets) 100 | failed = turbine_targets[turbine_targets.target == 1] 101 | target_times = target_times.append(failed) 102 | 103 | non_failed = turbine_targets[turbine_targets.target == 0] 104 | non_failed = non_failed.sample(min(max_false, len(non_failed))) 105 | 106 | target_times = target_times.append(non_failed) 107 | 108 | if shuffle: 109 | target_times = target_times.sample(len(target_times)) 110 | 111 | return target_times 112 | 113 | 114 | def _valid_targets(timestamps): 115 | def apply_function(row): 116 | cutoff = row.cutoff_time 117 | try: 118 | times = timestamps.loc[row.turbine_id] 119 | except KeyError: 120 | return False 121 | 122 | return times['min'] <= cutoff <= times['max'] 123 | 124 | return apply_function 125 | 126 | 127 | def select_valid_targets(target_times, readings, window_size, rule=None): 128 | """Filter out target_times without enough data for this window_size. 129 | 130 | The table_times table is scanned and checked against the readings table 131 | considering the window_size. All the target times entries that do not 132 | have enough data are dropped. 133 | 134 | Args: 135 | target_times (pandas.DataFrame): 136 | Target times table, with at least turbined_id and cutoff_time fields. 137 | readings (pandas.DataFrame): 138 | Readings table, with at least turbine_id, signal_id, and timestamp ields. 139 | window_size (str or pandas.TimeDelta): 140 | TimeDelta specification that indicates the lenght of the training window. 141 | rule (str or pandas.TimeDelta): 142 | Resampling rule specification. If given, add that to the max timestamp 143 | to ensure the period is completely covered. 144 | 145 | Returns: 146 | pandas.DataFrame: 147 | New target_times table without the invalid targets. 148 | """ 149 | 150 | timestamps = readings.groupby('turbine_id').timestamp.agg(['min', 'max']) 151 | timestamps['min'] += pd.to_timedelta(window_size) 152 | 153 | if rule is not None: 154 | timestamps['max'] += pd.to_timedelta(rule) 155 | 156 | valid = target_times.apply(_valid_targets(timestamps), axis=1) 157 | valid_targets = target_times[valid] 158 | 159 | length = len(valid_targets) 160 | LOGGER.info('Dropped %s targets without enough data. Final target_times size: %s', 161 | len(target_times) - length, length) 162 | 163 | return valid_targets 164 | 165 | 166 | def drop_duplicates(target_times): 167 | length = len(target_times) 168 | filtered = target_times.drop_duplicates() 169 | new_length = len(filtered) 170 | if length != new_length: 171 | LOGGER.warn('Dropped %s duplicate targets!', length - new_length) 172 | 173 | filtered = filtered.drop_duplicates(subset=['turbine_id', 'cutoff_time'], keep=False) 174 | final_length = len(filtered) 175 | if new_length != final_length: 176 | LOGGER.warn('Dropped %s incoherent targets!', new_length - final_length) 177 | 178 | return filtered.copy() 179 | -------------------------------------------------------------------------------- /draco/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from mlblocks import MLPipeline 6 | 7 | 8 | def clone_pipeline(pipeline): 9 | return MLPipeline.from_dict(pipeline.to_dict()) 10 | 11 | 12 | def walk(document, transform): 13 | if not isinstance(document, dict): 14 | return document 15 | 16 | new_doc = dict() 17 | for key, value in document.items(): 18 | if isinstance(value, dict): 19 | value = walk(value, transform) 20 | elif isinstance(value, list): 21 | value = [walk(v, transform) for v in value] 22 | 23 | new_key, new_value = transform(key, value) 24 | new_doc[new_key] = new_value 25 | 26 | return new_doc 27 | 28 | 29 | def remove_dots(document): 30 | return walk(document, lambda key, value: (key.replace('.', '-'), value)) 31 | 32 | 33 | def restore_dots(document): 34 | return walk(document, lambda key, value: (key.replace('-', '.'), value)) 35 | 36 | 37 | def logging_setup(verbosity=1, logfile=None, logger_name=None): 38 | logger = logging.getLogger(logger_name) 39 | log_level = (3 - verbosity) * 10 40 | fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s' 41 | formatter = logging.Formatter(fmt) 42 | logger.setLevel(log_level) 43 | logger.propagate = False 44 | 45 | if logfile: 46 | file_handler = logging.FileHandler(logfile) 47 | file_handler.setLevel(logging.DEBUG) 48 | file_handler.setFormatter(formatter) 49 | logger.addHandler(file_handler) 50 | 51 | else: 52 | console_handler = logging.StreamHandler() 53 | console_handler.setLevel(log_level) 54 | console_handler.setFormatter(formatter) 55 | logger.addHandler(console_handler) 56 | 57 | 58 | def as_list(param): 59 | """Make sure that param is a ``list``.""" 60 | if isinstance(param, (list, tuple)): 61 | return param 62 | 63 | return [param] 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.3.1.dev0 3 | commit = True 4 | tag = True 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? 6 | serialize = 7 | {major}.{minor}.{patch}.{release}{candidate} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = release 12 | first_value = dev 13 | values = 14 | dev 15 | release 16 | 17 | [bumpversion:part:candidate] 18 | 19 | [bumpversion:file:setup.py] 20 | search = version='{current_version}' 21 | replace = version='{new_version}' 22 | 23 | [bumpversion:file:draco/__init__.py] 24 | search = __version__ = '{current_version}' 25 | replace = __version__ = '{new_version}' 26 | 27 | [bdist_wheel] 28 | universal = 1 29 | 30 | [flake8] 31 | max-line-length = 99 32 | exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints 33 | ignore = # keep empty to prevent default ignores 34 | 35 | [isort] 36 | include_trailing_comment = True 37 | line_length = 99 38 | lines_between_types = 0 39 | multi_line_output = 4 40 | not_skip = __init__.py 41 | use_parentheses = True 42 | 43 | [aliases] 44 | test = pytest 45 | 46 | [tool:pytest] 47 | collect_ignore = ['setup.py'] 48 | 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | 6 | try: 7 | with open('README.md', encoding='utf-8') as readme_file: 8 | readme = readme_file.read() 9 | except IOError: 10 | readme = '' 11 | 12 | try: 13 | with open('HISTORY.md', encoding='utf-8') as history_file: 14 | history = history_file.read() 15 | except IOError: 16 | history = '' 17 | 18 | install_requires = [ 19 | 'baytune>=0.4.0,<0.5', 20 | 'ml-stars>=0.1.0', 21 | 'mlblocks>=0.4.0,<0.5', 22 | 'pymongo>=3.7.2,<4', 23 | 'scikit-learn>=0.21,<1.2', 24 | 'tqdm<4.50.0,>=4.36.1', 25 | 'scipy>=1.0.1,<2', 26 | 'numpy>=1.16.0,<1.19', 27 | 'pandas>=1,<2', 28 | 'tensorflow>=2,<2.3', 29 | 'partd>=1.1.0,<2', 30 | 'fsspec>=0.8.5,<0.9', 31 | 'dask>=2.6.0,<3', 32 | 'tabulate>=0.8.3,<0.9', 33 | 'xlsxwriter>=1.3.6,<1.4', 34 | # fix conflicts 35 | 'protobuf<4', 36 | 'importlib-metadata<5', 37 | ] 38 | 39 | setup_requires = [ 40 | 'pytest-runner>=2.11.1', 41 | ] 42 | 43 | tests_require = [ 44 | 'pytest>=3.4.2', 45 | 'pytest-cov>=2.6.0', 46 | 'jupyter>=1.0.0,<2', 47 | 'rundoc>=0.4.3,<0.5', 48 | ] 49 | 50 | development_requires = [ 51 | # general 52 | 'bumpversion>=0.5.3,<0.6', 53 | 'pip>=9.0.1', 54 | 'watchdog>=0.8.3,<0.11', 55 | 56 | # docs 57 | 'm2r>=0.2.0,<0.3', 58 | 'nbsphinx>=0.5.0,<0.7', 59 | 'Sphinx>=1.7.1,<3', 60 | 'sphinx_rtd_theme>=0.2.4,<0.5', 61 | 'docutils>=0.14,<0.18', 62 | 'autodocsumm>=0.1.10', 63 | 'markupsafe<2.1.0', 64 | 'Jinja2>=2,<3', 65 | 66 | # style check 67 | 'flake8>=3.7.7,<4', 68 | 'isort>=4.3.4,<5', 69 | 70 | # fix style issues 71 | 'autoflake>=1.1,<2', 72 | 'autopep8>=1.4.3,<2', 73 | 'importlib-metadata<5', 74 | 75 | # distribute on PyPI 76 | 'twine>=1.10.0,<4', 77 | 'wheel>=0.30.0', 78 | 79 | # Advanced testing 80 | 'coverage>=4.5.1,<6', 81 | 'tox>=2.9.1,<4', 82 | ] 83 | 84 | setup( 85 | author='MIT Data To AI Lab', 86 | author_email='dailabmit@gmail.com', 87 | classifiers=[ 88 | 'Development Status :: 2 - Pre-Alpha', 89 | 'Intended Audience :: Developers', 90 | 'License :: OSI Approved :: MIT License', 91 | 'Natural Language :: English', 92 | 'Programming Language :: Python :: 3', 93 | 'Programming Language :: Python :: 3.6', 94 | 'Programming Language :: Python :: 3.7', 95 | 'Programming Language :: Python :: 3.8', 96 | ], 97 | description='AutoML for Time Series.', 98 | entry_points={ 99 | 'mlblocks': [ 100 | 'pipelines=draco:MLBLOCKS_PIPELINES', 101 | 'primitives=draco:MLBLOCKS_PRIMITIVES' 102 | ], 103 | }, 104 | extras_require={ 105 | 'test': tests_require, 106 | 'dev': development_requires + tests_require, 107 | }, 108 | include_package_data=True, 109 | install_requires=install_requires, 110 | keywords='wind machine learning draco', 111 | license='MIT license', 112 | long_description=readme + '\n\n' + history, 113 | long_description_content_type='text/markdown', 114 | name='draco-ml', 115 | packages=find_packages(include=['draco', 'draco.*']), 116 | python_requires='>=3.6,<3.9', 117 | setup_requires=setup_requires, 118 | test_suite='tests', 119 | tests_require=tests_require, 120 | url='https://github.com/sintel-dev/Draco', 121 | version='0.3.1.dev0', 122 | zip_safe=False, 123 | ) 124 | -------------------------------------------------------------------------------- /tests/test_benchmark.py: -------------------------------------------------------------------------------- 1 | """Tests for `draco.benchmark` module.""" 2 | import numpy as np 3 | 4 | from draco.benchmark import evaluate_templates 5 | from draco.demo import load_demo 6 | 7 | 8 | def test_predict(): 9 | # setup 10 | templates = [ 11 | 'lstm_with_unstack' 12 | ] 13 | 14 | window_size_rule = [ 15 | ('1d', '1h') 16 | ] 17 | 18 | target_times, readings = load_demo() 19 | target_times = target_times.head(40) 20 | readings = readings.head(100) 21 | 22 | # run 23 | scores_df = evaluate_templates( 24 | target_times=target_times, 25 | readings=readings, 26 | templates=templates, 27 | window_size_rule=window_size_rule, 28 | tuning_iterations=1, 29 | cv_splits=2 30 | ) 31 | 32 | # assert 33 | expected_columns = [ 34 | 'problem_name', 35 | 'window_size', 36 | 'resample_rule', 37 | 'template', 38 | 'default_test', 39 | 'default_cv', 40 | 'tuned_cv', 41 | 'tuned_test', 42 | 'tuning_metric', 43 | 'tuning_metric_kwargs', 44 | 'fit_predict_time', 45 | 'default_cv_time', 46 | 'average_cv_time', 47 | 'total_time', 48 | 'status', 49 | 'accuracy_threshold/0.5', 50 | 'f1_threshold/0.5', 51 | 'fpr_threshold/0.5', 52 | 'tpr_threshold/0.5', 53 | ] 54 | 55 | expected_dtypes = [ 56 | np.dtype('O'), 57 | np.dtype('O'), 58 | np.dtype('O'), 59 | np.dtype('O'), 60 | np.dtype('float64'), 61 | np.dtype('float64'), 62 | np.dtype('float64'), 63 | np.dtype('float64'), 64 | np.dtype('O'), 65 | np.dtype('O'), 66 | np.dtype('\n", 184 | "\n", 197 | "\n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | "
problem_namewindow_sizeresample_ruletemplatedefault_testdefault_cvtuned_cvtuned_testtuning_metrictuning_metric_kwargsfit_predict_timedefault_cv_timeaverage_cv_timetotal_timestatusaccuracy_threshold/0.5f1_threshold/0.5fpr_threshold/0.5tpr_threshold/0.5
0None1d1hlstm_prob_with_unstack0.4945050.5899050.5899050.322650roc_auc_score{'threshold': 0.5}0 days 00:00:03.8731570 days 00:00:14.3695360 days 00:00:08.1784220 days 00:00:47.144655OK0.2808990.2558141.00.0
1None2d2hlstm_prob_with_unstack0.4465810.5430560.5615700.707875roc_auc_score{'threshold': 0.5}0 days 00:00:03.4604670 days 00:00:12.1219050 days 00:00:08.2759190 days 00:00:44.449291OK0.7303370.5862071.00.0
2None1d1hdouble_lstm_prob_with_unstack0.8131870.3079930.5926960.417582roc_auc_score{'threshold': 0.5}0 days 00:00:05.4609850 days 00:00:18.1036600 days 00:00:14.0118770 days 00:01:11.192546OK0.3033710.3673471.00.0
3None2d2hdouble_lstm_prob_with_unstack0.2457260.6639190.6639190.293346roc_auc_score{'threshold': 0.5}0 days 00:00:05.5688350 days 00:00:17.9483610 days 00:00:14.0038160 days 00:01:11.051792OK0.3033710.1842111.00.0
\n", 313 | "" 314 | ], 315 | "text/plain": [ 316 | " problem_name window_size resample_rule template \\\n", 317 | "0 None 1d 1h lstm_prob_with_unstack \n", 318 | "1 None 2d 2h lstm_prob_with_unstack \n", 319 | "2 None 1d 1h double_lstm_prob_with_unstack \n", 320 | "3 None 2d 2h double_lstm_prob_with_unstack \n", 321 | "\n", 322 | " default_test default_cv tuned_cv tuned_test tuning_metric \\\n", 323 | "0 0.494505 0.589905 0.589905 0.322650 roc_auc_score \n", 324 | "1 0.446581 0.543056 0.561570 0.707875 roc_auc_score \n", 325 | "2 0.813187 0.307993 0.592696 0.417582 roc_auc_score \n", 326 | "3 0.245726 0.663919 0.663919 0.293346 roc_auc_score \n", 327 | "\n", 328 | " tuning_metric_kwargs fit_predict_time default_cv_time \\\n", 329 | "0 {'threshold': 0.5} 0 days 00:00:03.873157 0 days 00:00:14.369536 \n", 330 | "1 {'threshold': 0.5} 0 days 00:00:03.460467 0 days 00:00:12.121905 \n", 331 | "2 {'threshold': 0.5} 0 days 00:00:05.460985 0 days 00:00:18.103660 \n", 332 | "3 {'threshold': 0.5} 0 days 00:00:05.568835 0 days 00:00:17.948361 \n", 333 | "\n", 334 | " average_cv_time total_time status \\\n", 335 | "0 0 days 00:00:08.178422 0 days 00:00:47.144655 OK \n", 336 | "1 0 days 00:00:08.275919 0 days 00:00:44.449291 OK \n", 337 | "2 0 days 00:00:14.011877 0 days 00:01:11.192546 OK \n", 338 | "3 0 days 00:00:14.003816 0 days 00:01:11.051792 OK \n", 339 | "\n", 340 | " accuracy_threshold/0.5 f1_threshold/0.5 fpr_threshold/0.5 \\\n", 341 | "0 0.280899 0.255814 1.0 \n", 342 | "1 0.730337 0.586207 1.0 \n", 343 | "2 0.303371 0.367347 1.0 \n", 344 | "3 0.303371 0.184211 1.0 \n", 345 | "\n", 346 | " tpr_threshold/0.5 \n", 347 | "0 0.0 \n", 348 | "1 0.0 \n", 349 | "2 0.0 \n", 350 | "3 0.0 " 351 | ] 352 | }, 353 | "execution_count": 4, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "results" 360 | ] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3 (ipykernel)", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.8.16" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /tutorials/04_Draco_Regression_Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Draco Regression Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this tutorial we will show you how to use Draco Regression pipelines to solve a Machine Learning problem\n", 15 | "defined via a Target Times table.\n", 16 | "\n", 17 | "During the next steps we will:\n", 18 | "\n", 19 | "- Load demo Remaining Useful Life (dataset) with training and testing target times and readings\n", 20 | "- Find available pipelines and load one of them\n", 21 | "- Build and fit a Machine Learning pipeline\n", 22 | "- Make predictions using the fitted pipeline\n", 23 | "- Evaluate how good the predictions are" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 0. Setup the logging\n", 31 | "\n", 32 | "This step sets up logging in our environment to increase our visibility over\n", 33 | "the steps that Draco performs." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import logging;\n", 43 | "\n", 44 | "logging.basicConfig(level=logging.INFO)\n", 45 | "logging.getLogger().setLevel(level=logging.INFO)\n", 46 | "\n", 47 | "import warnings\n", 48 | "warnings.simplefilter(\"ignore\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 1. Load the Data\n", 56 | "\n", 57 | "The first step is to load the data that we are going to use.\n", 58 | "\n", 59 | "In order to use the demo data included in Draco, the `draco.demo.load_demo` function can be used." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from draco.demo import load_demo\n", 69 | "\n", 70 | "train_target_times, test_target_times, readings = load_demo(name='rul')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "This will download some demo data from [Draco S3 demo Bucket](\n", 78 | "https://d3-ai-draco.s3.amazonaws.com/index.html) and load it as\n", 79 | "the necessary `target_times` and `readings` tables.\n", 80 | "\n", 81 | "The exact format of these tables is described in the Draco README and docs:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
turbine_idcutoff_timetarget
012013-01-12 04:20:00166
112013-01-12 04:30:00165
212013-01-12 04:40:00164
312013-01-12 04:50:00163
412013-01-12 05:00:00162
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " turbine_id cutoff_time target\n", 152 | "0 1 2013-01-12 04:20:00 166\n", 153 | "1 1 2013-01-12 04:30:00 165\n", 154 | "2 1 2013-01-12 04:40:00 164\n", 155 | "3 1 2013-01-12 04:50:00 163\n", 156 | "4 1 2013-01-12 05:00:00 162" 157 | ] 158 | }, 159 | "execution_count": 3, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "train_target_times.head()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "(18131, 3)" 177 | ] 178 | }, 179 | "execution_count": 4, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "train_target_times.shape" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 5, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "turbine_id int64\n", 197 | "cutoff_time datetime64[ns]\n", 198 | "target int64\n", 199 | "dtype: object" 200 | ] 201 | }, 202 | "execution_count": 5, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "train_target_times.dtypes" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/html": [ 219 | "
\n", 220 | "\n", 233 | "\n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | "
turbine_idcutoff_timetarget
012013-01-13 13:10:00112.0
122013-01-14 08:00:0098.0
232013-01-14 02:50:0069.0
342013-01-14 01:10:0082.0
452013-01-14 13:10:0091.0
\n", 275 | "
" 276 | ], 277 | "text/plain": [ 278 | " turbine_id cutoff_time target\n", 279 | "0 1 2013-01-13 13:10:00 112.0\n", 280 | "1 2 2013-01-14 08:00:00 98.0\n", 281 | "2 3 2013-01-14 02:50:00 69.0\n", 282 | "3 4 2013-01-14 01:10:00 82.0\n", 283 | "4 5 2013-01-14 13:10:00 91.0" 284 | ] 285 | }, 286 | "execution_count": 6, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "test_target_times.head()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 7, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "(100, 3)" 304 | ] 305 | }, 306 | "execution_count": 7, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "test_target_times.shape" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 8, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "turbine_id int64\n", 324 | "cutoff_time datetime64[ns]\n", 325 | "target float64\n", 326 | "dtype: object" 327 | ] 328 | }, 329 | "execution_count": 8, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "test_target_times.dtypes" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 9, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/html": [ 346 | "
\n", 347 | "\n", 360 | "\n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | "
turbine_idtimestampsignal_idvalue
012013-01-12 00:10:00operational setting 1-0.0007
112013-01-12 00:20:00operational setting 10.0019
212013-01-12 00:30:00operational setting 1-0.0043
312013-01-12 00:40:00operational setting 10.0007
412013-01-12 00:50:00operational setting 1-0.0019
\n", 408 | "
" 409 | ], 410 | "text/plain": [ 411 | " turbine_id timestamp signal_id value\n", 412 | "0 1 2013-01-12 00:10:00 operational setting 1 -0.0007\n", 413 | "1 1 2013-01-12 00:20:00 operational setting 1 0.0019\n", 414 | "2 1 2013-01-12 00:30:00 operational setting 1 -0.0043\n", 415 | "3 1 2013-01-12 00:40:00 operational setting 1 0.0007\n", 416 | "4 1 2013-01-12 00:50:00 operational setting 1 -0.0019" 417 | ] 418 | }, 419 | "execution_count": 9, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "readings.head()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 10, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/plain": [ 436 | "(809448, 4)" 437 | ] 438 | }, 439 | "execution_count": 10, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "readings.shape" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 11, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "turbine_id int64\n", 457 | "timestamp datetime64[ns]\n", 458 | "signal_id object\n", 459 | "value float64\n", 460 | "dtype: object" 461 | ] 462 | }, 463 | "execution_count": 11, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "readings.dtypes" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "### Load your own Dataset" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "Alternatively, if you want to load your own dataset, all you have to do is load the\n", 484 | "`target_times` and `readings` tables as `pandas.DataFrame` objects.\n", 485 | "\n", 486 | "Make sure to parse the corresponding datetime fields!\n", 487 | "\n", 488 | "```python\n", 489 | "import pandas as pd\n", 490 | "\n", 491 | "target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", 492 | "readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])\n", 493 | "```" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "## 2. Finding the available Pipelines\n", 501 | "\n", 502 | "The next step will be to select a collection of templates from the ones\n", 503 | "available in Draco.\n", 504 | "\n", 505 | "For this, we can use the `draco.get_pipelines` function, which will\n", 506 | "return us the list of all the available MLBlocks pipelines found in the\n", 507 | "Draco system." 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 12, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "['dummy',\n", 519 | " 'lstm_regressor_with_unstack',\n", 520 | " 'lstm_regressor',\n", 521 | " 'double_lstm_prob_with_unstack',\n", 522 | " 'double_lstm_prob',\n", 523 | " 'double_lstm',\n", 524 | " 'double_lstm_with_unstack',\n", 525 | " 'lstm_prob_with_unstack',\n", 526 | " 'lstm_with_unstack',\n", 527 | " 'lstm_prob',\n", 528 | " 'lstm']" 529 | ] 530 | }, 531 | "execution_count": 12, 532 | "metadata": {}, 533 | "output_type": "execute_result" 534 | } 535 | ], 536 | "source": [ 537 | "from draco import get_pipelines\n", 538 | "\n", 539 | "get_pipelines()" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "Optionally, we can pass a string to select the pipelines that contain it:" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 13, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "data": { 556 | "text/plain": [ 557 | "['lstm_regressor_with_unstack', 'lstm_regressor']" 558 | ] 559 | }, 560 | "execution_count": 13, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "get_pipelines('regressor')" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "We will use the regression pipeline `lstm_regressor_with_unstack`\n", 574 | "\n", 575 | "The `lstm_regressor_with_unstack` pipeline contains the following steps:\n", 576 | "\n", 577 | "- Resample the data using a 10 minute average aggregation\n", 578 | "- Unstack the data by signal, so each signal is in a different column\n", 579 | "- Impute missing values in the readings table\n", 580 | "- Normalize (scale) the data between [-1, 1].\n", 581 | "- Create window sequences using target times.\n", 582 | "- Apply an LSTM Regressor" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 14, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "pipeline_name = 'lstm_regressor_with_unstack'" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "## 3. Fitting a Draco Pipeline\n", 599 | "\n", 600 | "Once we have loaded the data, we create a **DracoPipeline** instance by passing `pipeline_name` which is the name of a pipeline, the path to a template json file, or a list that can combine both of them." 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 15, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "from draco.pipeline import DracoPipeline\n", 610 | "\n", 611 | "init_params = {\n", 612 | " \"keras.Sequential.LSTMTimeSeriesRegressor#1\": {\n", 613 | " \"epochs\": 10\n", 614 | " }\n", 615 | "}\n", 616 | "\n", 617 | "pipeline = DracoPipeline(pipeline_name, init_params=init_params)" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "To train a pipeline we use the `fit` method passing the `target_times` and the `readings` table:" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 16, 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "name": "stderr", 634 | "output_type": "stream", 635 | "text": [ 636 | "2023-04-07 16:46:35.571262: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", 637 | "2023-04-07 16:46:35.594871: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff23c392800 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", 638 | "2023-04-07 16:46:35.594885: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" 639 | ] 640 | }, 641 | { 642 | "name": "stdout", 643 | "output_type": "stream", 644 | "text": [ 645 | "Epoch 1/10\n", 646 | "227/227 [==============================] - 6s 28ms/step - loss: 9064.8613 - mse: 9064.8613 - val_loss: 11566.7559 - val_mse: 11566.7559\n", 647 | "Epoch 2/10\n", 648 | "227/227 [==============================] - 6s 27ms/step - loss: 6775.8911 - mse: 6775.8911 - val_loss: 9392.9561 - val_mse: 9392.9561\n", 649 | "Epoch 3/10\n", 650 | "227/227 [==============================] - 6s 27ms/step - loss: 5391.6719 - mse: 5391.6719 - val_loss: 7923.1221 - val_mse: 7923.1221\n", 651 | "Epoch 4/10\n", 652 | "227/227 [==============================] - 6s 28ms/step - loss: 4524.3457 - mse: 4524.3457 - val_loss: 6955.8647 - val_mse: 6955.8647\n", 653 | "Epoch 5/10\n", 654 | "227/227 [==============================] - 7s 33ms/step - loss: 4040.5396 - mse: 4040.5396 - val_loss: 6356.0605 - val_mse: 6356.0605\n", 655 | "Epoch 6/10\n", 656 | "227/227 [==============================] - 6s 28ms/step - loss: 3802.5298 - mse: 3802.5298 - val_loss: 5998.2061 - val_mse: 5998.2061\n", 657 | "Epoch 7/10\n", 658 | "227/227 [==============================] - 7s 30ms/step - loss: 3683.9429 - mse: 3683.9429 - val_loss: 5790.9092 - val_mse: 5790.9092\n", 659 | "Epoch 8/10\n", 660 | "227/227 [==============================] - 7s 33ms/step - loss: 3636.9177 - mse: 3636.9177 - val_loss: 5674.6558 - val_mse: 5674.6558\n", 661 | "Epoch 9/10\n", 662 | "227/227 [==============================] - 7s 30ms/step - loss: 3609.4973 - mse: 3609.4973 - val_loss: 5619.3926 - val_mse: 5619.3926\n", 663 | "Epoch 10/10\n", 664 | "227/227 [==============================] - 7s 29ms/step - loss: 3617.7119 - mse: 3617.7119 - val_loss: 5587.2671 - val_mse: 5587.2671\n" 665 | ] 666 | } 667 | ], 668 | "source": [ 669 | "pipeline.fit(train_target_times, readings)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": {}, 675 | "source": [ 676 | "## 4. Use the fitted pipeline\n", 677 | "\n", 678 | "After fitting the pipeline, we are ready to make predictions on new data:" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 17, 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "2/2 [==============================] - 0s 3ms/step\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "predictions = pipeline.predict(test_target_times, readings)" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": {}, 701 | "source": [ 702 | "And evaluate its prediction performance:" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 18, 708 | "metadata": {}, 709 | "outputs": [ 710 | { 711 | "data": { 712 | "text/plain": [ 713 | "-0.1533211964451806" 714 | ] 715 | }, 716 | "execution_count": 18, 717 | "metadata": {}, 718 | "output_type": "execute_result" 719 | } 720 | ], 721 | "source": [ 722 | "from sklearn.metrics import r2_score\n", 723 | "\n", 724 | "r2_score(test_target_times['target'], predictions)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "metadata": {}, 730 | "source": [ 731 | "## 5. Save and load the pipeline\n", 732 | "\n", 733 | "Since the tuning and fitting process takes time to execute and requires a lot of data, you\n", 734 | "will probably want to save a fitted instance and load it later to analyze new signals\n", 735 | "instead of fitting pipelines over and over again.\n", 736 | "\n", 737 | "This can be done by using the `save` and `load` methods from the `DracoPipeline`.\n", 738 | "\n", 739 | "In order to save an instance, call its `save` method passing it the path and filename\n", 740 | "where the model should be saved." 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 19, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "path = 'my_pipeline.pkl'\n", 750 | "\n", 751 | "pipeline.save(path)" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "Once the pipeline is saved, it can be loaded back as a new `DracoPipeline` by using the\n", 759 | "`DracoPipeline.load` method:" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": 20, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "new_pipeline = DracoPipeline.load(path)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "Once loaded, it can be directly used to make predictions on new data." 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 21, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "2/2 [==============================] - 0s 5ms/step\n" 788 | ] 789 | }, 790 | { 791 | "data": { 792 | "text/plain": [ 793 | "array([[91.7917 ],\n", 794 | " [91.791695],\n", 795 | " [91.79166 ],\n", 796 | " [91.79167 ],\n", 797 | " [91.79167 ]], dtype=float32)" 798 | ] 799 | }, 800 | "execution_count": 21, 801 | "metadata": {}, 802 | "output_type": "execute_result" 803 | } 804 | ], 805 | "source": [ 806 | "predictions = new_pipeline.predict(test_target_times, readings)\n", 807 | "predictions[0:5]" 808 | ] 809 | } 810 | ], 811 | "metadata": { 812 | "kernelspec": { 813 | "display_name": "Python 3 (ipykernel)", 814 | "language": "python", 815 | "name": "python3" 816 | }, 817 | "language_info": { 818 | "codemirror_mode": { 819 | "name": "ipython", 820 | "version": 3 821 | }, 822 | "file_extension": ".py", 823 | "mimetype": "text/x-python", 824 | "name": "python", 825 | "nbconvert_exporter": "python", 826 | "pygments_lexer": "ipython3", 827 | "version": "3.8.16" 828 | } 829 | }, 830 | "nbformat": 4, 831 | "nbformat_minor": 2 832 | } 833 | -------------------------------------------------------------------------------- /tutorials/Convert NASA CMAPSS to Draco Format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2f3d8acf", 6 | "metadata": {}, 7 | "source": [ 8 | "# Convert CMAPSS to Draco Format\n", 9 | "\n", 10 | "In this notebook we download [CMAPSS](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan) data and reformat it as Draco pipelines expect." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "f39b805c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import datetime\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "\n", 24 | "import matplotlib.pyplot as plt" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "626a2da0", 30 | "metadata": {}, 31 | "source": [ 32 | "## 1. Download Data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "ff641cff", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import io\n", 43 | "import os\n", 44 | "import urllib\n", 45 | "import zipfile\n", 46 | "\n", 47 | "DATA_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/CMAPSSData.zip'\n", 48 | "\n", 49 | "response = urllib.request.urlopen(DATA_URL)\n", 50 | "bytes_io = io.BytesIO(response.read())\n", 51 | "\n", 52 | "with zipfile.ZipFile(bytes_io) as zf:\n", 53 | " zf.extractall('CMAPSSData')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "9c435699", 59 | "metadata": {}, 60 | "source": [ 61 | "## 2. Read Data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "id": "1bb002ac", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# columns\n", 72 | "\n", 73 | "index = ['unit number', 'time, in cycles']\n", 74 | "setting = ['operational setting {}'.format(i + 1) for i in range(0, 3)]\n", 75 | "sensor = ['sensor measurement {}'.format(i + 1) for i in range(0, 21)]\n", 76 | "\n", 77 | "all_columns = index + setting + sensor" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "74478b0f", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "train = pd.read_csv('CMAPSSData/train_FD001.txt', sep=' ', header=None)\n", 88 | "train = train.dropna(axis=1)\n", 89 | "train.columns = all_columns\n", 90 | "\n", 91 | "test = pd.read_csv('CMAPSSData/test_FD001.txt', sep=' ', header=None)\n", 92 | "test = test.dropna(axis=1)\n", 93 | "test.columns = all_columns\n", 94 | "\n", 95 | "y_test = pd.read_csv('CMAPSSData/RUL_FD001.txt', sep=' ', header=None)\n", 96 | "y_test = y_test.dropna(axis=1)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "dd480185", 102 | "metadata": {}, 103 | "source": [ 104 | "## 3. Create columns\n", 105 | "\n", 106 | "### 3.a create `RUL` column\n", 107 | "How do we create **Remaining Useful Life (RUL)** column for the training dataset? We can assume that the last entry in the training dataset is the maximum life expectancy for that unit. Then each cycle we have will decrease by that number." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "id": "eb0270ba", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "def get_max(x):\n", 118 | " return cycles_max[x]\n", 119 | "\n", 120 | "cycles_max = train.groupby(\"unit number\")[\"time, in cycles\"].max().to_dict()\n", 121 | "cycles_max = train['unit number'].apply(get_max)\n", 122 | "\n", 123 | "train['RUL'] = cycles_max - train[\"time, in cycles\"]" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "57fbd3b9", 129 | "metadata": {}, 130 | "source": [ 131 | "### 3.b create `cutoff_time` column\n", 132 | "`cutoff_time` is a datetime column with relation to the `cycle` number. We pick a start date and start incrementing from there." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "id": "3e320356", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "def get_timestamp(x):\n", 143 | " return start + datetime.timedelta(minutes=x * 10)\n", 144 | "\n", 145 | "start = datetime.datetime(2013, 1, 12)\n", 146 | "train['timestamp'] = train['time, in cycles'].apply(get_timestamp)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "id": "11f78b71", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def get_timestamp_test(x):\n", 157 | " return last[x['unit number']] + datetime.timedelta(minutes=x['time, in cycles'] * 10)\n", 158 | "\n", 159 | "last = train.groupby('unit number').last()['timestamp'].to_dict()\n", 160 | "test['timestamp'] = test.apply(get_timestamp_test, axis=1)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "95bec88f", 166 | "metadata": {}, 167 | "source": [ 168 | "### 4. Format Data\n", 169 | "\n", 170 | "make `label_times` have three columns, namely: `['turbine_id', 'cutoff_time', 'target']`." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 8, 176 | "id": "1ce4320e", 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/html": [ 182 | "
\n", 183 | "\n", 196 | "\n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | "
turbine_idcutoff_timetarget
2512013-01-12 04:20:00166
2612013-01-12 04:30:00165
2712013-01-12 04:40:00164
2812013-01-12 04:50:00163
2912013-01-12 05:00:00162
\n", 238 | "
" 239 | ], 240 | "text/plain": [ 241 | " turbine_id cutoff_time target\n", 242 | "25 1 2013-01-12 04:20:00 166\n", 243 | "26 1 2013-01-12 04:30:00 165\n", 244 | "27 1 2013-01-12 04:40:00 164\n", 245 | "28 1 2013-01-12 04:50:00 163\n", 246 | "29 1 2013-01-12 05:00:00 162" 247 | ] 248 | }, 249 | "execution_count": 8, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "train_label_times = train[['unit number', 'timestamp', 'RUL']].copy()\n", 256 | "train_label_times.columns = ['turbine_id', 'cutoff_time', 'target']\n", 257 | "\n", 258 | "# drop first 24 occurances\n", 259 | "train_label_times = train_label_times[train_label_times.groupby('turbine_id').cumcount('turbine_id') > 24]\n", 260 | "train_label_times.head()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 9, 266 | "id": "f320e753", 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | "
turbine_idcutoff_timetarget
012013-01-13 13:10:00112.0
122013-01-14 08:00:0098.0
232013-01-14 02:50:0069.0
342013-01-14 01:10:0082.0
452013-01-14 13:10:0091.0
\n", 328 | "
" 329 | ], 330 | "text/plain": [ 331 | " turbine_id cutoff_time target\n", 332 | "0 1 2013-01-13 13:10:00 112.0\n", 333 | "1 2 2013-01-14 08:00:00 98.0\n", 334 | "2 3 2013-01-14 02:50:00 69.0\n", 335 | "3 4 2013-01-14 01:10:00 82.0\n", 336 | "4 5 2013-01-14 13:10:00 91.0" 337 | ] 338 | }, 339 | "execution_count": 9, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "test_label_times = test[['unit number', 'timestamp']].groupby('unit number').last().reset_index()\n", 346 | "test_label_times.columns = ['turbine_id', 'cutoff_time']\n", 347 | "test_label_times['target'] = np.array(y_test).astype('float32')\n", 348 | "test_label_times.head()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 10, 354 | "id": "50be8dc4", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "reading_columns = ['unit number', 'timestamp'] + setting + sensor\n", 359 | "readings = pd.concat([train, test])[reading_columns]\n", 360 | "readings = readings.melt(id_vars=['unit number', 'timestamp'])\n", 361 | "readings.columns = ['turbine_id', 'timestamp', 'signal_id', 'value']" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "id": "01a77e60", 367 | "metadata": {}, 368 | "source": [ 369 | "## 5. Save Data" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 11, 375 | "id": "5f622ff7", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "readings.to_csv('rul_readings.csv.gz', compression='gzip', index=False)\n", 380 | "train_label_times.to_csv('rul_train_target_times.csv.gz', compression='gzip', index=False)\n", 381 | "test_label_times.to_csv('rul_test_target_times.csv.gz', compression='gzip', index=False)" 382 | ] 383 | } 384 | ], 385 | "metadata": { 386 | "kernelspec": { 387 | "display_name": "Python 3 (ipykernel)", 388 | "language": "python", 389 | "name": "python3" 390 | }, 391 | "language_info": { 392 | "codemirror_mode": { 393 | "name": "ipython", 394 | "version": 3 395 | }, 396 | "file_extension": ".py", 397 | "mimetype": "text/x-python", 398 | "name": "python", 399 | "nbconvert_exporter": "python", 400 | "pygments_lexer": "ipython3", 401 | "version": "3.7.11" 402 | } 403 | }, 404 | "nbformat": 4, 405 | "nbformat_minor": 5 406 | } 407 | --------------------------------------------------------------------------------