├── tests
    ├── __init__.py
    ├── run.py
    ├── project_based_test.py
    ├── data
    │   ├── test_1_vald.csv
    │   └── test_1.csv
    ├── project_client_test.py
    ├── result_client_test.py
    ├── dataset_client_test.py
    ├── experiment_client_test.py
    └── mljar_test.py
├── mljar
    ├── model
    │   ├── __init__.py
    │   ├── base.py
    │   ├── prediction.py
    │   ├── dataset.py
    │   ├── project.py
    │   ├── result.py
    │   └── experiment.py
    ├── client
    │   ├── __init__.py
    │   ├── result.py
    │   ├── prediction.py
    │   ├── predictjob.py
    │   ├── prediction_download.py
    │   ├── dataupload.py
    │   ├── project.py
    │   ├── base.py
    │   ├── experiment.py
    │   └── dataset.py
    ├── log.py
    ├── __init__.py
    ├── exceptions.py
    ├── utils.py
    └── mljar.py
├── requirements.txt
├── .travis.yml
├── setup.py
├── .gitignore
├── README.md
├── README.rst
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mljar/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mljar/client/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.2.1
2 | marshmallow>=2.12.1
3 | numpy==1.14.2
4 | pandas==0.22.0
5 | future
6 | 


--------------------------------------------------------------------------------
/mljar/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger('mljar')
4 | logging.basicConfig(format='%(message)s', level=logging.ERROR)
5 | 


--------------------------------------------------------------------------------
/mljar/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = '0.0.8'
3 | API_VERSION = 'v1'
4 | MLJAR_ENDPOINT = 'https://mljar.com/api'
5 | 
6 | from .mljar import Mljar
7 | 


--------------------------------------------------------------------------------
/mljar/model/base.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | class BaseModel(object):
 4 | 
 5 |     def to_dict(self):
 6 |         return self.schema.dump(self).data
 7 | 
 8 |     @classmethod
 9 |     def from_dict(cls, dct):
10 |         return cls.schema.load(dct).data
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.6"
 5 | install:
 6 |   - pip install -U -r requirements.txt
 7 |   - pip install coveralls
 8 | script:
 9 |   coverage run --source='mljar' -m tests.run -v
10 | after_success:
11 |   coveralls
12 | 


--------------------------------------------------------------------------------
/tests/run.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | MLJAR unit tests.
 3 | '''
 4 | import os
 5 | import unittest
 6 | 
 7 | from .project_client_test import ProjectClientTest
 8 | from .dataset_client_test import DatasetClientTest
 9 | from .experiment_client_test import ExperimentClientTest
10 | from .result_client_test import ResultClientTest
11 | from .mljar_test import MljarTest
12 | 
13 | if __name__ == '__main__':
14 |     unittest.main()
15 | 


--------------------------------------------------------------------------------
/tests/project_based_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | ProjectBasedTest tests.
 3 | '''
 4 | import os
 5 | import unittest
 6 | import pandas as pd
 7 | import sys
 8 | 
 9 | from mljar.client.project import ProjectClient
10 | 
11 | def get_postfix():
12 |     return '-v-'+str(sys.version_info.major)
13 | 
14 | class ProjectBasedTest(unittest.TestCase):
15 | 
16 |     @staticmethod
17 |     def clean_projects():
18 |         project_client = ProjectClient()
19 |         projects = project_client.get_projects()
20 |         for proj in projects:
21 |             if proj.title.startswith('Test') and proj.title.endswith(get_postfix()):
22 |                 project_client.delete_project(proj.hid)
23 | 
24 |     @classmethod
25 |     def setUpClass(cls):
26 |         ProjectBasedTest.clean_projects()
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         ProjectBasedTest.clean_projects()
31 | 


--------------------------------------------------------------------------------
/mljar/client/result.py:
--------------------------------------------------------------------------------
 1 | from .base import MljarHttpClient
 2 | from ..model.result import Result
 3 | from ..exceptions import NotFoundException
 4 | 
 5 | class ResultClient(MljarHttpClient):
 6 |     '''
 7 |     Client to interact with MLJAR results (models).
 8 |     '''
 9 |     def __init__(self, project_hid):
10 |         self.url = "/results/"
11 |         self.project_hid = project_hid
12 |         super(ResultClient, self).__init__()
13 | 
14 |     def get_results(self, experiment_hid = None):
15 |         '''
16 |         List all models.
17 |         '''
18 |         data = {'project_id': self.project_hid}
19 |         if experiment_hid is not None:
20 |             data['experiment_id'] = experiment_hid
21 |         response = self.request("POST", self.url, data = data)
22 |         results_dict = response.json()
23 |         return [Result.from_dict(r) for r in results_dict]
24 | 


--------------------------------------------------------------------------------
/mljar/client/prediction.py:
--------------------------------------------------------------------------------
 1 | from .base import MljarHttpClient
 2 | from ..model.prediction import Prediction
 3 | from ..exceptions import NotFoundException
 4 | 
 5 | class PredictionClient(MljarHttpClient):
 6 |     '''
 7 |     Client to interact with MLJAR results (models).
 8 |     '''
 9 |     def __init__(self, project_hid):
10 |         self.url = "/predictions"
11 |         self.project_hid = project_hid
12 |         super(PredictionClient, self).__init__()
13 | 
14 |     def get_prediction(self, dataset_hid, result_hid):
15 |         '''
16 |         Get prediction.
17 |         '''
18 |         response = self.request("GET", self.url + '?project_id=' + self.project_hid + '&dataset_id='+dataset_hid+'&result_id='+result_hid)
19 |         predictions_dict = response.json()
20 |         if len(predictions_dict) == 1:
21 |             return Prediction.from_dict(predictions_dict[0])
22 |         return None
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | # Get the long description from the README file
 8 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name='mljar',
13 |     version='0.1.0',
14 |     description='Python wrapper over MLJAR API',
15 |     long_description=long_description,
16 |     url='https://github.com/mljar/mljar-api-python',
17 |     author='Piotr Plonski',
18 |     author_email='contact@mljar.com',
19 |     license='Apache-2.0',
20 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
21 |     install_requires=['requests', 'marshmallow'],
22 |     classifiers=[
23 |         'Programming Language :: Python',
24 |         'Programming Language :: Python :: 2.7',
25 |         'Programming Language :: Python :: 3.6'
26 |     ]
27 | )
28 | 


--------------------------------------------------------------------------------
/mljar/client/predictjob.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from .base import MljarHttpClient
 3 | from ..exceptions import FileUploadException
 4 | 
 5 | from ..log import logger
 6 | 
 7 | class PredictJobClient(MljarHttpClient):
 8 |     '''
 9 |     Client to submit predict job in MLJAR.
10 |     '''
11 |     def __init__(self):
12 |         self.url = "/predict/"
13 |         super(PredictJobClient, self).__init__()
14 | 
15 | 
16 |     def submit(self, project_hid, dataset_hid, result_hid):
17 |         data =  {
18 |                     'predict_params' : json.dumps({'project_id': project_hid,
19 |                                                     'project_hardware': 'cloud',
20 |                                                     'algorithms_ids': [result_hid],
21 |                                                     'dataset_id': dataset_hid,
22 |                                                     'cv_models':1})
23 |                 }
24 |         response = self.request("POST", self.url, data = data, parse_json = False)
25 |         return response.status_code == 200
26 | 


--------------------------------------------------------------------------------
/mljar/exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class MljarException(Exception):
 3 |     """Base exception class for this module"""
 4 |     pass
 5 | 
 6 | class TokenException(MljarException):
 7 |     pass
 8 | 
 9 | class DataReadException(MljarException):
10 |     pass
11 | 
12 | class JSONReadException(MljarException):
13 |     pass
14 | 
15 | class NotFoundException(MljarException):
16 |     pass
17 | 
18 | class AuthenticationException(MljarException):
19 |     pass
20 | 
21 | class BadRequestException(MljarException):
22 |     pass
23 | 
24 | class BadValueException(MljarException):
25 |     pass
26 | 
27 | class UnknownProjectTask(MljarException):
28 |     pass
29 | 
30 | class IncorrectInputDataException(MljarException):
31 |     pass
32 | 
33 | class FileUploadException(MljarException):
34 |     pass
35 | 
36 | class CreateProjectException(MljarException):
37 |     pass
38 | 
39 | class CreateDatasetException(MljarException):
40 |     pass
41 | 
42 | class CreateExperimentException(MljarException):
43 |     pass
44 | 
45 | class UndefinedExperimentException(MljarException):
46 |     pass
47 | 
48 | class DatasetUnknownException(MljarException):
49 |     pass
50 | 
51 | class PredictionDownloadException(MljarException):
52 |     pass    
53 | 


--------------------------------------------------------------------------------
/mljar/client/prediction_download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uuid
 3 | import tempfile
 4 | import pandas as pd
 5 | from .base import MljarHttpClient
 6 | from ..exceptions import PredictionDownloadException
 7 | 
 8 | from ..log import logger
 9 | 
10 | class PredictionDownloadClient(MljarHttpClient):
11 |     '''
12 |     Client to get predictions from MLJAR.
13 |     '''
14 |     def __init__(self):
15 |         self.url = "/download/prediction/"
16 |         super(PredictionDownloadClient, self).__init__()
17 | 
18 |     def download(self, prediction_hid):
19 |         response = self.request("POST", self.url, data = {"prediction_id": prediction_hid}, parse_json=False)
20 |         pred = None
21 |         try:
22 |             tmp_file = os.path.join(tempfile.gettempdir(), 'mljar_prediction_' + str(uuid.uuid4()) + '.csv')
23 |             with open(tmp_file, 'wb') as f:
24 |                 for chunk in response.iter_content(chunk_size=1024):
25 |                     if chunk: # filter out keep-alive new chunks
26 |                         f.write(chunk)
27 |             pred = pd.read_csv(tmp_file)
28 |             os.remove(tmp_file)
29 |         except Exception as e:
30 |             raise PredictionDownloadException(str(e))
31 |         return pred
32 | 


--------------------------------------------------------------------------------
/mljar/client/dataupload.py:
--------------------------------------------------------------------------------
 1 | from .base import MljarHttpClient
 2 | from ..model.dataset import Dataset
 3 | from ..exceptions import FileUploadException
 4 | 
 5 | from ..log import logger
 6 | 
 7 | class DataUploadClient(MljarHttpClient):
 8 |     '''
 9 |     Client to upload data into MLJAR.
10 |     '''
11 |     def __init__(self):
12 |         self.url = "/s3policy/"
13 |         super(DataUploadClient, self).__init__()
14 | 
15 |     def _get_signed_url(self, project_hid, file_path):
16 |         data = {'project_hid':project_hid, 'fname': file_path.split('/')[-1]}
17 |         response = self.request("POST", self.url, data = data)
18 |         return response.json()
19 | 
20 |     def upload_file(self, project_hid, file_path):
21 |         logger.info('File upload started')
22 |         url_data = self._get_signed_url(project_hid, file_path)
23 |         signed_url = url_data['signed_url']
24 |         dst_path   = url_data['destination_path']
25 |         with open(file_path, 'rb') as fin:
26 |             response = self.request("PUT", signed_url, data=fin.read(),
27 |                                             with_header=False, url_outside_mljar=True,
28 |                                             parse_json=False)
29 |             if response.status_code != 200:
30 |                 raise FileUploadException('There was a problem with data upload into MLJAR')
31 |         return dst_path
32 | 


--------------------------------------------------------------------------------
/mljar/model/prediction.py:
--------------------------------------------------------------------------------
 1 | from marshmallow import Schema, fields, post_load
 2 | 
 3 | from .base import BaseModel
 4 | 
 5 | class PredictionSchema(Schema):
 6 |     hid = fields.Str()
 7 |     scope = fields.Str()
 8 |     created_by = fields.Number()
 9 |     created_at = fields.DateTime()
10 |     parent_alg_hid = fields.Str()
11 |     prediction_on_dataset_title = fields.Str()
12 |     alg_name = fields.Str()
13 |     alg_on_dataset_title = fields.Str()
14 |     alg_metric = fields.Str()
15 | 
16 |     @post_load
17 |     def make_prediction_instance(self, data):
18 |         return Prediction(**data)
19 | 
20 | class Prediction(BaseModel):
21 |     schema = PredictionSchema(strict=True)
22 | 
23 |     def __init__(self, hid, scope, created_by, created_at, parent_alg_hid,
24 |                     prediction_on_dataset_title, alg_name, alg_on_dataset_title,
25 |                     alg_metric):
26 |         self.hid = hid
27 |         self.scope = scope
28 |         self.created_by = created_by
29 |         self.created_at = created_at
30 |         self.parent_alg_hid = parent_alg_hid
31 |         self.prediction_on_dataset_title = prediction_on_dataset_title
32 |         self.alg_name = alg_name
33 |         self.alg_on_dataset_title = alg_on_dataset_title
34 |         self.alg_metric = alg_metric
35 | 
36 |     def __str__(self):
37 |         desc = 'Prediction id: {}\n'.format(self.hid)
38 |         return desc
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/tests/data/test_1_vald.csv:
--------------------------------------------------------------------------------
 1 | sepal length,sepal width,petal length,petal width,class
 2 | 5.1,3.5,1.4,0.2,0
 3 | 4.9,3.0,1.4,0.2,0
 4 | 4.7,3.2,1.3,0.2,0
 5 | 4.6,3.1,1.5,0.2,0
 6 | 5.0,3.6,1.4,0.2,0
 7 | 5.4,3.9,1.7,0.4,0
 8 | 4.6,3.4,1.4,0.3,0
 9 | 5.0,3.4,1.5,0.2,0
10 | 4.4,2.9,1.4,0.2,0
11 | 4.9,3.1,1.5,0.1,0
12 | 5.4,3.7,1.5,0.2,0
13 | 4.8,3.4,1.6,0.2,0
14 | 4.8,3.0,1.4,0.1,0
15 | 4.3,3.0,1.1,0.1,0
16 | 5.8,4.0,1.2,0.2,0
17 | 5.7,4.4,1.5,0.4,0
18 | 5.4,3.9,1.3,0.4,0
19 | 5.1,3.5,1.4,0.3,0
20 | 5.7,3.8,1.7,0.3,0
21 | 5.1,3.8,1.5,0.3,0
22 | 5.4,3.4,1.7,0.2,0
23 | 5.1,3.7,1.5,0.4,0
24 | 4.6,3.6,1.0,0.2,0
25 | 5.1,3.3,1.7,0.5,0
26 | 4.8,3.4,1.9,0.2,0
27 | 5.0,3.0,1.6,0.2,0
28 | 5.0,3.4,1.6,0.4,0
29 | 5.2,3.5,1.5,0.2,0
30 | 5.2,3.4,1.4,0.2,0
31 | 4.7,3.2,1.6,0.2,0
32 | 4.8,3.1,1.6,0.2,0
33 | 5.4,3.4,1.5,0.4,0
34 | 5.2,4.1,1.5,0.1,0
35 | 5.5,4.2,1.4,0.2,0
36 | 4.9,3.1,1.5,0.1,0
37 | 5.0,3.2,1.2,0.2,0
38 | 5.5,3.5,1.3,0.2,0
39 | 4.9,3.1,1.5,0.1,0
40 | 4.4,3.0,1.3,0.2,0
41 | 5.1,3.4,1.5,0.2,0
42 | 5.0,3.5,1.3,0.3,0
43 | 4.5,2.3,1.3,0.3,0
44 | 4.4,3.2,1.3,0.2,0
45 | 5.0,3.5,1.6,0.6,0
46 | 5.1,3.8,1.9,0.4,0
47 | 5.7,2.8,4.5,1.3,1
48 | 6.3,3.3,4.7,1.6,1
49 | 4.9,2.4,3.3,1.0,1
50 | 6.6,2.9,4.6,1.3,1
51 | 5.2,2.7,3.9,1.4,1
52 | 5.0,2.0,3.5,1.0,1
53 | 5.9,3.0,4.2,1.5,1
54 | 6.0,2.2,4.0,1.0,1
55 | 6.1,2.9,4.7,1.4,1
56 | 5.6,2.9,3.6,1.3,1
57 | 6.7,3.1,4.4,1.4,1
58 | 5.6,3.0,4.5,1.5,1
59 | 5.8,2.7,4.1,1.0,1
60 | 6.2,2.2,4.5,1.5,1
61 | 5.6,2.5,3.9,1.1,1
62 | 5.9,3.2,4.8,1.8,1
63 | 6.1,2.8,4.0,1.3,1
64 | 6.3,2.5,4.9,1.5,1
65 | 6.1,2.8,4.7,1.2,1
66 | 6.4,2.9,4.3,1.3,1
67 | 6.6,3.0,4.4,1.4,1
68 | 6.8,2.8,4.8,1.4,1
69 | 6.7,3.0,5.0,1.7,1
70 | 6.0,2.9,4.5,1.5,1
71 | 5.7,2.6,3.5,1.0,1
72 | 5.5,2.4,3.8,1.1,1
73 | 5.5,2.4,3.7,1.0,1
74 | 5.8,2.7,3.9,1.2,1
75 | 6.0,2.7,5.1,1.6,1
76 | 5.4,3.0,4.5,1.5,1
77 | 6.0,3.4,4.5,1.6,1
78 | 6.7,3.1,4.7,1.5,1
79 | 6.3,2.3,4.4,1.3,1
80 | 5.6,3.0,4.1,1.3,1
81 | 5.5,2.5,4.0,1.3,1
82 | 5.5,2.6,4.4,1.2,1
83 | 6.1,3.0,4.6,1.4,1
84 | 5.8,2.6,4.0,1.2,1
85 | 5.0,2.3,3.3,1.0,1
86 | 5.6,2.7,4.2,1.3,1
87 | 5.7,3.0,4.2,1.2,1
88 | 5.7,2.9,4.2,1.3,1
89 | 6.2,2.9,4.3,1.3,1
90 | 5.1,2.5,3.0,1.1,1
91 | 5.7,2.8,4.1,1.3,1
92 | 


--------------------------------------------------------------------------------
/mljar/model/dataset.py:
--------------------------------------------------------------------------------
 1 | from marshmallow import Schema, fields, post_load
 2 | 
 3 | from .base import BaseModel
 4 | 
 5 | class DatasetSchema(Schema):
 6 |     hid = fields.Str()
 7 |     title = fields.Str()
 8 |     scope = fields.Str()
 9 |     created_at = fields.DateTime(allow_none=True)
10 |     created_by = fields.Number(allow_none=True)
11 |     parent_project = fields.Number(allow_none=True)
12 |     data_type = fields.Str()
13 |     dataset_hash = fields.Str()
14 |     file_name = fields.Str()
15 |     file_path = fields.Str()
16 |     file_size = fields.Str()
17 |     meta = fields.List(fields.Dict(), allow_none=True)
18 |     prediction_only = fields.Number()
19 |     accepted = fields.Number()
20 |     checked = fields.Number()
21 |     derived = fields.Number()
22 |     valid = fields.Number()
23 |     text_msg = fields.Str(allow_none=True)
24 |     column_usage_min = fields.Dict(allow_none=True)
25 | 
26 |     @post_load
27 |     def make_project_instance(self, data):
28 |         return Dataset(**data)
29 | 
30 | class Dataset(BaseModel):
31 |     schema = DatasetSchema(strict=True)
32 | 
33 |     def __init__(self, hid, title, scope, data_type,
34 |                     file_name, file_path, file_size, meta, prediction_only,
35 |                     accepted, checked, derived, valid, text_msg, dataset_hash,
36 |                     column_usage_min, created_at = None, created_by = None, parent_project = None):
37 |         self.hid = hid
38 |         self.title = title
39 |         self.scope = scope
40 |         self.created_at = created_at
41 |         self.created_by = created_by
42 |         self.parent_project = parent_project
43 |         self.data_type = data_type
44 |         self.dataset_hash = dataset_hash
45 |         self.file_name = file_name
46 |         self.file_path = file_path
47 |         self.file_size = file_size
48 |         self.meta = meta
49 |         self.prediction_only = prediction_only
50 |         self.accepted = accepted
51 |         self.checked = checked
52 |         self.derived = derived
53 |         self.valid = valid
54 |         self.text_msg = text_msg
55 |         self.column_usage_min = column_usage_min
56 | 
57 |     def __str__(self):
58 |         desc = 'Dataset id: {} title: {} file: {}\n'.format(self.hid, self.title, self.file_name)
59 |         desc += 'File size: {} accepted column usage: {}\n'.format(self.file_size, self.accepted)
60 |         return desc
61 | 


--------------------------------------------------------------------------------
/tests/data/test_1.csv:
--------------------------------------------------------------------------------
  1 | sepal length,sepal width,petal length,petal width,class
  2 | 5.1,3.5,1.4,0.2,0
  3 | 4.9,3.0,1.4,0.2,0
  4 | 4.7,3.2,1.3,0.2,0
  5 | 4.6,3.1,1.5,0.2,0
  6 | 5.0,3.6,1.4,0.2,0
  7 | 5.4,3.9,1.7,0.4,0
  8 | 4.6,3.4,1.4,0.3,0
  9 | 5.0,3.4,1.5,0.2,0
 10 | 4.4,2.9,1.4,0.2,0
 11 | 4.9,3.1,1.5,0.1,0
 12 | 5.4,3.7,1.5,0.2,0
 13 | 4.8,3.4,1.6,0.2,0
 14 | 4.8,3.0,1.4,0.1,0
 15 | 4.3,3.0,1.1,0.1,0
 16 | 5.8,4.0,1.2,0.2,0
 17 | 5.7,4.4,1.5,0.4,0
 18 | 5.4,3.9,1.3,0.4,0
 19 | 5.1,3.5,1.4,0.3,0
 20 | 5.7,3.8,1.7,0.3,0
 21 | 5.1,3.8,1.5,0.3,0
 22 | 5.4,3.4,1.7,0.2,0
 23 | 5.1,3.7,1.5,0.4,0
 24 | 4.6,3.6,1.0,0.2,0
 25 | 5.1,3.3,1.7,0.5,0
 26 | 4.8,3.4,1.9,0.2,0
 27 | 5.0,3.0,1.6,0.2,0
 28 | 5.0,3.4,1.6,0.4,0
 29 | 5.2,3.5,1.5,0.2,0
 30 | 5.2,3.4,1.4,0.2,0
 31 | 4.7,3.2,1.6,0.2,0
 32 | 4.8,3.1,1.6,0.2,0
 33 | 5.4,3.4,1.5,0.4,0
 34 | 5.2,4.1,1.5,0.1,0
 35 | 5.5,4.2,1.4,0.2,0
 36 | 4.9,3.1,1.5,0.1,0
 37 | 5.0,3.2,1.2,0.2,0
 38 | 5.5,3.5,1.3,0.2,0
 39 | 4.9,3.1,1.5,0.1,0
 40 | 4.4,3.0,1.3,0.2,0
 41 | 5.1,3.4,1.5,0.2,0
 42 | 5.0,3.5,1.3,0.3,0
 43 | 4.5,2.3,1.3,0.3,0
 44 | 4.4,3.2,1.3,0.2,0
 45 | 5.0,3.5,1.6,0.6,0
 46 | 5.1,3.8,1.9,0.4,0
 47 | 4.8,3.0,1.4,0.3,0
 48 | 5.1,3.8,1.6,0.2,0
 49 | 4.6,3.2,1.4,0.2,0
 50 | 5.3,3.7,1.5,0.2,0
 51 | 5.0,3.3,1.4,0.2,0
 52 | 7.0,3.2,4.7,1.4,1
 53 | 6.4,3.2,4.5,1.5,1
 54 | 6.9,3.1,4.9,1.5,1
 55 | 5.5,2.3,4.0,1.3,1
 56 | 6.5,2.8,4.6,1.5,1
 57 | 5.7,2.8,4.5,1.3,1
 58 | 6.3,3.3,4.7,1.6,1
 59 | 4.9,2.4,3.3,1.0,1
 60 | 6.6,2.9,4.6,1.3,1
 61 | 5.2,2.7,3.9,1.4,1
 62 | 5.0,2.0,3.5,1.0,1
 63 | 5.9,3.0,4.2,1.5,1
 64 | 6.0,2.2,4.0,1.0,1
 65 | 6.1,2.9,4.7,1.4,1
 66 | 5.6,2.9,3.6,1.3,1
 67 | 6.7,3.1,4.4,1.4,1
 68 | 5.6,3.0,4.5,1.5,1
 69 | 5.8,2.7,4.1,1.0,1
 70 | 6.2,2.2,4.5,1.5,1
 71 | 5.6,2.5,3.9,1.1,1
 72 | 5.9,3.2,4.8,1.8,1
 73 | 6.1,2.8,4.0,1.3,1
 74 | 6.3,2.5,4.9,1.5,1
 75 | 6.1,2.8,4.7,1.2,1
 76 | 6.4,2.9,4.3,1.3,1
 77 | 6.6,3.0,4.4,1.4,1
 78 | 6.8,2.8,4.8,1.4,1
 79 | 6.7,3.0,5.0,1.7,1
 80 | 6.0,2.9,4.5,1.5,1
 81 | 5.7,2.6,3.5,1.0,1
 82 | 5.5,2.4,3.8,1.1,1
 83 | 5.5,2.4,3.7,1.0,1
 84 | 5.8,2.7,3.9,1.2,1
 85 | 6.0,2.7,5.1,1.6,1
 86 | 5.4,3.0,4.5,1.5,1
 87 | 6.0,3.4,4.5,1.6,1
 88 | 6.7,3.1,4.7,1.5,1
 89 | 6.3,2.3,4.4,1.3,1
 90 | 5.6,3.0,4.1,1.3,1
 91 | 5.5,2.5,4.0,1.3,1
 92 | 5.5,2.6,4.4,1.2,1
 93 | 6.1,3.0,4.6,1.4,1
 94 | 5.8,2.6,4.0,1.2,1
 95 | 5.0,2.3,3.3,1.0,1
 96 | 5.6,2.7,4.2,1.3,1
 97 | 5.7,3.0,4.2,1.2,1
 98 | 5.7,2.9,4.2,1.3,1
 99 | 6.2,2.9,4.3,1.3,1
100 | 5.1,2.5,3.0,1.1,1
101 | 5.7,2.8,4.1,1.3,1
102 | 


--------------------------------------------------------------------------------
/mljar/model/project.py:
--------------------------------------------------------------------------------
 1 | from marshmallow import Schema, fields, post_load
 2 | 
 3 | from .base import BaseModel
 4 | 
 5 | from ..exceptions import UnknownProjectTask
 6 | 
 7 | class ProjectSchema(Schema):
 8 |     hid = fields.Str()
 9 |     title = fields.Str()
10 |     description = fields.Str(allow_none=True)
11 |     task = fields.Str()
12 |     hardware = fields.Str()
13 |     scope = fields.Str()
14 |     info = fields.Dict(allow_none=True)
15 |     created_at = fields.DateTime()
16 |     created_by = fields.Number()
17 |     experiments_cnt = fields.Number()
18 |     models_cnt = fields.Number()
19 |     datasets = fields.List(fields.Dict(), allow_none=True)
20 |     topalg = fields.List(fields.Dict(), allow_none=True)
21 |     total_timelog = fields.Number(allow_none=True)
22 |     compute_now = fields.Number()
23 |     insights = fields.List(fields.Dict(), allow_none=True)
24 | 
25 |     @post_load
26 |     def make_project_instance(self, data):
27 |         return Project(**data)
28 | 
29 | class Project(BaseModel):
30 |     schema = ProjectSchema(strict=True)
31 | 
32 |     def __init__(self, hid, title, description, task, hardware, scope, created_at, created_by,
33 |                     models_cnt, compute_now, experiments_cnt = None, datasets = None, topalg = None,
34 |                     insights = None, total_timelog = 0, info = None):
35 |         self.hid = hid
36 |         self.title = title
37 |         self.description = description
38 |         self.task = task
39 |         self.info = info
40 |         self.created_at = created_at
41 |         self.created_by = created_by
42 |         self.experiments_cnt = experiments_cnt
43 |         self.models_cnt = models_cnt
44 |         self.hardware = hardware
45 |         self.scope = scope
46 |         self.datasets = datasets
47 |         self.topalg = topalg
48 |         self.total_timelog = total_timelog
49 |         self.compute_now = compute_now
50 |         self.insights = insights
51 | 
52 |     def __str__(self):
53 |         desc = 'Project id: {} title: {} task: {}\n'.format(self.hid, self.title, self.task)
54 |         desc += 'Hardware: {} data sources count: {} models count: {}\n'.format(self.hardware, len(self.datasets), self.models_cnt)
55 |         return desc
56 | 
57 |     def _task_to_full_name(self, task_short):
58 |         tasks = {'bin_class': "Binary classification",
59 |                     'reg': "Regression",
60 |                     'img_class': "Images classification"}
61 |         if task_short not in tasks:
62 |             raise UnknownProjectTask('Unknown task %s' % task_short)
63 |         return tasks[task_short]
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/mljar/mljar-api-python.svg?branch=master)](https://travis-ci.org/mljar/mljar-api-python)
 2 | [![PyPI version](https://badge.fury.io/py/mljar.svg)](https://badge.fury.io/py/mljar)
 3 | [![Coverage Status](https://coveralls.io/repos/github/mljar/mljar-api-python/badge.svg?branch=master)](https://coveralls.io/github/mljar/mljar-api-python?branch=master)
 4 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/mljar.svg)](https://pypi.python.org/pypi/mljar/)
 5 | 
 6 | # mljar-api-python
 7 | 
 8 | A simple python wrapper over mljar API. It allows MLJAR users to create Machine Learning models with few lines of code:
 9 | 
10 | ```python
11 | from mljar import Mljar
12 | 
13 | model = Mljar(project='My awesome project', experiment='First experiment')
14 | model.fit(X,y)
15 | 
16 | model.predict(X)
17 | ```
18 | 
19 | That's all folks! Yeah, I know, this makes Machine Learning super easy! You can use this code for following Machine Learning tasks:
20 |  * Binary classification (your target has only two unique values)
21 |  * Regression (your target value is continuous)
22 |  * More is coming soon!
23 | 
24 | ## How to install
25 | 
26 | You can install mljar with **pip**:
27 | 
28 |     pip install -U mljar
29 | 
30 | or from source code:
31 | 
32 |     python setup.py install
33 | 
34 | ## How to use it
35 | 
36 |  1. Create an account at mljar.com and login.
37 |  2. Please go to your users settings (top, right corner).
38 |  3. Get your token, for example 'exampleexampleexample'.
39 |  4. Set environment variable `MLJAR_TOKEN` with your token value:
40 | ```
41 | export MLJAR_TOKEN=exampleexampleexample
42 | ```
43 |  5. That's all, you are ready to use MLJAR in your python code!
44 | 
45 | ## What's going on?
46 | 
47 |  * This wrapper allows you to search through different Machine Learning algorithms and tune each of the algorithm.
48 |  * By searching and tuning ML algorithm to your data you will get very accurate model.
49 |  * By calling method `fit` from `Mljar class` you create new project and start experiment with models training.
50 |  All your results will be accessible from your mljar.com account - this makes Machine Learning super easy and
51 |  keeps all your models and results in beautiful order. So, you will never miss anything.
52 |  * All computations are done in MLJAR Cloud, they are executed in parallel. So after calling `fit` method you can switch
53 |  your computer off and MLJAR will do the job for you!
54 |  * I think this is really amazing! What do you think? Please let us know at `contact@mljar.com`.
55 | 
56 | ## Examples
57 | 
58 | The examples are [here!](https://github.com/mljar/mljar-examples).
59 | 
60 | ## Testing
61 | 
62 | To run tests with command:
63 | 
64 | ```
65 | python -m tests.run
66 | ```
67 | 


--------------------------------------------------------------------------------
/mljar/client/project.py:
--------------------------------------------------------------------------------
 1 | from .base import MljarHttpClient
 2 | from ..model.project import Project
 3 | from ..exceptions import NotFoundException, CreateProjectException
 4 | from ..log import logger
 5 | 
 6 | class ProjectClient(MljarHttpClient):
 7 |     '''
 8 |     Client to interact with MLJAR projects.
 9 |     '''
10 |     def __init__(self):
11 |         self.verbose = True
12 |         self.url = "/projects"
13 |         super(ProjectClient, self).__init__()
14 | 
15 |     def get_projects(self):
16 |         '''
17 |         List all user projects.
18 |         '''
19 |         response = self.request("GET", self.url)
20 |         projects_dict = response.json()
21 |         return [Project.from_dict(proj) for proj in projects_dict]
22 | 
23 |     def get_project(self, hid):
24 |         '''
25 |         Print out project details and return details in json.
26 |         '''
27 |         try:
28 |             response = self.request("GET", '/'.join([self.url, hid]))
29 |             return Project.from_dict(response.json())
30 |         except NotFoundException:
31 |             return None
32 | 
33 | 
34 |     def create_project(self, title, task, description = ''):
35 |         '''
36 |         Creates new project
37 |         '''
38 |         data= {'hardware': 'cloud',
39 |                 'scope': 'private',
40 |                 'task': task,
41 |                 'compute_now': 0,
42 |                 'description': description,
43 |                 'title':title}
44 |         response = self.request("POST", self.url, data = data)
45 |         if response.status_code != 201:
46 |             raise CreateProjectException()
47 |         return Project.from_dict(response.json())
48 | 
49 |     def delete_project(self, hid):
50 |         '''
51 |         Deletes project
52 |         '''
53 |         logger.info('Remove project: %s' % hid)
54 |         response = self.request("DELETE", '/'.join([self.url, hid]))
55 |         return response.status_code == 204 or response.status_code == 200
56 | 
57 |     def create_project_if_not_exists(self, title, task, description = ''):
58 |         '''
59 |         Checks if project with specified title and task exists, if not it adds new project.
60 |         '''
61 |         projects = self.get_projects()
62 |         self.my_project = [p for p in projects if p.title == title and p.task == task]
63 |         # if project with such title does not exist, create one
64 |         if len(self.my_project) == 0:
65 |             self.my_project = self.create_project(title = title,
66 |                                                     description = description,
67 |                                                     task = task)
68 |         else:
69 |             self.my_project = self.my_project[0]
70 | 
71 |         return self.my_project
72 | 


--------------------------------------------------------------------------------
/tests/project_client_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | ProjectClient tests.
 3 | '''
 4 | import os
 5 | import unittest
 6 | 
 7 | from mljar.client.project import ProjectClient
 8 | from .project_based_test import get_postfix
 9 | 
10 | class ProjectClientTest(unittest.TestCase):
11 | 
12 |     def test_create_and_delete(self):
13 |         '''
14 |         Get list of projects, add new project, again get lists of projects and
15 |         compare if new list length is greater than old one.
16 |         '''
17 |         proj_title = 'Test project-01'+get_postfix()
18 |         proj_task = 'bin_class'
19 |         pc = ProjectClient()
20 |         projects_before = pc.get_projects()
21 |         new_project = pc.create_project(title = proj_title, task = proj_task)
22 |         self.assertEqual(new_project.title, proj_title)
23 |         projects_after = pc.get_projects()
24 |         self.assertEqual(len(projects_before) + 1, len(projects_after))
25 |         pc.delete_project(new_project.hid)
26 |         projects_after = pc.get_projects()
27 |         self.assertEqual(len(projects_before), len(projects_after))
28 | 
29 | 
30 |     def test_project_get(self):
31 |         '''
32 |         Test project get method.
33 |         '''
34 |         proj_title = 'Test project-02'+get_postfix()
35 |         proj_task = 'bin_class'
36 |         pc = ProjectClient()
37 |         projects_before = pc.get_projects()
38 |         new_project = pc.create_project(title = proj_title, task = proj_task)
39 |         project = pc.get_project(hid = new_project.hid)
40 |         self.assertEqual(new_project.hid, project.hid)
41 |         self.assertEqual(new_project.title, project.title)
42 |         self.assertEqual(new_project.task, project.task)
43 |         self.assertEqual(new_project.scope, project.scope)
44 |         self.assertEqual(new_project.hardware, project.hardware)
45 |         # test __str__ method
46 |         self.assertTrue('id' in str(new_project))
47 |         self.assertTrue('title' in str(new_project))
48 |         self.assertTrue('task' in str(new_project))
49 | 
50 |         pc.delete_project(new_project.hid)
51 |         project = pc.get_project(hid = new_project.hid)
52 |         self.assertEqual(project, None)
53 | 
54 |     def test_project_get_unknown_hid(self):
55 |         '''
56 |         Test invalid hid value in project get method.
57 |         '''
58 |         pc = ProjectClient()
59 |         project = pc.get_project(hid = 'invalid_hid_value')
60 |         self.assertEqual(project, None)
61 | 
62 |     def test_create_if_not_exists(self):
63 |         proj_title = 'Test project-02'
64 |         proj_task = 'bin_class'
65 |         pc = ProjectClient()
66 |         project = pc.create_project_if_not_exists(title = proj_title, task = proj_task)
67 |         self.assertNotEqual(project, None)
68 |         pc.delete_project(project.hid)
69 |         project = pc.get_project(hid =project.hid)
70 |         self.assertEqual(project, None)
71 | 


--------------------------------------------------------------------------------
/mljar/model/result.py:
--------------------------------------------------------------------------------
 1 | from marshmallow import Schema, fields, post_load
 2 | 
 3 | from .base import BaseModel
 4 | 
 5 | class ResultSchema(Schema):
 6 |     hid = fields.Str()
 7 |     experiment = fields.Str()
 8 |     dataset = fields.Str()
 9 |     validation_scheme = fields.Str()
10 |     model_type = fields.Str()
11 |     metric_type = fields.Str()
12 |     metric_value = fields.Number(allow_none=True)
13 |     run_time = fields.Number(allow_none=True)
14 |     iters = fields.Number(allow_none=True)
15 |     status = fields.Str()
16 |     status_detail = fields.Str(allow_none=True)
17 |     status_modify_at = fields.DateTime()
18 |     importance = fields.Dict(allow_none=True)
19 |     train_prediction_path = fields.Str(allow_none=True)
20 |     params = fields.Dict(allow_none=True)
21 |     train_details = fields.Dict(allow_none=True)
22 |     models_saved = fields.Str(allow_none=True)
23 |     metric_additional = fields.Dict(allow_none=True)
24 | 
25 |     @post_load
26 |     def make_result_instance(self, data):
27 |         return Result(**data)
28 | 
29 | class Result(BaseModel):
30 |     schema = ResultSchema(strict=True)
31 | 
32 |     def __init__(self, hid, experiment, dataset, validation_scheme, model_type, metric_type,
33 |                     params, status, status_detail=None, status_modify_at=None, metric_value=None,
34 |                     importance=None, train_prediction_path=None, run_time=None, iters=None, train_details=None,
35 |                     metric_additional=None, models_saved=None):
36 |         self.hid = hid
37 |         self.experiment = experiment
38 |         self.dataset = dataset
39 |         self.validation_scheme = validation_scheme
40 |         self.model_type = model_type
41 |         self.metric_type = metric_type
42 |         self.metric_value = metric_value
43 |         self.run_time = run_time
44 |         self.iters = iters
45 |         self.status = status
46 |         self.status_detail = status_detail
47 |         self.status_modify_at = status_modify_at
48 |         self.importance = importance
49 |         self.train_prediction_path = train_prediction_path
50 |         self.params = params
51 |         self.train_details = train_details
52 |         self.models_saved = models_saved
53 |         self.metric_additional = metric_additional
54 | 
55 |     def __str__(self):
56 |         desc = 'Result id: {} model: {} status: {}\n'.format(self.hid, self.model_type, self.status)
57 |         desc += 'Performance: {} on {} with {}\n'.format(str(self.metric_value), self.metric_type, self.validation_scheme)
58 |         return desc
59 | 
60 |     '''
61 |     def _get_full_model_name(self, model_type):
62 |         model_name = ''
63 |         if model_type in MLJAR_BIN_CLASS:
64 |             model_name = MLJAR_BIN_CLASS[model_type]
65 |         if model_type in MLJAR_REGRESSION:
66 |             model_name = MLJAR_REGRESSION[model_type]
67 |         if model_name == '':
68 |             model_name = model_type
69 |         return model_name
70 |     '''
71 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |Build Status| |PyPI version| |Coverage Status| |PyPI pyversions|
 2 | 
 3 | mljar-api-python
 4 | ================
 5 | 
 6 | A simple python wrapper over mljar API. It allows MLJAR users to create
 7 | Machine Learning models with few lines of code:
 8 | 
 9 | .. code:: python
10 | 
11 |     from mljar import Mljar
12 | 
13 |     model = Mljar(project='My awesome project', experiment='First experiment')
14 |     model.fit(X,y)
15 | 
16 |     model.predict(X)
17 | 
18 | That's all folks! Yeah, I know, this makes Machine Learning super easy!
19 | You can use this code for following Machine Learning tasks: \* Binary
20 | classification (your target has only two unique values) \* Regression
21 | (your target value is continuous) \* More is coming soon!
22 | 
23 | How to install
24 | --------------
25 | 
26 | You can install mljar with **pip**:
27 | 
28 | ::
29 | 
30 |     pip install -U mljar
31 | 
32 | or from source code:
33 | 
34 | ::
35 | 
36 |     python setup.py install
37 | 
38 | How to use it
39 | -------------
40 | 
41 | 1. Create an account at mljar.com and login.
42 | 2. Please go to your users settings (top, right corner).
43 | 3. Get your token, for example 'exampleexampleexample'.
44 | 4. Set environment variable ``MLJAR_TOKEN`` with your token value:
45 | 
46 |    ::
47 | 
48 |        export MLJAR_TOKEN=exampleexampleexample
49 | 
50 | 5. That's all, you are ready to use MLJAR in your python code!
51 | 
52 | What's going on?
53 | ----------------
54 | 
55 | -  This wrapper allows you to search through different Machine Learning
56 |    algorithms and tune each of the algorithm.
57 | -  By searching and tuning ML algorithm to your data you will get very
58 |    accurate model.
59 | -  By calling method ``fit`` from ``Mljar class`` you create new project
60 |    and start experiment with models training. All your results will be
61 |    accessible from your mljar.com account - this makes Machine Learning
62 |    super easy and keeps all your models and results in beautiful order.
63 |    So, you will never miss anything.
64 | -  All computations are done in MLJAR Cloud, they are executed in
65 |    parallel. So after calling ``fit`` method you can switch your
66 |    computer off and MLJAR will do the job for you!
67 | -  I think this is really amazing! What do you think? Please let us know
68 |    at ``contact@mljar.com``.
69 | 
70 | Examples
71 | --------
72 | 
73 | The examples are `here! <https://github.com/mljar/mljar-examples>`__.
74 | 
75 | Testing
76 | -------
77 | 
78 | To run tests with command:
79 | 
80 | ::
81 | 
82 |     python -m tests.run
83 | 
84 | .. |Build Status| image:: https://travis-ci.org/mljar/mljar-api-python.svg?branch=master
85 |    :target: https://travis-ci.org/mljar/mljar-api-python
86 | .. |PyPI version| image:: https://badge.fury.io/py/mljar.svg
87 |    :target: https://badge.fury.io/py/mljar
88 | .. |Coverage Status| image:: https://coveralls.io/repos/github/mljar/mljar-api-python/badge.svg?branch=master
89 |    :target: https://coveralls.io/github/mljar/mljar-api-python?branch=master
90 | 


--------------------------------------------------------------------------------
/mljar/model/experiment.py:
--------------------------------------------------------------------------------
 1 | from marshmallow import Schema, fields, post_load
 2 | 
 3 | from .base import BaseModel
 4 | 
 5 | class ExperimentSchema(Schema):
 6 |     hid = fields.Str()
 7 |     title = fields.Str()
 8 |     created_at = fields.DateTime(allow_none=True)
 9 |     created_by = fields.Number(allow_none=True)
10 |     parent_project = fields.Str(allow_none=True)
11 |     models_cnt = fields.Number()
12 |     task = fields.Str()
13 |     description = fields.Str(allow_none=True)
14 |     metric = fields.Str()
15 |     validation_scheme = fields.Str()
16 |     total_timelog = fields.Str(allow_none=True)
17 |     bestalg = fields.List(fields.Dict(), allow_none=True)
18 |     details = fields.Dict(allow_none=True)
19 |     params = fields.Dict(allow_none=True)
20 |     compute_now = fields.Number()
21 |     computation_started_at = fields.DateTime(allow_none=True)
22 | 
23 |     @post_load
24 |     def make_experiment_instance(self, data):
25 |         return Experiment(**data)
26 | 
27 | class Experiment(BaseModel):
28 |     schema = ExperimentSchema(strict=True)
29 | 
30 |     def __init__(self, hid, title, models_cnt, task, description, metric,
31 |                     validation_scheme, details,
32 |                     params, compute_now, computation_started_at,
33 |                     bestalg = None, total_timelog = None,
34 |                     created_at = None, created_by = None, parent_project = None):
35 |         self.hid = hid
36 |         self.title = title
37 |         self.description = description
38 |         self.created_at = created_at
39 |         self.created_by = created_by
40 |         self.parent_project = parent_project
41 |         self.models_cnt = models_cnt
42 |         self.task = task
43 |         self.metric = metric
44 |         self.validation_scheme = validation_scheme
45 |         self.total_timelog = total_timelog
46 |         self.bestalg = bestalg
47 |         self.details = details
48 |         self.params = params
49 |         self.compute_now = compute_now
50 |         self.computation_started_at = computation_started_at
51 | 
52 |     def __str__(self):
53 |         desc = 'Experiment id: {} title: {} metric: {} validation: {}\n'.format(self.hid, self.title, self.metric, self.validation_scheme)
54 |         desc += 'Algorithms: {} single algorithm train time: {}\n'.format(str(self.params.get('algs', None)), str(self.params.get('single_limit', None)))
55 |         return desc
56 | 
57 |     def equal(self, expt):
58 |         # sort algorithms names before comparison
59 |         algs   = sorted(self.params.get('algs', []))
60 |         algs_2 = sorted(expt.params.get('algs', []))
61 |         return  self.params['train_dataset'].get('hid', None) == expt.params['train_dataset'].get('hid', None) and \
62 |                 self.metric == str(expt.metric) and \
63 |                 self.validation_scheme == str(expt.validation_scheme) and \
64 |                 '-'.join(algs) == '-'.join(algs_2) and \
65 |                 int(self.params.get('single_limit', 0)) == int(expt.params.get('single_limit', 0)) and \
66 |                 self.params.get('preproc', None) == expt.params.get('preproc', None)
67 | 


--------------------------------------------------------------------------------
/mljar/client/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json, requests
 3 | 
 4 | from .. import API_VERSION, MLJAR_ENDPOINT
 5 | from ..exceptions import MljarException, TokenException, DataReadException, BadRequestException
 6 | from ..exceptions import JSONReadException, NotFoundException, AuthenticationException
 7 | 
 8 | 
 9 | from ..log import logger
10 | 
11 | class MljarHttpClient(object):
12 |     '''
13 |         Mljar Client for HTTP Requests.
14 |     '''
15 | 
16 |     def __init__(self):
17 |         self.TOKEN = os.environ.get('MLJAR_TOKEN', None)
18 |         if not self.TOKEN:
19 |             raise TokenException('Please define environment variable MLJAR_TOKEN. \
20 |                                 You can get you MLJAR token by login to mljar.com account. \
21 |                                 It is available in your settings.')
22 | 
23 |         self.base_url = '/'.join([MLJAR_ENDPOINT, API_VERSION])
24 | 
25 |     def request(self, method, url, data=None, with_header=True, url_outside_mljar=False, parse_json=True):
26 |         """
27 |         Execute the request using requests library.
28 |         """
29 |         if url_outside_mljar:
30 |             request_url = url
31 |         else:
32 |             request_url = self.base_url + url
33 |         logger.debug("Starting request to url: {} with data: {}".format(request_url, data))
34 | 
35 |         headers = {'Authorization': 'Token '+self.TOKEN }
36 |         if with_header:
37 |             response = requests.request(method, request_url, headers=headers, data=data)
38 |         else:
39 |             response = requests.request(method, request_url, data=data)
40 | 
41 |         if parse_json:
42 |             try:
43 |                 if response.status_code != 204:
44 |                     logger.debug("Response content: {}, headers: {}".format(response.json(), response.headers))
45 |             except Exception as e:
46 |                 logger.error("Request failed: {} {}".format(response.content, str(e)))
47 |         self._check_response_status(response)
48 |         return response
49 | 
50 |     def _check_response_status(self, response):
51 |         """
52 |         Check if response is successful else raise Exception.
53 |         """
54 |         if not (200 <= response.status_code < 300):
55 |             try:
56 |                 message = response.json()["errors"]
57 |             except Exception as e:
58 |                 message = None
59 |             logger.debug("Error received : status_code: {}, message: {}".format(response.status_code,
60 |                                                                                       message or response.content))
61 | 
62 |             if response.status_code == 401:
63 |                 raise AuthenticationException()
64 |             elif response.status_code == 404:
65 |                 raise NotFoundException()
66 |             elif response.status_code == 400:
67 |                 raise BadRequestException(response.content)
68 |             elif response.status_code == 500:
69 |                 raise MljarException('server error: ' +str(response.content))
70 |             else:
71 |                 response.raise_for_status()
72 | 


--------------------------------------------------------------------------------
/mljar/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | import pandas as pd
 3 | import numpy as np
 4 | import hashlib
 5 | import sys
 6 | '''
 7 | MLJAR Constants
 8 | '''
 9 | 
10 | MLJAR_TASKS = {
11 |             'bin_class' : 'Binary Classification',
12 |             'regression': 'Regression'
13 |             }
14 | 
15 | MLJAR_METRICS = {
16 |             'auc'    : 'Area Under Curve',
17 |             'logloss': 'Logarithmic Loss',
18 |             'rmse'   : 'Root Mean Square Error',
19 |             'mse'    : 'Mean Square Error',
20 |             'mae'    : 'Mean Absolute Error'
21 |             }
22 | 
23 | MLJAR_DEFAULT_FOLDS = 5
24 | MLJAR_DEFAULT_SHUFFLE = True
25 | MLJAR_DEFAULT_STRATIFY = True
26 | MLJAR_DEFAULT_TRAIN_SPLIT = None
27 | 
28 | 
29 | MLJAR_BIN_CLASS = {
30 |             "xgb"   :"Extreme Gradient Boosting",
31 |             "lgb"   :"LightGBM",
32 |             "rfc"   :"Random Forest",
33 |             "rgfc"  :"Regularized Greedy Forest",
34 |             "etc"   :"Extra Trees",
35 |             "knnc"  :"k-Nearest Neighbor",
36 |             "logreg":"Logistic Regression",
37 |             "mlp"   :"Neural Network"
38 |             }
39 | 
40 | MLJAR_REGRESSION = {
41 |             "xgbr" :"Extreme Gradient Boosting",
42 |             "lgbr" :"LightGBM",
43 |             "rfr"  :"Random Forest",
44 |             "rgfr" :"Regularized Greedy Forest",
45 |             "etr"  :"Extra Trees"
46 |             }
47 | 
48 | MLJAR_TUNING_MODES = {
49 |             'Normal': {'random_start_cnt': 5, 'hill_climbing_cnt': 1},
50 |             'Sport': {'random_start_cnt': 10, 'hill_climbing_cnt': 2},
51 |             'Insane': {'random_start_cnt': 15, 'hill_climbing_cnt': 3}
52 |             }
53 | 
54 | '''
55 | MLJAR Defaults
56 | '''
57 | MLJAR_DEFAULT_METRICS = {
58 |             'bin_class' : 'logloss',
59 |             'regression': 'rmse'
60 |             }
61 | 
62 | MLJAR_DEFAULT_ALGORITHMS = {
63 |             'bin_class': ['xgb', 'lgb', 'mlp'],
64 |             'regression': ['xgbr', 'lgbr']
65 |             }
66 | 
67 | MLJAR_DEFAULT_ENSEMBLE        = True
68 | MLJAR_DEFAULT_TUNING_MODE     = 'Normal'
69 | MLJAR_DEFAULT_TIME_CONSTRAINT = '5' # minutes
70 | 
71 | MLJAR_OPT_MAXIMIZE = ['auc']
72 | 
73 | '''
74 | Function to compute datasets hash, to not upload several times the same dataset.
75 | '''
76 | def make_hash(item):
77 |     if isinstance(item, pd.DataFrame) or isinstance(item, pd.Series):
78 |         if sys.version_info.major == 2:
79 |             values = [str(x).replace(' ', '').encode('utf-8') for x in item.values]
80 |         else:
81 |             values = [str(x).replace(' ', '') for x in item.values]
82 |         item = values
83 |     elif isinstance(item, np.ndarray):
84 |         item = item.copy(order='C')
85 |         return hashlib.sha1(item).hexdigest()
86 |     try:
87 |         i = str(item).encode('utf-8')
88 |         h = hashlib.md5(i).hexdigest()
89 |         return h
90 |     except TypeError:
91 |         try:
92 |             # this might act funny if a thing is convertible to tuple but the tuple
93 |             # is not a proper representation for the item (like for a frame :-()
94 |             return hash(tuple(item))
95 |         except TypeError as e:
96 |             print("Unhashable type: %s" % (item))
97 |             raise e
98 | 


--------------------------------------------------------------------------------
/tests/result_client_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | ResultClient tests.
  3 | '''
  4 | import os
  5 | import unittest
  6 | import pandas as pd
  7 | import time
  8 | 
  9 | from mljar.client.project import ProjectClient
 10 | from mljar.client.dataset import DatasetClient
 11 | from mljar.client.experiment import ExperimentClient
 12 | from mljar.client.result import ResultClient
 13 | from mljar.exceptions import BadRequestException
 14 | 
 15 | from .project_based_test import ProjectBasedTest, get_postfix
 16 | 
 17 | class ResultClientTest(ProjectBasedTest):
 18 | 
 19 |     def setUp(self):
 20 |         proj_title = 'Test project-01'+get_postfix()
 21 |         proj_task = 'bin_class'
 22 |         self.expt_title = 'Test experiment-01'
 23 |         self.validation_kfolds = 5
 24 |         self.validation_shuffle = True
 25 |         self.validation_stratify = True
 26 |         self.validation_train_split = None
 27 |         self.algorithms = ['xgb']
 28 |         self.metric = 'logloss'
 29 |         self.tuning_mode = 'Normal'
 30 |         self.time_constraint = 1
 31 |         self.create_enseble = False
 32 |         # setup project
 33 |         self.project_client = ProjectClient()
 34 |         self.project = self.project_client.create_project(title = proj_title, task = proj_task)
 35 |         # load data
 36 |         df = pd.read_csv('tests/data/test_1.csv')
 37 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
 38 |         target = 'class'
 39 |         # add dataset
 40 |         self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(df[cols], df[target])
 41 | 
 42 | 
 43 |     def tearDown(self):
 44 |         # clean
 45 |         self.project_client.delete_project(self.project.hid)
 46 | 
 47 |     def test_get_results_for_wrong_project(self):
 48 |         with self.assertRaises(BadRequestException) as context:
 49 |             # init result client
 50 |             rc = ResultClient('wrong-hid')
 51 |             self.assertTrue(rc is not None)
 52 |             # get results - should raise exception
 53 |             rc.get_results()
 54 | 
 55 | 
 56 |     def test_get_results_for_project(self):
 57 |         # init result client
 58 |         rc = ResultClient(self.project.hid)
 59 |         self.assertNotEqual(rc, None)
 60 |         # get results - should be empty
 61 |         results = rc.get_results()
 62 |         self.assertEqual(results, [])
 63 |         # add experiment
 64 |         ec = ExperimentClient(self.project.hid)
 65 |         # create new experiment
 66 |         self.experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
 67 |                                             self.validation_kfolds, self.validation_shuffle,
 68 |                                             self.validation_stratify, self.validation_train_split,
 69 |                                             self.algorithms, self.metric,
 70 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
 71 |         # wait some time till models are initialized
 72 |         time.sleep(60)
 73 |         # get results - should be some models there
 74 |         results = rc.get_results()
 75 |         self.assertNotEqual(len(results), 0)
 76 | 
 77 | 
 78 |     def test_get_results_for_experiment(self):
 79 |         # init result client
 80 |         rc = ResultClient(self.project.hid)
 81 |         self.assertNotEqual(rc, None)
 82 |         # get results - should be empty
 83 |         results = rc.get_results()
 84 |         self.assertEqual(results, [])
 85 |         # get results for wrong experiment hid
 86 |         results = rc.get_results('wrong-hid')
 87 |         self.assertEqual(results, [])
 88 |         # add experiment
 89 |         ec = ExperimentClient(self.project.hid)
 90 |         # create new experiment
 91 |         self.experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
 92 |                                             self.validation_kfolds, self.validation_shuffle,
 93 |                                             self.validation_stratify, self.validation_train_split,
 94 |                                             self.algorithms, self.metric,
 95 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
 96 |         # wait some time till models are initialized
 97 |         time.sleep(60)
 98 |         # get results for experiment - should be some models there
 99 |         results = rc.get_results(self.experiment.hid)
100 |         self.assertNotEqual(len(results), 0)
101 | 
102 |         # get results for project
103 |         project_results = rc.get_results()
104 |         self.assertNotEqual(results, [])
105 |         # get results for wrong experiment hid
106 |         # all results from project should be returned
107 |         results_2 = rc.get_results('wrong-hid')
108 |         self.assertEqual(len(project_results), len(results_2))
109 | 
110 |         for r in project_results:
111 |             # test __str__ method
112 |             self.assertTrue('id' in str(r))
113 |             self.assertTrue('model' in str(r))
114 |             self.assertTrue('status' in str(r))
115 | 


--------------------------------------------------------------------------------
/tests/dataset_client_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | DatasetClient tests.
  3 | '''
  4 | import os
  5 | import unittest
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from mljar.client.project import ProjectClient
 10 | from mljar.client.dataset import DatasetClient
 11 | 
 12 | from .project_based_test import ProjectBasedTest, get_postfix
 13 | 
 14 | class DatasetClientTest(ProjectBasedTest):
 15 | 
 16 |     def setUp(self):
 17 |         proj_title = 'Test project-01'+get_postfix()
 18 |         proj_task = 'bin_class'
 19 |         # setup project
 20 |         self.project_client = ProjectClient()
 21 |         self.project = self.project_client.create_project(title = proj_title, task = proj_task)
 22 |         # load data
 23 |         df = pd.read_csv('tests/data/test_1.csv')
 24 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
 25 |         target = 'class'
 26 |         self.X = df.loc[:,cols]
 27 |         self.y = df[target]
 28 | 
 29 |     def tearDown(self):
 30 |         # clean
 31 |         self.project_client.delete_project(self.project.hid)
 32 | 
 33 | 
 34 |     def test_get_datasests(self):
 35 | 
 36 |         #Get empty list of datasets in project.
 37 | 
 38 |         # get datasets
 39 |         datasets = DatasetClient(self.project.hid).get_datasets()
 40 |         self.assertEqual(datasets, [])
 41 | 
 42 |     def test_prepare_data(self):
 43 |         #Test _prepare_data method on numpy array data
 44 |         dc = DatasetClient(self.project.hid)
 45 |         samples = 100
 46 |         columns = 10
 47 |         X = np.random.rand(samples, columns)
 48 |         y = np.random.choice([0,1], samples, replace = True)
 49 |         data, data_hash = dc._prepare_data(X, y)
 50 |         self.assertTrue(data is not None)
 51 |         self.assertTrue(data_hash is not None)
 52 |         self.assertTrue(isinstance(data_hash, str))
 53 |         self.assertEqual(11, len(data.columns))
 54 |         self.assertTrue('target' in data.columns)
 55 |         self.assertTrue('attribute_1' in data.columns)
 56 |         self.assertTrue('attribute_10' in data.columns)
 57 | 
 58 |     def test_get_dataset_for_wrong_hid(self):
 59 |         #Get dataset for wrong hid should return None
 60 |         dc = DatasetClient(self.project.hid)
 61 |         dataset = dc.get_dataset('some-wrong-hid')
 62 |         self.assertTrue(dataset is None)
 63 | 
 64 |     def test_add_dataset_for_training(self):
 65 |         # setup dataset client
 66 |         dc = DatasetClient(self.project.hid)
 67 |         self.assertNotEqual(dc, None)
 68 |         # get datasets, there should be none
 69 |         datasets = dc.get_datasets()
 70 |         self.assertEqual(len(datasets), 0)
 71 |         # add dataset
 72 |         my_dataset = dc.add_dataset_if_not_exists(self.X, self.y)
 73 |         self.assertNotEqual(my_dataset, None)
 74 |         # get datasets
 75 |         datasets = dc.get_datasets()
 76 |         self.assertEqual(len(datasets), 1)
 77 |         my_dataset_2 = dc.get_dataset(my_dataset.hid)
 78 |         self.assertEqual(my_dataset.hid, my_dataset_2.hid)
 79 |         self.assertEqual(my_dataset.title, my_dataset_2.title)
 80 |         # test __str__ method
 81 |         self.assertTrue('id' in str(my_dataset_2))
 82 |         self.assertTrue('title' in str(my_dataset_2))
 83 |         self.assertTrue('file' in str(my_dataset_2))
 84 | 
 85 |     def test_add_dataset_for_prediction(self):
 86 |         # setup dataset client
 87 |         dc = DatasetClient(self.project.hid)
 88 |         self.assertNotEqual(dc, None)
 89 |         # get datasets, there should be none
 90 |         datasets = dc.get_datasets()
 91 |         self.assertEqual(len(datasets), 0)
 92 |         # add dataset
 93 |         my_dataset = dc.add_dataset_if_not_exists(self.X, None)
 94 |         self.assertNotEqual(my_dataset, None)
 95 |         # get datasets
 96 |         datasets = dc.get_datasets()
 97 |         self.assertEqual(len(datasets), 1)
 98 |         my_dataset_2 = dc.get_dataset(my_dataset.hid)
 99 |         self.assertEqual(my_dataset.hid, my_dataset_2.hid)
100 |         self.assertEqual(my_dataset.title, my_dataset_2.title)
101 | 
102 |     def test_add_existing_dataset(self):
103 |         # setup dataset client
104 |         dc = DatasetClient(self.project.hid)
105 |         self.assertNotEqual(dc, None)
106 |         # get initial number of datasets
107 |         init_datasets_cnt = len(dc.get_datasets())
108 |         # add dataset
109 |         dc.add_dataset_if_not_exists(self.X, self.y)
110 |         # get datasets
111 |         datasets = dc.get_datasets()
112 |         self.assertEqual(len(datasets), init_datasets_cnt+1)
113 |         # add the same dataset
114 |         # it shouldn't be added
115 |         dc.add_dataset_if_not_exists(self.X, self.y)
116 |         # number of all datasets in project should be 1
117 |         datasets = dc.get_datasets()
118 |         self.assertEqual(len(datasets), init_datasets_cnt+1)
119 | 
120 |     def test_prepare_data_two_sources(self):
121 |         dc = DatasetClient(self.project.hid)
122 |         data_1, data_hash_1 = dc._prepare_data(self.X, self.y)
123 |         data_2, data_hash_2 = dc._prepare_data(self.X, None)
124 |         self.assertNotEqual(data_hash_1, data_hash_2)
125 | 
126 |     def test_prepare_data_two_sources_numpy(self):
127 |         dc = DatasetClient(self.project.hid)
128 |         data_1, data_hash_1 = dc._prepare_data(np.array(self.X), np.array(self.y))
129 |         data_2, data_hash_2 = dc._prepare_data(np.array(self.X), None)
130 |         self.assertNotEqual(data_hash_1, data_hash_2)
131 | 
132 |     def test_create_and_delete(self):
133 |         # setup dataset client
134 |         dc = DatasetClient(self.project.hid)
135 |         self.assertNotEqual(dc, None)
136 |         # get initial number of datasets
137 |         init_datasets_cnt = len(dc.get_datasets())
138 |         # add dataset
139 |         my_dataset_1 = dc.add_dataset_if_not_exists(self.X, self.y)
140 |         my_dataset_2 = dc.add_dataset_if_not_exists(self.X, y = None)
141 |         # get datasets
142 |         datasets = dc.get_datasets()
143 |         self.assertEqual(len(datasets), init_datasets_cnt+2)
144 |         # delete added dataset
145 |         dc.delete_dataset(my_dataset_1.hid)
146 |         # check number of datasets
147 |         datasets = dc.get_datasets()
148 |         self.assertEqual(len(datasets), init_datasets_cnt+1)
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     unittest.main()
153 | 


--------------------------------------------------------------------------------
/mljar/client/experiment.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function 
  2 | import json
  3 | import warnings
  4 | from .base import MljarHttpClient
  5 | from ..model.experiment import Experiment
  6 | from ..exceptions import NotFoundException, MljarException, CreateExperimentException
  7 | from ..exceptions import UndefinedExperimentException
  8 | 
  9 | from .dataupload import DataUploadClient
 10 | from ..log import logger
 11 | 
 12 | from ..utils import make_hash
 13 | from ..utils import MLJAR_METRICS, MLJAR_TUNING_MODES, MLJAR_DEFAULT_ALGORITHMS, MLJAR_DEFAULT_METRICS
 14 | 
 15 | class ExperimentClient(MljarHttpClient):
 16 |     '''
 17 |     Client to interact with MLJAR experiments
 18 |     '''
 19 |     def __init__(self, project_hid):
 20 |         self.project_hid = project_hid
 21 |         self.url = "/experiments"
 22 |         super(ExperimentClient, self).__init__()
 23 | 
 24 |     def get_experiments(self):
 25 |         '''
 26 |         Gets all experiments in the project
 27 |         '''
 28 |         logger.info('Get experiments, project id {}'.format(self.project_hid))
 29 |         response = self.request("GET", self.url+'?project_id='+self.project_hid)
 30 |         experiments_dict = response.json()
 31 |         return [Experiment.from_dict(expt) for expt in experiments_dict]
 32 | 
 33 |     def get_experiment(self, experiment_hid):
 34 |         '''
 35 |         Get details of experiment.
 36 |         '''
 37 |         logger.info('Get experiment, experiment id {}'.format(experiment_hid))
 38 |         try:
 39 |             response = self.request("GET", self.url+'/'+experiment_hid)
 40 |             return Experiment.from_dict(response.json())
 41 |         except NotFoundException:
 42 |             return None
 43 | 
 44 |     def create_experiment(self, data):
 45 |         response = self.request("POST", self.url, data = data)
 46 |         if response.status_code != 201:
 47 |             raise CreateExperimentException()
 48 |         return Experiment.from_dict(response.json())
 49 | 
 50 |     def add_experiment_if_not_exists(self, train_dataset, vald_dataset, experiment_title, project_task, \
 51 |                                         validation_kfolds, validation_shuffle, \
 52 |                                         validation_stratify, validation_train_split, \
 53 |                                         algorithms, metric, \
 54 |                                         tuning_mode, time_constraint, create_ensemble):
 55 |         logger.info('Add experiment if not exists')
 56 |         # parameters validation
 57 |         # validation with dataset
 58 |         if vald_dataset is not None:
 59 |             validation = "With dataset"
 60 |         else:
 61 |             # do train/validation split
 62 |             if validation_train_split is not None:
 63 |                 percents = int(validation_train_split * 100.0)
 64 |                 validation = "Split {}/{}".format(percents, 100-percents)
 65 |             else:
 66 |                 validation = "{}-fold CV".format(validation_kfolds)
 67 | 
 68 |             # shuffle and stratify
 69 |             if validation_shuffle:
 70 |                 validation += ", Shuffle"
 71 |             if validation_stratify and project_task == 'bin_class':
 72 |                 validation += ", Stratify"
 73 |             if validation_stratify and project_task != 'bin_class':
 74 |                 warnings.warn('Cannot use stratify in validation for your project task. Omitting this option in validation.')
 75 | 
 76 |         if metric is None or metric == '' or metric not in MLJAR_METRICS:
 77 |             metric = MLJAR_DEFAULT_METRICS[project_task]
 78 |         if tuning_mode is None or tuning_mode == '' or tuning_mode not in MLJAR_TUNING_MODES:
 79 |             tuning_mode = MLJAR_DEFAULT_TUNING_MODE
 80 |         if algorithms is None or algorithms == [] or algorithms == '':
 81 |             algorithms = MLJAR_DEFAULT_ALGORITHMS[project_task]
 82 |         # set default preprocessing if needed
 83 |         logger.info('Set default preprocessing')
 84 |         dataset_preproc = {}
 85 |         if len(train_dataset.column_usage_min['cols_to_fill_na']) > 0:
 86 |             dataset_preproc['na_fill'] = 'na_fill_median'
 87 |         if len(train_dataset.column_usage_min['cols_to_convert_categorical']) > 0:
 88 |             dataset_preproc['convert_categorical'] = 'categorical_to_int'
 89 |         # create stub for new experiment
 90 |         logger.info('Create new experiment stub')
 91 |         expt_params = {
 92 |                 "train_dataset": {"id": train_dataset.hid, 'title': train_dataset.title},
 93 |                 "algs":algorithms,
 94 |                 "preproc": dataset_preproc,
 95 |                 "single_limit":time_constraint,
 96 |                 "ensemble":create_ensemble,
 97 |                 "random_start_cnt": MLJAR_TUNING_MODES[tuning_mode]['random_start_cnt'],
 98 |                 "hill_climbing_cnt": MLJAR_TUNING_MODES[tuning_mode]['hill_climbing_cnt']
 99 |                 }
100 |         if vald_dataset is not None:
101 |             expt_params['vald_dataset'] = {"id": vald_dataset.hid, 'title': vald_dataset.title}
102 | 
103 |         new_expt = Experiment(hid='', title=experiment_title, models_cnt=0, task=project_task,
104 |                                 description='', metric=metric, validation_scheme=validation,
105 |                                 total_timelog=0, bestalg=[], details={},
106 |                                 params=expt_params,
107 |                                 compute_now=0, computation_started_at=None, created_at=None,
108 |                                 created_by=None, parent_project=self.project_hid)
109 | 
110 |         # get existing experiments
111 |         experiments = self.get_experiments()
112 |         # check if there are experiments with selected title
113 |         experiments = [e for e in experiments if e.title == new_expt.title]
114 |         # if there are experiments with selected title
115 |         if len(experiments) > 0:
116 |             # check if experiment with the same title has different parameters
117 |             for expt in experiments:
118 |                 if not expt.equal(new_expt):
119 |                     print('The experiment with specified title already exists, but it has different parameters than you specified.')
120 |                     print('Existing experiment')
121 |                     print(str(expt))
122 |                     print('New experiment')
123 |                     print(str(new_expt))
124 |                     print('Please rename your new experiment with new parameters setup.')
125 |                     return None
126 |             # there is only one experiment with selected title and has the same parameters
127 |             # this is our experiment :)
128 |             if len(experiments) == 1:
129 |                 return experiments[0]
130 |             else:
131 |                 # there more than 1 experiment, something goes wrong ...
132 |                 raise UndefinedExperimentException()
133 |         else:
134 |             # there is no experiment with such title, let's go and create it!
135 |             logger.info('Create new experiment: %s' % new_expt.title)
136 |             # create data for experiment construction by hand
137 |             params = json.dumps(new_expt.params)
138 |             data = {
139 |                 'title': new_expt.title,
140 |                 'description': '',
141 |                 'metric': new_expt.metric,
142 |                 'validation_scheme': new_expt.validation_scheme,
143 |                 'task': new_expt.task,
144 |                 'compute_now': 1,
145 |                 'parent_project': self.project_hid,
146 |                 'params': params
147 | 
148 |             }
149 |             return self.create_experiment(data)
150 | 
151 | 
152 | 
153 |         return None
154 | 


--------------------------------------------------------------------------------
/tests/experiment_client_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | ExperimentClient tests.
  3 | '''
  4 | import os
  5 | import unittest
  6 | import pandas as pd
  7 | import time
  8 | 
  9 | from mljar.client.project import ProjectClient
 10 | from mljar.client.dataset import DatasetClient
 11 | from mljar.client.experiment import ExperimentClient
 12 | 
 13 | from .project_based_test import ProjectBasedTest, get_postfix
 14 | 
 15 | class ExperimentClientTest(ProjectBasedTest):
 16 | 
 17 |     def setUp(self):
 18 |         proj_title = 'Test project-01'+get_postfix()
 19 |         proj_task = 'bin_class'
 20 |         self.expt_title = 'Test experiment-01'
 21 |         self.validation_kfolds = 5
 22 |         self.validation_shuffle = True
 23 |         self.validation_stratify = True
 24 |         self.validation_train_split = None
 25 |         self.algorithms = ['xgb']
 26 |         self.metric = 'logloss'
 27 |         self.tuning_mode = 'Normal'
 28 |         self.time_constraint = 1
 29 |         self.create_enseble = False
 30 |         # setup project
 31 |         self.project_client = ProjectClient()
 32 |         self.project = self.project_client.create_project(title = proj_title, task = proj_task)
 33 |         # add training data
 34 |         df = pd.read_csv('tests/data/test_1.csv')
 35 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
 36 |         target = 'class'
 37 |         dc = DatasetClient(self.project.hid)
 38 |         self.dataset = dc.add_dataset_if_not_exists(df[cols], df[target])
 39 | 
 40 | 
 41 |     def tearDown(self):
 42 |         # wait before clean, to have time to initialize models
 43 |         time.sleep(60)
 44 |         # clean
 45 |         self.project_client.delete_project(self.project.hid)
 46 | 
 47 |     def test_create_with_kfold_cv(self):
 48 | 
 49 |         #Create experiment test with k-fold CV.
 50 | 
 51 |         # add experiment
 52 |         ec = ExperimentClient(self.project.hid)
 53 |         self.assertNotEqual(ec, None)
 54 |         # there should be none experiments
 55 |         experiments = ec.get_experiments()
 56 |         self.assertEqual(experiments, [])
 57 |         # create new experiment
 58 |         experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
 59 |                                             self.validation_kfolds, self.validation_shuffle,
 60 |                                             self.validation_stratify, self.validation_train_split,
 61 |                                             self.algorithms, self.metric,
 62 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
 63 |         self.assertNotEqual(experiment, None)
 64 |         self.assertEqual(experiment.title, self.expt_title)
 65 |         self.assertEqual(experiment.validation_scheme, "5-fold CV, Shuffle, Stratify")
 66 |         self.assertEqual(experiment.metric, self.metric)
 67 |         # get all experiments, should be only one
 68 |         experiments = ec.get_experiments()
 69 |         self.assertEqual(len(experiments), 1)
 70 |         # get experiment by hid, there should be the same
 71 |         experiment_2 = ec.get_experiment(experiment.hid)
 72 |         self.assertEqual(experiment_2.hid, experiment.hid)
 73 |         self.assertEqual(experiment_2.title, experiment.title)
 74 |         self.assertEqual(experiment_2.metric, experiment.metric)
 75 |         self.assertEqual(experiment_2.validation_scheme, experiment.validation_scheme)
 76 |         self.assertTrue(experiment.equal(experiment_2))
 77 |         # test __str__ method
 78 |         self.assertTrue('id' in str(experiment_2))
 79 |         self.assertTrue('title' in str(experiment_2))
 80 |         self.assertTrue('metric' in str(experiment_2))
 81 |         self.assertTrue('validation' in str(experiment_2))
 82 | 
 83 |     def test_create_with_train_split(self):
 84 | 
 85 |         #Create experiment with validation by train split.
 86 | 
 87 |         # add experiment
 88 |         ec = ExperimentClient(self.project.hid)
 89 |         self.assertNotEqual(ec, None)
 90 |         # there should be none experiments
 91 |         experiments = ec.get_experiments()
 92 |         self.assertEqual(experiments, [])
 93 |         # create new experiment
 94 |         experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
 95 |                                             self.validation_kfolds, self.validation_shuffle,
 96 |                                             self.validation_stratify, 0.72,
 97 |                                             self.algorithms, self.metric,
 98 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
 99 |         self.assertNotEqual(experiment, None)
100 |         self.assertEqual(experiment.title, self.expt_title)
101 |         self.assertEqual(experiment.validation_scheme, "Split 72/28, Shuffle, Stratify")
102 | 
103 | 
104 |     def test_create_with_validation_dataset(self):
105 | 
106 |         #Create experiment with validation with dataset.
107 | 
108 |         # add vald dataset
109 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
110 |         target = 'class'
111 |         df = pd.read_csv('tests/data/test_1_vald.csv')
112 |         dc = DatasetClient(self.project.hid)
113 |         vald_dataset = dc.add_dataset_if_not_exists(df[cols], df[target])
114 |         # add experiment
115 |         ec = ExperimentClient(self.project.hid)
116 |         self.assertNotEqual(ec, None)
117 |         # there should be none experiments
118 |         experiments = ec.get_experiments()
119 |         self.assertEqual(experiments, [])
120 |         # create new experiment
121 |         experiment = ec.add_experiment_if_not_exists(self.dataset, vald_dataset, self.expt_title, self.project.task,
122 |                                             self.validation_kfolds, self.validation_shuffle,
123 |                                             self.validation_stratify, 0.72,
124 |                                             self.algorithms, self.metric,
125 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
126 |         self.assertNotEqual(experiment, None)
127 |         self.assertEqual(experiment.title, self.expt_title)
128 |         self.assertEqual(experiment.validation_scheme, "With dataset")
129 | 
130 | 
131 |     def test_create_if_exists(self):
132 | 
133 |         #Create experiment after experiment is already in project.
134 | 
135 |         # add experiment
136 |         ec = ExperimentClient(self.project.hid)
137 |         self.assertNotEqual(ec, None)
138 |         # there should be none experiments
139 |         experiments = ec.get_experiments()
140 |         self.assertEqual(experiments, [])
141 |         # create new experiment
142 |         experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
143 |                                             self.validation_kfolds, self.validation_shuffle,
144 |                                             self.validation_stratify, self.validation_train_split,
145 |                                             self.algorithms, self.metric,
146 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
147 |         self.assertNotEqual(experiment, None)
148 |         # get all experiments, should be only one
149 |         experiments = ec.get_experiments()
150 |         self.assertEqual(len(experiments), 1)
151 |         # try to create the same experiment
152 |         experiment_2 = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
153 |                                             self.validation_kfolds, self.validation_shuffle,
154 |                                             self.validation_stratify, self.validation_train_split,
155 |                                             self.algorithms, self.metric,
156 |                                             self.tuning_mode, self.time_constraint, self.create_enseble)
157 |         self.assertNotEqual(experiment, None)
158 |         # get all experiments, should be only one
159 |         experiments = ec.get_experiments()
160 |         self.assertEqual(len(experiments), 1)
161 |         # both should be the same
162 |         self.assertEqual(experiment_2.hid, experiment.hid)
163 |         self.assertEqual(experiment_2.title, experiment.title)
164 |         self.assertEqual(experiment_2.metric, experiment.metric)
165 |         self.assertEqual(experiment_2.validation_scheme, experiment.validation_scheme)
166 |         self.assertTrue(experiment.equal(experiment_2))
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     unittest.main()
171 | 


--------------------------------------------------------------------------------
/mljar/client/dataset.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | import numpy as np
  3 | import pandas as pd
  4 | import uuid
  5 | import os
  6 | import sys
  7 | import time
  8 | import copy
  9 | import tempfile
 10 | from zipfile import ZipFile, ZIP_DEFLATED
 11 | from os.path import basename
 12 | from .base import MljarHttpClient
 13 | from ..model.dataset import Dataset
 14 | from ..exceptions import NotFoundException, MljarException, CreateDatasetException, DatasetUnknownException
 15 | 
 16 | from .dataupload import DataUploadClient
 17 | from ..log import logger
 18 | 
 19 | from ..utils import make_hash
 20 | 
 21 | class DatasetClient(MljarHttpClient):
 22 |     '''
 23 |     Client to interact with MLJAR datasets
 24 |     '''
 25 |     def __init__(self, project_hid):
 26 |         self.project_hid = project_hid
 27 |         self.url = "/datasets"
 28 |         super(DatasetClient, self).__init__()
 29 | 
 30 |     def get_datasets(self):
 31 |         '''
 32 |         Gets all datasets in the project
 33 |         '''
 34 |         logger.info('Get datasets, project id {}'.format(self.project_hid))
 35 |         response = self.request("GET", self.url+'?project_id='+self.project_hid)
 36 |         datasets_dict = response.json()
 37 |         return [Dataset.from_dict(ds) for ds in datasets_dict]
 38 | 
 39 |     def get_dataset(self, dataset_hid):
 40 |         '''
 41 |         Gets dataset for specified hid
 42 |         '''
 43 |         logger.info('Get dataset, dataset id {}'.format(dataset_hid))
 44 |         try:
 45 |             response = self.request("GET", self.url+'/'+dataset_hid)
 46 |             return Dataset.from_dict(response.json())
 47 |         except NotFoundException:
 48 |             logger.error('Dataset not found')
 49 |             return None
 50 | 
 51 |     def delete_dataset(self, dataset_hid):
 52 |         '''
 53 |         Deletes dataset
 54 |         '''
 55 |         response = self.request("DELETE", '/'.join([self.url, dataset_hid]))
 56 |         return response.status_code == 204 or response.status_code == 200
 57 | 
 58 |     def _prepare_data(self, X, y):
 59 |         '''
 60 |         Concatenates matrices and computes hash
 61 |         '''
 62 |         logger.info('Prepare dataset and compute hash')
 63 |         data = None
 64 |         if isinstance(X, np.ndarray):
 65 |             cols = {}
 66 |             col_names = []
 67 |             X_cpy = copy.deepcopy(X)
 68 |             for i in range(X_cpy.shape[1]):
 69 |                 c = 'attribute_'+str(i+1)
 70 |                 cols[c] = X_cpy[:,i]
 71 |                 col_names += [c]
 72 |             if y is not None:
 73 |                 cols['target'] = copy.deepcopy(y)
 74 |                 col_names.append('target')
 75 |             data = pd.DataFrame(cols, columns=col_names)
 76 |         if isinstance(X, pd.DataFrame):
 77 |             if y is not None:
 78 |                 data = copy.deepcopy(X)
 79 |                 data['target'] = copy.deepcopy(y)
 80 |                 # todo: add search for target like attributes and rename
 81 |                 # "target", "class", "loss"
 82 |             else:
 83 |                 data = copy.deepcopy(X)
 84 | 
 85 |         dataset_hash = str(make_hash(data))
 86 |         return data, dataset_hash
 87 | 
 88 |     def _wait_till_all_datasets_are_valid(self):
 89 |         '''
 90 |         Waits till all datasets is valid. If all valid it returns True,
 91 |         if wait time is exceeded and there is any dataset not valid then
 92 |         it returns False.
 93 |         '''
 94 |         logger.info('Wait till all datasets are valid')
 95 |         total_checks = 120
 96 |         for i in range(total_checks):
 97 |             datasets = self.get_datasets()
 98 |             if datasets is not None:
 99 |                 logger.info('There are %s datasets' % len(datasets))
100 |                 if len(datasets) == 0:
101 |                     logger.info('No datasets to wait for')
102 |                     return
103 |                 not_validated = [ds for ds in datasets if ds.valid == 0]
104 |                 if len(not_validated) == 0:
105 |                     logger.info('All datasets are valid')
106 |                     return
107 |             else:
108 |                 logger.info('None datasets list')
109 |             time.sleep(5)
110 |         raise MljarException('There are some problems with reading one of your dataset. \
111 |                             Please login to mljar.com and check your project for more details.')
112 | 
113 | 
114 | 
115 |     def add_dataset_if_not_exists(self, X, y, title_prefix = 'dataset-', dataset_title = None):
116 |         '''
117 |         Checks if dataset already exists, if not it add dataset to project.
118 |         '''
119 |         logger.info('Add dataset if not exists')
120 |         # before start adding any new dataset
121 |         # wait till all dataset are validated
122 |         # it does not return an object, it just waits
123 |         self._wait_till_all_datasets_are_valid()
124 |         logger.info('All datasets are valid till now')
125 |         # check if dataset already exists
126 |         data, dataset_hash = self._prepare_data(X, y)
127 |         datasets = self.get_datasets()
128 |         dataset_details = [d for d in datasets if d.dataset_hash == dataset_hash]
129 |         # dataset with specified hash does not exist
130 |         if len(dataset_details) == 0:
131 |             # add new dataset
132 |             dataset_details = self.add_new_dataset(data, y, title_prefix, dataset_title)
133 |         else:
134 |             dataset_details = dataset_details[0]
135 | 
136 |         if dataset_details is None:
137 |             raise MljarException('There was a problem during new dataset addition')
138 |         # wait till dataset is validated ...
139 |         self._wait_till_all_datasets_are_valid()
140 |         if not self._accept_dataset_column_usage(dataset_details.hid):
141 |             raise MljarException('There was a problem with accept column usage for your dataset.')
142 |         # get dataset with updated statistics
143 |         my_dataset = self.get_dataset(dataset_details.hid)
144 |         if my_dataset is None:
145 |             raise DatasetUnknownException('Can not find dataset: %s' % self.dataset_title)
146 |         if my_dataset.valid != 1:
147 |             raise MljarException('Sorry, your dataset can not be read by MLJAR. \
148 |                                     Please report this to us - we will fix it.')
149 | 
150 |         if my_dataset.column_usage_min is None:
151 |             raise MljarException('Something bad happend! There is no attributes \
152 |                                     usage defined for your dataset')
153 | 
154 |         return my_dataset
155 | 
156 |     def _accept_dataset_column_usage(self, dataset_hid):
157 |         logger.info('Accept column usage')
158 |         response = self.request("POST", '/accept_column_usage/',data = {'dataset_id': dataset_hid})
159 |         return response.status_code == 200
160 | 
161 | 
162 |     def add_new_dataset(self, data, y, title_prefix = 'dataset-', dataset_title = None):
163 |         logger.info('Add new dataset')
164 |         if dataset_title is None:
165 |             title = title_prefix + str(uuid.uuid4())[:4] # set some random name
166 |         else:
167 |             title = dataset_title
168 | 
169 |         file_path = os.path.join(tempfile.gettempdir(), 'dataset-'+ str(uuid.uuid4())[:8]+'.csv')
170 | 
171 |         logger.info('Compress data before export')
172 |         prediction_only = y is None
173 |         # save to local storage
174 |         data.to_csv(file_path, index=False)
175 |         # compress
176 |         file_path_zip = file_path + '.zip'
177 |         with ZipFile(file_path_zip, 'w', ZIP_DEFLATED) as myzip:
178 |             myzip.write(file_path, basename(file_path))
179 | 
180 |         # upload data to MLJAR storage
181 |         dst_path = DataUploadClient().upload_file(self.project_hid, file_path_zip)
182 |         # create a dataset instance in DB
183 |         data = {
184 |             'title': title,
185 |             'file_path': dst_path,
186 |             'file_name': basename(file_path_zip),
187 |             'file_size': round(os.path.getsize(file_path_zip) / 1024.0/ 1024.0, 2),
188 |             'derived': 0,
189 |             'valid': 0,
190 |             'parent_project': self.project_hid,
191 |             'meta': '',
192 |             'data_type': 'tabular',
193 |             'scope': 'private',
194 |             'prediction_only': 1 if prediction_only else 0
195 |         }
196 |         logger.info('Add information about dataset into MLJAR')
197 |         response = self.request("POST", self.url, data = data)
198 |         if response.status_code != 201:
199 |             raise CreateDatasetException()
200 |         logger.info('Clean tmp files')
201 |         # clean data file
202 |         os.remove(file_path)
203 |         os.remove(file_path_zip)
204 | 
205 |         return Dataset.from_dict(response.json())
206 | 


--------------------------------------------------------------------------------
/tests/mljar_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Mljar tests.
  3 | '''
  4 | import os
  5 | import pandas as pd
  6 | import numpy as np
  7 | import unittest
  8 | import time
  9 | 
 10 | from mljar.client.project import ProjectClient
 11 | from mljar.client.dataset import DatasetClient
 12 | from .project_based_test import ProjectBasedTest, get_postfix
 13 | from mljar.exceptions import BadValueException, IncorrectInputDataException
 14 | from mljar.utils import MLJAR_DEFAULT_TUNING_MODE
 15 | from mljar import Mljar
 16 | 
 17 | class MljarTest(ProjectBasedTest):
 18 | 
 19 |     def setUp(self):
 20 |         self.proj_title = 'Test project-01'+get_postfix()
 21 |         self.proj_task = 'bin_class'
 22 |         self.expt_title = 'Test expt 1'
 23 |         # load data
 24 |         df = pd.read_csv('tests/data/test_1.csv')
 25 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
 26 |         target = 'class'
 27 |         self.X = df[cols]
 28 |         self.y = df[target]
 29 | 
 30 |     def tearDown(self):
 31 |         # clean
 32 |         ProjectBasedTest.clean_projects()
 33 | 
 34 | 
 35 |     def mse(self, predictions, targets):
 36 |         predictions = np.array(predictions)
 37 |         targets = np.array(targets)
 38 |         targets = targets.reshape((targets.shape[0],1))
 39 |         return ((predictions - targets) ** 2).mean()
 40 | 
 41 | 
 42 |     def test_compute_prediction(self):
 43 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
 44 |                         algorithms = ['rfc'], metric = 'logloss',
 45 |                         validation_kfolds = 3, tuning_mode = 'Normal',
 46 |                         single_algorithm_time_limit = 1)
 47 |         self.assertTrue(model is not None)
 48 |         # fit models and wait till all models are trained
 49 |         model.fit(X = self.X, y = self.y, dataset_title = 'My dataset')
 50 | 
 51 |         # get project id
 52 |         project_id = model.project.hid
 53 |         # get model id
 54 |         model_id = model.selected_algorithm.hid
 55 | 
 56 |         dc = DatasetClient(project_id)
 57 |         init_datasets_cnt = len(dc.get_datasets())
 58 |         # compute predictions
 59 |         pred = Mljar.compute_prediction(self.X, model_id, project_id)
 60 |         # compute score
 61 |         score = self.mse(pred, self.y)
 62 |         self.assertTrue(score < 0.9)
 63 |         # check if dataset was removed
 64 |         self.assertEqual(init_datasets_cnt, len(dc.get_datasets()))
 65 |         # run predictions again, but keep dataset
 66 |         pred = Mljar.compute_prediction(self.X, model_id, project_id, keep_dataset = True)
 67 |         self.assertEqual(init_datasets_cnt+1, len(dc.get_datasets())) # should be one more
 68 | 
 69 | 
 70 |     def test_basic_usage(self):
 71 | 
 72 |         #Test the most common usage.
 73 | 
 74 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
 75 |                         algorithms = ['xgb'], metric = 'logloss',
 76 |                         validation_kfolds = 3, tuning_mode = 'Normal',
 77 |                         single_algorithm_time_limit = 1)
 78 |         self.assertTrue(model is not None)
 79 |         # fit models and wait till all models are trained
 80 |         model.fit(X = self.X, y = self.y)
 81 |         # run prediction
 82 |         pred = model.predict(self.X)
 83 |         # get MSE
 84 |         score = self.mse(pred, self.y)
 85 |         self.assertTrue(score < 0.9)
 86 | 
 87 |     def test_usage_with_train_split(self):
 88 | 
 89 |         #Test usage with train split.
 90 | 
 91 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
 92 |                     validation_train_split = 0.8, algorithms = ['xgb'], tuning_mode='Normal',
 93 |                     single_algorithm_time_limit=1)
 94 |         self.assertTrue(model is not None)
 95 |         # fit models and wait till all models are trained
 96 |         model.fit(X = self.X, y = self.y, wait_till_all_done = False)
 97 |         # wait some time
 98 |         time.sleep(80)
 99 |         # run prediction
100 |         pred = model.predict(self.X)
101 |         # get MSE
102 |         score = self.mse(pred, self.y)
103 |         self.assertTrue(score < 0.9)
104 |         # check default validation
105 |         self.assertEqual(model.selected_algorithm.validation_scheme, "Split 80/20, Shuffle, Stratify")
106 | 
107 | 
108 |     def test_usage_with_validation_dataset(self):
109 | 
110 |         #Test usage with validation dataset.
111 | 
112 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
113 |                             algorithms = ['xgb'], tuning_mode='Normal',
114 |                             single_algorithm_time_limit = 1)
115 |         self.assertTrue(model is not None)
116 |         # load validation data
117 |         df = pd.read_csv('tests/data/test_1_vald.csv')
118 |         cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
119 |         target = 'class'
120 |         X_vald = df[cols]
121 |         y_vald = df[target]
122 |         # fit models and wait till all models are trained
123 |         model.fit(X = self.X, y = self.y, validation_data=(X_vald, y_vald), wait_till_all_done = False)
124 |         # wait some time
125 |         time.sleep(80)
126 |         # run prediction
127 |         pred = model.predict(self.X)
128 |         # get MSE
129 |         score = self.mse(pred, self.y)
130 |         self.assertTrue(score < 0.9)
131 |         # check default validation
132 |         self.assertEqual(model.selected_algorithm.validation_scheme, "With dataset")
133 | 
134 | 
135 | 
136 | 
137 |     def test_empty_project_title(self):
138 |         with self.assertRaises(BadValueException) as context:
139 |             model = Mljar(project = '', experiment = '')
140 | 
141 |     def test_wrong_tuning_mode(self):
142 |         with self.assertRaises(BadValueException) as context:
143 |             model = Mljar(project = self.proj_title, experiment = self.expt_title,
144 |                             tuning_mode = 'Crazy')
145 | 
146 |     def test_default_tuning_mode(self):
147 |         model = Mljar(project = self.proj_title, experiment = self.expt_title)
148 |         self.assertEqual(model.tuning_mode, MLJAR_DEFAULT_TUNING_MODE)
149 | 
150 |     def test_wrong_input_dim(self):
151 |         with self.assertRaises(IncorrectInputDataException) as context:
152 |             model = Mljar(project = self.proj_title, experiment = self.expt_title)
153 |             samples = 100
154 |             columns = 10
155 |             X = np.random.rand(samples, columns)
156 |             y = np.random.choice([0,1], samples+1, replace = True)
157 |             model.fit(X, y)
158 | 
159 |     def test_predict_without_fit(self):
160 |         # Call predict without calling first fit method should return None
161 |         model = Mljar(project = self.proj_title, experiment = self.expt_title)
162 |         pred = model.predict(self.X)
163 |         self.assertTrue(pred is None)
164 | 
165 |     def test_non_wait_fit(self):
166 | 
167 |         #Test the non wait fit.
168 | 
169 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
170 |                         algorithms = ['xgb'], metric='logloss',
171 |                         validation_kfolds=3, tuning_mode='Normal',
172 |                         single_algorithm_time_limit = 1)
173 |         self.assertTrue(model is not None)
174 |         # fit models, just start computation and do not wait
175 |         start_time = time.time()
176 |         model.fit(X = self.X, y = self.y, wait_till_all_done = False)
177 |         end_time = time.time()
178 |         # time to initialize models should not be greater than 5 minutes
179 |         self.assertTrue(end_time - start_time < 5*60)
180 |         # run prediction
181 |         # good model is not guaranteed
182 |         # but there should be at least one
183 |         max_trys = 50
184 |         pred = None
185 |         while True:
186 |             pred = model.predict(self.X)
187 |             if pred is None:
188 |                 # there is no model ready, please wait
189 |                 time.sleep(10)
190 |             else:
191 |                 break
192 |             max_trys -= 1
193 |             if max_trys <= 0:
194 |                 break
195 | 
196 |         self.assertTrue(pred is not None)
197 |         # get MSE
198 |         score = self.mse(pred, self.y)
199 |         self.assertTrue(score < 0.99)
200 | 
201 |     def test_retrive_models(self):
202 | 
203 |         #Test scenario, when user create project, fit models, and try to once
204 |         #again run project. In this case, there will be no additional computations,
205 |         #all models will be simply retrived from existing project.
206 | 
207 |         model = Mljar(project = self.proj_title, experiment = self.expt_title,
208 |                         algorithms = ['xgb'], metric = 'logloss',
209 |                         validation_kfolds = 3, tuning_mode = 'Normal',
210 |                         single_algorithm_time_limit = 1)
211 |         self.assertTrue(model is not None)
212 |         # fit models and wait till all models are trained
213 |         model.fit(X = self.X, y = self.y)
214 |         # run prediction
215 |         pred = model.predict(self.X)
216 |         # get MSE
217 |         score = self.mse(pred, self.y)
218 |         self.assertTrue(score < 0.1)
219 | 
220 |         # re-use already trained models
221 |         # call fit but models are already trained
222 |         # should be retrived - this should not be longer than 3 minutes
223 |         start_time = time.time()
224 |         model.fit(X = self.X, y = self.y)
225 |         end_time = time.time()
226 |         self.assertTrue(end_time - start_time < 3*60)
227 |         # check prediction
228 |         pred = model.predict(self.X)
229 |         # get MSE
230 |         score_2 = self.mse(pred, self.y)
231 |         self.assertTrue(score_2 < 0.1)
232 |         # scores should be the same
233 |         self.assertTrue(np.abs(score-score_2) < 1e-3)
234 | 
235 |         # re-use project
236 |         start_time = time.time()
237 |         model_2 = Mljar(project = self.proj_title, experiment = self.expt_title,
238 |                         algorithms = ['xgb'], metric = 'logloss',
239 |                         validation_kfolds = 3, tuning_mode = 'Normal',
240 |                         single_algorithm_time_limit = 1)
241 |         self.assertTrue(model_2 is not None)
242 |         # re-use trained models
243 |         model_2.fit(X = self.X, y = self.y)
244 |         end_time = time.time()
245 |         # it should not take longer than 5 minutes
246 |         self.assertTrue(end_time - start_time < 5*60)
247 |         # run prediction
248 |         pred = model_2.predict(self.X)
249 |         # get MSE
250 |         score_3 = self.mse(pred, self.y)
251 |         self.assertTrue(score_3 < 0.1)
252 |         # scores should be the same
253 |         self.assertTrue(np.abs(score-score_3) < 1e-3)
254 |     
255 | 
256 | if __name__ == "__main__":
257 |     unittest.main()
258 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/mljar/mljar.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import uuid
  4 | import sys
  5 | import json, requests
  6 | import time
  7 | import numpy as np
  8 | 
  9 | from .utils import *
 10 | from .exceptions import IncorrectInputDataException, UndefinedExperimentException
 11 | from .exceptions import MljarException, BadValueException
 12 | 
 13 | from .client.project import ProjectClient
 14 | from .client.dataset import DatasetClient
 15 | from .client.experiment import ExperimentClient
 16 | from .client.result import ResultClient
 17 | from .client.prediction import PredictionClient
 18 | from .client.predictjob import PredictJobClient
 19 | from .client.prediction_download import PredictionDownloadClient
 20 | 
 21 | from .log import logger
 22 | 
 23 | class Mljar(object):
 24 |     '''
 25 |     This is a wrapper over MLJAR API - it does all the stuff.
 26 |     '''
 27 | 
 28 |     def __init__(self, project,
 29 |                         experiment,
 30 |                         metric = '',
 31 |                         algorithms = [],
 32 |                         validation_kfolds = MLJAR_DEFAULT_FOLDS,
 33 |                         validation_shuffle = MLJAR_DEFAULT_SHUFFLE,
 34 |                         validation_stratify = MLJAR_DEFAULT_STRATIFY,
 35 |                         validation_train_split = MLJAR_DEFAULT_TRAIN_SPLIT,
 36 |                         tuning_mode = MLJAR_DEFAULT_TUNING_MODE,
 37 |                         create_ensemble  = MLJAR_DEFAULT_ENSEMBLE,
 38 |                         single_algorithm_time_limit = MLJAR_DEFAULT_TIME_CONSTRAINT):
 39 |         '''
 40 |         Set up MLJAR project and experiment.
 41 |         Args:
 42 |             tuning_mode: This parameter controls number of models that will be checked
 43 |                             for each selected algorithm. There available modes: Normal, Sport, Insane.
 44 |             algorithms: The list of algorithms that will be checked. The list depends on project task which will be guessed based on target column values.
 45 |                         For binary classification task available algorithm are:
 46 |                          - xgb which is for Xgboost
 47 |                          - lgb which is for LightGBM
 48 |                          - mlp which is for Neural Network
 49 |                          - rfc which is for Random Forest
 50 |                          - etc which is for Extra Trees
 51 |                          - rgfc which is for Regularized Greedy Forest
 52 |                          - knnc which is for k-Nearest Neighbors
 53 |                          - logreg which is for Logistic Regression
 54 |                         For regression task there are available algorithms:
 55 |                          - xgbr which is for Xgboost
 56 |                          - lgbr which is for LightGBM
 57 |                          - rgfr which is for Regularized Greedy Forest
 58 |                          - rfr which is for Random Forest
 59 |                          - etr which is for Extra Trees
 60 |                         You can specify the list of algorithms that you want to use, if left blank all algorithms will be used.
 61 |             metric: The metric that will be used for model search and tuning. It depends on project's task.
 62 |                     For binary classification there are metrics:
 63 |                      - auc which is for Area Under ROC Curve
 64 |                      - logloss which is for Logarithmic Loss
 65 |                     For regression tasks:
 66 |                      - rmse which is Root Mean Square Error
 67 |                      - mse which is for Mean Square Error
 68 |                      - mase which is for Mean Absolute Error
 69 |             validation_kfolds: The number of folds to be used in validation,
 70 |                             it is omitted if validation_train_split is not None
 71 |                             or there is validation dataset provided.
 72 |                             It can be number from 2 to 15.
 73 |             validation_shuffle: The boolean which specify if shuffle samples before training.
 74 |                             It is used in k-fold CV and in validation split. Default is set True.
 75 |                             It is ignored when validating with separate dataset.
 76 |             validation_stratify: The boolean which decides whether samples will be
 77 |                             divided into folds with the same class distribution.
 78 |                             In regression tasks this flag is ignored. Default is set to True.
 79 |             validation_train_split: The ratio how to split training dataset into train and validation.
 80 |                             This ratio specify what ratio from input data should be used in training.
 81 |                             It should be from (0.05,0.95) range. If it is not None, then
 82 |                             validation_kfolds variable is ignored.
 83 |             single_algorithm_time_limit: The time in minutes that will be spend for training single algorithm.
 84 |                         Default value is 5 minutes.
 85 |         '''
 86 |         super(Mljar, self).__init__()
 87 |         if project == '' or experiment == '':
 88 |             raise BadValueException('The project or experiment title is undefined')
 89 | 
 90 |         self.project_title    = project
 91 |         self.experiment_title = experiment
 92 |         self.create_ensemble   = create_ensemble
 93 |         self.selected_algorithm = None
 94 |         self.dataset_title      = None
 95 |         self.verbose            = True
 96 | 
 97 |         if tuning_mode is None:
 98 |             tuning_mode = MLJAR_DEFAULT_TUNING_MODE
 99 |         if tuning_mode not in ['Normal', 'Sport', 'Insane']:
100 |             raise BadValueException('There is a wrong tuning mode selected. \
101 |                                         There are available modes: Normal, Sport, Insane.')
102 |         self.tuning_mode = tuning_mode
103 |         # below params are validated later
104 |         self.algorithms = algorithms
105 |         self.metric = metric
106 |         self.single_algorithm_time_limit = single_algorithm_time_limit
107 |         self.wait_till_all_done = True
108 |         self.selected_algorithm = None
109 |         self.project = None
110 |         self.experiment = None
111 | 
112 |         self.validation_kfolds = validation_kfolds
113 |         self.validation_shuffle = validation_shuffle
114 |         self.validation_stratify = validation_stratify
115 |         self.validation_train_split = validation_train_split
116 | 
117 |         if self.validation_kfolds is not None:
118 |             if self.validation_kfolds < 2 or self.validation_kfolds > 15:
119 |                 raise MljarException('Wrong validation_kfolds parameter value, it should be in [2, 15] range.')
120 | 
121 |         if self.validation_train_split is not None:
122 |             if self.validation_train_split < 0.05 or self.validation_train_split > 0.95:
123 |                 raise MljarException('Wrong validation_train_split parameter value, it should be in (0.05, 0.95) range.')
124 | 
125 | 
126 |     def fit(self, X, y, validation_data = None, wait_till_all_done = True, dataset_title = None):
127 |         '''
128 |         Fit models with MLJAR engine.
129 |         Args:
130 |             X: The numpy or pandas matrix with training data.
131 |             y: The numpy or pandas vector with target values.
132 |             validation_data: Tuple (X,y) with validation data.If set to None, then
133 |                                 the k-fold CV or train split validation will be used.
134 |             wait_till_all_done: The flag which decides if fit function will wait
135 |                                 till experiment is done.
136 |             dataset_title: The title of your dataset. It is optional. If missing the
137 |                             random title will be generated.
138 |         '''
139 |         self.wait_till_all_done = wait_till_all_done
140 |         # check input data dimensions
141 |         if len(y.shape) > 1 and y.shape[1] > 1:
142 |             raise IncorrectInputDataException('Sorry, multiple outputs are not supported in MLJAR')
143 |         if y.shape[0] != X.shape[0]:
144 |             raise IncorrectInputDataException('Sorry, there is a missmatch between X and y matrices shapes')
145 | 
146 |         try:
147 |             self._start_experiment(X, y, validation_data, dataset_title)
148 |         except Exception as e:
149 |             print('Ups, {0}'.format(str(e)))
150 | 
151 | 
152 |     def _start_experiment(self, X, y, validation_data = None, dataset_title = None):
153 | 
154 |         # define project task
155 |         self.project_task = 'bin_class' if len(np.unique(y)) == 2 else 'reg'
156 |         #
157 |         # check if project with such title exists
158 |         #
159 |         logger.info('MLJAR: add project')
160 |         self.project = ProjectClient().create_project_if_not_exists(self.project_title, self.project_task)
161 |         #
162 |         # add a dataset to project
163 |         #
164 |         logger.info('MLJAR: add training dataset')
165 |         self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y, title_prefix = 'Training-', dataset_title = dataset_title)
166 | 
167 |         self.dataset_vald = None
168 |         if validation_data is not None:
169 |             if len(validation_data) != 2:
170 |                 raise MljarException('Wrong format of validation data. It should be tuple (X,y)')
171 |             logger.info('MLJAR: add validation dataset')
172 |             X_vald, y_vald = validation_data
173 |             self.dataset_vald = DatasetClient(self.project.hid).add_dataset_if_not_exists(X_vald, y_vald, title_prefix = 'Validation-')
174 |         #
175 |         # add experiment to project
176 |         #
177 |         logger.info('MLJAR: add experiment')
178 |         self.experiment = ExperimentClient(self.project.hid).add_experiment_if_not_exists(self.dataset, self.dataset_vald, \
179 |                                                     self.experiment_title, self.project_task, \
180 |                                                     self.validation_kfolds, self.validation_shuffle, \
181 |                                                     self.validation_stratify, self.validation_train_split, \
182 |                                                     self.algorithms, self.metric, \
183 |                                                     self.tuning_mode, self.single_algorithm_time_limit, self.create_ensemble)
184 |         if self.experiment is None:
185 |             raise UndefinedExperimentException()
186 |         #
187 |         # get results
188 |         #
189 |         # results = ResultClient(self.project.hid).get_results(self.experiment.hid)
190 |         #
191 |         # wait for models ...
192 |         #
193 |         if self.wait_till_all_done:
194 |             self.selected_algorithm = self._wait_till_all_models_trained()
195 | 
196 |     def _wait_till_all_models_trained(self):
197 |         WAIT_INTERVAL = 10.0
198 |         loop_max_counter = 24*360 # 24 hours of max waiting, is enough ;)
199 |         results = None
200 |         max_error_cnt = 5
201 |         current_error_cnt = 0
202 |         while True:
203 |             loop_max_counter -= 1
204 |             if loop_max_counter <= 0:
205 |                 break
206 |             try:
207 |                 # get current state of the results
208 |                 results = ResultClient(self.project.hid).get_results(self.experiment.hid)
209 |                 # check if experiment is done, if yes then stop training
210 |                 self.experiment = ExperimentClient(self.project.hid).get_experiment(self.experiment.hid)
211 |                 if self.experiment.compute_now == 2:
212 |                     break
213 |                 # print current state of the results
214 |                 initiated_cnt, learning_cnt, done_cnt, error_cnt = self._get_results_stats(results)
215 |                 eta = self._asses_total_training_time(results)
216 |                 if initiated_cnt + learning_cnt + done_cnt + error_cnt == 0:
217 |                     eta = 'estimating'
218 |                 else:
219 |                     eta = round(eta, 2)
220 |                 sys.stdout.write("\rinitiated: {}, learning: {}, done: {}, error: {} | ETA: {} minutes                         ".format(initiated_cnt, learning_cnt, done_cnt, error_cnt, eta))
221 |                 sys.stdout.flush()
222 | 
223 |                 time.sleep(WAIT_INTERVAL)
224 |             except KeyboardInterrupt:
225 |                 break
226 |             except Exception as e:
227 |                 logger.error('There is some problem while waiting for models, %s' % str(e))
228 |                 current_error_cnt += 1
229 |                 if current_error_cnt >= max_error_cnt:
230 |                     break
231 |         logger.info('Get the best result')
232 |         print('') # add new line
233 |         # get the best result!
234 |         return self._get_the_best_result(results)
235 | 
236 | 
237 | 
238 |     def _asses_total_training_time(self, results):
239 |         '''
240 |             Estimated time of models arrival, in minutes.
241 |         '''
242 |         single_alg_limit = float(self.experiment.params.get('single_limit', 5.0))
243 |         initiated_cnt, learning_cnt, done_cnt, error_cnt = self._get_results_stats(results)
244 |         total = (initiated_cnt * single_alg_limit) / float(max(learning_cnt,1))
245 |         total += 0.5 * single_alg_limit
246 |         return total
247 | 
248 |     def _get_results_stats(self, results):
249 |         initiated_cnt, learning_cnt, done_cnt, error_cnt = 0, 0, 0, 0
250 |         for r in results:
251 |             if r.status == 'Initiated':
252 |                 initiated_cnt += 1
253 |             elif r.status == 'Learning':
254 |                 learning_cnt += 1
255 |             elif r.status == 'Done':
256 |                 done_cnt += 1
257 |             else: # error
258 |                 error_cnt += 1
259 |         return initiated_cnt, learning_cnt, done_cnt, error_cnt
260 | 
261 |     def _get_the_best_result(self, results):
262 |         the_best_result = None
263 |         if self.experiment.compute_now in [1, 2]:
264 |             opt_direction = 1 if self.experiment.metric \
265 |                                         not in MLJAR_OPT_MAXIMIZE else -1
266 |             min_value = 10e12
267 |             for r in results:
268 |                 if r.metric_value is None:
269 |                     continue
270 |                 if r.metric_value*opt_direction < min_value:
271 |                     min_value = r.metric_value*opt_direction
272 |                     the_best_result = r
273 |         return the_best_result
274 | 
275 | 
276 |     def predict(self, X):
277 |         if self.project is None or self.experiment is None:
278 |             print('Can not run prediction.')
279 |             print('Please run fit method first, to start models training and to retrieve them ;)')
280 |             return None
281 |         if self.selected_algorithm is None:
282 |             results = ResultClient(self.project.hid).get_results(self.experiment.hid)
283 |             self.selected_algorithm = self._get_the_best_result(results)
284 |             if self.experiment.compute_now != 2:
285 |                 if self.selected_algorithm is not None:
286 |                     print('DISCLAIMER:')
287 |                     print('Your experiment is not yet finished.')
288 |                     print('You will use the best model up to now.')
289 |                     print('You can obtain better results if you wait till experiment is finished.')
290 |                 else:
291 |                     print('There is no ready model to use for prediction.')
292 |                     print('Please wait and try in a moment')
293 |                     return None
294 | 
295 |         if self.selected_algorithm is not None:
296 | 
297 |             return Mljar.compute_prediction(X, self.selected_algorithm.hid, self.project.hid)
298 |             '''
299 |             # chack if dataset exists in mljar if not upload dataset for prediction
300 |             dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y = None)
301 | 
302 |             # check if prediction is available
303 |             total_checks = 100
304 |             for i in xrange(total_checks):
305 |                 prediction = PredictionClient(self.project.hid).\
306 |                                 get_prediction(dataset.hid, self.selected_algorithm.hid)
307 | 
308 |                 # prediction is not available, first check so submit job
309 |                 if i == 0 and prediction is None:
310 |                     # create prediction job
311 |                     submitted = PredictJobClient().submit(self.project.hid, dataset.hid,
312 |                                                             self.selected_algorithm.hid)
313 |                     if not submitted:
314 |                         logger.error('Problem with prediction for your dataset')
315 |                         return None
316 | 
317 |                 if prediction is not None:
318 |                     pred = PredictionDownloadClient().download(prediction.hid)
319 |                     #sys.stdout.write('\r\n')
320 |                     return pred
321 | 
322 |                 #sys.stdout.write('\rFetch predictions: {0}%'.format(round(i/(total_checks*0.01))))
323 |                 #sys.stdout.flush()
324 |                 time.sleep(5)
325 | 
326 |             #sys.stdout.write('\r\n')
327 |             logger.error('Sorry, there was some problem with computing prediction for your dataset. \
328 |                             Please login to mljar.com to your account and check details.')
329 |             return None
330 |             '''
331 | 
332 | 
333 |     @staticmethod
334 |     def compute_prediction(X, model_id, project_id, keep_dataset = False, dataset_title = None):
335 | 
336 | 
337 |         # chack if dataset exists in mljar if not upload dataset for prediction
338 |         dataset = DatasetClient(project_id).add_dataset_if_not_exists(X, y = None, title_prefix = 'Testing-', dataset_title = dataset_title)
339 | 
340 |         # check if prediction is available
341 |         total_checks = 1000
342 |         for i in range(total_checks):
343 |             prediction = PredictionClient(project_id).\
344 |                             get_prediction(dataset.hid, model_id)
345 | 
346 |             # prediction is not available, first check so submit job
347 |             if i == 0 and prediction is None:
348 |                 # create prediction job
349 |                 submitted = PredictJobClient().submit(project_id, dataset.hid,
350 |                                                         model_id)
351 |                 if not submitted:
352 |                     logger.error('Problem with prediction for your dataset')
353 |                     return None
354 | 
355 |             if prediction is not None:
356 |                 pred = PredictionDownloadClient().download(prediction.hid)
357 |                 #sys.stdout.write('\r\n')
358 |                 if not keep_dataset:
359 |                     DatasetClient(project_id).delete_dataset(dataset.hid)
360 |                 return pred
361 | 
362 |             #sys.stdout.write('\rFetch predictions: {0}%'.format(round(i/(total_checks*0.01))))
363 |             #sys.stdout.flush()
364 |             time.sleep(10)
365 | 
366 |         #sys.stdout.write('\r\n')
367 |         logger.error('Sorry, there was some problem with computing prediction for your dataset. \
368 |                         Please login to mljar.com to your account and check details.')
369 |         return None
370 | 


--------------------------------------------------------------------------------