├── .gitignore ├── LICENSE.txt ├── PythonSDK.pyproj ├── PythonSDK.sln ├── README.md ├── azureml ├── __init__.py ├── errors.py ├── http.py ├── serialization.py └── services.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── README.md ├── __init__.py ├── coverage.bat ├── foo.txt ├── lib.py ├── performancetests.py ├── roundtriptests.py ├── serialize_test.py ├── servicestests.py └── unittests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache 2 | __pycache__/ 3 | *.pyc 4 | 5 | # PTVS analysis 6 | .ptvs/ 7 | 8 | # Build results 9 | /bin/ 10 | /obj/ 11 | 12 | # Python setup.py output 13 | /azureml.egg-info/ 14 | /dist/ 15 | /build/ 16 | 17 | # Test results 18 | /TestResults/ 19 | 20 | # Credentials 21 | azuremltestsettings.json 22 | 23 | # User-specific files 24 | *.suo 25 | *.user 26 | *.sln.docstates 27 | 28 | # Windows image file caches 29 | Thumbs.db 30 | ehthumbs.db 31 | 32 | # Folder config file 33 | Desktop.ini 34 | 35 | # Recycle Bin used on file shares 36 | $RECYCLE.BIN/ 37 | 38 | # Mac desktop service store files 39 | .DS_Store 40 | 41 | .idea 42 | src/build 43 | *.iml 44 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation 2 | All rights reserved. 3 | 4 | MIT License: 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /PythonSDK.pyproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | 2.0 6 | eb114967-b952-4108-b806-f72c334ff3be 7 | 8 | 9 | 10 | 11 | . 12 | . 13 | . 14 | PythonSDK 15 | PythonSDK 16 | {6d533506-2bd2-4a3f-ba63-0a02b57e03ad} 17 | 2.7 18 | False 19 | SAK 20 | SAK 21 | SAK 22 | SAK 23 | 24 | 25 | true 26 | false 27 | 28 | 29 | true 30 | false 31 | 32 | 33 | 10.0 34 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | Code 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | Code 55 | 56 | 57 | Code 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /PythonSDK.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.31101.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonSDK", "PythonSDK.pyproj", "{EB114967-B952-4108-B806-F72C334FF3BE}" 7 | EndProject 8 | Global 9 | GlobalSection(TeamFoundationVersionControl) = preSolution 10 | SccNumberOfProjects = 2 11 | SccEnterpriseProvider = {4CA58AB2-18FA-4F8D-95D4-32DDF27D184C} 12 | SccTeamFoundationServer = http://sqlbuvsts01:8080/main 13 | SccLocalPath0 = . 14 | SccProjectUniqueName1 = PythonSDK.pyproj 15 | SccLocalPath1 = . 16 | EndGlobalSection 17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 18 | Debug|Any CPU = Debug|Any CPU 19 | Release|Any CPU = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {EB114967-B952-4108-B806-F72C334FF3BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {EB114967-B952-4108-B806-F72C334FF3BE}.Release|Any CPU.ActiveCfg = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Microsoft Azure Machine Learning Python client library for Azure ML Studio 2 | ========================================================================== 3 | 4 | > **NOTE** This content is no longer maintained. Visit the [Azure Machine Learning Notebook](https://github.com/Azure/MachineLearningNotebooks) project for sample Jupyter notebooks for ML and deep learning with Azure Machine Learning using the Python SDK. 5 | 6 | The preview of Azure Machine Learning Python client library lets you access your Azure ML Studio datasets from your local Python environment. 7 | 8 | You can download datasets that are available in your ML Studio workspace, or intermediate datasets from experiments that were run. You can upload new datasets and update existing datasets. The data is optionally converted to/from a Pandas DataFrame. 9 | 10 | This is a technology preview. The APIs exposed by the library and the REST endpoints it connects to are subject to change. 11 | 12 | 13 | Installation 14 | ============ 15 | 16 | The SDK has been tested with Python 2.7, 3.3 and 3.4. 17 | 18 | It has a dependency on the following packages: 19 | 20 | - requests 21 | - python-dateutil 22 | - pandas 23 | 24 | 25 | You can install it from [PyPI](https://pypi.python.org/pypi/azureml): 26 | 27 | ``` 28 | pip install azureml 29 | ``` 30 | 31 | 32 | Usage 33 | ===== 34 | 35 | Note: We recommend that you use the **Generate Data Access Code** feature from [Azure Machine Learning Studio](https://studio.azureml.net) in order to get Python code snippets that give you access to your datasets. The code snippets include your workspace id, authorization token, and other necessary identifiers to get to your datasets. 36 | 37 | Accessing your workspace 38 | ------------------------ 39 | 40 | You'll need to obtain your workspace id and token in order to get access to your workspace. 41 | 42 | ```python 43 | from azureml import Workspace 44 | 45 | ws = Workspace(workspace_id='4c29e1adeba2e5a7cbeb0e4f4adfb4df', 46 | authorization_token='f4f3ade2c6aefdb1afb043cd8bcf3daf') 47 | ``` 48 | 49 | If you're using AzureML in a region other than South Central US you'll also need to specify the endpoint: 50 | 51 | ```python 52 | from azureml import Workspace 53 | 54 | ws = Workspace(workspace_id='4c29e1adeba2e5a7cbeb0e4f4adfb4df', 55 | authorization_token='f4f3ade2c6aefdb1afb043cd8bcf3daf', 56 | endpoint='https://europewest.studio.azureml.net/') 57 | ``` 58 | 59 | Specify workspace via config 60 | ---------------------------- 61 | If you don't want to store your access tokens in code you can also put them in a configuration file. The SDK will look for ~/.azureml/settings.ini and if available use that: 62 | 63 | ``` 64 | [workspace] 65 | id=4c29e1adeba2e5a7cbeb0e4f4adfb4df 66 | authorization_token=f4f3ade2c6aefdb1afb043cd8bcf3daf 67 | api_endpoint=https://studio.azureml.net 68 | management_endpoint=https://management.azureml.net 69 | ``` 70 | 71 | And then the workspace can be created without arguments: 72 | 73 | ```python 74 | from azureml import Workspace 75 | 76 | ws = Workspace() 77 | ``` 78 | 79 | 80 | Accessing datasets 81 | ------------------ 82 | 83 | To enumerate all datasets in a given workspace: 84 | 85 | ```python 86 | for ds in ws.datasets: 87 | print(ds.name) 88 | ``` 89 | 90 | Just the user-created datasets: 91 | 92 | ```python 93 | for ds in ws.user_datasets: 94 | print(ds.name) 95 | ``` 96 | 97 | Just the example datasets: 98 | 99 | ```python 100 | for ds in ws.example_datasets: 101 | print(ds.name) 102 | ``` 103 | 104 | You can access a dataset by name (which is case-sensitive): 105 | 106 | ```python 107 | ds = ws.datasets['my dataset name'] 108 | ``` 109 | 110 | By index: 111 | 112 | ```python 113 | ds = ws.datasets[0] 114 | ``` 115 | 116 | 117 | Dataset metadata 118 | ---------------- 119 | 120 | Every dataset has metadata in addition to its content. 121 | 122 | Some metadata values are assigned by the user at creation time: 123 | 124 | ```python 125 | print(ds.name) 126 | print(ds.description) 127 | print(ds.family_id) 128 | print(ds.data_type_id) 129 | ``` 130 | 131 | Others are values assigned by Azure ML: 132 | 133 | ```python 134 | print(ds.id) 135 | print(ds.created_date) 136 | print(ds.size) 137 | ``` 138 | 139 | See the `SourceDataset` class for more on the available metadata. 140 | 141 | 142 | Reading contents 143 | ---------------- 144 | 145 | You can import the dataset contents as a pandas DataFrame object. 146 | The `data_type_id` metadata on the dataset is used to determine how to import the contents. 147 | 148 | ```python 149 | frame = ds.to_dataframe() 150 | ``` 151 | 152 | If a dataset is in a format that cannot be deserialized to a pandas DataFrame, the dataset object will not have a to_dataframe method. 153 | 154 | You can still read those datasets as text or binary, then parse the data manually. 155 | 156 | Read the contents as text: 157 | 158 | ```python 159 | text_data = ds.read_as_text() 160 | ``` 161 | 162 | Read the contents as binary: 163 | 164 | ```python 165 | binary_data = ds.read_as_binary() 166 | ``` 167 | 168 | You can also just open a stream to the contents: 169 | 170 | ```python 171 | with ds.open() as file: 172 | binary_data_chunk = file.read(1000) 173 | ``` 174 | 175 | This gives you more control over the memory usage, as you can read and parse the data in chunks. 176 | 177 | 178 | Accessing intermediate datasets 179 | ------------------------------- 180 | 181 | You can access the intermediate datasets at the output ports of the nodes in your experiments. 182 | 183 | Note that the default binary serialization format (.dataset) for intermediate datasets is not supported. Make sure to use a Convert to TSV or Convert to CSV module and read the intermediate dataset from its output port. 184 | 185 | First, get the experiment, using the experiment id: 186 | 187 | ```python 188 | experiment = ws.experiments['my experiment id'] 189 | ``` 190 | 191 | Then get the intermediate dataset object: 192 | 193 | ```python 194 | ds = experiment.get_intermediate_dataset( 195 | node_id='5c457225-68e3-4b60-9e3a-bc55f9f029a4-565', 196 | port_name='Results dataset', 197 | data_type_id=DataTypeIds.GenericCSV 198 | ) 199 | ``` 200 | 201 | To determine the values to pass to `get_intermediate_dataset`, use the **Generate Data Access Code** command on the module output port in ML Studio. 202 | 203 | You can then read the intermediate dataset contents just like you do for a regular dataset: 204 | 205 | ```python 206 | frame = ds.to_dataframe() 207 | ``` 208 | 209 | You can also use `open`, `read_as_text` and `read_as_binary`. 210 | 211 | Note that intermediate datasets do not have any metadata available. 212 | 213 | 214 | Creating a new dataset 215 | ---------------------- 216 | 217 | After you've manipulated the data, you can upload it as a new dataset on Azure ML. 218 | 219 | This will serialize the pandas DataFrame object to the format specified in the 220 | `data_type_id` parameter, then upload it to Azure ML. 221 | 222 | ```python 223 | dataset = workspace.datasets.add_from_dataframe( 224 | dataframe=frame, 225 | data_type_id=DataTypeIds.GenericCSV, 226 | name='my new dataset', 227 | description='my description' 228 | ) 229 | ``` 230 | 231 | If you want to serialize the data yourself, you can upload the raw data. Note 232 | that you still have to indicate the format of the data. 233 | 234 | ```python 235 | raw_data = my_own_csv_serialization_function(frame) 236 | dataset = workspace.datasets.add_from_raw_data( 237 | raw_data=raw_data, 238 | data_type_id=DataTypeIds.GenericCSV, 239 | name='my new dataset', 240 | description='my description' 241 | ) 242 | ``` 243 | 244 | After it's added, it's immediately accessible from the datasets collection. 245 | 246 | If you attempt to create a new dataset with a name that matches an existing dataset, an AzureMLConflictHttpError will be raised. 247 | 248 | ```python 249 | from azureml import AzureMLConflictHttpError 250 | 251 | try: 252 | workspace.datasets.add_from_dataframe( 253 | dataframe=frame, 254 | data_type_id=DataTypeIds.GenericCSV, 255 | name='not a unique name', 256 | description='my description' 257 | ) 258 | except AzureMLConflictHttpError: 259 | print('Try again with a unique name!') 260 | ``` 261 | 262 | To update an existing dataset, you can use `update_from_dataframe` or `update_from_raw_data`: 263 | 264 | ```python 265 | name = 'my existing dataset' 266 | dataset = workspace.datasets[name] 267 | 268 | dataset.update_from_dataframe(dataframe=frame) 269 | ``` 270 | 271 | You can optionally change the name, description or the format of the data too: 272 | 273 | ```python 274 | name = 'my existing dataset' 275 | dataset = workspace.datasets[name] 276 | 277 | dataset.update_from_dataframe( 278 | dataframe=frame, 279 | data_type_id=DataTypeIds.GenericCSV, 280 | name='my new name', 281 | description='my new description' 282 | ) 283 | ``` 284 | 285 | If you attempt to create a new dataset with an invalid name, or if Azure ML rejects the dataset for any other reason, an AzureMLHttpError will be raised. AzureMLHttpError is raised when the http status code indicates a failure. A detailed error message can displayed by printing the exception, and the HTTP status code is stored in the `status_code` field. 286 | 287 | ```python 288 | from azureml import AzureMLHttpError 289 | 290 | try: 291 | workspace.datasets.add_from_dataframe( 292 | dataframe=frame, 293 | data_type_id=DataTypeIds.GenericCSV, 294 | name='invalid:name', 295 | description='my description' 296 | ) 297 | except AzureMLHttpError as error: 298 | print(error.status_code) 299 | print(error) 300 | ``` 301 | 302 | Services Usage 303 | ============== 304 | The services subpackage allows you to easily publish and consume AzureML Web Services. Currently only Python 2.7 is supported for services because the back end only has Python 2.7 installed. 305 | 306 | Publishing 307 | ---------- 308 | 309 | Python functions can either be published using the @publish decorator or by calling the publish method directly. To publish a function using the decorator you can do: 310 | 311 | ```python 312 | from azureml import services 313 | 314 | @services.publish(workspace, workspace_token) 315 | @services.types(a = float, b = float) 316 | @services.returns(float) 317 | def func(a, b): 318 | return a / b 319 | ``` 320 | 321 | This publishes a function which takes two floating point values and divides them. Alternately you can publish a function by calling the publish method directly: 322 | 323 | ```python 324 | my_func = publish(my_func, workspace, workspace_token, files_list, endpoint=None) 325 | ``` 326 | 327 | If a function has no source file associated with it (for example, you're developing inside of a REPL environment) then the functions byte code is serialized. If the function refers to any global variables those will also be serialized using Pickle. In this mode all of the state which you're referring to needs to be already defined (e.g. your published function should come after any other functions you are calling). 328 | 329 | If a function is saved on disk then the entire module the function is defined in will be serialized and re-executed on the server to get the function back. In this mode the entire contents of the file is serialized and the order of the function definitions don't matter. 330 | 331 | After the function is published there will be a "service" property on the function. This object has several properties of interest: 332 | 333 | | Property | Description | 334 | | ------------- |:-------------:| 335 | | url | this is the end point for executing the function | 336 | | api_key | this is the API key which is required to invoke the function | 337 | | help_url | this is a human readable page which describes the parameters and results of the function. It also includes sample code for executing it from various languages. | 338 | | service_id | this is a unique GUID identifying the service in your workspace. You can re-use this ID to update the service once it's published | 339 | 340 | You can specify a list of files which should be published along with the function. 341 | The resulting files will be stored in a subdirectory called 'Script Bundle'. The 342 | list of files can be one of: 343 | 344 | | Format | Description | 345 | | ------------------------------------------ |:---------------------------------------------------------------:| 346 | | (('file1.txt', None), ) | file is read from disk | 347 | | (('file1.txt', b'contents'), ) | file contents are provided | 348 | | ('file1.txt', 'file2.txt') | files are read from disk, written with same filename | 349 | | ((('file1.txt', 'destname.txt'), None), ) | file is read from disk, written with different destination name filenames. | 350 | 351 | 352 | The various formats for each filename can be freely mixed and matched. Files can also be attached using the @attach decoator: 353 | 354 | ```python 355 | @publish(...) 356 | @attach('file1.txt') 357 | def f(x): 358 | pass 359 | ``` 360 | 361 | And this supports the same file formats as the list. 362 | 363 | If you are using AzureML from a different geography (for example West Europe or East Asia) you'll need to specify the endpoint that you need to connect to. The end point is your region plus "management.azureml.net", for example: https://europewest.management.azureml.net 364 | 365 | Consumption 366 | ----------- 367 | 368 | Existing services can be consumed using the service decorator. An empty function body is supplied and the resulting function becomes invokable and calls the published service: 369 | 370 | ```python 371 | from azureml import services 372 | 373 | @services.service(url, api_key) 374 | @services.types(a = float, b = float) 375 | @services.returns(float) 376 | def func(a, b): 377 | pass 378 | ``` 379 | 380 | Controlling publishing / consumption 381 | ------------------------------------ 382 | 383 | There are several decorators which are used to control how the invocation occurs. 384 | 385 | ### types(**kwargs) 386 | Specifies the types used for the arguments of a published or consumed service. 387 | 388 | The type annotations are optional and are used for providing information which allows the service to interoperate with other languages. The type information will be seen on the help page of the published service. If the type information is not provided a Python specific format will be used and other languages may not be able to call the sevice. 389 | 390 | Supported types are: int, bool, float, unicode. 391 | 392 | When an unsupported type is specified the type will be serialized using an internal representation based upon Python's Pickle protocol. This will prevent the web service from being used with other languages. 393 | 394 | When working with strings you need to use the unicode data type. This is because the string data type used for interop is actually a Unicode string and Python's "str" objects are actually byte arrays. 395 | 396 | For 397 | 398 | ### returns(return_type) 399 | Specifies the return type for a published service. 400 | 401 | Like the parameter types this is also optional, and when omitted an internal Python format will be used and interoperability with other languages may be reduced. 402 | 403 | Supported types are: int, bool, float, unicode. 404 | 405 | When an unsupported type is specified the type will be serialized using an internal representation based upon Python's Pickle protocol. This will prevent the web service from being used with other languages. 406 | 407 | When working with strings you need to use the unicode data type. This is because the string data type used for interop is actually a Unicode string and Python's "str" objects are actually byte arrays. 408 | 409 | ### service_id(id) 410 | Specifies the service ID for a service. When publishing to the same service ID the service is updated instead of having a new service created. 411 | 412 | ### name(name) 413 | Specifies a friendly name for a service. By default the name is the function name, but this allows names with spaces or 414 | other characters which are not allowed in functions. 415 | 416 | ### attach(name, contents) 417 | Attaches a file to the payload to be uploaded. 418 | 419 | If contents is omitted the file is read from disk. 420 | If name is a tuple it specifies the on-disk filename and the destination filename. 421 | 422 | ### dataframe_service 423 | Indicates that the function operations on a data frame. The function 424 | will receive a single input in the form of a data frame, and should return 425 | a data frame object. The schema of the data frame is specified with this 426 | decorator. 427 | 428 | ```python 429 | @publish(...) 430 | @dataframe_service(a = int, b = int) 431 | @returns(int) 432 | def myfunc(df): 433 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])]) 434 | ``` 435 | 436 | This code can then be invoked either with: 437 | ```python 438 | myfunc(1, 2) 439 | ``` 440 | 441 | or: 442 | 443 | ```python 444 | myfunc.map([[1,2], [3,4]]) 445 | ``` 446 | 447 | ### input_name 448 | Specifies the name of the input the web service expects to receive. Defaults to 'input1' Currently this is only 449 | supported on consumption. 450 | 451 | ### output_name 452 | Specifies the name of the output the web service expects to receive. Defaults to 'output1'. Currently this is only 453 | supported on consumption. 454 | 455 | Those include the types decorator for specifying the format of the inputs, the returns decorator for specifying the return value, the attach decorator for attaching files to a published function, 456 | -------------------------------------------------------------------------------- /azureml/__init__.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | from datetime import datetime 27 | 28 | import numbers 29 | import re 30 | import sys 31 | import json 32 | try: 33 | import ConfigParser 34 | except ImportError: 35 | import configparser as ConfigParser 36 | 37 | from os import path 38 | 39 | try: 40 | from cStringIO import BytesIO 41 | except ImportError: 42 | from io import BytesIO 43 | 44 | from azureml.errors import ( 45 | AzureMLConflictHttpError, 46 | AzureMLError, 47 | AzureMLHttpError, 48 | UnsupportedDatasetTypeError, 49 | _not_none, 50 | _not_none_or_empty, 51 | ) 52 | from azureml.http import ( 53 | _RestClient, 54 | __author__, 55 | __version__, 56 | ) 57 | from azureml.serialization import ( 58 | DataTypeIds, 59 | deserialize_dataframe, 60 | serialize_dataframe, 61 | is_supported, 62 | ) 63 | 64 | 65 | _GLOBAL_WORKSPACE_ID = '506153734175476c4f62416c57734963' 66 | 67 | 68 | class Endpoints(object): 69 | """Constants for the known REST API endpoints.""" 70 | default = 'https://studio.azureml.net' 71 | management_default = 'https://management.azureml.net' 72 | 73 | 74 | class Dataset(object): 75 | """Abstract base class for Azure ML datasets.""" 76 | pass 77 | 78 | 79 | class SourceDataset(Dataset): 80 | """Metadata for a dataset and methods to read its contents.""" 81 | 82 | def __init__(self, workspace=None, metadata=None): 83 | """ 84 | INTERNAL USE ONLY. Initialize a dataset. 85 | 86 | Parameters 87 | ---------- 88 | workspace : Workspace 89 | Parent workspace of the dataset. 90 | metadata : dict 91 | Dictionary of dataset metadata as returned by the REST API. 92 | """ 93 | _not_none('metadata', metadata) 94 | _not_none('workspace', workspace) 95 | 96 | self.workspace = workspace 97 | self._metadata = metadata 98 | 99 | if is_supported(self.data_type_id): 100 | self.to_dataframe = self._to_dataframe 101 | 102 | if not self.is_example: 103 | self.update_from_raw_data = self._update_from_raw_data 104 | self.update_from_dataframe = self._update_from_dataframe 105 | 106 | @staticmethod 107 | def _metadata_repr(metadata): 108 | val = metadata['Name'] 109 | if sys.version_info < (3,): 110 | return val.encode('ascii','ignore') 111 | else: 112 | return val 113 | 114 | def __repr__(self): 115 | return SourceDataset._metadata_repr(self._metadata) 116 | 117 | def open(self): 118 | '''Open and return a stream for the dataset contents.''' 119 | return self.workspace._rest.open_dataset_contents(self.contents_url) 120 | 121 | def read_as_binary(self): 122 | '''Read and return the dataset contents as binary.''' 123 | return self.workspace._rest.read_dataset_contents_binary(self.contents_url) 124 | 125 | def read_as_text(self): 126 | '''Read and return the dataset contents as text.''' 127 | return self.workspace._rest.read_dataset_contents_text(self.contents_url) 128 | 129 | def _to_dataframe(self): 130 | """Read and return the dataset contents as a pandas DataFrame.""" 131 | with self.open() as reader: 132 | return deserialize_dataframe(reader, self.data_type_id) 133 | 134 | def _update_from_dataframe(self, dataframe, data_type_id=None, name=None, 135 | description=None): 136 | """ 137 | Serialize the specified DataFrame and replace the existing dataset. 138 | 139 | Parameters 140 | ---------- 141 | dataframe : pandas.DataFrame 142 | Data to serialize. 143 | data_type_id : str, optional 144 | Format to serialize to. 145 | If None, the existing format is preserved. 146 | Supported formats are: 147 | 'PlainText' 148 | 'GenericCSV' 149 | 'GenericTSV' 150 | 'GenericCSVNoHeader' 151 | 'GenericTSVNoHeader' 152 | See the azureml.DataTypeIds class for constants. 153 | name : str, optional 154 | Name for the dataset. 155 | If None, the name of the existing dataset is used. 156 | description : str, optional 157 | Description for the dataset. 158 | If None, the name of the existing dataset is used. 159 | """ 160 | _not_none('dataframe', dataframe) 161 | 162 | if data_type_id is None: 163 | data_type_id = self.data_type_id 164 | if name is None: 165 | name = self.name 166 | if description is None: 167 | description = self.description 168 | 169 | try: 170 | output = BytesIO() 171 | serialize_dataframe(output, data_type_id, dataframe) 172 | raw_data = output.getvalue() 173 | finally: 174 | output.close() 175 | 176 | self._upload_and_refresh(raw_data, data_type_id, name, description) 177 | 178 | def _update_from_raw_data(self, raw_data, data_type_id=None, name=None, 179 | description=None): 180 | """ 181 | Upload already serialized raw data and replace the existing dataset. 182 | 183 | Parameters 184 | ---------- 185 | raw_data: bytes 186 | Dataset contents to upload. 187 | data_type_id : str 188 | Serialization format of the raw data. 189 | If None, the format of the existing dataset is used. 190 | Supported formats are: 191 | 'PlainText' 192 | 'GenericCSV' 193 | 'GenericTSV' 194 | 'GenericCSVNoHeader' 195 | 'GenericTSVNoHeader' 196 | 'ARFF' 197 | See the azureml.DataTypeIds class for constants. 198 | name : str, optional 199 | Name for the dataset. 200 | If None, the name of the existing dataset is used. 201 | description : str, optional 202 | Description for the dataset. 203 | If None, the name of the existing dataset is used. 204 | """ 205 | _not_none('raw_data', raw_data) 206 | 207 | if data_type_id is None: 208 | data_type_id = self.data_type_id 209 | if name is None: 210 | name = self.name 211 | if description is None: 212 | description = self.description 213 | 214 | self._upload_and_refresh(raw_data, data_type_id, name, description) 215 | 216 | def _upload_and_refresh(self, raw_data, data_type_id, name, description): 217 | dataset_id = self.workspace._rest.upload_dataset( 218 | self.workspace.workspace_id, 219 | name, 220 | description, 221 | data_type_id, 222 | raw_data, 223 | self.family_id 224 | ) 225 | 226 | self._metadata = self.workspace._rest.get_dataset( 227 | self.workspace.workspace_id, 228 | dataset_id 229 | ) 230 | 231 | class Location(object): 232 | def __init__(self, metadata): 233 | self._metadata = metadata 234 | 235 | @property 236 | def base_uri(self): 237 | """TODO.""" 238 | return self._metadata['BaseUri'] 239 | 240 | @property 241 | def size(self): 242 | """TODO.""" 243 | return self._metadata['Size'] 244 | 245 | @property 246 | def endpoint_type(self): 247 | """TODO.""" 248 | return self._metadata['EndpointType'] 249 | 250 | @property 251 | def credential_container(self): 252 | """TODO.""" 253 | return self._metadata['CredentialContainer'] 254 | 255 | @property 256 | def access_credential(self): 257 | """TODO.""" 258 | return self._metadata['AccessCredential'] 259 | 260 | @property 261 | def location(self): 262 | """TODO.""" 263 | return self._metadata['Location'] 264 | 265 | @property 266 | def file_type(self): 267 | """TODO.""" 268 | return self._metadata['FileType'] 269 | 270 | @property 271 | def is_auxiliary(self): 272 | """TODO.""" 273 | return self._metadata['IsAuxiliary'] 274 | 275 | @property 276 | def name(self): 277 | """TODO.""" 278 | return self._metadata['Name'] 279 | 280 | @property 281 | def visualize_end_point(self): 282 | """TODO.""" 283 | return SourceDataset.Location(self._metadata['VisualizeEndPoint']) 284 | 285 | @property 286 | def schema_end_point(self): 287 | """TODO.""" 288 | return SourceDataset.Location(self._metadata['SchemaEndPoint']) 289 | 290 | @property 291 | def schema_status(self): 292 | """TODO.""" 293 | return self._metadata['SchemaStatus'] 294 | 295 | @property 296 | def dataset_id(self): 297 | """Unique identifier for the dataset.""" 298 | return self._metadata['Id'] 299 | 300 | @property 301 | def name(self): 302 | """Unique name for the dataset.""" 303 | return self._metadata['Name'] 304 | 305 | @property 306 | def data_type_id(self): 307 | """ 308 | Serialization format for the dataset. 309 | See the azureml.DataTypeIds class for constants. 310 | """ 311 | return self._metadata['DataTypeId'] 312 | 313 | @property 314 | def description(self): 315 | """Description for the dataset.""" 316 | return self._metadata['Description'] 317 | 318 | @property 319 | def resource_upload_id(self): 320 | """TODO.""" 321 | return self._metadata['ResourceUploadId'] 322 | 323 | @property 324 | def family_id(self): 325 | """TODO.""" 326 | return self._metadata['FamilyId'] 327 | 328 | @property 329 | def size(self): 330 | """Size in bytes of the serialized dataset contents.""" 331 | return self._metadata['Size'] 332 | 333 | @property 334 | def source_origin(self): 335 | """TODO.""" 336 | return self._metadata['SourceOrigin'] 337 | 338 | @property 339 | def created_date(self): 340 | # Example format of date to parse: 341 | # /Date(1418444668177)/ 342 | match = re.search(r"/Date\((\d+)\)/", self._metadata['CreatedDate']) 343 | return datetime.fromtimestamp(int(match.group(1)) / 1000.0) 344 | 345 | @property 346 | def owner(self): 347 | """TODO.""" 348 | return self._metadata['Owner'] 349 | 350 | @property 351 | def experiment_id(self): 352 | """TODO.""" 353 | return self._metadata['ExperimentId'] 354 | 355 | @property 356 | def client_version(self): 357 | """TODO.""" 358 | return self._metadata['ClientVersion'] 359 | 360 | @property 361 | def promoted_from(self): 362 | """TODO.""" 363 | return self._metadata['PromotedFrom'] 364 | 365 | @property 366 | def uploaded_from_filename(self): 367 | """TODO.""" 368 | return self._metadata['UploadedFromFilename'] 369 | 370 | @property 371 | def service_version(self): 372 | """TODO.""" 373 | return self._metadata['ServiceVersion'] 374 | 375 | @property 376 | def is_latest(self): 377 | """TODO.""" 378 | return self._metadata['IsLatest'] 379 | 380 | @property 381 | def category(self): 382 | """TODO.""" 383 | return self._metadata['Category'] 384 | 385 | @property 386 | def download_location(self): 387 | """TODO.""" 388 | return SourceDataset.Location(self._metadata['DownloadLocation']) 389 | 390 | @property 391 | def is_deprecated(self): 392 | """TODO.""" 393 | return self._metadata['IsDeprecated'] 394 | 395 | @property 396 | def culture(self): 397 | """TODO.""" 398 | return self._metadata['Culture'] 399 | 400 | @property 401 | def batch(self): 402 | """TODO.""" 403 | return self._metadata['Batch'] 404 | 405 | @property 406 | def created_date_ticks(self): 407 | """TODO.""" 408 | return self._metadata['CreatedDateTicks'] 409 | 410 | @property 411 | def contents_url(self): 412 | """Full URL to the dataset contents.""" 413 | loc = self.download_location 414 | return loc.base_uri + loc.location + loc.access_credential 415 | 416 | @property 417 | def is_example(self): 418 | """True for an example dataset, False for user created.""" 419 | return self.dataset_id.startswith(_GLOBAL_WORKSPACE_ID) 420 | 421 | 422 | class Datasets(object): 423 | def __init__(self, workspace, example_filter=None): 424 | """ 425 | INTERNAL USE ONLY. Initialize a dataset collection. 426 | 427 | Parameters 428 | ---------- 429 | workspace : Workspace 430 | Parent workspace of the datasets. 431 | example_filter : bool 432 | True to include only examples. 433 | False to include only user-created. 434 | None to include all. 435 | """ 436 | _not_none('workspace', workspace) 437 | 438 | self.workspace = workspace 439 | self._example_filter = example_filter 440 | 441 | def __repr__(self): 442 | return '\n'.join((SourceDataset._metadata_repr(dataset) for dataset in self._get_datasets())) 443 | 444 | def __iter__(self): 445 | for dataset in self._get_datasets(): 446 | yield self._create_dataset(dataset) 447 | 448 | def __len__(self): 449 | return sum(1 for _ in self._get_datasets()) 450 | 451 | def __getitem__(self, index): 452 | '''Retrieve a dataset by index or by name (case-sensitive).''' 453 | _not_none('index', index) 454 | 455 | datasets = self._get_datasets() 456 | if isinstance(index, numbers.Integral): 457 | return self._create_dataset(list(datasets)[index]) 458 | else: 459 | for dataset in datasets: 460 | if dataset['Name'] == index: 461 | return self._create_dataset(dataset) 462 | 463 | raise IndexError('A data set named "{}" does not exist'.format(index)) 464 | 465 | def add_from_dataframe(self, dataframe, data_type_id, name, description): 466 | """ 467 | Serialize the specified DataFrame and upload it as a new dataset. 468 | 469 | Parameters 470 | ---------- 471 | dataframe : pandas.DataFrame 472 | Data to serialize. 473 | data_type_id : str 474 | Format to serialize to. 475 | Supported formats are: 476 | 'PlainText' 477 | 'GenericCSV' 478 | 'GenericTSV' 479 | 'GenericCSVNoHeader' 480 | 'GenericTSVNoHeader' 481 | See the azureml.DataTypeIds class for constants. 482 | name : str 483 | Name for the new dataset. 484 | description : str 485 | Description for the new dataset. 486 | 487 | Returns 488 | ------- 489 | SourceDataset 490 | Dataset that was just created. 491 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on 492 | the dataset object to get its contents as a stream, bytes, str or 493 | pandas DataFrame. 494 | """ 495 | _not_none('dataframe', dataframe) 496 | _not_none_or_empty('data_type_id', data_type_id) 497 | _not_none_or_empty('name', name) 498 | _not_none_or_empty('description', description) 499 | 500 | try: 501 | output = BytesIO() 502 | serialize_dataframe(output, data_type_id, dataframe) 503 | raw_data = output.getvalue() 504 | finally: 505 | output.close() 506 | 507 | return self._upload(raw_data, data_type_id, name, description) 508 | 509 | def add_from_raw_data(self, raw_data, data_type_id, name, description): 510 | """ 511 | Upload already serialized raw data as a new dataset. 512 | 513 | Parameters 514 | ---------- 515 | raw_data: bytes 516 | Dataset contents to upload. 517 | data_type_id : str 518 | Serialization format of the raw data. 519 | Supported formats are: 520 | 'PlainText' 521 | 'GenericCSV' 522 | 'GenericTSV' 523 | 'GenericCSVNoHeader' 524 | 'GenericTSVNoHeader' 525 | 'ARFF' 526 | See the azureml.DataTypeIds class for constants. 527 | name : str 528 | Name for the new dataset. 529 | description : str 530 | Description for the new dataset. 531 | 532 | Returns 533 | ------- 534 | SourceDataset 535 | Dataset that was just created. 536 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on 537 | the dataset object to get its contents as a stream, bytes, str or 538 | pandas DataFrame. 539 | """ 540 | _not_none('raw_data', raw_data) 541 | _not_none_or_empty('data_type_id', data_type_id) 542 | _not_none_or_empty('name', name) 543 | _not_none_or_empty('description', description) 544 | 545 | return self._upload(raw_data, data_type_id, name, description) 546 | 547 | def _upload(self, raw_data, data_type_id, name, description): 548 | dataset_id = self.workspace._rest.upload_dataset( 549 | self.workspace.workspace_id, name, description, data_type_id, 550 | raw_data, None) 551 | 552 | metadata = self.workspace._rest.get_dataset( 553 | self.workspace.workspace_id, dataset_id) 554 | 555 | return self._create_dataset(metadata) 556 | 557 | def _get_datasets(self): 558 | datasets = self.workspace._rest.get_datasets(self.workspace.workspace_id) 559 | return datasets if self._example_filter is None else \ 560 | (d for d in datasets if d['Id'].startswith( 561 | _GLOBAL_WORKSPACE_ID) == self._example_filter) 562 | 563 | def _create_dataset(self, metadata): 564 | return SourceDataset(self.workspace, metadata) 565 | 566 | 567 | class IntermediateDataset(Dataset): 568 | """Represents an intermediate dataset and methods to read its contents.""" 569 | 570 | def __init__(self, workspace, experiment, node_id, port_name, data_type_id): 571 | """ 572 | INTERNAL USE ONLY. Initialize an intermediate dataset. 573 | 574 | Parameters 575 | ---------- 576 | workspace : Workspace 577 | Parent workspace of the dataset. 578 | experiment : Experiment 579 | Parent experiment of the dataset. 580 | node_id : str 581 | Module node id from the experiment graph. 582 | port_name : str 583 | Output port of the module. 584 | data_type_id : str 585 | Serialization format of the raw data. 586 | See the azureml.DataTypeIds class for constants. 587 | """ 588 | _not_none('workspace', workspace) 589 | _not_none('experiment', experiment) 590 | _not_none_or_empty('node_id', node_id) 591 | _not_none_or_empty('port_name', port_name) 592 | _not_none_or_empty('data_type_id', data_type_id) 593 | 594 | self.workspace = workspace 595 | self.experiment = experiment 596 | self.node_id = node_id 597 | self.port_name = port_name 598 | self.data_type_id = data_type_id 599 | 600 | if is_supported(self.data_type_id): 601 | self.to_dataframe = self._to_dataframe 602 | 603 | def open(self): 604 | '''Open and return a stream for the dataset contents.''' 605 | return self.workspace._rest.open_intermediate_dataset_contents( 606 | self.workspace.workspace_id, 607 | self.experiment.experiment_id, 608 | self.node_id, 609 | self.port_name 610 | ) 611 | 612 | def read_as_binary(self): 613 | '''Read and return the dataset contents as binary.''' 614 | return self.workspace._rest.read_intermediate_dataset_contents_binary( 615 | self.workspace.workspace_id, 616 | self.experiment.experiment_id, 617 | self.node_id, 618 | self.port_name 619 | ) 620 | 621 | def read_as_text(self): 622 | '''Read and return the dataset contents as text.''' 623 | return self.workspace._rest.read_intermediate_dataset_contents_text( 624 | self.workspace.workspace_id, 625 | self.experiment.experiment_id, 626 | self.node_id, 627 | self.port_name 628 | ) 629 | 630 | def _to_dataframe(self): 631 | """Read and return the dataset contents as a pandas DataFrame.""" 632 | #TODO: figure out why passing in the opened stream directly gives invalid data 633 | data = self.read_as_binary() 634 | reader = BytesIO(data) 635 | return deserialize_dataframe(reader, self.data_type_id) 636 | 637 | 638 | class Experiment(object): 639 | 640 | def __init__(self, workspace, metadata): 641 | """ 642 | INTERNAL USE ONLY. Initialize an experiment. 643 | 644 | Parameters 645 | ---------- 646 | workspace : Workspace 647 | Parent workspace of the experiment. 648 | metadata : dict 649 | Dictionary of experiment metadata as returned by the REST API. 650 | """ 651 | _not_none('workspace', workspace) 652 | _not_none('metadata', metadata) 653 | 654 | self.workspace = workspace 655 | self._metadata = metadata 656 | 657 | @staticmethod 658 | def _metadata_repr(metadata): 659 | val = u'{0}\t{1}'.format(metadata['ExperimentId'], metadata['Description']) 660 | if sys.version_info < (3,): 661 | return val.encode('ascii','ignore') 662 | else: 663 | return val 664 | 665 | def __repr__(self): 666 | return Experiment._metadata_repr(self._metadata) 667 | 668 | class Status(object): 669 | def __init__(self, metadata): 670 | self._metadata = metadata 671 | 672 | @property 673 | def status_code(self): 674 | """TODO.""" 675 | return self._metadata['StatusCode'] 676 | 677 | @property 678 | def status_detail(self): 679 | """TODO.""" 680 | return self._metadata['StatusDetail'] 681 | 682 | @property 683 | def creation_time(self): 684 | """TODO.""" 685 | # Example format of date to parse: 686 | # /Date(1418444668177)/ 687 | match = re.search(r"/Date\((\d+)\)/", self._metadata['CreationTime']) 688 | return datetime.fromtimestamp(int(match.group(1)) / 1000.0) 689 | 690 | @property 691 | def status(self): 692 | """TODO.""" 693 | return Experiment.Status(self._metadata['Status']) 694 | 695 | @property 696 | def description(self): 697 | """TODO.""" 698 | return self._metadata['Description'] 699 | 700 | @property 701 | def creator(self): 702 | """TODO.""" 703 | return self._metadata['Creator'] 704 | 705 | @property 706 | def experiment_id(self): 707 | """TODO.""" 708 | return self._metadata['ExperimentId'] 709 | 710 | @property 711 | def job_id(self): 712 | """TODO.""" 713 | return self._metadata['JobId'] 714 | 715 | @property 716 | def version_id(self): 717 | """TODO.""" 718 | return self._metadata['VersionId'] 719 | 720 | @property 721 | def etag(self): 722 | """TODO.""" 723 | return self._metadata['Etag'] 724 | 725 | @property 726 | def run_id(self): 727 | """TODO.""" 728 | return self._metadata['RunId'] 729 | 730 | @property 731 | def is_archived(self): 732 | """TODO.""" 733 | return self._metadata['IsArchived'] 734 | 735 | @property 736 | def is_example(self): 737 | """True for an example experiment, False for user created.""" 738 | return self.experiment_id.startswith(_GLOBAL_WORKSPACE_ID) 739 | 740 | def get_intermediate_dataset(self, node_id, port_name, data_type_id): 741 | """ 742 | Get an intermediate dataset. 743 | 744 | Parameters 745 | ---------- 746 | node_id : str 747 | Module node id from the experiment graph. 748 | port_name : str 749 | Output port of the module. 750 | data_type_id : str 751 | Serialization format of the raw data. 752 | See the azureml.DataTypeIds class for constants. 753 | 754 | Returns 755 | ------- 756 | IntermediateDataset 757 | Dataset object. 758 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on 759 | the dataset object to get its contents as a stream, bytes, str or 760 | pandas DataFrame. 761 | """ 762 | return IntermediateDataset(self.workspace, self, node_id, port_name, data_type_id) 763 | 764 | 765 | class Experiments(object): 766 | def __init__(self, workspace, example_filter=None): 767 | """ 768 | INTERNAL USE ONLY. Initialize an experiment collection. 769 | 770 | Parameters 771 | ---------- 772 | workspace : Workspace 773 | Parent workspace of the experiments. 774 | example_filter : bool 775 | True to include only examples. 776 | False to include only user-created. 777 | None to include all. 778 | """ 779 | _not_none('workspace', workspace) 780 | 781 | self.workspace = workspace 782 | self._example_filter = example_filter 783 | 784 | def __repr__(self): 785 | return '\n'.join((Experiment._metadata_repr(experiment) for experiment in self._get_experiments())) 786 | 787 | def __iter__(self): 788 | for experiment in self._get_experiments(): 789 | yield self._create_experiment(experiment) 790 | 791 | def __len__(self): 792 | return sum(1 for _ in self._get_experiments()) 793 | 794 | def __getitem__(self, index): 795 | '''Retrieve an experiment by index or by id.''' 796 | _not_none('index', index) 797 | 798 | experiments = self._get_experiments() 799 | if isinstance(index, numbers.Integral): 800 | return self._create_experiment(list(experiments)[index]) 801 | else: 802 | for experiment in experiments: 803 | if experiment['ExperimentId'] == index: 804 | return self._create_experiment(experiment) 805 | 806 | raise IndexError('An experiment with the id "{}" does not exist'.format(index)) 807 | 808 | def _get_experiments(self): 809 | experiments = self.workspace._rest.get_experiments(self.workspace.workspace_id) 810 | return experiments if self._example_filter is None else \ 811 | (e for e in experiments if e['ExperimentId'].startswith(_GLOBAL_WORKSPACE_ID) == self._example_filter) 812 | 813 | def _create_experiment(self, metadata): 814 | return Experiment(self.workspace, metadata) 815 | 816 | 817 | _CONFIG_WORKSPACE_SECTION = 'workspace' 818 | _CONFIG_WORKSPACE_ID = 'id' 819 | _CONFIG_AUTHORIZATION_TOKEN = 'authorization_token' 820 | _CONFIG_API_ENDPOINT = 'api_endpoint' 821 | _CONFIG_MANAGEMENT_ENDPOINT = 'management_endpoint' 822 | 823 | def _get_workspace_info(workspace_id, authorization_token, endpoint, management_endpoint): 824 | if workspace_id is None or authorization_token is None or endpoint is None or management_endpoint is None: 825 | # read the settings from config 826 | jsonConfig = path.expanduser('~/.azureml/settings.json') 827 | if path.exists(jsonConfig): 828 | with open(jsonConfig) as cfgFile: 829 | config = json.load(cfgFile) 830 | if _CONFIG_WORKSPACE_SECTION in config: 831 | ws = config[_CONFIG_WORKSPACE_SECTION] 832 | workspace_id = ws.get(_CONFIG_WORKSPACE_ID, workspace_id) 833 | authorization_token = ws.get(_CONFIG_AUTHORIZATION_TOKEN, authorization_token) 834 | endpoint = ws.get(_CONFIG_API_ENDPOINT, endpoint) 835 | management_endpoint = ws.get(_CONFIG_MANAGEMENT_ENDPOINT, management_endpoint) 836 | else: 837 | config = ConfigParser.ConfigParser() 838 | config.read(path.expanduser('~/.azureml/settings.ini')) 839 | 840 | if config.has_section(_CONFIG_WORKSPACE_SECTION): 841 | if workspace_id is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_WORKSPACE_ID): 842 | workspace_id = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_WORKSPACE_ID) 843 | if authorization_token is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_AUTHORIZATION_TOKEN): 844 | authorization_token = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_AUTHORIZATION_TOKEN) 845 | if endpoint is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_API_ENDPOINT): 846 | endpoint = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_API_ENDPOINT) 847 | if management_endpoint is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_MANAGEMENT_ENDPOINT): 848 | management_endpoint = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_MANAGEMENT_ENDPOINT) 849 | 850 | if workspace_id is None: 851 | raise ValueError('workspace_id not provided and not available via config') 852 | if authorization_token is None: 853 | raise ValueError('authorization_token not provided and not available via config') 854 | if endpoint is None: 855 | endpoint = Endpoints.default 856 | if management_endpoint is None: 857 | management_endpoint = Endpoints.management_default 858 | 859 | return workspace_id, authorization_token, endpoint, management_endpoint 860 | 861 | class Workspace(object): 862 | 863 | def __init__(self, workspace_id = None, authorization_token = None, endpoint=None): 864 | """ 865 | Initialize a workspace. 866 | 867 | Parameters 868 | ---------- 869 | workspace_id : str 870 | Unique identifier for the existing workspace. Can be obtained from 871 | the URL in ML Studio when editing a workspace. 872 | authorization_token: str 873 | Access token for the workspace. Can be the primary or secondary 874 | token managed in ML Studio. 875 | endpoint: str 876 | URL of the endpoint to connect to. Specify this only if you host 877 | ML Studio on your own server(s). 878 | 879 | Parameters that are omitted will be read from ~/.azureml/settings.ini: 880 | [workspace] 881 | id = abcd1234 882 | authorization_token = abcd1234 883 | endpoint = https://studio.azureml.net 884 | """ 885 | workspace_id, authorization_token, endpoint, management_endpoint = _get_workspace_info(workspace_id, authorization_token, endpoint, None) 886 | 887 | _not_none_or_empty('workspace_id', workspace_id) 888 | _not_none_or_empty('authorization_token', authorization_token) 889 | _not_none_or_empty('endpoint', endpoint) 890 | 891 | self.workspace_id = workspace_id 892 | self.authorization_token = authorization_token 893 | self.api_endpoint = endpoint 894 | self.management_endpoint = management_endpoint 895 | self._rest = _RestClient(endpoint, authorization_token) 896 | self.datasets = Datasets(workspace=self) 897 | self.user_datasets = Datasets(workspace=self, example_filter=False) 898 | self.example_datasets = Datasets(workspace=self, example_filter=True) 899 | self.experiments = Experiments(workspace=self) 900 | self.user_experiments = Experiments(workspace=self, example_filter=False) 901 | self.example_experiments = Experiments(workspace=self, example_filter=True) 902 | 903 | 904 | _manglingPattern = re.compile(r'[\W_]+') 905 | 906 | def _mangled(name): 907 | result = _manglingPattern.sub('_', name) 908 | return result.lower() 909 | -------------------------------------------------------------------------------- /azureml/errors.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | 27 | class _ErrorMessages(object): 28 | unsupported_type = 'Dataset type "{0}" is not supported' 29 | not_none = '"{0}" should not be None.' 30 | not_none_or_empty = '"{0}" should not be None or empty.' 31 | 32 | 33 | class AzureMLError(Exception): 34 | '''AzureML Exception base class.''' 35 | def __init__(self, message): 36 | super(AzureMLError, self).__init__(message) 37 | 38 | 39 | class AzureMLHttpError(AzureMLError): 40 | '''Error from Azure ML REST API.''' 41 | def __init__(self, message, status_code): 42 | super(AzureMLHttpError, self).__init__(message) 43 | self.status_code = status_code 44 | 45 | def __new__(cls, message, status_code, *args, **kwargs): 46 | if status_code == 409: 47 | cls = AzureMLConflictHttpError 48 | elif status_code == 401: 49 | cls = AzureMLUnauthorizedError 50 | return AzureMLError.__new__(cls, message, status_code, *args, **kwargs) 51 | 52 | 53 | class AzureMLUnauthorizedError(AzureMLHttpError): 54 | '''Unauthorized error from Azure ML REST API.''' 55 | def __init__(self, message, status_code): 56 | message = 'Unauthorized, please check your workspace ID and authorization token ({})'.format(message) 57 | super(AzureMLUnauthorizedError, self).__init__(message, status_code) 58 | 59 | 60 | class AzureMLConflictHttpError(AzureMLHttpError): 61 | '''Conflict error from Azure ML REST API.''' 62 | def __init__(self, message, status_code): 63 | super(AzureMLConflictHttpError, self).__init__(message, status_code) 64 | 65 | class UnsupportedDatasetTypeError(AzureMLError): 66 | '''Dataset type is not supported.''' 67 | def __init__(self, data_type_id): 68 | super(UnsupportedDatasetTypeError, self).__init__( 69 | _ErrorMessages.unsupported_type.format(data_type_id)) 70 | 71 | 72 | def _not_none(param_name, param): 73 | if param is None: 74 | raise TypeError(_ErrorMessages.not_none.format(param_name)) 75 | 76 | 77 | def _not_none_or_empty(param_name, param): 78 | if not param: 79 | raise TypeError(_ErrorMessages.not_none_or_empty.format(param_name)) 80 | -------------------------------------------------------------------------------- /azureml/http.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | import json 27 | import requests 28 | from azureml.errors import AzureMLConflictHttpError 29 | 30 | try: 31 | from urlparse import urljoin 32 | except ImportError: 33 | from urllib.parse import urljoin 34 | 35 | from azureml.errors import ( 36 | AzureMLHttpError, 37 | ) 38 | 39 | __author__ = 'Microsoft Corp. ' 40 | __version__ = '0.2.7' 41 | 42 | 43 | class _RestClient(object): 44 | SERVICE_ROOT = 'api/' 45 | INTERMEDIATE_DATASET_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/experiments/{1}/outputdata/{2}/{3}' 46 | EXPERIMENTS_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/experiments' 47 | DATASOURCES_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/datasources' 48 | DATASOURCE_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/datasources/{1}' 49 | UPLOAD_URI_FMI = SERVICE_ROOT + 'resourceuploads/workspaces/{0}/?userStorage=true&dataTypeId={1}' 50 | UPLOAD_CHUNK_URI_FMT = SERVICE_ROOT + 'blobuploads/workspaces/{0}/?numberOfBlocks={1}&blockId={2}&uploadId={3}&dataTypeId={4}' 51 | SESSION_ID_HEADER_NAME = 'x-ms-client-session-id' 52 | SESSION_ID_HEADER_VALUE = 'DefaultSession' 53 | ACCESS_TOKEN_HEADER_NAME = 'x-ms-metaanalytics-authorizationtoken' 54 | CONTENT_TYPE_HEADER_NAME = 'Content-Type' 55 | CONTENT_TYPE_HEADER_VALUE_JSON = 'application/json;charset=UTF8' 56 | CHUNK_SIZE = 0x200000 57 | DEFAULT_OWNER = 'Python SDK' 58 | USER_AGENT_HEADER_NAME = 'User-Agent' 59 | USER_AGENT_HEADER_VALUE = 'pyazureml/' + __version__ 60 | 61 | def __init__(self, service_endpoint, access_token): 62 | self._service_endpoint = service_endpoint 63 | self._access_token = access_token 64 | 65 | def get_experiments(self, workspace_id): 66 | """Runs HTTP GET request to retrieve the list of experiments.""" 67 | api_path = self.EXPERIMENTS_URI_FMT.format(workspace_id) 68 | return self._send_get_req(api_path) 69 | 70 | def get_datasets(self, workspace_id): 71 | """Runs HTTP GET request to retrieve the list of datasets.""" 72 | api_path = self.DATASOURCES_URI_FMT.format(workspace_id) 73 | return self._send_get_req(api_path) 74 | 75 | def get_dataset(self, workspace_id, dataset_id): 76 | """Runs HTTP GET request to retrieve a single dataset.""" 77 | api_path = self.DATASOURCE_URI_FMT.format(workspace_id, dataset_id) 78 | return self._send_get_req(api_path) 79 | 80 | def open_intermediate_dataset_contents(self, workspace_id, experiment_id, 81 | node_id, port_name): 82 | return self._get_intermediate_dataset_contents( 83 | workspace_id, 84 | experiment_id, 85 | node_id, 86 | port_name, 87 | stream=True).raw 88 | 89 | def read_intermediate_dataset_contents_binary(self, workspace_id, 90 | experiment_id, node_id, 91 | port_name): 92 | return self._get_intermediate_dataset_contents( 93 | workspace_id, 94 | experiment_id, 95 | node_id, 96 | port_name, 97 | stream=False).content 98 | 99 | def read_intermediate_dataset_contents_text(self, workspace_id, 100 | experiment_id, node_id, 101 | port_name): 102 | return self._get_intermediate_dataset_contents( 103 | workspace_id, 104 | experiment_id, 105 | node_id, 106 | port_name, 107 | stream=False).text 108 | 109 | def _get_intermediate_dataset_contents(self, workspace_id, experiment_id, 110 | node_id, port_name, stream): 111 | api_path = self.INTERMEDIATE_DATASET_URI_FMT.format( 112 | workspace_id, experiment_id, node_id, port_name) 113 | response = requests.get( 114 | url=urljoin(self._service_endpoint, api_path), 115 | headers=self._get_headers(), 116 | stream=stream, 117 | ) 118 | return response 119 | 120 | def open_dataset_contents(self, url): 121 | response = requests.get(url, stream=True) 122 | return response.raw 123 | 124 | def read_dataset_contents_binary(self, url): 125 | response = requests.get(url) 126 | return response.content 127 | 128 | def read_dataset_contents_text(self, url): 129 | response = requests.get(url) 130 | return response.text 131 | 132 | def upload_dataset(self, workspace_id, name, description, data_type_id, 133 | raw_data, family_id): 134 | # uploading data is a two step process. First we upload the raw data 135 | api_path = self.UPLOAD_URI_FMI.format(workspace_id, data_type_id) 136 | upload_result = self._send_post_req(api_path, data=b'') 137 | 138 | # now get the id that was generated 139 | upload_id = upload_result["Id"] 140 | 141 | # Upload the data in chunks... 142 | total_chunks = int((len(raw_data) + (self.CHUNK_SIZE-1)) / self.CHUNK_SIZE) 143 | for chunk in range(total_chunks): 144 | chunk_url = self.UPLOAD_CHUNK_URI_FMT.format( 145 | workspace_id, 146 | total_chunks, # number of blocks 147 | chunk, # block id 148 | upload_id, 149 | data_type_id, 150 | ) 151 | chunk_data = raw_data[chunk*self.CHUNK_SIZE:(chunk + 1)*self.CHUNK_SIZE] 152 | self._send_post_req(chunk_url, data=chunk_data) 153 | 154 | # use that to construct the DataSource metadata 155 | metadata = { 156 | "DataSource": { 157 | "Name": name, 158 | "DataTypeId":data_type_id, 159 | "Description":description, 160 | "FamilyId":family_id, 161 | "Owner": self.DEFAULT_OWNER, 162 | "SourceOrigin":"FromResourceUpload" 163 | }, 164 | "UploadId": upload_id, 165 | "UploadedFromFileName":"", 166 | "ClientPoll": True 167 | } 168 | 169 | try: 170 | api_path = self.DATASOURCES_URI_FMT.format(workspace_id) 171 | except AzureMLConflictHttpError as e: 172 | raise AzureMLConflictHttpError( 173 | 'A data set named "{}" already exists'.format(name), 174 | e.status_code 175 | ) 176 | 177 | datasource_id = self._send_post_req( 178 | api_path, json.dumps(metadata), self.CONTENT_TYPE_HEADER_VALUE_JSON) 179 | return datasource_id 180 | 181 | def _send_get_req(self, api_path): 182 | response = requests.get( 183 | url=urljoin(self._service_endpoint, api_path), 184 | headers=self._get_headers() 185 | ) 186 | 187 | if response.status_code >= 400: 188 | raise AzureMLHttpError(response.text, response.status_code) 189 | 190 | return response.json() 191 | 192 | def _send_post_req(self, api_path, data, content_type=None): 193 | response = requests.post( 194 | url=urljoin(self._service_endpoint, api_path), 195 | data=data, 196 | headers=self._get_headers(content_type) 197 | ) 198 | 199 | if response.status_code >= 400: 200 | raise AzureMLHttpError(response.text, response.status_code) 201 | 202 | return response.json() 203 | 204 | def _get_headers(self, content_type=None): 205 | headers = { 206 | self.USER_AGENT_HEADER_NAME: self.USER_AGENT_HEADER_VALUE, 207 | self.CONTENT_TYPE_HEADER_NAME: self.CONTENT_TYPE_HEADER_VALUE_JSON, 208 | self.SESSION_ID_HEADER_NAME: self.SESSION_ID_HEADER_VALUE, 209 | self.ACCESS_TOKEN_HEADER_NAME: self._access_token 210 | } 211 | if content_type: 212 | headers[self.CONTENT_TYPE_HEADER_NAME] = content_type 213 | return headers 214 | -------------------------------------------------------------------------------- /azureml/serialization.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | from functools import partial 27 | import codecs 28 | import pandas as pd 29 | 30 | from azureml.errors import ( 31 | UnsupportedDatasetTypeError, 32 | _not_none, 33 | _not_none_or_empty, 34 | ) 35 | 36 | 37 | class DataTypeIds(object): 38 | """Constants for the known dataset data type id strings.""" 39 | ARFF = 'ARFF' 40 | PlainText = 'PlainText' 41 | GenericCSV = 'GenericCSV' 42 | GenericTSV = 'GenericTSV' 43 | GenericCSVNoHeader = 'GenericCSVNoHeader' 44 | GenericTSVNoHeader = 'GenericTSVNoHeader' 45 | 46 | 47 | def _dataframe_to_csv(writer, dataframe, delimiter, with_header): 48 | """serialize the dataframe with different delimiters""" 49 | encoding_writer = codecs.getwriter('utf-8')(writer) 50 | dataframe.to_csv( 51 | path_or_buf=encoding_writer, 52 | sep=delimiter, 53 | header=with_header, 54 | index=False 55 | ) 56 | 57 | def _dataframe_to_txt(writer, dataframe): 58 | encoding_writer = codecs.getwriter('utf-8')(writer) 59 | for row in dataframe.iterrows(): 60 | encoding_writer.write("".join(row[1].tolist())) 61 | encoding_writer.write('\n') 62 | 63 | def _dataframe_from_csv(reader, delimiter, with_header, skipspace): 64 | """Returns csv data as a pandas Dataframe object""" 65 | sep = delimiter 66 | header = 0 67 | if not with_header: 68 | header = None 69 | 70 | return pd.read_csv( 71 | reader, 72 | header=header, 73 | sep=sep, 74 | skipinitialspace=skipspace, 75 | encoding='utf-8-sig' 76 | ) 77 | 78 | def _dataframe_from_txt(reader): 79 | """Returns PlainText data as a pandas Dataframe object""" 80 | return pd.read_csv(reader, header=None, sep="\n", encoding='utf-8-sig') 81 | 82 | 83 | _SERIALIZERS = { 84 | DataTypeIds.PlainText: ( 85 | _dataframe_to_txt, 86 | _dataframe_from_txt, 87 | ), 88 | DataTypeIds.GenericCSV: ( 89 | partial(_dataframe_to_csv, delimiter=',', with_header=True), 90 | partial(_dataframe_from_csv, delimiter=',', with_header=True, skipspace=True), 91 | ), 92 | DataTypeIds.GenericCSVNoHeader: ( 93 | partial(_dataframe_to_csv, delimiter=',', with_header=False), 94 | partial(_dataframe_from_csv, delimiter=',', with_header=False, skipspace=True), 95 | ), 96 | DataTypeIds.GenericTSV: ( 97 | partial(_dataframe_to_csv, delimiter='\t', with_header=True), 98 | partial(_dataframe_from_csv, delimiter='\t', with_header=True, skipspace=False), 99 | ), 100 | DataTypeIds.GenericTSVNoHeader: ( 101 | partial(_dataframe_to_csv, delimiter='\t', with_header=False), 102 | partial(_dataframe_from_csv, delimiter='\t', with_header=False, skipspace=False), 103 | ), 104 | } 105 | 106 | 107 | def serialize_dataframe(writer, data_type_id, dataframe): 108 | """ 109 | Serialize a dataframe. 110 | 111 | Parameters 112 | ---------- 113 | writer : file 114 | File-like object to write to. Must be opened in binary mode. 115 | data_type_id : dict 116 | Serialization format to use. 117 | See the azureml.DataTypeIds class for constants. 118 | dataframe: pandas.DataFrame 119 | Dataframe to serialize. 120 | """ 121 | _not_none('writer', writer) 122 | _not_none_or_empty('data_type_id', data_type_id) 123 | _not_none('dataframe', dataframe) 124 | 125 | serializer = _SERIALIZERS.get(data_type_id) 126 | if serializer is None: 127 | raise UnsupportedDatasetTypeError(data_type_id) 128 | serializer[0](writer=writer, dataframe=dataframe) 129 | 130 | def deserialize_dataframe(reader, data_type_id): 131 | """ 132 | Deserialize a dataframe. 133 | 134 | Parameters 135 | ---------- 136 | reader : file 137 | File-like object to read from. Must be opened in binary mode. 138 | data_type_id : dict 139 | Serialization format of the raw data. 140 | See the azureml.DataTypeIds class for constants. 141 | 142 | Returns 143 | ------- 144 | pandas.DataFrame 145 | Dataframe object. 146 | """ 147 | _not_none('reader', reader) 148 | _not_none_or_empty('data_type_id', data_type_id) 149 | 150 | serializer = _SERIALIZERS.get(data_type_id) 151 | if serializer is None: 152 | raise UnsupportedDatasetTypeError(data_type_id) 153 | return serializer[1](reader=reader) 154 | 155 | def is_supported(data_type_id): 156 | """Return if a serializer is available for the specified format.""" 157 | _not_none_or_empty('data_type_id', data_type_id) 158 | 159 | return _SERIALIZERS.get(data_type_id) is not None 160 | -------------------------------------------------------------------------------- /azureml/services.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | """ 27 | Supports publishing and consuming published services that execute within the AzureML 28 | web service execution framework. 29 | 30 | Existing services can be consumed using the service decorator: 31 | 32 | from azureml import services 33 | 34 | @services.service(url, api_key) 35 | @services.types(a = float, b = float) 36 | @services.returns(float) 37 | def some_service(a, b): 38 | pass 39 | 40 | Where the url and api_key are specified for the published web service. 41 | 42 | Python functions can be published using the @publish decorator: 43 | 44 | @services.publish(workspace, workspace_key) 45 | @services.types(a = float, b = float) 46 | @services.returns(float) 47 | def float_typed(a, b): 48 | return a / b 49 | 50 | 51 | The function will be published under a newly created endpoint. 52 | 53 | Publish can also be called programmatically instead: 54 | 55 | published = services.publish(myfunc2, workspace, workspace_key) 56 | 57 | The types and returns decorators can be used to provide type information about the 58 | inputs and outputs. These types will be visible on the help page and enable clients 59 | written in other languages to call published Python functions. 60 | 61 | If types aren't specified then core Python types will be serialized in a custom manner. 62 | This allows working with many common types such as lists, dictionaries, numpy types, etc... 63 | But interop with other languages will be much more difficult. 64 | 65 | Files can also be attached to published functions using the @attach decorator: 66 | 67 | @services.publish(workspace, workspace_key) 68 | @services.attach('foo.txt') 69 | def attached(): 70 | return ''.join(file('foo.txt').readlines()) 71 | 72 | """ 73 | from functools import update_wrapper 74 | import codecs 75 | import inspect 76 | import re 77 | import requests 78 | import uuid 79 | import sys 80 | import json 81 | import base64 82 | import zipfile 83 | import dis 84 | from collections import deque, OrderedDict 85 | from types import CodeType, FunctionType, ModuleType 86 | import types as typesmod 87 | try: 88 | import cPickle as pickle 89 | except: 90 | import pickle 91 | try: 92 | from io import BytesIO 93 | except: 94 | from cStringIO import StringIO as BytesIO 95 | try: 96 | import azureml 97 | except: 98 | # We are published, we won't call publish_worker again. 99 | pass 100 | 101 | try: 102 | import numpy 103 | except: 104 | numpy = None 105 | 106 | try: 107 | import pandas 108 | except: 109 | pandas = None 110 | 111 | _LOAD_GLOBAL = dis.opmap['LOAD_GLOBAL'] 112 | ################################################# 113 | # Serialization/Deserialization of inputs. This code is distinct from the 114 | # serialization of the user defined function. The user defined function can contain 115 | # arbitrary objects and is fully trusted (so we can use pickle). The inputs to the function 116 | # are coming from arbitrary user input and so need to support a more limited form 117 | # of serialization. 118 | # 119 | # Serialization of the arguments is done using JSON. Each argument is serialized with 120 | # a type and a value. The type is a known type name (int, bool, float, etc...) and the 121 | # value is the serialized value in string format. Usually this is the simplest possible 122 | # representation. Strings are serialized as is, ints/floats we just call str() on, etc... 123 | # For byte arrays we base64 encode them. For data structures we store a list of the elements 124 | # which are encoded in the same way. For example a list would have a list of dictionaries 125 | # in JSON which each have a type and value member. 126 | 127 | _serializers = {} 128 | _deserializers = {} 129 | 130 | def serializer(type): 131 | def l(func): 132 | _serializers[type] = func 133 | return func 134 | return l 135 | 136 | def deserializer(type): 137 | def l(func): 138 | _deserializers[type] = func 139 | return func 140 | return l 141 | 142 | # Type: bool 143 | @serializer(bool) 144 | def _serialize_bool(inp, memo): 145 | return {'type': 'bool', 'value': 'true' if inp else 'false' } 146 | 147 | @deserializer('bool') 148 | def _deserialize_bool(value): 149 | if value['value'] == 'true': 150 | return True 151 | else: 152 | return False 153 | 154 | # Type: int 155 | @serializer(int) 156 | def _serialize_int(inp, memo): 157 | return {'type': 'int', 'value': str(inp) } 158 | 159 | @deserializer('int') 160 | def _deserialize_int(value): 161 | return int(value['value']) 162 | 163 | if sys.version_info < (3, ): 164 | # long 165 | @serializer(long) 166 | def _serialize_long(inp, memo): 167 | return {'type': 'long', 'value': str(inp) } 168 | 169 | @deserializer('long') 170 | def _deserialize_long(value): 171 | return long(value['value']) 172 | 173 | # Type: float 174 | @serializer(float) 175 | def _serialize_float(inp, memo): 176 | return {'type': 'float', 'value': str(inp) } 177 | 178 | @deserializer('float') 179 | def _deserialize_float(value): 180 | return float(value['value']) 181 | 182 | 183 | # Type: complex 184 | @serializer(complex) 185 | def _serialize_complex(inp, memo): 186 | return {'type': 'complex', 'value': str(inp) } 187 | 188 | @deserializer('complex') 189 | def _deserialize_bool(value): 190 | return complex(value['value']) 191 | 192 | 193 | # Type: unicode 194 | @serializer(str if sys.version_info >= (3,) else unicode) 195 | def _serialize_unicode(inp, memo): 196 | return {'type': 'unicode', 'value': str(inp) } 197 | 198 | @deserializer('unicode') 199 | def _deserialize_unicode(value): 200 | return value['value'] 201 | 202 | 203 | # Type: byte arrays 204 | @serializer(bytes if sys.version_info >= (3,) else str) 205 | def _serialize_bytes(inp, memo): 206 | data = base64.encodestring(inp) 207 | if sys.version_info >= (3, ): 208 | data = data.decode('utf8') 209 | return {'type': 'bytes', 'value': data.replace(chr(10), '') } 210 | 211 | @deserializer('bytes') 212 | def _deserialize_bytes(value): 213 | data = value['value'] 214 | if sys.version_info >= (3, ): 215 | data = data.encode('utf8') 216 | return base64.decodestring(data) 217 | 218 | # Type: dictionaries 219 | @serializer(dict) 220 | def serialize_dict(inp, memo): 221 | return { 222 | 'type': 'dict', 223 | 'value' : [(_encode(k, memo), _encode(inp[k], memo)) for k in inp] 224 | } 225 | 226 | 227 | @deserializer('dict') 228 | def _deserialize_dict(value): 229 | return { _decode_inner(k):_decode_inner(v) for k, v in value['value'] } 230 | 231 | # Type: None/null 232 | 233 | @serializer(type(None)) 234 | def serialize_none(inp, memo): 235 | return {'type':'null', 'value':'null'} 236 | 237 | @deserializer('null') 238 | def _deserialize_null(value): 239 | return None 240 | 241 | 242 | # Type: list and tuple 243 | @serializer(list) 244 | @serializer(tuple) 245 | def _serialize_list_or_tuple(inp, memo): 246 | res = [] 247 | for value in inp: 248 | res.append(_encode(value, memo)) 249 | 250 | return {'type': type(inp).__name__, 'value': res } 251 | 252 | @deserializer('list') 253 | def _deserialize_list(value): 254 | return [_decode_inner(x) for x in value['value']] 255 | 256 | @deserializer('tuple') 257 | def _deserialize_tuple(value): 258 | return tuple(_decode_inner(x) for x in value['value']) 259 | 260 | 261 | if numpy is not None: 262 | # ndarray is serialized as (shape, datatype, data) 263 | @serializer(numpy.ndarray) 264 | def serialize_ndarray(inp, memo): 265 | return { 266 | 'type':'numpy.ndarray', 267 | 'value': ( 268 | _encode(inp.shape, memo), 269 | _encode(inp.dtype.name, memo), 270 | _encode(inp.tostring(), memo) 271 | ) 272 | } 273 | 274 | @deserializer('numpy.ndarray') 275 | def deserialize_ndarray(value): 276 | shape, dtype, data = value['value'] 277 | return numpy.ndarray( 278 | _decode_inner(shape), _decode_inner(dtype), _decode_inner(data) 279 | ) 280 | 281 | # TODO: Need better story here... 282 | @serializer(numpy.int32) 283 | def serialize_numpy_int32(inp, memo): 284 | return _serialize_int(inp, memo) 285 | 286 | @serializer(numpy.int64) 287 | def serialize_numpy_int64(inp, memo): 288 | if sys.version_info >= (3, ): 289 | return _serialize_int(inp, memo) 290 | 291 | return _serialize_long(inp, memo) 292 | 293 | @serializer(numpy.float64) 294 | def serialize_numpy_float64(inp, memo): 295 | return _serialize_float(inp, memo) 296 | 297 | # Core deserialization functions. There's a top-level one used when 298 | # actually reading/writing values, and an inner one when we're doing the 299 | # recursive serialization/deserialization. 300 | 301 | def _decode_inner(value): 302 | val_type = value['type'] 303 | deserializer = _deserializers.get(value['type']) 304 | if deserializer is None: 305 | raise ValueError("unsupported type: " + value['type']) 306 | 307 | return deserializer(value) 308 | 309 | def _encode(inp, memo = None): 310 | outer = False 311 | if memo is None: 312 | outer = True 313 | memo = {} 314 | if id(inp) in memo and type(inp) in [list, tuple, dict]: 315 | raise ValueError('circular reference detected') 316 | memo[id(inp)] = inp 317 | 318 | serializer = _serializers.get(type(inp)) 319 | if serializer is None: 320 | raise TypeError("Unsupported type for invocation: " + type(inp).__module__ + '.' + type(inp).__name__) 321 | 322 | res = serializer(inp, memo) 323 | if outer: 324 | return json.dumps(res) 325 | return res 326 | 327 | 328 | def _decode(inp): 329 | value = json.loads(inp) 330 | 331 | if isinstance(value, dict): 332 | return _decode_inner(value) 333 | 334 | raise TypeError('expected a dictionary, got ' + type(inp).__name__) 335 | 336 | PUBLISH_URL_FORMAT = '{}/workspaces/{}/webservices/{}' 337 | 338 | if sys.version_info >= (3, 0): 339 | _code_args = ['co_argcount', 'co_kwonlyargcount', 'co_nlocals', 'co_stacksize', 'co_flags', 340 | 'co_code', 'co_consts', 'co_names', 'co_varnames', 'co_filename', 'co_name', 341 | 'co_firstlineno', 'co_lnotab', 'co_freevars', 'co_cellvars'] 342 | _func_args = ['__name__', '__defaults__', '__closure__'] 343 | else: 344 | _code_args = ['co_argcount', 'co_nlocals', 'co_stacksize', 'co_flags', 'co_code', 'co_consts', 345 | 'co_names', 'co_varnames', 'co_filename', 'co_name', 'co_firstlineno', 'co_lnotab', 346 | 'co_freevars', 'co_cellvars'] 347 | _func_args = ['func_name', 'func_defaults', 'func_closure'] 348 | 349 | 350 | class _Serializer(object): 351 | '''serializes the specified functions, and the globals it uses as well. 352 | 353 | normal globals are just serialized as-is, they must be picklable to do so. 354 | 355 | other functions which are referenced are serialized as an additional function, and 356 | will be repopulated in globals. This allows things like mutually recursive functions 357 | to exist. 358 | ''' 359 | def __init__(self): 360 | self.functions = set() 361 | self.queue = deque() 362 | 363 | if sys.version_info < (3, ): 364 | CLASS_TYPES = (typesmod.ClassType, type) 365 | else: 366 | CLASS_TYPES = type 367 | 368 | def serialize(self, obj): 369 | self.queue.append(('func', obj.__name__, obj)) 370 | self.functions.add((obj.__name__, obj)) 371 | self.mod = obj.__module__ 372 | 373 | return self.serialize_obj(obj) 374 | 375 | def serialize_obj(self, obj): 376 | res = [] 377 | while self.queue: 378 | objType, name, cur = self.queue.popleft() 379 | 380 | if objType == 'func': 381 | res.append((objType, name, self.get_code_args(cur))) 382 | elif objType == 'mod': 383 | res.append((objType, name, cur.__name__)) 384 | elif objType == 'type': 385 | raise NotImplementedError('new style class not supported') 386 | elif objType == 'oldclass': 387 | res.append((objType, name, [cur.__name__, cur.__module__, cur.__bases__, {n:self.serialize_obj(v) for n, v in cur.__dict__.items()}])) 388 | else: 389 | raise Exception('Unknown serialization type') 390 | 391 | return pickle.dumps(res) 392 | 393 | @staticmethod 394 | def find_globals(code): 395 | """walks the byte code to find the variables which are actually globals""" 396 | cur_byte = 0 397 | byte_code = code.co_code 398 | 399 | names = set() 400 | while cur_byte < len(byte_code): 401 | op = ord(byte_code[cur_byte]) 402 | 403 | if op >= dis.HAVE_ARGUMENT: 404 | if op == _LOAD_GLOBAL: 405 | oparg = ord(byte_code[cur_byte + 1]) + (ord(byte_code[cur_byte + 2]) << 8) 406 | name = code.co_names[oparg] 407 | names.add(name) 408 | 409 | cur_byte += 2 410 | cur_byte += 1 411 | 412 | return names 413 | 414 | def get_code_args(self, func): 415 | code = func.__code__ 416 | 417 | codeArgs = [getattr(code, name) for name in _code_args] 418 | funcArgs = [getattr(func, name) for name in _func_args] 419 | globals = {} 420 | 421 | for name in self.find_globals(code): 422 | if name in func.__globals__: 423 | value = func.__globals__[name] 424 | if isinstance(value, FunctionType): 425 | if (name, value) not in self.functions: 426 | self.queue.append(('func', name, value)) 427 | self.functions.add((name, value)) 428 | elif isinstance(value, ModuleType): 429 | self.queue.append(('mod', name, value)) 430 | elif isinstance(value, _Serializer.CLASS_TYPES) and value.__module__ == self.mod: 431 | # class that needs to be serialized... 432 | if isinstance(value, type): 433 | # new-style class 434 | self.queue.append(('type', name, value)) 435 | else: 436 | # old-style class 437 | self.queue.append(('oldclass', name, value)) 438 | else: 439 | globals[name] = value 440 | 441 | return pickle.dumps((codeArgs, funcArgs, globals)) 442 | 443 | def _serialize_func(func): 444 | return _Serializer().serialize(func) 445 | 446 | def _deserialize_func(funcs, globalDict): 447 | items = pickle.loads(funcs) 448 | res = None 449 | for objType, name, data in items: 450 | if objType == 'func': 451 | codeArgs, funcArgs, updatedGlobals = pickle.loads(data) 452 | code = CodeType(*codeArgs) 453 | 454 | globalDict.update(**updatedGlobals) 455 | 456 | value = FunctionType(code, globalDict, *funcArgs) 457 | elif objType == 'mod': 458 | value = __import__(data) 459 | elif objType == 'oldclass': 460 | class_name, module, bases, class_dict = data 461 | value = typesmod.ClassType(class_name, bases, {k:_deserialize_func(v, globalDict) for k, v in class_dict.items()}) 462 | value.__module__ = module 463 | elif objType == 'type': 464 | raise Exception('deserialize type') 465 | else: 466 | raise Exception('Unknown serialization type') 467 | globalDict[name] = value 468 | 469 | if res is None: 470 | res = value 471 | 472 | return res 473 | 474 | def _get_args(func): 475 | raw_schema = _get_dataframe_schema(func) 476 | if raw_schema is not None: 477 | return list(raw_schema.keys()) 478 | 479 | args = inspect.getargs(func.__code__) 480 | all_args = args.args 481 | if args.varargs is not None: 482 | all_args.append(args.varargs) 483 | if args.keywords is not None: 484 | all_args.append(args.keywords) 485 | return all_args 486 | 487 | def _encode_arg(arg, type): 488 | if type == OBJECT_NAME: 489 | return _encode(arg) 490 | elif type['type'].lower() == 'string': 491 | return arg 492 | 493 | return json.dumps(arg) 494 | 495 | def _decode_one_response(response, real_type): 496 | if real_type == OBJECT_NAME: 497 | return _decode(response[0]) 498 | elif real_type['type'].lower() == 'string': 499 | return response[0] 500 | 501 | # TODO: These shouldn't be necessary, AzureML is returning things to us oddly... 502 | if response[0] == 'True': 503 | return True 504 | elif response[0] == 'False': 505 | return False 506 | return json.loads(response[0]) 507 | 508 | def _get_dict_type(column, index, type, types): 509 | if type is not None and column in type: 510 | return _annotation_to_type(type[column]) 511 | 512 | return {'type': types[index]} 513 | 514 | def _decode_response(columns, types, response, type): 515 | if isinstance(type, tuple): 516 | # multi-value decode... 517 | return tuple(_decode_one_response((r, ), _annotation_to_type(t)) for r, t in zip(response, type)) 518 | elif isinstance(type, dict): 519 | return {c:_decode_one_response((r, ), _get_dict_type(c, i, type, types)) for (i, c), r in zip(enumerate(columns), response)} 520 | elif columns is not None and len(columns) > 1: 521 | return {c:_decode_one_response((r, ), {'type': types[i]}) for (i, c), r in zip(enumerate(columns), response)} 522 | 523 | return _decode_one_response(response, _annotation_to_type(type)) 524 | 525 | class published(object): 526 | """The result of publishing a service or marking a method as being published. 527 | 528 | Supports being called to invoke the remote service, iteration for unpacking the url, 529 | api key, and help url, or the url, api_key, and help_url can be accessed directly 530 | as attributes. 531 | """ 532 | 533 | def __init__(self, url, api_key, help_url, func, service_id): 534 | self.url = url 535 | self.api_key = api_key 536 | self.help_url = help_url 537 | self.func = func 538 | self.service_id = service_id 539 | 540 | def __repr__(self): 541 | return ''.format(self.func.__name__, self.url) 542 | 543 | def _invoke(self, call_args): 544 | body = { 545 | "Inputs": { 546 | getattr(self.func, '__input_name__', 'input1'): { 547 | "ColumnNames": _get_args(self.func), 548 | "Values": call_args, 549 | } 550 | }, 551 | "GlobalParameters": {} 552 | } 553 | 554 | resp = requests.post( 555 | self.url, 556 | json=body, 557 | headers={ 558 | 'authorization': 'bearer ' + self.api_key, 559 | } 560 | ) 561 | 562 | r = resp.json() 563 | if resp.status_code >= 300: 564 | try: 565 | code = r['error']['code'] 566 | except LookupError: 567 | code = None 568 | if code in ('ModuleExecutionError', 'Unauthorized'): 569 | raise RuntimeError(r['error']['details'][0]['message']) 570 | raise ValueError(str(r)) 571 | return r 572 | 573 | def _map_args(self, *args, **kwargs): 574 | args = inspect.getcallargs(self.func, *args, **kwargs) 575 | return [ _encode_arg(args[name], _get_arg_type(name, self.func)) for name in _get_args(self.func) ] 576 | 577 | def __call__(self, *args, **kwargs): 578 | # Call remote function 579 | r = self._invoke([ self._map_args(*args, **kwargs) ]) 580 | output_name = getattr(self.func, '__output_name__', 'output1') 581 | return _decode_response( 582 | r["Results"][output_name]["value"].get("ColumnNames"), 583 | r["Results"][output_name]["value"].get("ColumnTypes"), 584 | r["Results"][output_name]["value"]["Values"][0], 585 | _get_annotation('return', self.func) 586 | ) 587 | 588 | def map(self, *args): 589 | """maps the function onto multiple inputs. The input should be multiple sequences. The 590 | sequences will be zipped together forming the positional arguments for the call. This is 591 | equivalent to map(func, ...) but is executed with a single network call.""" 592 | call_args = [self._map_args(*cur_args) for cur_args in zip(*args)] 593 | r = self._invoke(call_args) 594 | 595 | ret_type = _get_annotation('return', self.func) 596 | output_name = getattr(self.func, '__output_name__', 'output1') 597 | return [_decode_response( 598 | r['Results'][output_name]['value'].get("ColumnNames"), 599 | r['Results'][output_name]['value'].get("ColumnTypes"), 600 | x, 601 | ret_type) 602 | for x in r['Results']['output1']['value']['Values']] 603 | 604 | def delete(self): 605 | """unpublishes the service""" 606 | raise NotImplementedError('delete not implemented yet') 607 | 608 | def __iter__(self): 609 | yield self.url 610 | yield self.api_key 611 | yield self.help_url 612 | 613 | 614 | def _get_dataframe_schema(function): 615 | return getattr(function, '__dataframe_schema__', None) 616 | 617 | def _get_main_source(function): 618 | 619 | main_source = u'def azureml_main(df1 = None, df2 = None):\n' 620 | main_source += u' results = []\n' 621 | 622 | if _get_dataframe_schema(function): 623 | # function just takes a dataframe... 624 | main_source += u' results.append(__user_function(df1))' + chr(10) 625 | else: 626 | # we're marshalling the arguments in. 627 | main_source += u' for i in range(df1.shape[0]):' + chr(10) 628 | for arg in _get_args(function): 629 | arg_type = _get_arg_type(arg, function) 630 | if pandas is not None and arg_type is pandas.DataFrame: 631 | raise Exception('Only a single DataFrame argument is supported') 632 | 633 | if _get_arg_type(arg, function) == OBJECT_NAME: 634 | main_source += ' ' + arg + u' = ' + u'_decode(df1["' + arg + u'"][i])' + chr(10) 635 | else: 636 | main_source += ' ' + arg + u' = ' + u'df1["' + arg + u'"][i]' + chr(10) 637 | 638 | main_source += u' results.append(__user_function(' 639 | 640 | args = inspect.getargs(function.__code__) 641 | all_args = args.args 642 | if args.varargs is not None: 643 | all_args.append(u'*' + args.varargs) 644 | if args.keywords is not None: 645 | all_args.append(u'**' + args.keywords) 646 | 647 | # pass position arguments... 648 | main_source += u', '.join(all_args) 649 | main_source += u'))' + chr(10) 650 | 651 | ret_annotation = _get_annotation('return', function) 652 | if _get_dataframe_schema(function): 653 | # function just returns a data frame directly 654 | main_source += u' if len(results) == 1:' + chr(10) 655 | main_source += u' return results[0]' + chr(10) 656 | main_source += u' return pandas.DataFrame(results)' + chr(10) 657 | elif isinstance(ret_annotation, tuple): 658 | # multi-value return support... 659 | format = [] 660 | arg_names = [] 661 | for index, ret_type in enumerate(ret_annotation): 662 | arg_names.append(u'r' + str(index)) 663 | t = _annotation_to_type(ret_type) 664 | if t == OBJECT_NAME: 665 | format.append(u'_encode(r' + str(index) + u')') 666 | else: 667 | format.append(u'r' + str(index)) 668 | main_source += u' return pandas.DataFrame([(' + u', '.join(format) + u') for ' + ', '.join(arg_names) + u' in results])' + chr(10) 669 | elif _get_arg_type('return', function) == OBJECT_NAME: 670 | main_source += u' return pandas.DataFrame([_encode(r) for r in results])' + chr(10) 671 | else: 672 | main_source += u' return pandas.DataFrame(results)' + chr(10) 673 | 674 | return main_source 675 | 676 | def _get_source(function): 677 | source_file = inspect.getsourcefile(function) 678 | encoding = '' 679 | try: 680 | with open(source_file, 'rb') as source_file: 681 | line1 = source_file.readline() 682 | line2 = source_file.readline() 683 | if line1[:3] == '\xef\xbb\xbf': 684 | encoding = 'utf-8-sig' 685 | else: 686 | match = re.search(b"coding[:=]\s*([-\w.]+)", line1) or re.search(b"coding[:=]\s*([-\w.]+)", line2) 687 | if match: 688 | encoding = match.groups()[0] 689 | with codecs.open(source_file, 'r', encoding) as source_file: 690 | source_text = source_file.read() 691 | except: 692 | source_text = None 693 | 694 | # include our source code... 695 | ourfile = __file__ 696 | if ourfile.endswith('.pyc'): 697 | ourfile = ourfile[:-1] 698 | if encoding: 699 | source = u'# coding=' + encoding.decode('ascii') 700 | 701 | with codecs.open(ourfile, 'r', 'ascii') as services_file: 702 | source = services_file.read() 703 | 704 | main_source = _get_main_source(function) 705 | 706 | source += chr(10) + main_source 707 | 708 | if source_text is None: 709 | # we're in a REPL environment, we need to serialize the code... 710 | #TODO: Remove base64 encoding when json double escape issue is fixed 711 | source += inspect.getsource(_deserialize_func) 712 | source += chr(10) 713 | source += u'__user_function = _deserialize_func(base64.decodestring(' + repr(base64.encodestring(_serialize_func(function)).replace(chr(10), '')) + '), globals())' 714 | else: 715 | # we can upload the source code itself... 716 | source += u''' 717 | # overwrite publish/service with ones which won't re-publish... 718 | import sys 719 | sys.modules['azureml'] = azureml = type(sys)('azureml') 720 | sys.modules['azureml.services'] = services = type(sys)('services') 721 | azureml.services = services 722 | 723 | def publish(func, *args, **kwargs): 724 | if callable(func): 725 | return func 726 | def wrapper(func): 727 | return func 728 | return wrapper 729 | services.publish = publish 730 | 731 | def service(*args): 732 | def wrapper(func): 733 | return func 734 | return wrapper 735 | 736 | def attach(*args, **kwargs): 737 | def wrapper(func): 738 | return func 739 | return wrapper 740 | 741 | services.service = service 742 | services.types = types 743 | services.returns = returns 744 | services.attach = attach 745 | services.dataframe_service = attach 746 | services.service_id = attach 747 | 748 | ''' 749 | source += source_text 750 | source += chr(10) 751 | source += u'__user_function = ' + function.__name__ 752 | 753 | return source 754 | 755 | _known_types = { 756 | int: {'type':'integer', 'format':'int64'}, 757 | bool: {'type' : 'Boolean'}, 758 | float: {'type': 'number', 'format':'double'}, 759 | str if sys.version_info > (3, ) else unicode: {'type':'string'}, 760 | #complex:'Complex64', 761 | } 762 | 763 | OBJECT_NAME = {"type":"string", "format":"string"} # "description":"Python custom serialization" 764 | 765 | def _get_annotation(name, func): 766 | try: 767 | annotations = func.__annotations__ 768 | except AttributeError: 769 | return None 770 | 771 | return annotations.get(name) 772 | 773 | def _annotation_to_type(annotation): 774 | if annotation is None: 775 | return OBJECT_NAME 776 | 777 | if isinstance(annotation, str): 778 | # allow the user to specify the raw string value that will be passed... 779 | return annotation 780 | 781 | return _known_types.get(annotation) or OBJECT_NAME 782 | 783 | def _get_arg_type(name, func): 784 | if name != "return": 785 | raw_schema = _get_dataframe_schema(func) 786 | if raw_schema is not None: 787 | return _annotation_to_type(raw_schema[name]) 788 | 789 | annotation = _get_annotation(name, func) 790 | return _annotation_to_type(annotation) 791 | 792 | 793 | def _add_file(adding, zip_file): 794 | if isinstance(adding, tuple): 795 | name, contents = adding 796 | else: 797 | name = adding 798 | contents = None 799 | 800 | if isinstance(name, tuple): 801 | name, dest_name = name 802 | else: 803 | name = dest_name = name 804 | 805 | if contents is None: 806 | contents = file(name, 'rb').read() 807 | 808 | zip_file.writestr(dest_name, contents) 809 | 810 | _DEBUG = False 811 | def _publish_worker(func, files, workspace_id = None, workspace_token = None, management_endpoint = None): 812 | workspace_id, workspace_token, _, management_endpoint = azureml._get_workspace_info(workspace_id, workspace_token, None, management_endpoint) 813 | 814 | script_code = _get_source(func) + chr(10) 815 | ret_type = _get_annotation('return', func) 816 | 817 | if isinstance(ret_type, tuple): 818 | # multi-value return 819 | results = OrderedDict() 820 | for index, obj_type in enumerate(ret_type): 821 | results['result' + str(index)] = _annotation_to_type(obj_type) 822 | elif isinstance(ret_type, dict): 823 | # multi-value return 824 | results = OrderedDict() 825 | for name, obj_type in ret_type.items(): 826 | results[name] = _annotation_to_type(obj_type) 827 | else: 828 | results = {"result": _get_arg_type('return', func)} 829 | 830 | code_bundle = { 831 | "InputSchema": {name: _get_arg_type(name, func) for name in _get_args(func)}, 832 | "OutputSchema": results, 833 | "Language" : "python-2.7-64", 834 | "SourceCode": script_code, 835 | } 836 | 837 | attachments = getattr(func, '__attachments__', None) 838 | if attachments or files: 839 | data = BytesIO() 840 | zip_file = zipfile.PyZipFile(data, 'w') 841 | if attachments: 842 | for adding in attachments: 843 | _add_file(adding, zip_file) 844 | 845 | if files: 846 | for adding in files: 847 | _add_file(adding, zip_file) 848 | 849 | zip_file.close() 850 | 851 | code_bundle['ZipContents'] = base64.b64encode(data.getvalue()) 852 | 853 | name = getattr(func, '__service_name__', func.__name__) 854 | body = { 855 | "Name": name, 856 | "Type":"Code", 857 | "CodeBundle" : code_bundle 858 | } 859 | id = str(getattr(func, '__service_id__', uuid.uuid4())).replace('-', '') 860 | url = PUBLISH_URL_FORMAT.format(management_endpoint, workspace_id, id) 861 | headers = {'authorization': 'bearer ' + workspace_token} 862 | resp = requests.put( 863 | url, 864 | json=body, 865 | headers=headers 866 | ) 867 | 868 | if _DEBUG: 869 | with open(func.__name__ + '.req', 'w') as f: 870 | f.write(url + chr(10)) 871 | f.write(json.dumps(body)) 872 | f.close() 873 | 874 | with open(func.__name__ + '.res', 'w') as f: 875 | f.write(str(resp.status_code) + chr(10)) 876 | f.write(resp.text + chr(10)) 877 | f.close() 878 | 879 | if resp.status_code < 200 or resp.status_code > 299: 880 | try: 881 | msg = resp.json()['error']['message'] 882 | except: 883 | msg = str(resp.status_code) 884 | raise ValueError('Failed to publish function: ' + msg + chr(10) + 885 | 'Set azureml.services._DEBUG = True to enable writing {}.req/{}.res files'.format(func.__name__, func.__name__)) 886 | 887 | j = resp.json() 888 | epUrl = url + '/endpoints/' + j['DefaultEndpointName'] 889 | epResp = requests.get(epUrl, headers=headers) 890 | endpoints = epResp.json() 891 | 892 | url = endpoints['ApiLocation'] + '/execute?api-version=2.0' 893 | 894 | return published(url, endpoints['PrimaryKey'], endpoints['HelpLocation'] + '/score', func, id) 895 | 896 | def publish(func_or_workspace_id, workspace_id_or_token = None, workspace_token_or_none = None, files=(), endpoint=None): 897 | '''publishes a callable function or decorates a function to be published. 898 | 899 | Returns a callable, iterable object. Calling the object will invoke the published service. 900 | Iterating the object will give the API URL, API key, and API help url. 901 | 902 | To define a function which will be published to Azure you can simply decorate it with 903 | the @publish decorator. This will publish the service, and then future calls to the 904 | function will run against the operationalized version of the service in the cloud. 905 | 906 | >>> @publish(workspace_id, workspace_token) 907 | >>> def func(a, b): 908 | >>> return a + b 909 | 910 | After publishing you can then invoke the function using: 911 | func.service(1, 2) 912 | 913 | Or continue to invoke the function locally: 914 | func(1, 2) 915 | 916 | You can also just call publish directly to publish a function: 917 | 918 | >>> def func(a, b): return a + b 919 | >>> 920 | >>> res = publish(func, workspace_id, workspace_token) 921 | >>> 922 | >>> url, api_key, help_url = res 923 | >>> res(2, 3) 924 | 5 925 | >>> url, api_key, help_url = res.url, res.api_key, res.help_url 926 | 927 | The returned result will be the published service. 928 | 929 | You can specify a list of files which should be published along with the function. 930 | The resulting files will be stored in a subdirectory called 'Script Bundle'. The 931 | list of files can be one of: 932 | (('file1.txt', None), ) # file is read from disk 933 | (('file1.txt', b'contents'), ) # file contents are provided 934 | ('file1.txt', 'file2.txt') # files are read from disk, written with same filename 935 | ((('file1.txt', 'destname.txt'), None), ) # file is read from disk, written with different destination name 936 | 937 | The various formats for each filename can be freely mixed and matched. 938 | ''' 939 | if not callable(func_or_workspace_id): 940 | def do_publish(func): 941 | func.service = _publish_worker(func, files, func_or_workspace_id, workspace_id_or_token, endpoint) 942 | return func 943 | return do_publish 944 | 945 | return _publish_worker(func_or_workspace_id, files, workspace_id_or_token, workspace_token_or_none, endpoint) 946 | 947 | def service(url, api_key, help_url = None): 948 | '''Marks a function as having been published and causes all invocations to go to the remote 949 | operationalized service. 950 | 951 | >>> @service(url, api_key) 952 | >>> def f(a, b): 953 | >>> pass 954 | ''' 955 | def do_publish(func): 956 | return published(url, api_key, help_url, func, None) 957 | return do_publish 958 | 959 | def types(**args): 960 | """Specifies the types used for the arguments of a published service. 961 | 962 | @types(a=int, b = str) 963 | def f(a, b): 964 | pass 965 | """ 966 | def l(func): 967 | if hasattr(func, '__annotations__'): 968 | func.__annotations__.update(args) 969 | else: 970 | func.__annotations__ = args 971 | return func 972 | return l 973 | 974 | def returns(type): 975 | """Specifies the return type for a published service. 976 | 977 | @returns(int) 978 | def f(...): 979 | pass 980 | """ 981 | def l(func): 982 | if hasattr(func, '__annotations__'): 983 | func.__annotations__['return'] = type 984 | else: 985 | func.__annotations__ = {'return': type} 986 | return func 987 | return l 988 | 989 | def attach(name, contents = None): 990 | """attaches a file to the payload to be uploaded. 991 | 992 | If contents is omitted the file is read from disk. 993 | If name is a tuple it specifies the on-disk filename and the destination filename. 994 | """ 995 | def do_attach(func): 996 | if hasattr(func, '__attachments__'): 997 | func.__attachments__.append((name, contents)) 998 | else: 999 | func.__attachments__ = [(name, contents)] 1000 | return func 1001 | return do_attach 1002 | 1003 | def service_id(id): 1004 | """Specifies the service ID to enable re-publishing to the same end point. 1005 | Can be applied to the function which is being published: 1006 | 1007 | @publish(...) 1008 | @service_id('e5dd3903-796f-4544-b7aa-f4e08b2cc639') 1009 | def myfunc(): 1010 | return 42 1011 | 1012 | When the function is published it will replace any existing instances of the 1013 | function. 1014 | """ 1015 | def l(func): 1016 | func.__service_id__ = id 1017 | return func 1018 | 1019 | return l 1020 | 1021 | def name(name): 1022 | """Provides a friendly name for the published web service which can include spaces and other characters illegal for Python functions. 1023 | """ 1024 | def l(func): 1025 | func.__service_name__ = name 1026 | return func 1027 | 1028 | return l 1029 | 1030 | def dataframe_service(**args): 1031 | """Indicates that the function operations on a data frame. The function 1032 | will receive a single input in the form of a data frame, and should return 1033 | a data frame object. The schema of the data frame is specified with this 1034 | decorator. 1035 | 1036 | @publish(...) 1037 | @dataframe_service(a = int, b = int) 1038 | def myfunc(df): 1039 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])]) 1040 | """ 1041 | def l(func): 1042 | func.__dataframe_schema__ = args 1043 | return func 1044 | 1045 | return l 1046 | 1047 | def input_name(name): 1048 | """specifies the name of the input the web service expects to receive. Defaults to 'input1'""" 1049 | def l(func): 1050 | func.__input_name__ = name 1051 | return func 1052 | 1053 | return l 1054 | 1055 | def output_name(name): 1056 | """specifies the name of the input the web service expects to receive. Defaults to 'input1'""" 1057 | def l(func): 1058 | func.__output_name__ = name 1059 | return func 1060 | 1061 | return l -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | python-dateutil 3 | pandas 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | #------------------------------------------------------------------------- 4 | # Copyright (c) Microsoft Corporation 5 | # All rights reserved. 6 | # 7 | # MIT License: 8 | # Permission is hereby granted, free of charge, to any person obtaining 9 | # a copy of this software and associated documentation files (the 10 | # "Software"), to deal in the Software without restriction, including 11 | # without limitation the rights to use, copy, modify, merge, publish, 12 | # distribute, sublicense, and/or sell copies of the Software, and to 13 | # permit persons to whom the Software is furnished to do so, subject to 14 | # the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be 17 | # included in all copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 23 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 25 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | #-------------------------------------------------------------------------- 27 | 28 | from setuptools import setup 29 | 30 | # To build: 31 | # python setup.py sdist 32 | # python setup.py bdist_wheel 33 | # 34 | # To install: 35 | # python setup.py install 36 | # 37 | # To register (only needed once): 38 | # python setup.py register 39 | # 40 | # To upload: 41 | # python setup.py sdist upload 42 | # python setup.py bdist_wheel upload 43 | 44 | setup( 45 | name='azureml', 46 | version='0.2.7', 47 | description='Microsoft Azure Machine Learning Python client library', 48 | license='MIT License', 49 | author='Microsoft Corporation', 50 | author_email='ptvshelp@microsoft.com', 51 | url='https://github.com/Azure/Azure-MachineLearning-ClientLibrary-Python', 52 | classifiers=[ 53 | 'Development Status :: 3 - Alpha', 54 | 'Programming Language :: Python', 55 | 'Programming Language :: Python :: 2', 56 | 'Programming Language :: Python :: 2.7', 57 | 'Programming Language :: Python :: 3', 58 | 'Programming Language :: Python :: 3.3', 59 | 'Programming Language :: Python :: 3.4', 60 | 'License :: OSI Approved :: MIT License', 61 | ], 62 | packages=['azureml'], 63 | install_requires=[ 64 | 'python-dateutil', 65 | 'requests', 66 | 'pandas', 67 | ] 68 | ) 69 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | Test settings 2 | ============= 3 | 4 | To successfully run tests, you'll need to create an **azuremltestsettings.json** file in this folder. 5 | 6 | This file contains credentials and lists various Azure resources to use when running the tests. 7 | 8 | 9 | Example 10 | ------- 11 | 12 | ``` 13 | { 14 | "workspace": { 15 | "id": "11111111111111111111111111111111", 16 | "token": "00000000000000000000000000000000", 17 | "endpoint": "https://studio.azureml.net" 18 | }, 19 | "storage": { 20 | "accountName": "mystorageaccount", 21 | "accountKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==", 22 | "container": "mydatasettestcontainer", 23 | "mediumSizeBlob": "MediumSizeDataset_NH.csv", 24 | "unicodeBomBlob": "DatasetWithUnicodeBOM.txt", 25 | "blobs": [ 26 | "Dataset_NH.csv", 27 | "Dataset_NH.tsv", 28 | "Dataset_WH.csv", 29 | "Dataset_WH.tsv", 30 | "Dataset.txt" 31 | ] 32 | }, 33 | "intermediateDataset": { 34 | "experimentId": "11111111111111111111111111111111.f-id.22222222222222222222222222222222", 35 | "nodeId": "33333333-3333-3333-3333-333333333333-333", 36 | "portName": "Results dataset", 37 | "dataTypeId": "GenericCSV" 38 | }, 39 | "diagnostics": { 40 | "writeBlobContents": "True", 41 | "writeSerializedFrame": "True" 42 | } 43 | } 44 | ``` 45 | 46 | 47 | Workspace 48 | --------- 49 | 50 | From the Azure portal, create a new ML workspace. Open the new workspace in Studio. From the URL, you'll find your workspace id. 51 | 52 | In the settings page, you'll find 2 authorization tokens, you can use either one. 53 | 54 | Set the id and token in the json: 55 | 56 | ``` 57 | "workspace": { 58 | "id": "11111111111111111111111111111111", 59 | "token": "00000000000000000000000000000000", 60 | "endpoint": "https://studio.azureml.net" 61 | }, 62 | ``` 63 | 64 | 65 | Storage account 66 | --------------- 67 | 68 | The storage section is used for some tests that load dataset files from Azure blob storage. 69 | 70 | You'll need to create an Azure storage account, create a container and upload dataset files to it. 71 | 72 | The round-trip tests rely on a naming convention for the ones in the blobs array: 73 | ``` 74 | "blobs": [ 75 | "Dataset_NH.csv", 76 | "Dataset_NH.tsv", 77 | "Dataset_WH.csv", 78 | "Dataset_WH.tsv", 79 | "Dataset.txt" 80 | ] 81 | ``` 82 | 83 | NH means no header, WH means with header. 84 | 85 | 86 | Experiment 87 | ---------- 88 | 89 | Create a new experiment. Add the following modules and connect them: 90 | 91 | - Airport Codes Dataset 92 | - Split 93 | - Convert to CSV 94 | 95 | Play the experiment and save. 96 | 97 | You'll need the experiment id (appears in URL), the node id (can be found in the HTML DOM), the port name (displayed as a tooltip when you hover on the output port) and the data type id. 98 | 99 | ``` 100 | "intermediateDataset": { 101 | "experimentId": "11111111111111111111111111111111.f-id.22222222222222222222222222222222", 102 | "nodeId": "33333333-3333-3333-3333-333333333333-333", 103 | "portName": "Results dataset", 104 | "dataTypeId": "GenericCSV" 105 | }, 106 | ``` 107 | 108 | 109 | Diagnostics 110 | ----------- 111 | 112 | Some of the tests can write intermediate results to disk, which can help with debugging. 113 | 114 | "diagnostics": { 115 | "writeBlobContents": "True", 116 | "writeSerializedFrame": "True" 117 | } 118 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | from os import path 27 | import json 28 | import numpy as np 29 | import random 30 | import string 31 | 32 | 33 | class TestSettings(object): 34 | class Workspace(object): 35 | def __init__(self, settings): 36 | self.settings = settings 37 | 38 | @property 39 | def id(self): 40 | return self.settings['id'] 41 | 42 | @property 43 | def token(self): 44 | return self.settings['token'] 45 | 46 | @property 47 | def endpoint(self): 48 | return self.settings['endpoint'] 49 | 50 | @property 51 | def management_endpoint(self): 52 | return self.settings['management_endpoint'] 53 | 54 | class Storage(object): 55 | def __init__(self, settings): 56 | self.settings = settings 57 | 58 | @property 59 | def account_name(self): 60 | return self.settings['accountName'] 61 | 62 | @property 63 | def account_key(self): 64 | return self.settings['accountKey'] 65 | 66 | @property 67 | def container(self): 68 | return self.settings['container'] 69 | 70 | @property 71 | def medium_size_blob(self): 72 | return self.settings['mediumSizeBlob'] 73 | 74 | @property 75 | def blobs(self): 76 | return self.settings['blobs'] 77 | 78 | class IntermediateDataset(object): 79 | def __init__(self, settings): 80 | self.settings = settings 81 | 82 | @property 83 | def experiment_id(self): 84 | return self.settings['experimentId'] 85 | 86 | @property 87 | def node_id(self): 88 | return self.settings['nodeId'] 89 | 90 | @property 91 | def port_name(self): 92 | return self.settings['portName'] 93 | 94 | @property 95 | def data_type_id(self): 96 | return self.settings['dataTypeId'] 97 | 98 | class Diagnostics(object): 99 | def __init__(self, settings): 100 | self.settings = settings 101 | 102 | @property 103 | def write_blob_contents(self): 104 | return self.settings['writeBlobContents'] 105 | 106 | @property 107 | def write_serialized_frame(self): 108 | return self.settings['writeSerializedFrame'] 109 | 110 | def __init__(self, settings): 111 | self.workspace = TestSettings.Workspace(settings['workspace']) 112 | self.storage = TestSettings.Storage(settings['storage']) 113 | self.intermediateDataset = TestSettings.IntermediateDataset(settings['intermediateDataset']) 114 | self.diagnostics = TestSettings.Diagnostics(settings['diagnostics']) 115 | 116 | 117 | def load_test_settings(): 118 | name = "azuremltestsettings.json" 119 | full_path = path.join(path.abspath(path.dirname(__file__)), name) 120 | if not path.exists(full_path): 121 | raise RuntimeError("Cannot run AzureML tests when the expected settings file , '{0}', does not exist!".format(full_path)) 122 | with open(full_path, "r") as f: 123 | settings = json.load(f) 124 | return TestSettings(settings) 125 | 126 | def id_generator(size=10, chars=string.ascii_uppercase + string.digits): 127 | return ''.join(random.choice(chars) for _ in range(size)) 128 | -------------------------------------------------------------------------------- /tests/coverage.bat: -------------------------------------------------------------------------------- 1 | @echo OFF 2 | SETLOCAL 3 | cls 4 | 5 | if "%1%" == "" ( 6 | set PYTHONDIR=%SystemDrive%\Anaconda 7 | ) else ( 8 | set PYTHONDIR=%1% 9 | ) 10 | 11 | if "%2%" == "" ( 12 | set COVERAGEDIR=htmlcov 13 | ) else ( 14 | set COVERAGEDIR=%2% 15 | ) 16 | 17 | if "%PYTHONPATH%" == "" ( 18 | set PYTHONPATH=.. 19 | ) else ( 20 | set PYTHONPATH=%PYTHONPATH%;.. 21 | ) 22 | 23 | if exist "%PYTHONDIR%\Scripts\coverage.exe" ( 24 | goto :coverage 25 | ) 26 | 27 | 28 | REM --------------------------------------------------------------------------- 29 | if not exist "%PYTHONDIR%\Scripts\pip.exe" ( 30 | echo Cannot do a code coverage run when neither 'coverage' nor 'pip' are installed. 31 | goto :exit_door 32 | ) 33 | 34 | echo Installing 'coverage' package... 35 | %PYTHONDIR%\Scripts\pip.exe install coverage 36 | echo Finished installing 'coverage' package 37 | 38 | REM --------------------------------------------------------------------------- 39 | :coverage 40 | echo Starting coverage run using %PYTHONDIR% 41 | %PYTHONDIR%\Scripts\coverage.exe run -m unittest discover -p "unittests.py" 42 | %PYTHONDIR%\Scripts\coverage.exe html -d %COVERAGEDIR% 43 | start %CD%\%COVERAGEDIR%\index.html 44 | echo Finished coverage run! 45 | 46 | REM --------------------------------------------------------------------------- 47 | :exit_door 48 | exit /B %UNITTEST_EC% -------------------------------------------------------------------------------- /tests/foo.txt: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /tests/lib.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | from azureml import services 27 | import pandas 28 | from os import path 29 | import os 30 | try: 31 | import tests 32 | from tests.settings import load_test_settings 33 | settings = load_test_settings() 34 | TEST_WS = settings.workspace.id 35 | TEST_KEY = settings.workspace.token 36 | ENDPOINT = settings.workspace.management_endpoint 37 | except: 38 | TEST_WS = '' 39 | TEST_KEY = '' 40 | ENDPOINT = '' 41 | 42 | 43 | #@services.publish(TEST_WS, TEST_KEY) 44 | #def noparams(): 45 | # return 'hello world!' 46 | 47 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 48 | @services.types(a = unicode, b = unicode) 49 | @services.returns(unicode) 50 | def str_typed(a, b): 51 | return a + b 52 | 53 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 54 | def untyped_identity(a): 55 | return a 56 | 57 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 58 | @services.attach((path.join(path.dirname(__file__), 'foo.txt'), 'foo.txt')) 59 | @services.types(a = unicode) 60 | @services.returns(unicode) 61 | def attached(a): 62 | return a + ''.join(file('Script Bundle\\foo.txt', 'rU').readlines()) 63 | 64 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 65 | @services.types(a = float, b = float) 66 | @services.returns(float) 67 | def float_typed(a, b): 68 | return a / b 69 | 70 | 71 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 72 | @services.types(a = int, b = int) 73 | @services.returns((int, int)) 74 | def multivalue_return(a, b): 75 | return a + b, a - b 76 | 77 | 78 | # style 1, var args 79 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 80 | def mysum(*args): 81 | return sum(args) 82 | 83 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 84 | @services.types(a = int, b = int) 85 | @services.returns(int) 86 | def typed(a, b): 87 | return a + b 88 | 89 | 90 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 91 | @services.types(a = bool, b = bool) 92 | @services.returns(bool) 93 | def bool_typed(a, b): 94 | return a and b 95 | 96 | ##@services.publish(TEST_WS, TEST_KEY) 97 | ##@services.types(a = complex, b = complex) 98 | ##@services.returns(complex) 99 | ##def complex_typed(a, b): 100 | ## return a * b 101 | 102 | 103 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 104 | @services.dataframe_service(a = int, b = int) 105 | @services.returns(int) 106 | def dataframe(df): 107 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])]) 108 | 109 | 110 | if hasattr(dataframe, 'service'): 111 | @services.service(dataframe.service.url, dataframe.service.api_key) 112 | @services.types(a = int, b = int) 113 | @services.returns(int) 114 | def dataframe_int(a, b): 115 | pass 116 | 117 | ## style 1, define a function and call the publish API explicitly. 118 | 119 | # style 1, define a function and publish it with a decorator 120 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 121 | def myfunc(a, b): 122 | return [a + b + a, a - b * b, a * b * a, a / b] 123 | 124 | 125 | # style 2, define a function and call the publish API explicitly. 126 | def myfunc2(a, b): 127 | return [a + b, a - b, a * b, a / b] 128 | 129 | published = services.publish(myfunc2, TEST_WS, TEST_KEY, endpoint=ENDPOINT) 130 | 131 | 132 | 133 | 134 | # style 1, kw args 135 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT) 136 | def kwargs(**args): 137 | return args 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /tests/performancetests.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | import unittest 27 | import pandas as pd 28 | from datetime import datetime 29 | from pandas.util.testing import assert_frame_equal 30 | 31 | from azure.storage import BlobService 32 | from azureml import ( 33 | BytesIO, 34 | Workspace, 35 | DataTypeIds, 36 | serialize_dataframe, 37 | ) 38 | from tests import ( 39 | load_test_settings, 40 | ) 41 | 42 | 43 | settings = load_test_settings() 44 | 45 | 46 | class PerformanceTests(unittest.TestCase): 47 | def setUp(self): 48 | self.workspace = Workspace( 49 | settings.workspace.id, 50 | settings.workspace.token, 51 | settings.workspace.endpoint 52 | ) 53 | self.blob = BlobService( 54 | settings.storage.account_name, 55 | settings.storage.account_key 56 | ) 57 | 58 | def _write_blob_contents(self, filename, data): 59 | if settings.diagnostics.write_blob_contents: 60 | with open('original-blob-' + filename, 'wb') as data_file: 61 | data_file.write(data) 62 | 63 | def _write_serialized_frame(self, filename, data): 64 | if settings.diagnostics.write_serialized_frame: 65 | with open('serialized-frame-' + filename, 'wb') as data_file: 66 | data_file.write(data) 67 | 68 | def test_serialize_40mb_dataframe(self): 69 | # Arrange 70 | blob_name = settings.storage.medium_size_blob 71 | original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) 72 | original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig') 73 | 74 | self._write_blob_contents(blob_name, original_data) 75 | 76 | # Act 77 | start_time = datetime.now() 78 | writer = BytesIO() 79 | serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe) 80 | elapsed_time = datetime.now() - start_time 81 | result_data = writer.getvalue() 82 | 83 | self._write_serialized_frame(blob_name, result_data) 84 | 85 | # Assert 86 | result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig') 87 | assert_frame_equal(original_dataframe, result_dataframe) 88 | self.assertLess(elapsed_time.total_seconds(), 10) 89 | 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /tests/roundtriptests.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | import unittest 27 | import pandas as pd 28 | from pandas.util.testing import assert_frame_equal 29 | 30 | from azure.storage import BlobService 31 | from azureml import ( 32 | BytesIO, 33 | Workspace, 34 | DataTypeIds, 35 | ) 36 | from tests import ( 37 | id_generator, 38 | load_test_settings, 39 | ) 40 | 41 | 42 | settings = load_test_settings() 43 | 44 | 45 | class RoundTripTests(unittest.TestCase): 46 | def setUp(self): 47 | self.workspace = Workspace( 48 | settings.workspace.id, 49 | settings.workspace.token, 50 | settings.workspace.endpoint 51 | ) 52 | self.blob = BlobService( 53 | settings.storage.account_name, 54 | settings.storage.account_key 55 | ) 56 | 57 | def _write_blob_contents(self, filename, data): 58 | if settings.diagnostics.write_blob_contents: 59 | with open('original-blob-' + filename, 'wb') as data_file: 60 | data_file.write(data) 61 | 62 | def _write_serialized_frame(self, filename, data): 63 | if settings.diagnostics.write_serialized_frame: 64 | with open('serialized-frame-' + filename, 'wb') as data_file: 65 | data_file.write(data) 66 | 67 | def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): 68 | def datatypeid_from_header_and_format(header, format): 69 | if format == 'csv': 70 | if header == 'wh': 71 | return DataTypeIds.GenericCSV 72 | else: 73 | return DataTypeIds.GenericCSVNoHeader 74 | elif format == 'tsv': 75 | if header == 'wh': 76 | return DataTypeIds.GenericTSV 77 | else: 78 | return DataTypeIds.GenericTSVNoHeader 79 | elif format == 'txt': 80 | return DataTypeIds.PlainText 81 | else: 82 | self.assertTrue(False, 'Unexpected format') 83 | 84 | def split_blob_name(blob_name): 85 | # blob naming convention: 86 | # name_
. 87 | #
: WH: with header 88 | # NH: no header 89 | # : CSV: comma separated 90 | # TSV: tab separated 91 | # TXT: newline separated 92 | name, format = blob_name.lower().split('.') 93 | if format != 'txt': 94 | name, header = name.split('_') 95 | else: 96 | header = 'nh' 97 | 98 | return name, format, header 99 | 100 | for blob_name in settings.storage.blobs: 101 | print(blob_name) 102 | 103 | name, format, header = split_blob_name(blob_name) 104 | 105 | # Read the data from blob storage 106 | original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) 107 | self._write_blob_contents(blob_name, original_data) 108 | 109 | # Parse the data to a dataframe using Pandas 110 | original_dataframe = pd.read_csv( 111 | BytesIO(original_data), 112 | header=0 if header == 'wh' else None, 113 | sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n', 114 | encoding='utf-8-sig' 115 | ) 116 | 117 | # Upload the dataframe as a new dataset 118 | dataset_name = 'unittest' + name + id_generator() 119 | description = 'safe to be deleted - ' + dataset_name 120 | data_type_id = datatypeid_from_header_and_format(header, format) 121 | self.workspace.datasets.add_from_dataframe( 122 | original_dataframe, 123 | data_type_id, 124 | dataset_name, 125 | description, 126 | ) 127 | 128 | # Get the new dataset 129 | dataset = self.workspace.datasets[dataset_name] 130 | self.assertIsNotNone(dataset) 131 | 132 | # Read the dataset as a dataframe 133 | result_data = dataset.read_as_binary() 134 | self._write_serialized_frame(blob_name, result_data) 135 | result_dataframe = dataset.to_dataframe() 136 | 137 | # Verify that the dataframes are equal 138 | assert_frame_equal(original_dataframe, result_dataframe) 139 | 140 | def test_azureml_example_datasets(self): 141 | max_size = 10 * 1024 * 1024 142 | skip = [ 143 | 'Restaurant feature data', 144 | 'IMDB Movie Titles', 145 | 'Book Reviews from Amazon', 146 | ] 147 | 148 | for dataset in self.workspace.example_datasets: 149 | if not hasattr(dataset, 'to_dataframe'): 150 | print('skipped (unsupported format): {0}'.format(dataset.name)) 151 | continue 152 | 153 | if dataset.size > max_size: 154 | print('skipped (max size): {0}'.format(dataset.name)) 155 | continue 156 | 157 | if dataset.name in skip: 158 | print('skipped: {0}'.format(dataset.name)) 159 | continue 160 | 161 | print('downloading: ' + dataset.name) 162 | frame = dataset.to_dataframe() 163 | 164 | print('uploading: ' + dataset.name) 165 | dataset_name = 'unittest' + dataset.name + id_generator() 166 | description = 'safe to be deleted - ' + dataset_name 167 | self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description) 168 | 169 | 170 | if __name__ == '__main__': 171 | unittest.main() 172 | -------------------------------------------------------------------------------- /tests/serialize_test.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | import unittest 27 | import azureml 28 | import sys 29 | from azureml.services import _serialize_func, _deserialize_func, _encode, _decode 30 | 31 | def mutually_ref_f(): 32 | mutually_ref_g 33 | return 42, mutually_ref_g 34 | 35 | def mutually_ref_g(): 36 | return 100, mutually_ref_f 37 | 38 | abc = 200 39 | def reads_global(): 40 | return abc 41 | 42 | 43 | class MyClass(): 44 | pass 45 | 46 | class BaseClass: pass 47 | 48 | class DerivedClass(BaseClass): 49 | pass 50 | 51 | def reads_class(): 52 | return MyClass() 53 | 54 | def reads_derived_class(): 55 | return DerivedClass() 56 | 57 | def aliased_function(): 58 | return 42 59 | 60 | alias = aliased_function 61 | def calls_aliased_function(): 62 | return alias() 63 | 64 | def reads_module(): 65 | return sys.version 66 | 67 | class Test_serialize_test(unittest.TestCase): 68 | def make_globals(self): 69 | return {'__builtins__' : __builtins__} 70 | 71 | def test_module(self): 72 | serialized = _serialize_func(reads_module) 73 | glbs = self.make_globals() 74 | f = _deserialize_func(serialized, glbs) 75 | self.assertEqual(f(), sys.version) 76 | 77 | def test_aliasing(self): 78 | serialized = _serialize_func(calls_aliased_function) 79 | glbs = self.make_globals() 80 | f = _deserialize_func(serialized, glbs) 81 | self.assertEqual(f(), 42) 82 | 83 | def test_mutually_ref(self): 84 | global mutually_ref_f, mutually_ref_g 85 | 86 | glbs = self.make_globals() 87 | serialized = _serialize_func(mutually_ref_f) 88 | del mutually_ref_f, mutually_ref_g 89 | 90 | f = _deserialize_func(serialized, glbs) 91 | self.assertEqual(f()[0], 42) 92 | 93 | self.assertEqual(f()[1]()[0], 100) 94 | 95 | def test_reads_global(self): 96 | global abc, reads_global 97 | 98 | glbs = self.make_globals() 99 | s = _serialize_func(reads_global) 100 | del abc, reads_global 101 | f = _deserialize_func(s, glbs) 102 | 103 | self.assertEqual(f(), 200) 104 | pass 105 | 106 | def test_core_types(self): 107 | values = [42, 'abc', b'abc', 100.0, True, False, 3j, None, [1,2,3], (1,2,3), {2:3}] 108 | 109 | for value in values: 110 | self.assertEqual(_decode(_encode(value)), value) 111 | 112 | def test_other_types(self): 113 | try: 114 | import numpy 115 | self.assertTrue(_decode(_encode(numpy.ndarray(42))).all()) 116 | except: 117 | return 118 | 119 | def test_reads_class(self): 120 | global reads_class, MyClass 121 | 122 | s = _serialize_func(reads_class) 123 | del reads_class, MyClass 124 | 125 | glbs = self.make_globals() 126 | f = _deserialize_func(s, glbs) 127 | 128 | self.assertTrue(repr(f()).startswith('<__main__.MyClass instance at')) 129 | 130 | #def test_reads_derived_class(self): 131 | # global reads_derived_class, BaseClass, DerivedClass 132 | 133 | # s = _serialize_func(reads_derived_class) 134 | # del reads_derived_class, BaseClass, DerivedClass 135 | 136 | # glbs = self.make_globals() 137 | # f = _deserialize_func(s, glbs) 138 | 139 | # print(glbs) 140 | # print(repr(f())) 141 | 142 | if __name__ == '__main__': 143 | unittest.main() 144 | -------------------------------------------------------------------------------- /tests/servicestests.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | from azureml import services 27 | import time 28 | import tests 29 | import traceback 30 | import unittest 31 | import lib 32 | import uuid 33 | 34 | def invoke(published_func, *args, **kwargs): 35 | '''helper to repeatedly invoke the function until it becomes available...''' 36 | for i in xrange(100): 37 | time.sleep(5) 38 | try: 39 | return published_func(*args, **kwargs) 40 | break 41 | except Exception as e: 42 | traceback.print_exc() 43 | print(e) 44 | 45 | def invoke_map(published_func, *args): 46 | '''helper to repeatedly invoke the function until it becomes available...''' 47 | for i in xrange(100): 48 | time.sleep(5) 49 | try: 50 | return published_func.map(*args) 51 | break 52 | except Exception as e: 53 | traceback.print_exc() 54 | print(e) 55 | 56 | class Test_services(unittest.TestCase): 57 | def test_service_id(self): 58 | service_id = uuid.UUID(lib.str_typed.service.service_id) 59 | self.assertNotEqual(service_id, uuid.UUID('00000000000000000000000000000000')) 60 | 61 | def test_str_typed(self): 62 | self.assertEqual(invoke(lib.str_typed.service, 'abc', 'def'), 'abcdef') 63 | 64 | def test_attached(self): 65 | self.assertEqual(invoke(lib.attached.service, 'test '), 'test hello world!') 66 | 67 | def test_bool_typed(self): 68 | self.assertEqual(invoke(lib.bool_typed.service, True, False), False) 69 | 70 | def test_float_typed(self): 71 | self.assertEqual(invoke(lib.float_typed.service, 3.0, 5.0), .6) 72 | 73 | def test_multivalue_return(self): 74 | self.assertEqual(invoke(lib.multivalue_return.service, 1, 2), (3, -1)) 75 | 76 | def test_map(self): 77 | # invoking via map 78 | self.assertEqual(invoke_map(lib.typed.service, [1, 1], [2, 4]), [3, 5]) 79 | 80 | def test_varargs(self): 81 | # style 1, var args 82 | self.assertEqual(invoke(lib.mysum.service, 1, 2, 3), 6) 83 | 84 | def test_interned_values(self): 85 | # style 1, var args 86 | self.assertEqual(invoke(lib.untyped_identity.service, [1, 1, None, None]), [1, 1, None, None]) 87 | 88 | def test_kwargs(self): 89 | self.assertEqual(invoke(lib.kwargs.service, x = 1, y = 2), {'y': 2, 'x': 1}) 90 | 91 | def test_simple_decorator(self): 92 | # style 1, define a function and publish it with a decorator 93 | self.assertEqual(invoke(lib.myfunc.service, 1, 2), [4, -3, 2, 0]) 94 | 95 | def test_publish_explicitly(self): 96 | # style 2, define a function and call the publish API explicitly. 97 | self.assertEqual(invoke(lib.published, 1, 2), [3, -1, 2, 0]) 98 | 99 | def test_strongly_typed(self): 100 | # a strongly typed version... 101 | self.assertEqual(invoke(lib.typed.service, 1, 2), 3) 102 | 103 | def test_data_frame_input(self): 104 | # style 2, define a function and call the publish API explicitly. 105 | self.assertEqual(invoke(lib.dataframe_int, 1, 2), 3.0) 106 | 107 | 108 | #def test_complex_typed(self): 109 | # print(invoke(lib.complex_typed, 3j, 5j)) 110 | 111 | def test_consume_published(self): 112 | # style 3, consume an already published service 113 | url, api_key, help_url = lib.published 114 | 115 | @services.service(url, api_key) 116 | def published_func(a, b): 117 | pass 118 | 119 | self.assertEqual(invoke(published_func, 1, 2), [3, -1, 2, 0]) 120 | 121 | if __name__ == '__main__': 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /tests/unittests.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation 3 | # All rights reserved. 4 | # 5 | # MIT License: 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #-------------------------------------------------------------------------- 25 | 26 | import math 27 | import sys 28 | import unittest 29 | import os 30 | import pandas as pd 31 | import numpy as np 32 | from os import path 33 | from pandas.util.testing import assert_frame_equal 34 | import random 35 | 36 | from azureml import ( 37 | BytesIO, 38 | Workspace, 39 | DataTypeIds, 40 | AzureMLConflictHttpError, 41 | AzureMLHttpError, 42 | UnsupportedDatasetTypeError, 43 | serialize_dataframe, 44 | deserialize_dataframe, 45 | ) 46 | from tests import ( 47 | id_generator, 48 | load_test_settings, 49 | ) 50 | 51 | 52 | EXAMPLE_EXPERIMENT_ID = '506153734175476c4f62416c57734963.f-id.1f022fd4578847dc867d662a51f0a105' 53 | EXAMPLE_EXPERIMENT_DESC = 'Binary Classification: Breast cancer detection' 54 | 55 | EXAMPLE_DATASET_NAME = 'Airport Codes Dataset' 56 | EXAMPLE_UNSUPPORTED_DATASET_NAME = 'Breast cancer data' 57 | 58 | settings = load_test_settings() 59 | 60 | 61 | class WorkspaceTests(unittest.TestCase): 62 | def test_create(self): 63 | # Arrange 64 | 65 | # Act 66 | workspace = Workspace( 67 | workspace_id=settings.workspace.id, 68 | authorization_token=settings.workspace.token, 69 | endpoint=settings.workspace.endpoint 70 | ) 71 | 72 | # Assert 73 | 74 | def test_create_ini(self): 75 | # Arrange 76 | try: 77 | with open(path.expanduser('~/.azureml/settings.ini'), 'w') as config: 78 | config.write(''' 79 | [workspace] 80 | id=test_id 81 | authorization_token=test_token 82 | api_endpoint=api_endpoint 83 | management_endpoint=management_endpoint 84 | ''') 85 | 86 | workspace = Workspace() 87 | # Assert 88 | self.assertEqual(workspace.workspace_id, 'test_id') 89 | self.assertEqual(workspace.authorization_token, 'test_token') 90 | self.assertEqual(workspace.api_endpoint, 'api_endpoint') 91 | self.assertEqual(workspace.management_endpoint, 'management_endpoint') 92 | finally: 93 | if path.exists(path.expanduser('~/.azureml/settings.ini')): 94 | os.unlink(path.expanduser('~/.azureml/settings.ini')) 95 | 96 | def test_create_json(self): 97 | # Arrange 98 | 99 | # Act 100 | 101 | try: 102 | with open(path.expanduser('~/.azureml/settings.json'), 'w') as config: 103 | config.write(''' 104 | {"workspace":{ 105 | "id":"test_id", 106 | "authorization_token": "test_token", 107 | "api_endpoint":"api_endpoint", 108 | "management_endpoint":"management_endpoint" 109 | }}''') 110 | 111 | workspace = Workspace() 112 | # Assert 113 | self.assertEqual(workspace.workspace_id, 'test_id') 114 | self.assertEqual(workspace.authorization_token, 'test_token') 115 | self.assertEqual(workspace.api_endpoint, 'api_endpoint') 116 | self.assertEqual(workspace.management_endpoint, 'management_endpoint') 117 | finally: 118 | if path.exists(path.expanduser('~/.azureml/settings.json')): 119 | os.unlink(path.expanduser('~/.azureml/settings.json')) 120 | 121 | 122 | def test_create_no_workspace_id(self): 123 | # Arrange 124 | 125 | # Act 126 | with self.assertRaises(TypeError): 127 | workspace = Workspace( 128 | workspace_id='', 129 | authorization_token=settings.workspace.token, 130 | ) 131 | 132 | # Assert 133 | 134 | def test_create_no_workspace_token(self): 135 | # Arrange 136 | 137 | # Act 138 | with self.assertRaises(TypeError): 139 | workspace = Workspace( 140 | workspace_id=settings.workspace.id, 141 | authorization_token='', 142 | ) 143 | 144 | # Assert 145 | 146 | def test_create_no_endpoint(self): 147 | # Arrange 148 | 149 | # Act 150 | with self.assertRaises(TypeError): 151 | workspace = Workspace( 152 | workspace_id=settings.workspace.id, 153 | authorization_token=settings.workspace.token, 154 | endpoint=None 155 | ) 156 | 157 | # Assert 158 | 159 | 160 | class ExperimentsTests(unittest.TestCase): 161 | def setUp(self): 162 | self.workspace = Workspace( 163 | settings.workspace.id, 164 | settings.workspace.token, 165 | settings.workspace.endpoint 166 | ) 167 | 168 | def test_iter(self): 169 | # Arrange 170 | 171 | # Act 172 | all = [] 173 | for experiment in self.workspace.experiments: 174 | all.append(experiment) 175 | print(experiment.experiment_id) 176 | print(experiment.description.encode('ascii', 'ignore')) 177 | print('') 178 | 179 | # Assert 180 | self.assertGreater(len(all), 0) 181 | 182 | def test_iter_example_experiments(self): 183 | # Arrange 184 | 185 | # Act 186 | all = [] 187 | for experiment in self.workspace.example_experiments: 188 | all.append(experiment) 189 | print(experiment.experiment_id) 190 | print(experiment.description.encode('ascii', 'ignore')) 191 | print('') 192 | self.assertTrue(experiment.is_example) 193 | 194 | # Assert 195 | self.assertGreater(len(all), 0) 196 | self.assertEqual(1, len([e for e in all if e.description == EXAMPLE_EXPERIMENT_DESC])) 197 | 198 | def test_iter_user_experiments(self): 199 | # Arrange 200 | 201 | # Act 202 | all = [] 203 | for experiment in self.workspace.user_experiments: 204 | all.append(experiment) 205 | print(experiment.experiment_id) 206 | print(experiment.description.encode('ascii', 'ignore')) 207 | print('') 208 | self.assertFalse(experiment.is_example) 209 | 210 | # Assert 211 | self.assertGreater(len(all), 0) 212 | self.assertEqual(0, len([e for e in all if e.description == EXAMPLE_EXPERIMENT_DESC])) 213 | 214 | def test_len(self): 215 | # Arrange 216 | 217 | # Act 218 | result = len(self.workspace.experiments) 219 | 220 | # Assert 221 | self.assertGreater(result, 0) 222 | 223 | def test_getitem_by_index(self): 224 | # Arrange 225 | 226 | # Act 227 | result = self.workspace.experiments[0] 228 | 229 | # Assert 230 | self.assertIsNotNone(result) 231 | 232 | def test_getitem_by_index_long(self): 233 | if sys.version_info >= (3,): 234 | return 235 | 236 | # Arrange 237 | 238 | # Act 239 | index = long(0) # can't use 0L as that breaks 3.x parsing 240 | result = self.workspace.experiments[index] 241 | 242 | # Assert 243 | self.assertIsNotNone(result) 244 | 245 | def test_getitem_by_index_out_of_range(self): 246 | # Arrange 247 | 248 | # Act 249 | with self.assertRaises(IndexError): 250 | result = self.workspace.experiments[32700] 251 | 252 | # Assert 253 | 254 | def test_getitem_by_id(self): 255 | # Arrange 256 | 257 | # Act 258 | id = settings.intermediateDataset.experiment_id 259 | result = self.workspace.experiments[id] 260 | 261 | # Assert 262 | self.assertIsNotNone(result) 263 | self.assertEqual(result.experiment_id, id) 264 | 265 | def test_getitem_by_id_does_not_exist(self): 266 | # Arrange 267 | 268 | # Act 269 | with self.assertRaises(IndexError): 270 | result = self.workspace.experiments['Does Not Exist'] 271 | 272 | # Assert 273 | 274 | def test_repr(self): 275 | # Arrange 276 | 277 | # Act 278 | result = repr(self.workspace.example_experiments) 279 | 280 | # Assert 281 | self.assertIn(EXAMPLE_EXPERIMENT_DESC, result) 282 | 283 | 284 | class ExperimentTests(unittest.TestCase): 285 | def setUp(self): 286 | self.workspace = Workspace( 287 | settings.workspace.id, 288 | settings.workspace.token, 289 | settings.workspace.endpoint 290 | ) 291 | 292 | def assertArrayEqual(self, a, b): 293 | if sys.version_info < (3,): 294 | self.assertItemsEqual(a, b) 295 | else: 296 | self.assertCountEqual(a, b) 297 | 298 | def test_metadata(self): 299 | # Arrange 300 | experiment = self.workspace.experiments[ 301 | settings.intermediateDataset.experiment_id] 302 | 303 | # Act 304 | print('status.status_code: {0}'.format(experiment.status.status_code)) 305 | print('status.status_detail: {0}'.format(experiment.status.status_detail)) 306 | print('status.creation_time: {0}'.format(experiment.status.creation_time)) 307 | print('description: {0}'.format(experiment.description.encode('ascii','ignore'))) 308 | print('creator: {0}'.format(experiment.creator)) 309 | print('experiment_id: {0}'.format(experiment.experiment_id)) 310 | print('job_id: {0}'.format(experiment.job_id)) 311 | print('version_id: {0}'.format(experiment.version_id)) 312 | print('etag: {0}'.format(experiment.etag)) 313 | print('run_id: {0}'.format(experiment.run_id)) 314 | print('is_archived: {0}'.format(experiment.is_archived)) 315 | print('is_example: {0}'.format(experiment.is_example)) 316 | 317 | # Assert 318 | 319 | def test_repr(self): 320 | # Arrange 321 | experiment = self.workspace.experiments[ 322 | settings.intermediateDataset.experiment_id] 323 | 324 | # Act 325 | result = repr(experiment) 326 | 327 | # Assert 328 | expected = u'{0}\t{1}'.format(experiment.experiment_id, experiment.description) 329 | if sys.version_info < (3,): 330 | self.assertEqual(type(result), bytes) 331 | self.assertEqual(result, expected.encode('ascii', 'ignore')) 332 | else: 333 | self.assertEqual(type(result), str) 334 | self.assertEqual(result, expected) 335 | 336 | def test_get_intermediate_dataset(self): 337 | # Arrange 338 | experiment = self.workspace.experiments[ 339 | settings.intermediateDataset.experiment_id] 340 | 341 | # Act 342 | result = experiment.get_intermediate_dataset( 343 | settings.intermediateDataset.node_id, 344 | settings.intermediateDataset.port_name, 345 | settings.intermediateDataset.data_type_id 346 | ) 347 | 348 | # Assert 349 | self.assertIsNotNone(result) 350 | self.assertEqual(result.workspace, self.workspace) 351 | self.assertEqual(result.experiment, experiment) 352 | self.assertEqual(result.node_id, settings.intermediateDataset.node_id) 353 | self.assertEqual(result.port_name, settings.intermediateDataset.port_name) 354 | self.assertEqual(result.data_type_id, settings.intermediateDataset.data_type_id) 355 | 356 | 357 | class IntermediateDatasetTests(unittest.TestCase): 358 | def setUp(self): 359 | self.workspace = Workspace( 360 | settings.workspace.id, 361 | settings.workspace.token, 362 | settings.workspace.endpoint 363 | ) 364 | 365 | self.experiment = self.workspace.experiments[ 366 | settings.intermediateDataset.experiment_id] 367 | 368 | self.dataset = self.experiment.get_intermediate_dataset( 369 | settings.intermediateDataset.node_id, 370 | settings.intermediateDataset.port_name, 371 | settings.intermediateDataset.data_type_id 372 | ) 373 | 374 | def test_to_dataframe(self): 375 | # Arrange 376 | 377 | # Act 378 | result = self.dataset.to_dataframe() 379 | 380 | # Assert 381 | self.assertGreater(len(result.columns), 0) 382 | self.assertGreater(len(result.values[0]), 0) 383 | 384 | def test_to_dataframe_unsupported_data_type_id(self): 385 | # Arrange 386 | dataset = self.experiment.get_intermediate_dataset( 387 | settings.intermediateDataset.node_id, 388 | settings.intermediateDataset.port_name, 389 | 'Unsupported' 390 | ) 391 | 392 | # Act 393 | result = hasattr(dataset, 'to_dataframe') 394 | 395 | # Assert 396 | self.assertFalse(result) 397 | 398 | def test_open(self): 399 | # Arrange 400 | 401 | # Act 402 | result = self.dataset.open() 403 | 404 | # Assert 405 | self.assertIsNotNone(result) 406 | raw_data = result.read() 407 | self.assertGreater(len(raw_data), 0) 408 | 409 | def test_read_as_binary(self): 410 | # Arrange 411 | 412 | # Act 413 | result = self.dataset.read_as_binary() 414 | 415 | # Assert 416 | self.assertGreater(len(result), 0) 417 | 418 | def test_read_as_text(self): 419 | # Arrange 420 | 421 | # Act 422 | result = self.dataset.read_as_text() 423 | 424 | # Assert 425 | self.assertGreater(len(result), 0) 426 | 427 | 428 | class DatasetsTests(unittest.TestCase): 429 | def setUp(self): 430 | self.workspace = Workspace( 431 | settings.workspace.id, 432 | settings.workspace.token, 433 | settings.workspace.endpoint 434 | ) 435 | 436 | def test_len(self): 437 | # Arrange 438 | 439 | # Act 440 | result = len(self.workspace.datasets) 441 | 442 | # Assert 443 | self.assertGreater(result, 0) 444 | 445 | def test_getitem_by_index(self): 446 | # Arrange 447 | 448 | # Act 449 | result = self.workspace.datasets[0] 450 | 451 | # Assert 452 | self.assertIsNotNone(result) 453 | 454 | def test_getitem_by_index_long(self): 455 | if sys.version_info >= (3,): 456 | return 457 | 458 | # Arrange 459 | 460 | # Act 461 | index = long(0) # can't use 0L as that breaks 3.x parsing 462 | result = self.workspace.datasets[index] 463 | 464 | # Assert 465 | self.assertIsNotNone(result) 466 | 467 | def test_getitem_by_index_out_of_range(self): 468 | # Arrange 469 | 470 | # Act 471 | with self.assertRaises(IndexError): 472 | result = self.workspace.datasets[32700] 473 | 474 | # Assert 475 | 476 | def test_getitem_by_name(self): 477 | # Arrange 478 | 479 | # Act 480 | result = self.workspace.datasets[EXAMPLE_DATASET_NAME] 481 | 482 | # Assert 483 | self.assertIsNotNone(result) 484 | self.assertEqual(result.name, EXAMPLE_DATASET_NAME) 485 | 486 | def test_getitem_by_name_wrong_case(self): 487 | # Arrange 488 | 489 | # Act 490 | with self.assertRaises(IndexError): 491 | result = self.workspace.datasets[EXAMPLE_DATASET_NAME.upper()] 492 | 493 | # Assert 494 | 495 | def test_getitem_by_name_does_not_exist(self): 496 | # Arrange 497 | 498 | # Act 499 | with self.assertRaises(IndexError): 500 | result = self.workspace.datasets['Does Not Exist'] 501 | 502 | # Assert 503 | 504 | def test_iter(self): 505 | # Arrange 506 | 507 | # Act 508 | all = [] 509 | for dataset in self.workspace.datasets: 510 | all.append(dataset) 511 | print(dataset.name) 512 | 513 | # Assert 514 | self.assertGreater(len(all), 0) 515 | 516 | def test_iter_example_datasets(self): 517 | # Arrange 518 | 519 | # Act 520 | all = [] 521 | for dataset in self.workspace.example_datasets: 522 | all.append(dataset) 523 | print(dataset.dataset_id) 524 | print(dataset.name) 525 | print(dataset.data_type_id) 526 | print('') 527 | self.assertTrue(dataset.is_example) 528 | 529 | # Assert 530 | self.assertGreater(len(all), 0) 531 | self.assertEqual(1, len([a for a in all if a.name ==EXAMPLE_DATASET_NAME])) 532 | 533 | def test_iter_user_datasets(self): 534 | # Arrange 535 | 536 | # Act 537 | all = [] 538 | for dataset in self.workspace.user_datasets: 539 | all.append(dataset) 540 | print(dataset.dataset_id) 541 | print(dataset.name) 542 | print(dataset.data_type_id) 543 | print('') 544 | self.assertFalse(dataset.is_example) 545 | 546 | # Assert 547 | self.assertGreater(len(all), 0) 548 | self.assertEqual(0, len([a for a in all if a.name ==EXAMPLE_DATASET_NAME])) 549 | 550 | def test_repr(self): 551 | # Arrange 552 | 553 | # Act 554 | result = repr(self.workspace.example_datasets) 555 | 556 | # Assert 557 | self.assertIn('{0}\n'.format(EXAMPLE_DATASET_NAME), result) 558 | 559 | 560 | class UploadTests(unittest.TestCase): 561 | def setUp(self): 562 | self.workspace = Workspace( 563 | settings.workspace.id, 564 | settings.workspace.token, 565 | settings.workspace.endpoint 566 | ) 567 | 568 | self.original_data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] 569 | self.original_dataframe = pd.DataFrame(self.original_data) 570 | self.original_name = 'unittestcsvwh' + id_generator() 571 | self.original_description = 'safe to be deleted - ' + self.original_name 572 | 573 | self.updated_data = [{'a': 101, 'b': 102}, {'a': 105, 'b': 110, 'c': 120}] 574 | self.updated_dataframe = pd.DataFrame(self.updated_data) 575 | self.updated_name = 'unittestcsvwhupdate' + id_generator() 576 | self.updated_description = 'updated' 577 | 578 | 579 | def test_add_from_dataframe(self): 580 | # Arrange 581 | 582 | # Act 583 | result = self.workspace.datasets.add_from_dataframe( 584 | self.original_dataframe, 585 | DataTypeIds.GenericCSV, 586 | self.original_name, 587 | self.original_description, 588 | ) 589 | 590 | # Assert 591 | self.assertIsNotNone(result) 592 | self.assertEqual(result.name, self.original_name) 593 | self.assertEqual(result.description, self.original_description) 594 | self.assertEqual(result.data_type_id, DataTypeIds.GenericCSV) 595 | self.assertEqual(result.owner, 'Python SDK') 596 | self.assertIsNotNone(self.workspace.datasets[self.original_name]) 597 | 598 | def test_add_from_dataframe_conflict(self): 599 | # Arrange 600 | self.workspace.datasets.add_from_dataframe( 601 | self.original_dataframe, 602 | DataTypeIds.GenericCSV, 603 | self.original_name, 604 | self.original_description, 605 | ) 606 | 607 | # Act 608 | with self.assertRaises(AzureMLConflictHttpError): 609 | result = self.workspace.datasets.add_from_dataframe( 610 | self.original_dataframe, 611 | DataTypeIds.GenericCSV, 612 | self.original_name, 613 | self.original_description, 614 | ) 615 | 616 | # Assert 617 | 618 | def test_update_from_dataframe(self): 619 | # Arrange 620 | dataset = self.workspace.datasets.add_from_dataframe( 621 | self.original_dataframe, 622 | DataTypeIds.GenericCSV, 623 | self.original_name, 624 | self.original_description, 625 | ) 626 | 627 | # Act 628 | result = dataset.update_from_dataframe(self.updated_dataframe) 629 | 630 | # Assert 631 | self.assertIsNone(result) 632 | actual_dataframe = dataset.to_dataframe() 633 | self.assertEqual(dataset.name, self.original_name) 634 | self.assertEqual(dataset.description, self.original_description) 635 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericCSV) 636 | assert_frame_equal(actual_dataframe, self.updated_dataframe) 637 | 638 | def test_update_from_dataframe_with_type_id_name_description(self): 639 | # Arrange 640 | dataset = self.workspace.datasets.add_from_dataframe( 641 | self.original_dataframe, 642 | DataTypeIds.GenericCSV, 643 | self.original_name, 644 | self.original_description, 645 | ) 646 | 647 | # Act 648 | result = dataset.update_from_dataframe( 649 | self.updated_dataframe, 650 | DataTypeIds.GenericTSV, 651 | self.updated_name, 652 | self.updated_description) 653 | 654 | # Assert 655 | self.assertIsNone(result) 656 | actual_dataframe = dataset.to_dataframe() 657 | self.assertEqual(dataset.name, self.updated_name) 658 | self.assertEqual(dataset.description, self.updated_description) 659 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericTSV) 660 | assert_frame_equal(actual_dataframe, self.updated_dataframe) 661 | 662 | def test_add_from_dataframe_invalid_name(self): 663 | # Arrange 664 | invalid_name = 'unittestcsvwh:' + id_generator() 665 | 666 | # Act 667 | try: 668 | result = self.workspace.datasets.add_from_dataframe( 669 | self.original_dataframe, 670 | DataTypeIds.GenericCSV, 671 | invalid_name, 672 | self.original_description, 673 | ) 674 | self.assertTrue(False, 'Failed to raise AzureMLHttpError.') 675 | except AzureMLHttpError as error: 676 | self.assertIn('forbidden characters', str(error)) 677 | self.assertEqual(error.status_code, 400) 678 | 679 | # Assert 680 | 681 | def test_add_from_raw_data(self): 682 | # Arrange 683 | original_raw_data = _frame_to_raw_data(self.original_dataframe, ',', True) 684 | 685 | # Act 686 | result = self.workspace.datasets.add_from_raw_data( 687 | original_raw_data, 688 | DataTypeIds.GenericCSV, 689 | self.original_name, 690 | self.original_description, 691 | ) 692 | 693 | # Assert 694 | self.assertIsNotNone(result) 695 | self.assertIsNotNone(self.workspace.datasets[self.original_name]) 696 | self.assertEqual(result.name, self.original_name) 697 | self.assertEqual(result.description, self.original_description) 698 | 699 | def test_add_from_raw_data_chunked(self): 700 | original_name = 'unittestcsvwh' + id_generator() 701 | 702 | # Arrange 703 | original_raw_data = b''.join(chr(random.randint(0, 255)) for x in range(0x800000)) 704 | 705 | # Act 706 | result = self.workspace.datasets.add_from_raw_data( 707 | original_raw_data, 708 | DataTypeIds.GenericCSV, 709 | original_name, 710 | 'test description', 711 | ) 712 | 713 | # Assert 714 | self.assertIsNotNone(result) 715 | self.assertIsNotNone(self.workspace.datasets[original_name]) 716 | self.assertEqual(result.name, original_name) 717 | 718 | new_data = self.workspace.datasets[original_name].read_as_binary() 719 | self.assertEqual(original_raw_data, new_data) 720 | 721 | 722 | def test_update_from_raw_data(self): 723 | # Arrange 724 | dataset = self.workspace.datasets.add_from_dataframe( 725 | self.original_dataframe, 726 | DataTypeIds.GenericCSV, 727 | self.original_name, 728 | self.original_description, 729 | ) 730 | 731 | updated_raw_data = _frame_to_raw_data(self.updated_dataframe, ',', True) 732 | 733 | # Act 734 | result = dataset.update_from_raw_data(updated_raw_data) 735 | 736 | # Assert 737 | self.assertIsNone(result) 738 | actual_dataframe = dataset.to_dataframe() 739 | self.assertEqual(dataset.name, self.original_name) 740 | self.assertEqual(dataset.description, self.original_description) 741 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericCSV) 742 | assert_frame_equal(actual_dataframe, self.updated_dataframe) 743 | 744 | def test_update_from_raw_data_with_data_type_id_name_description(self): 745 | # Arrange 746 | dataset = self.workspace.datasets.add_from_dataframe( 747 | self.original_dataframe, 748 | DataTypeIds.GenericCSV, 749 | self.original_name, 750 | self.original_description, 751 | ) 752 | 753 | updated_raw_data = _frame_to_raw_data(self.updated_dataframe, '\t', True) 754 | 755 | # Act 756 | result = dataset.update_from_raw_data( 757 | updated_raw_data, 758 | DataTypeIds.GenericTSV, 759 | self.updated_name, 760 | self.updated_description, 761 | ) 762 | 763 | # Assert 764 | self.assertIsNone(result) 765 | actual_dataframe = dataset.to_dataframe() 766 | self.assertEqual(dataset.name, self.updated_name) 767 | self.assertEqual(dataset.description, self.updated_description) 768 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericTSV) 769 | assert_frame_equal(actual_dataframe, self.updated_dataframe) 770 | 771 | def test_update_from_dataframe_example_dataset(self): 772 | # Arrange 773 | dataset = self.workspace.example_datasets[0] 774 | 775 | # Act 776 | result = hasattr(dataset, 'update_from_dataframe') 777 | 778 | # Assert 779 | self.assertFalse(result) 780 | 781 | def test_update_from_raw_data_example_dataset(self): 782 | # Arrange 783 | dataset = self.workspace.example_datasets[0] 784 | 785 | # Act 786 | result = hasattr(dataset, 'update_from_raw_data') 787 | 788 | # Assert 789 | self.assertFalse(result) 790 | 791 | 792 | class DatasetTests(unittest.TestCase): 793 | def setUp(self): 794 | self.workspace = Workspace( 795 | settings.workspace.id, 796 | settings.workspace.token, 797 | settings.workspace.endpoint 798 | ) 799 | 800 | def assertArrayEqual(self, a, b): 801 | if sys.version_info < (3,): 802 | self.assertItemsEqual(a, b) 803 | else: 804 | self.assertCountEqual(a, b) 805 | 806 | def test_metadata(self): 807 | # Arrange 808 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 809 | 810 | # Act 811 | print('visualize_end_point.base_uri: {0}'.format(dataset.visualize_end_point.base_uri)) 812 | print('visualize_end_point.size: {0}'.format(dataset.visualize_end_point.size)) 813 | print('visualize_end_point.endpoint_type: {0}'.format(dataset.visualize_end_point.endpoint_type)) 814 | print('visualize_end_point.credential_container: {0}'.format(dataset.visualize_end_point.credential_container)) 815 | print('visualize_end_point.access_credential: {0}'.format(dataset.visualize_end_point.access_credential)) 816 | print('visualize_end_point.location: {0}'.format(dataset.visualize_end_point.location)) 817 | print('visualize_end_point.file_type: {0}'.format(dataset.visualize_end_point.file_type)) 818 | print('visualize_end_point.is_auxiliary: {0}'.format(dataset.visualize_end_point.is_auxiliary)) 819 | print('visualize_end_point.name: {0}'.format(dataset.visualize_end_point.name)) 820 | print('schema_end_point.base_uri: {0}'.format(dataset.schema_end_point.base_uri)) 821 | print('schema_end_point.size: {0}'.format(dataset.schema_end_point.size)) 822 | print('schema_end_point.endpoint_type: {0}'.format(dataset.schema_end_point.endpoint_type)) 823 | print('schema_end_point.credential_container: {0}'.format(dataset.schema_end_point.credential_container)) 824 | print('schema_end_point.access_credential: {0}'.format(dataset.schema_end_point.access_credential)) 825 | print('schema_end_point.location: {0}'.format(dataset.schema_end_point.location)) 826 | print('schema_end_point.file_type: {0}'.format(dataset.schema_end_point.file_type)) 827 | print('schema_end_point.is_auxiliary: {0}'.format(dataset.schema_end_point.is_auxiliary)) 828 | print('schema_end_point.name: {0}'.format(dataset.schema_end_point.name)) 829 | print('schema_status: {0}'.format(dataset.schema_status)) 830 | print('dataset_id: {0}'.format(dataset.dataset_id)) 831 | print('data_type_id: {0}'.format(dataset.data_type_id)) 832 | print('name: {0}'.format(dataset.name)) 833 | print('description: {0}'.format(dataset.description)) 834 | print('family_id: {0}'.format(dataset.family_id)) 835 | print('resource_upload_id: {0}'.format(dataset.resource_upload_id)) 836 | print('source_origin: {0}'.format(dataset.source_origin)) 837 | print('size: {0}'.format(dataset.size)) 838 | print('created_date: {0}'.format(dataset.created_date)) 839 | print('owner: {0}'.format(dataset.owner)) 840 | print('experiment_id: {0}'.format(dataset.experiment_id)) 841 | print('client_version: {0}'.format(dataset.client_version)) 842 | print('promoted_from: {0}'.format(dataset.promoted_from)) 843 | print('uploaded_from_filename: {0}'.format(dataset.uploaded_from_filename)) 844 | print('service_version: {0}'.format(dataset.service_version)) 845 | print('is_latest: {0}'.format(dataset.is_latest)) 846 | print('category: {0}'.format(dataset.category)) 847 | print('download_location.base_uri: {0}'.format(dataset.download_location.base_uri)) 848 | print('download_location.size: {0}'.format(dataset.download_location.size)) 849 | print('download_location.endpoint_type: {0}'.format(dataset.download_location.endpoint_type)) 850 | print('download_location.credential_container: {0}'.format(dataset.download_location.credential_container)) 851 | print('download_location.access_credential: {0}'.format(dataset.download_location.access_credential)) 852 | print('download_location.location: {0}'.format(dataset.download_location.location)) 853 | print('download_location.file_type: {0}'.format(dataset.download_location.file_type)) 854 | print('download_location.is_auxiliary: {0}'.format(dataset.download_location.is_auxiliary)) 855 | print('download_location.name: {0}'.format(dataset.download_location.name)) 856 | print('is_deprecated: {0}'.format(dataset.is_deprecated)) 857 | print('culture: {0}'.format(dataset.culture)) 858 | print('batch: {0}'.format(dataset.batch)) 859 | print('created_date_ticks: {0}'.format(dataset.created_date_ticks)) 860 | 861 | # Assert 862 | 863 | def test_repr(self): 864 | # Arrange 865 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 866 | 867 | # Act 868 | result = repr(dataset) 869 | 870 | # Assert 871 | self.assertEqual(dataset.name, result) 872 | 873 | def test_to_dataframe(self): 874 | # Arrange 875 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 876 | 877 | # Act 878 | result = dataset.to_dataframe() 879 | 880 | # Assert 881 | self.assertArrayEqual( 882 | result.columns, 883 | [u'airport_id', u'city', u'state', u'name']) 884 | self.assertArrayEqual( 885 | result.values[0], 886 | [10165, 'Adak Island', 'AK', 'Adak']) 887 | self.assertArrayEqual( 888 | result.values[-1], 889 | [14543, 'Rock Springs', 'WY', 'Rock Springs Sweetwater County']) 890 | 891 | def test_to_dataframe_unsupported_data_type_id(self): 892 | # Arrange 893 | dataset = self.workspace.datasets[EXAMPLE_UNSUPPORTED_DATASET_NAME] 894 | 895 | # Act 896 | result = hasattr(dataset, 'to_dataframe') 897 | 898 | # Assert 899 | self.assertFalse(result) 900 | 901 | def test_open(self): 902 | # Arrange 903 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 904 | 905 | # Act 906 | result = dataset.open() 907 | 908 | # Assert 909 | self.assertIsNotNone(result) 910 | raw_data = result.read() 911 | expected = b'airport_id,city,state,name\r\n10165,Adak Island, AK, Adak' 912 | self.assertEqual(raw_data[:len(expected)], expected) 913 | 914 | def test_read_as_binary(self): 915 | # Arrange 916 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 917 | 918 | # Act 919 | result = dataset.read_as_binary() 920 | 921 | # Assert 922 | expected = b'airport_id,city,state,name\r\n10165,Adak Island, AK, Adak' 923 | self.assertEqual(result[:len(expected)], expected) 924 | 925 | def test_read_as_text(self): 926 | # Arrange 927 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME] 928 | 929 | # Act 930 | result = dataset.read_as_text() 931 | 932 | # Assert 933 | lines = result.splitlines() 934 | self.assertEqual(lines[0], 'airport_id,city,state,name') 935 | self.assertEqual(lines[1], '10165,Adak Island, AK, Adak') 936 | self.assertEqual(lines[-1], '14543,Rock Springs, WY, Rock Springs Sweetwater County') 937 | 938 | 939 | class SerializationTests(unittest.TestCase): 940 | def assertArrayEqual(self, a, b): 941 | if sys.version_info < (3,): 942 | self.assertItemsEqual(a, b) 943 | else: 944 | self.assertCountEqual(a, b) 945 | 946 | def test_serialize_to_csv(self): 947 | # Arrange 948 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] 949 | dataframe = pd.DataFrame(data) 950 | 951 | # Act 952 | writer = BytesIO() 953 | serialize_dataframe(writer, DataTypeIds.GenericCSV, dataframe) 954 | result = writer.getvalue() 955 | 956 | # Assert 957 | self.assertGreater(len(result), 0) 958 | self.assertEqual(result, b'a,b,c\n1.0,2.0,\n5.1,10.1,20.1\n') 959 | 960 | def test_serialize_to_csv_no_header(self): 961 | # Arrange 962 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] 963 | dataframe = pd.DataFrame(data) 964 | 965 | # Act 966 | writer = BytesIO() 967 | serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe) 968 | result = writer.getvalue() 969 | 970 | # Assert 971 | self.assertGreater(len(result), 0) 972 | self.assertEqual(result, b'1.0,2.0,\n5.1,10.1,20.1\n') 973 | 974 | def test_serialize_to_tsv(self): 975 | # Arrange 976 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] 977 | dataframe = pd.DataFrame(data) 978 | 979 | # Act 980 | writer = BytesIO() 981 | serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe) 982 | result = writer.getvalue() 983 | 984 | # Assert 985 | self.assertGreater(len(result), 0) 986 | self.assertEqual(result, b'a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n') 987 | 988 | def test_serialize_to_tsv_no_header(self): 989 | # Arrange 990 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] 991 | dataframe = pd.DataFrame(data) 992 | 993 | # Act 994 | writer = BytesIO() 995 | serialize_dataframe(writer, DataTypeIds.GenericTSVNoHeader, dataframe) 996 | result = writer.getvalue() 997 | 998 | # Assert 999 | self.assertGreater(len(result), 0) 1000 | self.assertEqual(result, b'1.0\t2.0\t\n5.1\t10.1\t20.1\n') 1001 | 1002 | def test_serialize_to_plain_text(self): 1003 | # Arrange 1004 | data = ['This is the first', 'This is second line'] 1005 | dataframe = pd.DataFrame(data) 1006 | 1007 | # Act 1008 | writer = BytesIO() 1009 | serialize_dataframe(writer, DataTypeIds.PlainText, dataframe) 1010 | result = writer.getvalue() 1011 | 1012 | # Assert 1013 | self.assertGreater(len(result), 0) 1014 | self.assertEqual(result, b'This is the first\nThis is second line\n') 1015 | 1016 | def test_deserialize_from_plain_text_bom(self): 1017 | # Arrange 1018 | data = b'\xef\xbb\xbfJohn enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.\r\nMicrosoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.' 1019 | 1020 | # Act 1021 | reader = BytesIO(data) 1022 | result = deserialize_dataframe(reader, DataTypeIds.PlainText) 1023 | 1024 | # Assert 1025 | self.assertIsNotNone(result) 1026 | expected = [ 1027 | {0: 'John enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.'}, 1028 | {0: 'Microsoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'}, 1029 | ] 1030 | assert_frame_equal(pd.DataFrame(expected), result) 1031 | 1032 | def test_deserialize_from_csv(self): 1033 | # Arrange 1034 | data = b'a,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' 1035 | 1036 | # Act 1037 | reader = BytesIO(data) 1038 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV) 1039 | 1040 | # Assert 1041 | self.assertIsNotNone(result) 1042 | expected = [ 1043 | {'a': 1.0, 'b': 2.0}, 1044 | {'a': 5.1, 'b': 10.1, 'c': 20.1}, 1045 | {'a': 50.2, 'c': 50.3}, 1046 | ] 1047 | assert_frame_equal(pd.DataFrame(expected), result) 1048 | 1049 | def test_deserialize_from_csv_bom(self): 1050 | # Arrange 1051 | data = b'\xef\xbb\xbfa,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' 1052 | 1053 | # Act 1054 | reader = BytesIO(data) 1055 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV) 1056 | 1057 | # Assert 1058 | self.assertIsNotNone(result) 1059 | expected = [ 1060 | {'a': 1.0, 'b': 2.0}, 1061 | {'a': 5.1, 'b': 10.1, 'c': 20.1}, 1062 | {'a': 50.2, 'c': 50.3}, 1063 | ] 1064 | assert_frame_equal(pd.DataFrame(expected), result) 1065 | 1066 | def test_deserialize_from_csv_spaces(self): 1067 | # Arrange 1068 | data = b'a, b, c\n1.0, two, nan\n5.1, "ten point one", 20.1\n50.2, , 50.3\n' 1069 | 1070 | # Act 1071 | reader = BytesIO(data) 1072 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV) 1073 | 1074 | # Assert 1075 | self.assertIsNotNone(result) 1076 | expected = [ 1077 | {'a': 1.0, 'b': 'two'}, 1078 | {'a': 5.1, 'b': 'ten point one', 'c': 20.1}, 1079 | {'a': 50.2, 'c': 50.3}, 1080 | ] 1081 | assert_frame_equal(pd.DataFrame(expected), result) 1082 | 1083 | def test_deserialize_from_csv_no_header(self): 1084 | # Arrange 1085 | data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' 1086 | 1087 | # Act 1088 | reader = BytesIO(data) 1089 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSVNoHeader) 1090 | 1091 | # Assert 1092 | self.assertIsNotNone(result) 1093 | expected = [ 1094 | {0: 1.0, 1: 2.0}, 1095 | {0: 5.1, 1: 10.1, 2: 20.1}, 1096 | {0: 50.2, 2: 50.3}, 1097 | ] 1098 | assert_frame_equal(pd.DataFrame(expected), result) 1099 | 1100 | @unittest.skip('ARFF is not supported yet.') 1101 | def test_deserialize_from_arff(self): 1102 | # Arrange 1103 | data = b"""@RELATION Unnamed 1104 | 1105 | @ATTRIBUTE Class NUMERIC 1106 | @ATTRIBUTE age NUMERIC 1107 | @ATTRIBUTE menopause NUMERIC 1108 | @ATTRIBUTE tumor-size NUMERIC 1109 | 1110 | @DATA 1111 | 0,5,1,1 1112 | 0,5,4,4 1113 | 1,4,8,8 1114 | 1115 | """ 1116 | 1117 | # Act 1118 | reader = BytesIO(data) 1119 | result = deserialize_dataframe(reader, DataTypeIds.ARFF) 1120 | print(result) 1121 | 1122 | # Assert 1123 | self.assertIsNotNone(result) 1124 | expected = [ 1125 | {'Class': 0., 'age': 5., 'menopause': 1., 'tumor-size':1.}, 1126 | {'Class': 0., 'age': 5., 'menopause': 4., 'tumor-size':4.}, 1127 | {'Class': 1., 'age': 4., 'menopause': 8., 'tumor-size':8.}, 1128 | ] 1129 | assert_frame_equal(pd.DataFrame(expected), result) 1130 | 1131 | def test_deserialize_from_unsupported_data_type_id(self): 1132 | # Arrange 1133 | data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' 1134 | 1135 | # Act 1136 | reader = BytesIO(data) 1137 | with self.assertRaises(UnsupportedDatasetTypeError): 1138 | result = deserialize_dataframe(reader, 'Unsupported') 1139 | 1140 | # Assert 1141 | 1142 | 1143 | def _frame_to_raw_data(dataframe, sep, header): 1144 | return dataframe.to_csv(sep=sep, header=header, index=False, encoding='utf-8') 1145 | 1146 | 1147 | if __name__ == '__main__': 1148 | unittest.main() 1149 | --------------------------------------------------------------------------------