├── .gitignore
├── LICENSE.txt
├── PythonSDK.pyproj
├── PythonSDK.sln
├── README.md
├── azureml
├── __init__.py
├── errors.py
├── http.py
├── serialization.py
└── services.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
├── README.md
├── __init__.py
├── coverage.bat
├── foo.txt
├── lib.py
├── performancetests.py
├── roundtriptests.py
├── serialize_test.py
├── servicestests.py
└── unittests.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python cache
2 | __pycache__/
3 | *.pyc
4 |
5 | # PTVS analysis
6 | .ptvs/
7 |
8 | # Build results
9 | /bin/
10 | /obj/
11 |
12 | # Python setup.py output
13 | /azureml.egg-info/
14 | /dist/
15 | /build/
16 |
17 | # Test results
18 | /TestResults/
19 |
20 | # Credentials
21 | azuremltestsettings.json
22 |
23 | # User-specific files
24 | *.suo
25 | *.user
26 | *.sln.docstates
27 |
28 | # Windows image file caches
29 | Thumbs.db
30 | ehthumbs.db
31 |
32 | # Folder config file
33 | Desktop.ini
34 |
35 | # Recycle Bin used on file shares
36 | $RECYCLE.BIN/
37 |
38 | # Mac desktop service store files
39 | .DS_Store
40 |
41 | .idea
42 | src/build
43 | *.iml
44 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) Microsoft Corporation
2 | All rights reserved.
3 |
4 | MIT License:
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/PythonSDK.pyproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | 2.0
6 | eb114967-b952-4108-b806-f72c334ff3be
7 |
8 |
9 |
10 |
11 | .
12 | .
13 | .
14 | PythonSDK
15 | PythonSDK
16 | {6d533506-2bd2-4a3f-ba63-0a02b57e03ad}
17 | 2.7
18 | False
19 | SAK
20 | SAK
21 | SAK
22 | SAK
23 |
24 |
25 | true
26 | false
27 |
28 |
29 | true
30 | false
31 |
32 |
33 | 10.0
34 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 | Code
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 | Code
55 |
56 |
57 | Code
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
77 |
78 |
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/PythonSDK.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.31101.0
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonSDK", "PythonSDK.pyproj", "{EB114967-B952-4108-B806-F72C334FF3BE}"
7 | EndProject
8 | Global
9 | GlobalSection(TeamFoundationVersionControl) = preSolution
10 | SccNumberOfProjects = 2
11 | SccEnterpriseProvider = {4CA58AB2-18FA-4F8D-95D4-32DDF27D184C}
12 | SccTeamFoundationServer = http://sqlbuvsts01:8080/main
13 | SccLocalPath0 = .
14 | SccProjectUniqueName1 = PythonSDK.pyproj
15 | SccLocalPath1 = .
16 | EndGlobalSection
17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
18 | Debug|Any CPU = Debug|Any CPU
19 | Release|Any CPU = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | {EB114967-B952-4108-B806-F72C334FF3BE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23 | {EB114967-B952-4108-B806-F72C334FF3BE}.Release|Any CPU.ActiveCfg = Release|Any CPU
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | EndGlobal
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Microsoft Azure Machine Learning Python client library for Azure ML Studio
2 | ==========================================================================
3 |
4 | > **NOTE** This content is no longer maintained. Visit the [Azure Machine Learning Notebook](https://github.com/Azure/MachineLearningNotebooks) project for sample Jupyter notebooks for ML and deep learning with Azure Machine Learning using the Python SDK.
5 |
6 | The preview of Azure Machine Learning Python client library lets you access your Azure ML Studio datasets from your local Python environment.
7 |
8 | You can download datasets that are available in your ML Studio workspace, or intermediate datasets from experiments that were run. You can upload new datasets and update existing datasets. The data is optionally converted to/from a Pandas DataFrame.
9 |
10 | This is a technology preview. The APIs exposed by the library and the REST endpoints it connects to are subject to change.
11 |
12 |
13 | Installation
14 | ============
15 |
16 | The SDK has been tested with Python 2.7, 3.3 and 3.4.
17 |
18 | It has a dependency on the following packages:
19 |
20 | - requests
21 | - python-dateutil
22 | - pandas
23 |
24 |
25 | You can install it from [PyPI](https://pypi.python.org/pypi/azureml):
26 |
27 | ```
28 | pip install azureml
29 | ```
30 |
31 |
32 | Usage
33 | =====
34 |
35 | Note: We recommend that you use the **Generate Data Access Code** feature from [Azure Machine Learning Studio](https://studio.azureml.net) in order to get Python code snippets that give you access to your datasets. The code snippets include your workspace id, authorization token, and other necessary identifiers to get to your datasets.
36 |
37 | Accessing your workspace
38 | ------------------------
39 |
40 | You'll need to obtain your workspace id and token in order to get access to your workspace.
41 |
42 | ```python
43 | from azureml import Workspace
44 |
45 | ws = Workspace(workspace_id='4c29e1adeba2e5a7cbeb0e4f4adfb4df',
46 | authorization_token='f4f3ade2c6aefdb1afb043cd8bcf3daf')
47 | ```
48 |
49 | If you're using AzureML in a region other than South Central US you'll also need to specify the endpoint:
50 |
51 | ```python
52 | from azureml import Workspace
53 |
54 | ws = Workspace(workspace_id='4c29e1adeba2e5a7cbeb0e4f4adfb4df',
55 | authorization_token='f4f3ade2c6aefdb1afb043cd8bcf3daf',
56 | endpoint='https://europewest.studio.azureml.net/')
57 | ```
58 |
59 | Specify workspace via config
60 | ----------------------------
61 | If you don't want to store your access tokens in code you can also put them in a configuration file. The SDK will look for ~/.azureml/settings.ini and if available use that:
62 |
63 | ```
64 | [workspace]
65 | id=4c29e1adeba2e5a7cbeb0e4f4adfb4df
66 | authorization_token=f4f3ade2c6aefdb1afb043cd8bcf3daf
67 | api_endpoint=https://studio.azureml.net
68 | management_endpoint=https://management.azureml.net
69 | ```
70 |
71 | And then the workspace can be created without arguments:
72 |
73 | ```python
74 | from azureml import Workspace
75 |
76 | ws = Workspace()
77 | ```
78 |
79 |
80 | Accessing datasets
81 | ------------------
82 |
83 | To enumerate all datasets in a given workspace:
84 |
85 | ```python
86 | for ds in ws.datasets:
87 | print(ds.name)
88 | ```
89 |
90 | Just the user-created datasets:
91 |
92 | ```python
93 | for ds in ws.user_datasets:
94 | print(ds.name)
95 | ```
96 |
97 | Just the example datasets:
98 |
99 | ```python
100 | for ds in ws.example_datasets:
101 | print(ds.name)
102 | ```
103 |
104 | You can access a dataset by name (which is case-sensitive):
105 |
106 | ```python
107 | ds = ws.datasets['my dataset name']
108 | ```
109 |
110 | By index:
111 |
112 | ```python
113 | ds = ws.datasets[0]
114 | ```
115 |
116 |
117 | Dataset metadata
118 | ----------------
119 |
120 | Every dataset has metadata in addition to its content.
121 |
122 | Some metadata values are assigned by the user at creation time:
123 |
124 | ```python
125 | print(ds.name)
126 | print(ds.description)
127 | print(ds.family_id)
128 | print(ds.data_type_id)
129 | ```
130 |
131 | Others are values assigned by Azure ML:
132 |
133 | ```python
134 | print(ds.id)
135 | print(ds.created_date)
136 | print(ds.size)
137 | ```
138 |
139 | See the `SourceDataset` class for more on the available metadata.
140 |
141 |
142 | Reading contents
143 | ----------------
144 |
145 | You can import the dataset contents as a pandas DataFrame object.
146 | The `data_type_id` metadata on the dataset is used to determine how to import the contents.
147 |
148 | ```python
149 | frame = ds.to_dataframe()
150 | ```
151 |
152 | If a dataset is in a format that cannot be deserialized to a pandas DataFrame, the dataset object will not have a to_dataframe method.
153 |
154 | You can still read those datasets as text or binary, then parse the data manually.
155 |
156 | Read the contents as text:
157 |
158 | ```python
159 | text_data = ds.read_as_text()
160 | ```
161 |
162 | Read the contents as binary:
163 |
164 | ```python
165 | binary_data = ds.read_as_binary()
166 | ```
167 |
168 | You can also just open a stream to the contents:
169 |
170 | ```python
171 | with ds.open() as file:
172 | binary_data_chunk = file.read(1000)
173 | ```
174 |
175 | This gives you more control over the memory usage, as you can read and parse the data in chunks.
176 |
177 |
178 | Accessing intermediate datasets
179 | -------------------------------
180 |
181 | You can access the intermediate datasets at the output ports of the nodes in your experiments.
182 |
183 | Note that the default binary serialization format (.dataset) for intermediate datasets is not supported. Make sure to use a Convert to TSV or Convert to CSV module and read the intermediate dataset from its output port.
184 |
185 | First, get the experiment, using the experiment id:
186 |
187 | ```python
188 | experiment = ws.experiments['my experiment id']
189 | ```
190 |
191 | Then get the intermediate dataset object:
192 |
193 | ```python
194 | ds = experiment.get_intermediate_dataset(
195 | node_id='5c457225-68e3-4b60-9e3a-bc55f9f029a4-565',
196 | port_name='Results dataset',
197 | data_type_id=DataTypeIds.GenericCSV
198 | )
199 | ```
200 |
201 | To determine the values to pass to `get_intermediate_dataset`, use the **Generate Data Access Code** command on the module output port in ML Studio.
202 |
203 | You can then read the intermediate dataset contents just like you do for a regular dataset:
204 |
205 | ```python
206 | frame = ds.to_dataframe()
207 | ```
208 |
209 | You can also use `open`, `read_as_text` and `read_as_binary`.
210 |
211 | Note that intermediate datasets do not have any metadata available.
212 |
213 |
214 | Creating a new dataset
215 | ----------------------
216 |
217 | After you've manipulated the data, you can upload it as a new dataset on Azure ML.
218 |
219 | This will serialize the pandas DataFrame object to the format specified in the
220 | `data_type_id` parameter, then upload it to Azure ML.
221 |
222 | ```python
223 | dataset = workspace.datasets.add_from_dataframe(
224 | dataframe=frame,
225 | data_type_id=DataTypeIds.GenericCSV,
226 | name='my new dataset',
227 | description='my description'
228 | )
229 | ```
230 |
231 | If you want to serialize the data yourself, you can upload the raw data. Note
232 | that you still have to indicate the format of the data.
233 |
234 | ```python
235 | raw_data = my_own_csv_serialization_function(frame)
236 | dataset = workspace.datasets.add_from_raw_data(
237 | raw_data=raw_data,
238 | data_type_id=DataTypeIds.GenericCSV,
239 | name='my new dataset',
240 | description='my description'
241 | )
242 | ```
243 |
244 | After it's added, it's immediately accessible from the datasets collection.
245 |
246 | If you attempt to create a new dataset with a name that matches an existing dataset, an AzureMLConflictHttpError will be raised.
247 |
248 | ```python
249 | from azureml import AzureMLConflictHttpError
250 |
251 | try:
252 | workspace.datasets.add_from_dataframe(
253 | dataframe=frame,
254 | data_type_id=DataTypeIds.GenericCSV,
255 | name='not a unique name',
256 | description='my description'
257 | )
258 | except AzureMLConflictHttpError:
259 | print('Try again with a unique name!')
260 | ```
261 |
262 | To update an existing dataset, you can use `update_from_dataframe` or `update_from_raw_data`:
263 |
264 | ```python
265 | name = 'my existing dataset'
266 | dataset = workspace.datasets[name]
267 |
268 | dataset.update_from_dataframe(dataframe=frame)
269 | ```
270 |
271 | You can optionally change the name, description or the format of the data too:
272 |
273 | ```python
274 | name = 'my existing dataset'
275 | dataset = workspace.datasets[name]
276 |
277 | dataset.update_from_dataframe(
278 | dataframe=frame,
279 | data_type_id=DataTypeIds.GenericCSV,
280 | name='my new name',
281 | description='my new description'
282 | )
283 | ```
284 |
285 | If you attempt to create a new dataset with an invalid name, or if Azure ML rejects the dataset for any other reason, an AzureMLHttpError will be raised. AzureMLHttpError is raised when the http status code indicates a failure. A detailed error message can displayed by printing the exception, and the HTTP status code is stored in the `status_code` field.
286 |
287 | ```python
288 | from azureml import AzureMLHttpError
289 |
290 | try:
291 | workspace.datasets.add_from_dataframe(
292 | dataframe=frame,
293 | data_type_id=DataTypeIds.GenericCSV,
294 | name='invalid:name',
295 | description='my description'
296 | )
297 | except AzureMLHttpError as error:
298 | print(error.status_code)
299 | print(error)
300 | ```
301 |
302 | Services Usage
303 | ==============
304 | The services subpackage allows you to easily publish and consume AzureML Web Services. Currently only Python 2.7 is supported for services because the back end only has Python 2.7 installed.
305 |
306 | Publishing
307 | ----------
308 |
309 | Python functions can either be published using the @publish decorator or by calling the publish method directly. To publish a function using the decorator you can do:
310 |
311 | ```python
312 | from azureml import services
313 |
314 | @services.publish(workspace, workspace_token)
315 | @services.types(a = float, b = float)
316 | @services.returns(float)
317 | def func(a, b):
318 | return a / b
319 | ```
320 |
321 | This publishes a function which takes two floating point values and divides them. Alternately you can publish a function by calling the publish method directly:
322 |
323 | ```python
324 | my_func = publish(my_func, workspace, workspace_token, files_list, endpoint=None)
325 | ```
326 |
327 | If a function has no source file associated with it (for example, you're developing inside of a REPL environment) then the functions byte code is serialized. If the function refers to any global variables those will also be serialized using Pickle. In this mode all of the state which you're referring to needs to be already defined (e.g. your published function should come after any other functions you are calling).
328 |
329 | If a function is saved on disk then the entire module the function is defined in will be serialized and re-executed on the server to get the function back. In this mode the entire contents of the file is serialized and the order of the function definitions don't matter.
330 |
331 | After the function is published there will be a "service" property on the function. This object has several properties of interest:
332 |
333 | | Property | Description |
334 | | ------------- |:-------------:|
335 | | url | this is the end point for executing the function |
336 | | api_key | this is the API key which is required to invoke the function |
337 | | help_url | this is a human readable page which describes the parameters and results of the function. It also includes sample code for executing it from various languages. |
338 | | service_id | this is a unique GUID identifying the service in your workspace. You can re-use this ID to update the service once it's published |
339 |
340 | You can specify a list of files which should be published along with the function.
341 | The resulting files will be stored in a subdirectory called 'Script Bundle'. The
342 | list of files can be one of:
343 |
344 | | Format | Description |
345 | | ------------------------------------------ |:---------------------------------------------------------------:|
346 | | (('file1.txt', None), ) | file is read from disk |
347 | | (('file1.txt', b'contents'), ) | file contents are provided |
348 | | ('file1.txt', 'file2.txt') | files are read from disk, written with same filename |
349 | | ((('file1.txt', 'destname.txt'), None), ) | file is read from disk, written with different destination name filenames. |
350 |
351 |
352 | The various formats for each filename can be freely mixed and matched. Files can also be attached using the @attach decoator:
353 |
354 | ```python
355 | @publish(...)
356 | @attach('file1.txt')
357 | def f(x):
358 | pass
359 | ```
360 |
361 | And this supports the same file formats as the list.
362 |
363 | If you are using AzureML from a different geography (for example West Europe or East Asia) you'll need to specify the endpoint that you need to connect to. The end point is your region plus "management.azureml.net", for example: https://europewest.management.azureml.net
364 |
365 | Consumption
366 | -----------
367 |
368 | Existing services can be consumed using the service decorator. An empty function body is supplied and the resulting function becomes invokable and calls the published service:
369 |
370 | ```python
371 | from azureml import services
372 |
373 | @services.service(url, api_key)
374 | @services.types(a = float, b = float)
375 | @services.returns(float)
376 | def func(a, b):
377 | pass
378 | ```
379 |
380 | Controlling publishing / consumption
381 | ------------------------------------
382 |
383 | There are several decorators which are used to control how the invocation occurs.
384 |
385 | ### types(**kwargs)
386 | Specifies the types used for the arguments of a published or consumed service.
387 |
388 | The type annotations are optional and are used for providing information which allows the service to interoperate with other languages. The type information will be seen on the help page of the published service. If the type information is not provided a Python specific format will be used and other languages may not be able to call the sevice.
389 |
390 | Supported types are: int, bool, float, unicode.
391 |
392 | When an unsupported type is specified the type will be serialized using an internal representation based upon Python's Pickle protocol. This will prevent the web service from being used with other languages.
393 |
394 | When working with strings you need to use the unicode data type. This is because the string data type used for interop is actually a Unicode string and Python's "str" objects are actually byte arrays.
395 |
396 | For
397 |
398 | ### returns(return_type)
399 | Specifies the return type for a published service.
400 |
401 | Like the parameter types this is also optional, and when omitted an internal Python format will be used and interoperability with other languages may be reduced.
402 |
403 | Supported types are: int, bool, float, unicode.
404 |
405 | When an unsupported type is specified the type will be serialized using an internal representation based upon Python's Pickle protocol. This will prevent the web service from being used with other languages.
406 |
407 | When working with strings you need to use the unicode data type. This is because the string data type used for interop is actually a Unicode string and Python's "str" objects are actually byte arrays.
408 |
409 | ### service_id(id)
410 | Specifies the service ID for a service. When publishing to the same service ID the service is updated instead of having a new service created.
411 |
412 | ### name(name)
413 | Specifies a friendly name for a service. By default the name is the function name, but this allows names with spaces or
414 | other characters which are not allowed in functions.
415 |
416 | ### attach(name, contents)
417 | Attaches a file to the payload to be uploaded.
418 |
419 | If contents is omitted the file is read from disk.
420 | If name is a tuple it specifies the on-disk filename and the destination filename.
421 |
422 | ### dataframe_service
423 | Indicates that the function operations on a data frame. The function
424 | will receive a single input in the form of a data frame, and should return
425 | a data frame object. The schema of the data frame is specified with this
426 | decorator.
427 |
428 | ```python
429 | @publish(...)
430 | @dataframe_service(a = int, b = int)
431 | @returns(int)
432 | def myfunc(df):
433 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])])
434 | ```
435 |
436 | This code can then be invoked either with:
437 | ```python
438 | myfunc(1, 2)
439 | ```
440 |
441 | or:
442 |
443 | ```python
444 | myfunc.map([[1,2], [3,4]])
445 | ```
446 |
447 | ### input_name
448 | Specifies the name of the input the web service expects to receive. Defaults to 'input1' Currently this is only
449 | supported on consumption.
450 |
451 | ### output_name
452 | Specifies the name of the output the web service expects to receive. Defaults to 'output1'. Currently this is only
453 | supported on consumption.
454 |
455 | Those include the types decorator for specifying the format of the inputs, the returns decorator for specifying the return value, the attach decorator for attaching files to a published function,
456 |
--------------------------------------------------------------------------------
/azureml/__init__.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | from datetime import datetime
27 |
28 | import numbers
29 | import re
30 | import sys
31 | import json
32 | try:
33 | import ConfigParser
34 | except ImportError:
35 | import configparser as ConfigParser
36 |
37 | from os import path
38 |
39 | try:
40 | from cStringIO import BytesIO
41 | except ImportError:
42 | from io import BytesIO
43 |
44 | from azureml.errors import (
45 | AzureMLConflictHttpError,
46 | AzureMLError,
47 | AzureMLHttpError,
48 | UnsupportedDatasetTypeError,
49 | _not_none,
50 | _not_none_or_empty,
51 | )
52 | from azureml.http import (
53 | _RestClient,
54 | __author__,
55 | __version__,
56 | )
57 | from azureml.serialization import (
58 | DataTypeIds,
59 | deserialize_dataframe,
60 | serialize_dataframe,
61 | is_supported,
62 | )
63 |
64 |
65 | _GLOBAL_WORKSPACE_ID = '506153734175476c4f62416c57734963'
66 |
67 |
68 | class Endpoints(object):
69 | """Constants for the known REST API endpoints."""
70 | default = 'https://studio.azureml.net'
71 | management_default = 'https://management.azureml.net'
72 |
73 |
74 | class Dataset(object):
75 | """Abstract base class for Azure ML datasets."""
76 | pass
77 |
78 |
79 | class SourceDataset(Dataset):
80 | """Metadata for a dataset and methods to read its contents."""
81 |
82 | def __init__(self, workspace=None, metadata=None):
83 | """
84 | INTERNAL USE ONLY. Initialize a dataset.
85 |
86 | Parameters
87 | ----------
88 | workspace : Workspace
89 | Parent workspace of the dataset.
90 | metadata : dict
91 | Dictionary of dataset metadata as returned by the REST API.
92 | """
93 | _not_none('metadata', metadata)
94 | _not_none('workspace', workspace)
95 |
96 | self.workspace = workspace
97 | self._metadata = metadata
98 |
99 | if is_supported(self.data_type_id):
100 | self.to_dataframe = self._to_dataframe
101 |
102 | if not self.is_example:
103 | self.update_from_raw_data = self._update_from_raw_data
104 | self.update_from_dataframe = self._update_from_dataframe
105 |
106 | @staticmethod
107 | def _metadata_repr(metadata):
108 | val = metadata['Name']
109 | if sys.version_info < (3,):
110 | return val.encode('ascii','ignore')
111 | else:
112 | return val
113 |
114 | def __repr__(self):
115 | return SourceDataset._metadata_repr(self._metadata)
116 |
117 | def open(self):
118 | '''Open and return a stream for the dataset contents.'''
119 | return self.workspace._rest.open_dataset_contents(self.contents_url)
120 |
121 | def read_as_binary(self):
122 | '''Read and return the dataset contents as binary.'''
123 | return self.workspace._rest.read_dataset_contents_binary(self.contents_url)
124 |
125 | def read_as_text(self):
126 | '''Read and return the dataset contents as text.'''
127 | return self.workspace._rest.read_dataset_contents_text(self.contents_url)
128 |
129 | def _to_dataframe(self):
130 | """Read and return the dataset contents as a pandas DataFrame."""
131 | with self.open() as reader:
132 | return deserialize_dataframe(reader, self.data_type_id)
133 |
134 | def _update_from_dataframe(self, dataframe, data_type_id=None, name=None,
135 | description=None):
136 | """
137 | Serialize the specified DataFrame and replace the existing dataset.
138 |
139 | Parameters
140 | ----------
141 | dataframe : pandas.DataFrame
142 | Data to serialize.
143 | data_type_id : str, optional
144 | Format to serialize to.
145 | If None, the existing format is preserved.
146 | Supported formats are:
147 | 'PlainText'
148 | 'GenericCSV'
149 | 'GenericTSV'
150 | 'GenericCSVNoHeader'
151 | 'GenericTSVNoHeader'
152 | See the azureml.DataTypeIds class for constants.
153 | name : str, optional
154 | Name for the dataset.
155 | If None, the name of the existing dataset is used.
156 | description : str, optional
157 | Description for the dataset.
158 | If None, the name of the existing dataset is used.
159 | """
160 | _not_none('dataframe', dataframe)
161 |
162 | if data_type_id is None:
163 | data_type_id = self.data_type_id
164 | if name is None:
165 | name = self.name
166 | if description is None:
167 | description = self.description
168 |
169 | try:
170 | output = BytesIO()
171 | serialize_dataframe(output, data_type_id, dataframe)
172 | raw_data = output.getvalue()
173 | finally:
174 | output.close()
175 |
176 | self._upload_and_refresh(raw_data, data_type_id, name, description)
177 |
178 | def _update_from_raw_data(self, raw_data, data_type_id=None, name=None,
179 | description=None):
180 | """
181 | Upload already serialized raw data and replace the existing dataset.
182 |
183 | Parameters
184 | ----------
185 | raw_data: bytes
186 | Dataset contents to upload.
187 | data_type_id : str
188 | Serialization format of the raw data.
189 | If None, the format of the existing dataset is used.
190 | Supported formats are:
191 | 'PlainText'
192 | 'GenericCSV'
193 | 'GenericTSV'
194 | 'GenericCSVNoHeader'
195 | 'GenericTSVNoHeader'
196 | 'ARFF'
197 | See the azureml.DataTypeIds class for constants.
198 | name : str, optional
199 | Name for the dataset.
200 | If None, the name of the existing dataset is used.
201 | description : str, optional
202 | Description for the dataset.
203 | If None, the name of the existing dataset is used.
204 | """
205 | _not_none('raw_data', raw_data)
206 |
207 | if data_type_id is None:
208 | data_type_id = self.data_type_id
209 | if name is None:
210 | name = self.name
211 | if description is None:
212 | description = self.description
213 |
214 | self._upload_and_refresh(raw_data, data_type_id, name, description)
215 |
216 | def _upload_and_refresh(self, raw_data, data_type_id, name, description):
217 | dataset_id = self.workspace._rest.upload_dataset(
218 | self.workspace.workspace_id,
219 | name,
220 | description,
221 | data_type_id,
222 | raw_data,
223 | self.family_id
224 | )
225 |
226 | self._metadata = self.workspace._rest.get_dataset(
227 | self.workspace.workspace_id,
228 | dataset_id
229 | )
230 |
231 | class Location(object):
232 | def __init__(self, metadata):
233 | self._metadata = metadata
234 |
235 | @property
236 | def base_uri(self):
237 | """TODO."""
238 | return self._metadata['BaseUri']
239 |
240 | @property
241 | def size(self):
242 | """TODO."""
243 | return self._metadata['Size']
244 |
245 | @property
246 | def endpoint_type(self):
247 | """TODO."""
248 | return self._metadata['EndpointType']
249 |
250 | @property
251 | def credential_container(self):
252 | """TODO."""
253 | return self._metadata['CredentialContainer']
254 |
255 | @property
256 | def access_credential(self):
257 | """TODO."""
258 | return self._metadata['AccessCredential']
259 |
260 | @property
261 | def location(self):
262 | """TODO."""
263 | return self._metadata['Location']
264 |
265 | @property
266 | def file_type(self):
267 | """TODO."""
268 | return self._metadata['FileType']
269 |
270 | @property
271 | def is_auxiliary(self):
272 | """TODO."""
273 | return self._metadata['IsAuxiliary']
274 |
275 | @property
276 | def name(self):
277 | """TODO."""
278 | return self._metadata['Name']
279 |
280 | @property
281 | def visualize_end_point(self):
282 | """TODO."""
283 | return SourceDataset.Location(self._metadata['VisualizeEndPoint'])
284 |
285 | @property
286 | def schema_end_point(self):
287 | """TODO."""
288 | return SourceDataset.Location(self._metadata['SchemaEndPoint'])
289 |
290 | @property
291 | def schema_status(self):
292 | """TODO."""
293 | return self._metadata['SchemaStatus']
294 |
295 | @property
296 | def dataset_id(self):
297 | """Unique identifier for the dataset."""
298 | return self._metadata['Id']
299 |
300 | @property
301 | def name(self):
302 | """Unique name for the dataset."""
303 | return self._metadata['Name']
304 |
305 | @property
306 | def data_type_id(self):
307 | """
308 | Serialization format for the dataset.
309 | See the azureml.DataTypeIds class for constants.
310 | """
311 | return self._metadata['DataTypeId']
312 |
313 | @property
314 | def description(self):
315 | """Description for the dataset."""
316 | return self._metadata['Description']
317 |
318 | @property
319 | def resource_upload_id(self):
320 | """TODO."""
321 | return self._metadata['ResourceUploadId']
322 |
323 | @property
324 | def family_id(self):
325 | """TODO."""
326 | return self._metadata['FamilyId']
327 |
328 | @property
329 | def size(self):
330 | """Size in bytes of the serialized dataset contents."""
331 | return self._metadata['Size']
332 |
333 | @property
334 | def source_origin(self):
335 | """TODO."""
336 | return self._metadata['SourceOrigin']
337 |
338 | @property
339 | def created_date(self):
340 | # Example format of date to parse:
341 | # /Date(1418444668177)/
342 | match = re.search(r"/Date\((\d+)\)/", self._metadata['CreatedDate'])
343 | return datetime.fromtimestamp(int(match.group(1)) / 1000.0)
344 |
345 | @property
346 | def owner(self):
347 | """TODO."""
348 | return self._metadata['Owner']
349 |
350 | @property
351 | def experiment_id(self):
352 | """TODO."""
353 | return self._metadata['ExperimentId']
354 |
355 | @property
356 | def client_version(self):
357 | """TODO."""
358 | return self._metadata['ClientVersion']
359 |
360 | @property
361 | def promoted_from(self):
362 | """TODO."""
363 | return self._metadata['PromotedFrom']
364 |
365 | @property
366 | def uploaded_from_filename(self):
367 | """TODO."""
368 | return self._metadata['UploadedFromFilename']
369 |
370 | @property
371 | def service_version(self):
372 | """TODO."""
373 | return self._metadata['ServiceVersion']
374 |
375 | @property
376 | def is_latest(self):
377 | """TODO."""
378 | return self._metadata['IsLatest']
379 |
380 | @property
381 | def category(self):
382 | """TODO."""
383 | return self._metadata['Category']
384 |
385 | @property
386 | def download_location(self):
387 | """TODO."""
388 | return SourceDataset.Location(self._metadata['DownloadLocation'])
389 |
390 | @property
391 | def is_deprecated(self):
392 | """TODO."""
393 | return self._metadata['IsDeprecated']
394 |
395 | @property
396 | def culture(self):
397 | """TODO."""
398 | return self._metadata['Culture']
399 |
400 | @property
401 | def batch(self):
402 | """TODO."""
403 | return self._metadata['Batch']
404 |
405 | @property
406 | def created_date_ticks(self):
407 | """TODO."""
408 | return self._metadata['CreatedDateTicks']
409 |
410 | @property
411 | def contents_url(self):
412 | """Full URL to the dataset contents."""
413 | loc = self.download_location
414 | return loc.base_uri + loc.location + loc.access_credential
415 |
416 | @property
417 | def is_example(self):
418 | """True for an example dataset, False for user created."""
419 | return self.dataset_id.startswith(_GLOBAL_WORKSPACE_ID)
420 |
421 |
422 | class Datasets(object):
423 | def __init__(self, workspace, example_filter=None):
424 | """
425 | INTERNAL USE ONLY. Initialize a dataset collection.
426 |
427 | Parameters
428 | ----------
429 | workspace : Workspace
430 | Parent workspace of the datasets.
431 | example_filter : bool
432 | True to include only examples.
433 | False to include only user-created.
434 | None to include all.
435 | """
436 | _not_none('workspace', workspace)
437 |
438 | self.workspace = workspace
439 | self._example_filter = example_filter
440 |
441 | def __repr__(self):
442 | return '\n'.join((SourceDataset._metadata_repr(dataset) for dataset in self._get_datasets()))
443 |
444 | def __iter__(self):
445 | for dataset in self._get_datasets():
446 | yield self._create_dataset(dataset)
447 |
448 | def __len__(self):
449 | return sum(1 for _ in self._get_datasets())
450 |
451 | def __getitem__(self, index):
452 | '''Retrieve a dataset by index or by name (case-sensitive).'''
453 | _not_none('index', index)
454 |
455 | datasets = self._get_datasets()
456 | if isinstance(index, numbers.Integral):
457 | return self._create_dataset(list(datasets)[index])
458 | else:
459 | for dataset in datasets:
460 | if dataset['Name'] == index:
461 | return self._create_dataset(dataset)
462 |
463 | raise IndexError('A data set named "{}" does not exist'.format(index))
464 |
465 | def add_from_dataframe(self, dataframe, data_type_id, name, description):
466 | """
467 | Serialize the specified DataFrame and upload it as a new dataset.
468 |
469 | Parameters
470 | ----------
471 | dataframe : pandas.DataFrame
472 | Data to serialize.
473 | data_type_id : str
474 | Format to serialize to.
475 | Supported formats are:
476 | 'PlainText'
477 | 'GenericCSV'
478 | 'GenericTSV'
479 | 'GenericCSVNoHeader'
480 | 'GenericTSVNoHeader'
481 | See the azureml.DataTypeIds class for constants.
482 | name : str
483 | Name for the new dataset.
484 | description : str
485 | Description for the new dataset.
486 |
487 | Returns
488 | -------
489 | SourceDataset
490 | Dataset that was just created.
491 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on
492 | the dataset object to get its contents as a stream, bytes, str or
493 | pandas DataFrame.
494 | """
495 | _not_none('dataframe', dataframe)
496 | _not_none_or_empty('data_type_id', data_type_id)
497 | _not_none_or_empty('name', name)
498 | _not_none_or_empty('description', description)
499 |
500 | try:
501 | output = BytesIO()
502 | serialize_dataframe(output, data_type_id, dataframe)
503 | raw_data = output.getvalue()
504 | finally:
505 | output.close()
506 |
507 | return self._upload(raw_data, data_type_id, name, description)
508 |
509 | def add_from_raw_data(self, raw_data, data_type_id, name, description):
510 | """
511 | Upload already serialized raw data as a new dataset.
512 |
513 | Parameters
514 | ----------
515 | raw_data: bytes
516 | Dataset contents to upload.
517 | data_type_id : str
518 | Serialization format of the raw data.
519 | Supported formats are:
520 | 'PlainText'
521 | 'GenericCSV'
522 | 'GenericTSV'
523 | 'GenericCSVNoHeader'
524 | 'GenericTSVNoHeader'
525 | 'ARFF'
526 | See the azureml.DataTypeIds class for constants.
527 | name : str
528 | Name for the new dataset.
529 | description : str
530 | Description for the new dataset.
531 |
532 | Returns
533 | -------
534 | SourceDataset
535 | Dataset that was just created.
536 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on
537 | the dataset object to get its contents as a stream, bytes, str or
538 | pandas DataFrame.
539 | """
540 | _not_none('raw_data', raw_data)
541 | _not_none_or_empty('data_type_id', data_type_id)
542 | _not_none_or_empty('name', name)
543 | _not_none_or_empty('description', description)
544 |
545 | return self._upload(raw_data, data_type_id, name, description)
546 |
547 | def _upload(self, raw_data, data_type_id, name, description):
548 | dataset_id = self.workspace._rest.upload_dataset(
549 | self.workspace.workspace_id, name, description, data_type_id,
550 | raw_data, None)
551 |
552 | metadata = self.workspace._rest.get_dataset(
553 | self.workspace.workspace_id, dataset_id)
554 |
555 | return self._create_dataset(metadata)
556 |
557 | def _get_datasets(self):
558 | datasets = self.workspace._rest.get_datasets(self.workspace.workspace_id)
559 | return datasets if self._example_filter is None else \
560 | (d for d in datasets if d['Id'].startswith(
561 | _GLOBAL_WORKSPACE_ID) == self._example_filter)
562 |
563 | def _create_dataset(self, metadata):
564 | return SourceDataset(self.workspace, metadata)
565 |
566 |
567 | class IntermediateDataset(Dataset):
568 | """Represents an intermediate dataset and methods to read its contents."""
569 |
570 | def __init__(self, workspace, experiment, node_id, port_name, data_type_id):
571 | """
572 | INTERNAL USE ONLY. Initialize an intermediate dataset.
573 |
574 | Parameters
575 | ----------
576 | workspace : Workspace
577 | Parent workspace of the dataset.
578 | experiment : Experiment
579 | Parent experiment of the dataset.
580 | node_id : str
581 | Module node id from the experiment graph.
582 | port_name : str
583 | Output port of the module.
584 | data_type_id : str
585 | Serialization format of the raw data.
586 | See the azureml.DataTypeIds class for constants.
587 | """
588 | _not_none('workspace', workspace)
589 | _not_none('experiment', experiment)
590 | _not_none_or_empty('node_id', node_id)
591 | _not_none_or_empty('port_name', port_name)
592 | _not_none_or_empty('data_type_id', data_type_id)
593 |
594 | self.workspace = workspace
595 | self.experiment = experiment
596 | self.node_id = node_id
597 | self.port_name = port_name
598 | self.data_type_id = data_type_id
599 |
600 | if is_supported(self.data_type_id):
601 | self.to_dataframe = self._to_dataframe
602 |
603 | def open(self):
604 | '''Open and return a stream for the dataset contents.'''
605 | return self.workspace._rest.open_intermediate_dataset_contents(
606 | self.workspace.workspace_id,
607 | self.experiment.experiment_id,
608 | self.node_id,
609 | self.port_name
610 | )
611 |
612 | def read_as_binary(self):
613 | '''Read and return the dataset contents as binary.'''
614 | return self.workspace._rest.read_intermediate_dataset_contents_binary(
615 | self.workspace.workspace_id,
616 | self.experiment.experiment_id,
617 | self.node_id,
618 | self.port_name
619 | )
620 |
621 | def read_as_text(self):
622 | '''Read and return the dataset contents as text.'''
623 | return self.workspace._rest.read_intermediate_dataset_contents_text(
624 | self.workspace.workspace_id,
625 | self.experiment.experiment_id,
626 | self.node_id,
627 | self.port_name
628 | )
629 |
630 | def _to_dataframe(self):
631 | """Read and return the dataset contents as a pandas DataFrame."""
632 | #TODO: figure out why passing in the opened stream directly gives invalid data
633 | data = self.read_as_binary()
634 | reader = BytesIO(data)
635 | return deserialize_dataframe(reader, self.data_type_id)
636 |
637 |
638 | class Experiment(object):
639 |
640 | def __init__(self, workspace, metadata):
641 | """
642 | INTERNAL USE ONLY. Initialize an experiment.
643 |
644 | Parameters
645 | ----------
646 | workspace : Workspace
647 | Parent workspace of the experiment.
648 | metadata : dict
649 | Dictionary of experiment metadata as returned by the REST API.
650 | """
651 | _not_none('workspace', workspace)
652 | _not_none('metadata', metadata)
653 |
654 | self.workspace = workspace
655 | self._metadata = metadata
656 |
657 | @staticmethod
658 | def _metadata_repr(metadata):
659 | val = u'{0}\t{1}'.format(metadata['ExperimentId'], metadata['Description'])
660 | if sys.version_info < (3,):
661 | return val.encode('ascii','ignore')
662 | else:
663 | return val
664 |
665 | def __repr__(self):
666 | return Experiment._metadata_repr(self._metadata)
667 |
668 | class Status(object):
669 | def __init__(self, metadata):
670 | self._metadata = metadata
671 |
672 | @property
673 | def status_code(self):
674 | """TODO."""
675 | return self._metadata['StatusCode']
676 |
677 | @property
678 | def status_detail(self):
679 | """TODO."""
680 | return self._metadata['StatusDetail']
681 |
682 | @property
683 | def creation_time(self):
684 | """TODO."""
685 | # Example format of date to parse:
686 | # /Date(1418444668177)/
687 | match = re.search(r"/Date\((\d+)\)/", self._metadata['CreationTime'])
688 | return datetime.fromtimestamp(int(match.group(1)) / 1000.0)
689 |
690 | @property
691 | def status(self):
692 | """TODO."""
693 | return Experiment.Status(self._metadata['Status'])
694 |
695 | @property
696 | def description(self):
697 | """TODO."""
698 | return self._metadata['Description']
699 |
700 | @property
701 | def creator(self):
702 | """TODO."""
703 | return self._metadata['Creator']
704 |
705 | @property
706 | def experiment_id(self):
707 | """TODO."""
708 | return self._metadata['ExperimentId']
709 |
710 | @property
711 | def job_id(self):
712 | """TODO."""
713 | return self._metadata['JobId']
714 |
715 | @property
716 | def version_id(self):
717 | """TODO."""
718 | return self._metadata['VersionId']
719 |
720 | @property
721 | def etag(self):
722 | """TODO."""
723 | return self._metadata['Etag']
724 |
725 | @property
726 | def run_id(self):
727 | """TODO."""
728 | return self._metadata['RunId']
729 |
730 | @property
731 | def is_archived(self):
732 | """TODO."""
733 | return self._metadata['IsArchived']
734 |
735 | @property
736 | def is_example(self):
737 | """True for an example experiment, False for user created."""
738 | return self.experiment_id.startswith(_GLOBAL_WORKSPACE_ID)
739 |
740 | def get_intermediate_dataset(self, node_id, port_name, data_type_id):
741 | """
742 | Get an intermediate dataset.
743 |
744 | Parameters
745 | ----------
746 | node_id : str
747 | Module node id from the experiment graph.
748 | port_name : str
749 | Output port of the module.
750 | data_type_id : str
751 | Serialization format of the raw data.
752 | See the azureml.DataTypeIds class for constants.
753 |
754 | Returns
755 | -------
756 | IntermediateDataset
757 | Dataset object.
758 | Use open(), read_as_binary(), read_as_text() or to_dataframe() on
759 | the dataset object to get its contents as a stream, bytes, str or
760 | pandas DataFrame.
761 | """
762 | return IntermediateDataset(self.workspace, self, node_id, port_name, data_type_id)
763 |
764 |
765 | class Experiments(object):
766 | def __init__(self, workspace, example_filter=None):
767 | """
768 | INTERNAL USE ONLY. Initialize an experiment collection.
769 |
770 | Parameters
771 | ----------
772 | workspace : Workspace
773 | Parent workspace of the experiments.
774 | example_filter : bool
775 | True to include only examples.
776 | False to include only user-created.
777 | None to include all.
778 | """
779 | _not_none('workspace', workspace)
780 |
781 | self.workspace = workspace
782 | self._example_filter = example_filter
783 |
784 | def __repr__(self):
785 | return '\n'.join((Experiment._metadata_repr(experiment) for experiment in self._get_experiments()))
786 |
787 | def __iter__(self):
788 | for experiment in self._get_experiments():
789 | yield self._create_experiment(experiment)
790 |
791 | def __len__(self):
792 | return sum(1 for _ in self._get_experiments())
793 |
794 | def __getitem__(self, index):
795 | '''Retrieve an experiment by index or by id.'''
796 | _not_none('index', index)
797 |
798 | experiments = self._get_experiments()
799 | if isinstance(index, numbers.Integral):
800 | return self._create_experiment(list(experiments)[index])
801 | else:
802 | for experiment in experiments:
803 | if experiment['ExperimentId'] == index:
804 | return self._create_experiment(experiment)
805 |
806 | raise IndexError('An experiment with the id "{}" does not exist'.format(index))
807 |
808 | def _get_experiments(self):
809 | experiments = self.workspace._rest.get_experiments(self.workspace.workspace_id)
810 | return experiments if self._example_filter is None else \
811 | (e for e in experiments if e['ExperimentId'].startswith(_GLOBAL_WORKSPACE_ID) == self._example_filter)
812 |
813 | def _create_experiment(self, metadata):
814 | return Experiment(self.workspace, metadata)
815 |
816 |
817 | _CONFIG_WORKSPACE_SECTION = 'workspace'
818 | _CONFIG_WORKSPACE_ID = 'id'
819 | _CONFIG_AUTHORIZATION_TOKEN = 'authorization_token'
820 | _CONFIG_API_ENDPOINT = 'api_endpoint'
821 | _CONFIG_MANAGEMENT_ENDPOINT = 'management_endpoint'
822 |
823 | def _get_workspace_info(workspace_id, authorization_token, endpoint, management_endpoint):
824 | if workspace_id is None or authorization_token is None or endpoint is None or management_endpoint is None:
825 | # read the settings from config
826 | jsonConfig = path.expanduser('~/.azureml/settings.json')
827 | if path.exists(jsonConfig):
828 | with open(jsonConfig) as cfgFile:
829 | config = json.load(cfgFile)
830 | if _CONFIG_WORKSPACE_SECTION in config:
831 | ws = config[_CONFIG_WORKSPACE_SECTION]
832 | workspace_id = ws.get(_CONFIG_WORKSPACE_ID, workspace_id)
833 | authorization_token = ws.get(_CONFIG_AUTHORIZATION_TOKEN, authorization_token)
834 | endpoint = ws.get(_CONFIG_API_ENDPOINT, endpoint)
835 | management_endpoint = ws.get(_CONFIG_MANAGEMENT_ENDPOINT, management_endpoint)
836 | else:
837 | config = ConfigParser.ConfigParser()
838 | config.read(path.expanduser('~/.azureml/settings.ini'))
839 |
840 | if config.has_section(_CONFIG_WORKSPACE_SECTION):
841 | if workspace_id is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_WORKSPACE_ID):
842 | workspace_id = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_WORKSPACE_ID)
843 | if authorization_token is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_AUTHORIZATION_TOKEN):
844 | authorization_token = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_AUTHORIZATION_TOKEN)
845 | if endpoint is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_API_ENDPOINT):
846 | endpoint = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_API_ENDPOINT)
847 | if management_endpoint is None and config.has_option(_CONFIG_WORKSPACE_SECTION, _CONFIG_MANAGEMENT_ENDPOINT):
848 | management_endpoint = config.get(_CONFIG_WORKSPACE_SECTION, _CONFIG_MANAGEMENT_ENDPOINT)
849 |
850 | if workspace_id is None:
851 | raise ValueError('workspace_id not provided and not available via config')
852 | if authorization_token is None:
853 | raise ValueError('authorization_token not provided and not available via config')
854 | if endpoint is None:
855 | endpoint = Endpoints.default
856 | if management_endpoint is None:
857 | management_endpoint = Endpoints.management_default
858 |
859 | return workspace_id, authorization_token, endpoint, management_endpoint
860 |
861 | class Workspace(object):
862 |
863 | def __init__(self, workspace_id = None, authorization_token = None, endpoint=None):
864 | """
865 | Initialize a workspace.
866 |
867 | Parameters
868 | ----------
869 | workspace_id : str
870 | Unique identifier for the existing workspace. Can be obtained from
871 | the URL in ML Studio when editing a workspace.
872 | authorization_token: str
873 | Access token for the workspace. Can be the primary or secondary
874 | token managed in ML Studio.
875 | endpoint: str
876 | URL of the endpoint to connect to. Specify this only if you host
877 | ML Studio on your own server(s).
878 |
879 | Parameters that are omitted will be read from ~/.azureml/settings.ini:
880 | [workspace]
881 | id = abcd1234
882 | authorization_token = abcd1234
883 | endpoint = https://studio.azureml.net
884 | """
885 | workspace_id, authorization_token, endpoint, management_endpoint = _get_workspace_info(workspace_id, authorization_token, endpoint, None)
886 |
887 | _not_none_or_empty('workspace_id', workspace_id)
888 | _not_none_or_empty('authorization_token', authorization_token)
889 | _not_none_or_empty('endpoint', endpoint)
890 |
891 | self.workspace_id = workspace_id
892 | self.authorization_token = authorization_token
893 | self.api_endpoint = endpoint
894 | self.management_endpoint = management_endpoint
895 | self._rest = _RestClient(endpoint, authorization_token)
896 | self.datasets = Datasets(workspace=self)
897 | self.user_datasets = Datasets(workspace=self, example_filter=False)
898 | self.example_datasets = Datasets(workspace=self, example_filter=True)
899 | self.experiments = Experiments(workspace=self)
900 | self.user_experiments = Experiments(workspace=self, example_filter=False)
901 | self.example_experiments = Experiments(workspace=self, example_filter=True)
902 |
903 |
904 | _manglingPattern = re.compile(r'[\W_]+')
905 |
906 | def _mangled(name):
907 | result = _manglingPattern.sub('_', name)
908 | return result.lower()
909 |
--------------------------------------------------------------------------------
/azureml/errors.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 |
27 | class _ErrorMessages(object):
28 | unsupported_type = 'Dataset type "{0}" is not supported'
29 | not_none = '"{0}" should not be None.'
30 | not_none_or_empty = '"{0}" should not be None or empty.'
31 |
32 |
33 | class AzureMLError(Exception):
34 | '''AzureML Exception base class.'''
35 | def __init__(self, message):
36 | super(AzureMLError, self).__init__(message)
37 |
38 |
39 | class AzureMLHttpError(AzureMLError):
40 | '''Error from Azure ML REST API.'''
41 | def __init__(self, message, status_code):
42 | super(AzureMLHttpError, self).__init__(message)
43 | self.status_code = status_code
44 |
45 | def __new__(cls, message, status_code, *args, **kwargs):
46 | if status_code == 409:
47 | cls = AzureMLConflictHttpError
48 | elif status_code == 401:
49 | cls = AzureMLUnauthorizedError
50 | return AzureMLError.__new__(cls, message, status_code, *args, **kwargs)
51 |
52 |
53 | class AzureMLUnauthorizedError(AzureMLHttpError):
54 | '''Unauthorized error from Azure ML REST API.'''
55 | def __init__(self, message, status_code):
56 | message = 'Unauthorized, please check your workspace ID and authorization token ({})'.format(message)
57 | super(AzureMLUnauthorizedError, self).__init__(message, status_code)
58 |
59 |
60 | class AzureMLConflictHttpError(AzureMLHttpError):
61 | '''Conflict error from Azure ML REST API.'''
62 | def __init__(self, message, status_code):
63 | super(AzureMLConflictHttpError, self).__init__(message, status_code)
64 |
65 | class UnsupportedDatasetTypeError(AzureMLError):
66 | '''Dataset type is not supported.'''
67 | def __init__(self, data_type_id):
68 | super(UnsupportedDatasetTypeError, self).__init__(
69 | _ErrorMessages.unsupported_type.format(data_type_id))
70 |
71 |
72 | def _not_none(param_name, param):
73 | if param is None:
74 | raise TypeError(_ErrorMessages.not_none.format(param_name))
75 |
76 |
77 | def _not_none_or_empty(param_name, param):
78 | if not param:
79 | raise TypeError(_ErrorMessages.not_none_or_empty.format(param_name))
80 |
--------------------------------------------------------------------------------
/azureml/http.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | import json
27 | import requests
28 | from azureml.errors import AzureMLConflictHttpError
29 |
30 | try:
31 | from urlparse import urljoin
32 | except ImportError:
33 | from urllib.parse import urljoin
34 |
35 | from azureml.errors import (
36 | AzureMLHttpError,
37 | )
38 |
39 | __author__ = 'Microsoft Corp. '
40 | __version__ = '0.2.7'
41 |
42 |
43 | class _RestClient(object):
44 | SERVICE_ROOT = 'api/'
45 | INTERMEDIATE_DATASET_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/experiments/{1}/outputdata/{2}/{3}'
46 | EXPERIMENTS_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/experiments'
47 | DATASOURCES_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/datasources'
48 | DATASOURCE_URI_FMT = SERVICE_ROOT + 'workspaces/{0}/datasources/{1}'
49 | UPLOAD_URI_FMI = SERVICE_ROOT + 'resourceuploads/workspaces/{0}/?userStorage=true&dataTypeId={1}'
50 | UPLOAD_CHUNK_URI_FMT = SERVICE_ROOT + 'blobuploads/workspaces/{0}/?numberOfBlocks={1}&blockId={2}&uploadId={3}&dataTypeId={4}'
51 | SESSION_ID_HEADER_NAME = 'x-ms-client-session-id'
52 | SESSION_ID_HEADER_VALUE = 'DefaultSession'
53 | ACCESS_TOKEN_HEADER_NAME = 'x-ms-metaanalytics-authorizationtoken'
54 | CONTENT_TYPE_HEADER_NAME = 'Content-Type'
55 | CONTENT_TYPE_HEADER_VALUE_JSON = 'application/json;charset=UTF8'
56 | CHUNK_SIZE = 0x200000
57 | DEFAULT_OWNER = 'Python SDK'
58 | USER_AGENT_HEADER_NAME = 'User-Agent'
59 | USER_AGENT_HEADER_VALUE = 'pyazureml/' + __version__
60 |
61 | def __init__(self, service_endpoint, access_token):
62 | self._service_endpoint = service_endpoint
63 | self._access_token = access_token
64 |
65 | def get_experiments(self, workspace_id):
66 | """Runs HTTP GET request to retrieve the list of experiments."""
67 | api_path = self.EXPERIMENTS_URI_FMT.format(workspace_id)
68 | return self._send_get_req(api_path)
69 |
70 | def get_datasets(self, workspace_id):
71 | """Runs HTTP GET request to retrieve the list of datasets."""
72 | api_path = self.DATASOURCES_URI_FMT.format(workspace_id)
73 | return self._send_get_req(api_path)
74 |
75 | def get_dataset(self, workspace_id, dataset_id):
76 | """Runs HTTP GET request to retrieve a single dataset."""
77 | api_path = self.DATASOURCE_URI_FMT.format(workspace_id, dataset_id)
78 | return self._send_get_req(api_path)
79 |
80 | def open_intermediate_dataset_contents(self, workspace_id, experiment_id,
81 | node_id, port_name):
82 | return self._get_intermediate_dataset_contents(
83 | workspace_id,
84 | experiment_id,
85 | node_id,
86 | port_name,
87 | stream=True).raw
88 |
89 | def read_intermediate_dataset_contents_binary(self, workspace_id,
90 | experiment_id, node_id,
91 | port_name):
92 | return self._get_intermediate_dataset_contents(
93 | workspace_id,
94 | experiment_id,
95 | node_id,
96 | port_name,
97 | stream=False).content
98 |
99 | def read_intermediate_dataset_contents_text(self, workspace_id,
100 | experiment_id, node_id,
101 | port_name):
102 | return self._get_intermediate_dataset_contents(
103 | workspace_id,
104 | experiment_id,
105 | node_id,
106 | port_name,
107 | stream=False).text
108 |
109 | def _get_intermediate_dataset_contents(self, workspace_id, experiment_id,
110 | node_id, port_name, stream):
111 | api_path = self.INTERMEDIATE_DATASET_URI_FMT.format(
112 | workspace_id, experiment_id, node_id, port_name)
113 | response = requests.get(
114 | url=urljoin(self._service_endpoint, api_path),
115 | headers=self._get_headers(),
116 | stream=stream,
117 | )
118 | return response
119 |
120 | def open_dataset_contents(self, url):
121 | response = requests.get(url, stream=True)
122 | return response.raw
123 |
124 | def read_dataset_contents_binary(self, url):
125 | response = requests.get(url)
126 | return response.content
127 |
128 | def read_dataset_contents_text(self, url):
129 | response = requests.get(url)
130 | return response.text
131 |
132 | def upload_dataset(self, workspace_id, name, description, data_type_id,
133 | raw_data, family_id):
134 | # uploading data is a two step process. First we upload the raw data
135 | api_path = self.UPLOAD_URI_FMI.format(workspace_id, data_type_id)
136 | upload_result = self._send_post_req(api_path, data=b'')
137 |
138 | # now get the id that was generated
139 | upload_id = upload_result["Id"]
140 |
141 | # Upload the data in chunks...
142 | total_chunks = int((len(raw_data) + (self.CHUNK_SIZE-1)) / self.CHUNK_SIZE)
143 | for chunk in range(total_chunks):
144 | chunk_url = self.UPLOAD_CHUNK_URI_FMT.format(
145 | workspace_id,
146 | total_chunks, # number of blocks
147 | chunk, # block id
148 | upload_id,
149 | data_type_id,
150 | )
151 | chunk_data = raw_data[chunk*self.CHUNK_SIZE:(chunk + 1)*self.CHUNK_SIZE]
152 | self._send_post_req(chunk_url, data=chunk_data)
153 |
154 | # use that to construct the DataSource metadata
155 | metadata = {
156 | "DataSource": {
157 | "Name": name,
158 | "DataTypeId":data_type_id,
159 | "Description":description,
160 | "FamilyId":family_id,
161 | "Owner": self.DEFAULT_OWNER,
162 | "SourceOrigin":"FromResourceUpload"
163 | },
164 | "UploadId": upload_id,
165 | "UploadedFromFileName":"",
166 | "ClientPoll": True
167 | }
168 |
169 | try:
170 | api_path = self.DATASOURCES_URI_FMT.format(workspace_id)
171 | except AzureMLConflictHttpError as e:
172 | raise AzureMLConflictHttpError(
173 | 'A data set named "{}" already exists'.format(name),
174 | e.status_code
175 | )
176 |
177 | datasource_id = self._send_post_req(
178 | api_path, json.dumps(metadata), self.CONTENT_TYPE_HEADER_VALUE_JSON)
179 | return datasource_id
180 |
181 | def _send_get_req(self, api_path):
182 | response = requests.get(
183 | url=urljoin(self._service_endpoint, api_path),
184 | headers=self._get_headers()
185 | )
186 |
187 | if response.status_code >= 400:
188 | raise AzureMLHttpError(response.text, response.status_code)
189 |
190 | return response.json()
191 |
192 | def _send_post_req(self, api_path, data, content_type=None):
193 | response = requests.post(
194 | url=urljoin(self._service_endpoint, api_path),
195 | data=data,
196 | headers=self._get_headers(content_type)
197 | )
198 |
199 | if response.status_code >= 400:
200 | raise AzureMLHttpError(response.text, response.status_code)
201 |
202 | return response.json()
203 |
204 | def _get_headers(self, content_type=None):
205 | headers = {
206 | self.USER_AGENT_HEADER_NAME: self.USER_AGENT_HEADER_VALUE,
207 | self.CONTENT_TYPE_HEADER_NAME: self.CONTENT_TYPE_HEADER_VALUE_JSON,
208 | self.SESSION_ID_HEADER_NAME: self.SESSION_ID_HEADER_VALUE,
209 | self.ACCESS_TOKEN_HEADER_NAME: self._access_token
210 | }
211 | if content_type:
212 | headers[self.CONTENT_TYPE_HEADER_NAME] = content_type
213 | return headers
214 |
--------------------------------------------------------------------------------
/azureml/serialization.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | from functools import partial
27 | import codecs
28 | import pandas as pd
29 |
30 | from azureml.errors import (
31 | UnsupportedDatasetTypeError,
32 | _not_none,
33 | _not_none_or_empty,
34 | )
35 |
36 |
37 | class DataTypeIds(object):
38 | """Constants for the known dataset data type id strings."""
39 | ARFF = 'ARFF'
40 | PlainText = 'PlainText'
41 | GenericCSV = 'GenericCSV'
42 | GenericTSV = 'GenericTSV'
43 | GenericCSVNoHeader = 'GenericCSVNoHeader'
44 | GenericTSVNoHeader = 'GenericTSVNoHeader'
45 |
46 |
47 | def _dataframe_to_csv(writer, dataframe, delimiter, with_header):
48 | """serialize the dataframe with different delimiters"""
49 | encoding_writer = codecs.getwriter('utf-8')(writer)
50 | dataframe.to_csv(
51 | path_or_buf=encoding_writer,
52 | sep=delimiter,
53 | header=with_header,
54 | index=False
55 | )
56 |
57 | def _dataframe_to_txt(writer, dataframe):
58 | encoding_writer = codecs.getwriter('utf-8')(writer)
59 | for row in dataframe.iterrows():
60 | encoding_writer.write("".join(row[1].tolist()))
61 | encoding_writer.write('\n')
62 |
63 | def _dataframe_from_csv(reader, delimiter, with_header, skipspace):
64 | """Returns csv data as a pandas Dataframe object"""
65 | sep = delimiter
66 | header = 0
67 | if not with_header:
68 | header = None
69 |
70 | return pd.read_csv(
71 | reader,
72 | header=header,
73 | sep=sep,
74 | skipinitialspace=skipspace,
75 | encoding='utf-8-sig'
76 | )
77 |
78 | def _dataframe_from_txt(reader):
79 | """Returns PlainText data as a pandas Dataframe object"""
80 | return pd.read_csv(reader, header=None, sep="\n", encoding='utf-8-sig')
81 |
82 |
83 | _SERIALIZERS = {
84 | DataTypeIds.PlainText: (
85 | _dataframe_to_txt,
86 | _dataframe_from_txt,
87 | ),
88 | DataTypeIds.GenericCSV: (
89 | partial(_dataframe_to_csv, delimiter=',', with_header=True),
90 | partial(_dataframe_from_csv, delimiter=',', with_header=True, skipspace=True),
91 | ),
92 | DataTypeIds.GenericCSVNoHeader: (
93 | partial(_dataframe_to_csv, delimiter=',', with_header=False),
94 | partial(_dataframe_from_csv, delimiter=',', with_header=False, skipspace=True),
95 | ),
96 | DataTypeIds.GenericTSV: (
97 | partial(_dataframe_to_csv, delimiter='\t', with_header=True),
98 | partial(_dataframe_from_csv, delimiter='\t', with_header=True, skipspace=False),
99 | ),
100 | DataTypeIds.GenericTSVNoHeader: (
101 | partial(_dataframe_to_csv, delimiter='\t', with_header=False),
102 | partial(_dataframe_from_csv, delimiter='\t', with_header=False, skipspace=False),
103 | ),
104 | }
105 |
106 |
107 | def serialize_dataframe(writer, data_type_id, dataframe):
108 | """
109 | Serialize a dataframe.
110 |
111 | Parameters
112 | ----------
113 | writer : file
114 | File-like object to write to. Must be opened in binary mode.
115 | data_type_id : dict
116 | Serialization format to use.
117 | See the azureml.DataTypeIds class for constants.
118 | dataframe: pandas.DataFrame
119 | Dataframe to serialize.
120 | """
121 | _not_none('writer', writer)
122 | _not_none_or_empty('data_type_id', data_type_id)
123 | _not_none('dataframe', dataframe)
124 |
125 | serializer = _SERIALIZERS.get(data_type_id)
126 | if serializer is None:
127 | raise UnsupportedDatasetTypeError(data_type_id)
128 | serializer[0](writer=writer, dataframe=dataframe)
129 |
130 | def deserialize_dataframe(reader, data_type_id):
131 | """
132 | Deserialize a dataframe.
133 |
134 | Parameters
135 | ----------
136 | reader : file
137 | File-like object to read from. Must be opened in binary mode.
138 | data_type_id : dict
139 | Serialization format of the raw data.
140 | See the azureml.DataTypeIds class for constants.
141 |
142 | Returns
143 | -------
144 | pandas.DataFrame
145 | Dataframe object.
146 | """
147 | _not_none('reader', reader)
148 | _not_none_or_empty('data_type_id', data_type_id)
149 |
150 | serializer = _SERIALIZERS.get(data_type_id)
151 | if serializer is None:
152 | raise UnsupportedDatasetTypeError(data_type_id)
153 | return serializer[1](reader=reader)
154 |
155 | def is_supported(data_type_id):
156 | """Return if a serializer is available for the specified format."""
157 | _not_none_or_empty('data_type_id', data_type_id)
158 |
159 | return _SERIALIZERS.get(data_type_id) is not None
160 |
--------------------------------------------------------------------------------
/azureml/services.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | """
27 | Supports publishing and consuming published services that execute within the AzureML
28 | web service execution framework.
29 |
30 | Existing services can be consumed using the service decorator:
31 |
32 | from azureml import services
33 |
34 | @services.service(url, api_key)
35 | @services.types(a = float, b = float)
36 | @services.returns(float)
37 | def some_service(a, b):
38 | pass
39 |
40 | Where the url and api_key are specified for the published web service.
41 |
42 | Python functions can be published using the @publish decorator:
43 |
44 | @services.publish(workspace, workspace_key)
45 | @services.types(a = float, b = float)
46 | @services.returns(float)
47 | def float_typed(a, b):
48 | return a / b
49 |
50 |
51 | The function will be published under a newly created endpoint.
52 |
53 | Publish can also be called programmatically instead:
54 |
55 | published = services.publish(myfunc2, workspace, workspace_key)
56 |
57 | The types and returns decorators can be used to provide type information about the
58 | inputs and outputs. These types will be visible on the help page and enable clients
59 | written in other languages to call published Python functions.
60 |
61 | If types aren't specified then core Python types will be serialized in a custom manner.
62 | This allows working with many common types such as lists, dictionaries, numpy types, etc...
63 | But interop with other languages will be much more difficult.
64 |
65 | Files can also be attached to published functions using the @attach decorator:
66 |
67 | @services.publish(workspace, workspace_key)
68 | @services.attach('foo.txt')
69 | def attached():
70 | return ''.join(file('foo.txt').readlines())
71 |
72 | """
73 | from functools import update_wrapper
74 | import codecs
75 | import inspect
76 | import re
77 | import requests
78 | import uuid
79 | import sys
80 | import json
81 | import base64
82 | import zipfile
83 | import dis
84 | from collections import deque, OrderedDict
85 | from types import CodeType, FunctionType, ModuleType
86 | import types as typesmod
87 | try:
88 | import cPickle as pickle
89 | except:
90 | import pickle
91 | try:
92 | from io import BytesIO
93 | except:
94 | from cStringIO import StringIO as BytesIO
95 | try:
96 | import azureml
97 | except:
98 | # We are published, we won't call publish_worker again.
99 | pass
100 |
101 | try:
102 | import numpy
103 | except:
104 | numpy = None
105 |
106 | try:
107 | import pandas
108 | except:
109 | pandas = None
110 |
111 | _LOAD_GLOBAL = dis.opmap['LOAD_GLOBAL']
112 | #################################################
113 | # Serialization/Deserialization of inputs. This code is distinct from the
114 | # serialization of the user defined function. The user defined function can contain
115 | # arbitrary objects and is fully trusted (so we can use pickle). The inputs to the function
116 | # are coming from arbitrary user input and so need to support a more limited form
117 | # of serialization.
118 | #
119 | # Serialization of the arguments is done using JSON. Each argument is serialized with
120 | # a type and a value. The type is a known type name (int, bool, float, etc...) and the
121 | # value is the serialized value in string format. Usually this is the simplest possible
122 | # representation. Strings are serialized as is, ints/floats we just call str() on, etc...
123 | # For byte arrays we base64 encode them. For data structures we store a list of the elements
124 | # which are encoded in the same way. For example a list would have a list of dictionaries
125 | # in JSON which each have a type and value member.
126 |
127 | _serializers = {}
128 | _deserializers = {}
129 |
130 | def serializer(type):
131 | def l(func):
132 | _serializers[type] = func
133 | return func
134 | return l
135 |
136 | def deserializer(type):
137 | def l(func):
138 | _deserializers[type] = func
139 | return func
140 | return l
141 |
142 | # Type: bool
143 | @serializer(bool)
144 | def _serialize_bool(inp, memo):
145 | return {'type': 'bool', 'value': 'true' if inp else 'false' }
146 |
147 | @deserializer('bool')
148 | def _deserialize_bool(value):
149 | if value['value'] == 'true':
150 | return True
151 | else:
152 | return False
153 |
154 | # Type: int
155 | @serializer(int)
156 | def _serialize_int(inp, memo):
157 | return {'type': 'int', 'value': str(inp) }
158 |
159 | @deserializer('int')
160 | def _deserialize_int(value):
161 | return int(value['value'])
162 |
163 | if sys.version_info < (3, ):
164 | # long
165 | @serializer(long)
166 | def _serialize_long(inp, memo):
167 | return {'type': 'long', 'value': str(inp) }
168 |
169 | @deserializer('long')
170 | def _deserialize_long(value):
171 | return long(value['value'])
172 |
173 | # Type: float
174 | @serializer(float)
175 | def _serialize_float(inp, memo):
176 | return {'type': 'float', 'value': str(inp) }
177 |
178 | @deserializer('float')
179 | def _deserialize_float(value):
180 | return float(value['value'])
181 |
182 |
183 | # Type: complex
184 | @serializer(complex)
185 | def _serialize_complex(inp, memo):
186 | return {'type': 'complex', 'value': str(inp) }
187 |
188 | @deserializer('complex')
189 | def _deserialize_bool(value):
190 | return complex(value['value'])
191 |
192 |
193 | # Type: unicode
194 | @serializer(str if sys.version_info >= (3,) else unicode)
195 | def _serialize_unicode(inp, memo):
196 | return {'type': 'unicode', 'value': str(inp) }
197 |
198 | @deserializer('unicode')
199 | def _deserialize_unicode(value):
200 | return value['value']
201 |
202 |
203 | # Type: byte arrays
204 | @serializer(bytes if sys.version_info >= (3,) else str)
205 | def _serialize_bytes(inp, memo):
206 | data = base64.encodestring(inp)
207 | if sys.version_info >= (3, ):
208 | data = data.decode('utf8')
209 | return {'type': 'bytes', 'value': data.replace(chr(10), '') }
210 |
211 | @deserializer('bytes')
212 | def _deserialize_bytes(value):
213 | data = value['value']
214 | if sys.version_info >= (3, ):
215 | data = data.encode('utf8')
216 | return base64.decodestring(data)
217 |
218 | # Type: dictionaries
219 | @serializer(dict)
220 | def serialize_dict(inp, memo):
221 | return {
222 | 'type': 'dict',
223 | 'value' : [(_encode(k, memo), _encode(inp[k], memo)) for k in inp]
224 | }
225 |
226 |
227 | @deserializer('dict')
228 | def _deserialize_dict(value):
229 | return { _decode_inner(k):_decode_inner(v) for k, v in value['value'] }
230 |
231 | # Type: None/null
232 |
233 | @serializer(type(None))
234 | def serialize_none(inp, memo):
235 | return {'type':'null', 'value':'null'}
236 |
237 | @deserializer('null')
238 | def _deserialize_null(value):
239 | return None
240 |
241 |
242 | # Type: list and tuple
243 | @serializer(list)
244 | @serializer(tuple)
245 | def _serialize_list_or_tuple(inp, memo):
246 | res = []
247 | for value in inp:
248 | res.append(_encode(value, memo))
249 |
250 | return {'type': type(inp).__name__, 'value': res }
251 |
252 | @deserializer('list')
253 | def _deserialize_list(value):
254 | return [_decode_inner(x) for x in value['value']]
255 |
256 | @deserializer('tuple')
257 | def _deserialize_tuple(value):
258 | return tuple(_decode_inner(x) for x in value['value'])
259 |
260 |
261 | if numpy is not None:
262 | # ndarray is serialized as (shape, datatype, data)
263 | @serializer(numpy.ndarray)
264 | def serialize_ndarray(inp, memo):
265 | return {
266 | 'type':'numpy.ndarray',
267 | 'value': (
268 | _encode(inp.shape, memo),
269 | _encode(inp.dtype.name, memo),
270 | _encode(inp.tostring(), memo)
271 | )
272 | }
273 |
274 | @deserializer('numpy.ndarray')
275 | def deserialize_ndarray(value):
276 | shape, dtype, data = value['value']
277 | return numpy.ndarray(
278 | _decode_inner(shape), _decode_inner(dtype), _decode_inner(data)
279 | )
280 |
281 | # TODO: Need better story here...
282 | @serializer(numpy.int32)
283 | def serialize_numpy_int32(inp, memo):
284 | return _serialize_int(inp, memo)
285 |
286 | @serializer(numpy.int64)
287 | def serialize_numpy_int64(inp, memo):
288 | if sys.version_info >= (3, ):
289 | return _serialize_int(inp, memo)
290 |
291 | return _serialize_long(inp, memo)
292 |
293 | @serializer(numpy.float64)
294 | def serialize_numpy_float64(inp, memo):
295 | return _serialize_float(inp, memo)
296 |
297 | # Core deserialization functions. There's a top-level one used when
298 | # actually reading/writing values, and an inner one when we're doing the
299 | # recursive serialization/deserialization.
300 |
301 | def _decode_inner(value):
302 | val_type = value['type']
303 | deserializer = _deserializers.get(value['type'])
304 | if deserializer is None:
305 | raise ValueError("unsupported type: " + value['type'])
306 |
307 | return deserializer(value)
308 |
309 | def _encode(inp, memo = None):
310 | outer = False
311 | if memo is None:
312 | outer = True
313 | memo = {}
314 | if id(inp) in memo and type(inp) in [list, tuple, dict]:
315 | raise ValueError('circular reference detected')
316 | memo[id(inp)] = inp
317 |
318 | serializer = _serializers.get(type(inp))
319 | if serializer is None:
320 | raise TypeError("Unsupported type for invocation: " + type(inp).__module__ + '.' + type(inp).__name__)
321 |
322 | res = serializer(inp, memo)
323 | if outer:
324 | return json.dumps(res)
325 | return res
326 |
327 |
328 | def _decode(inp):
329 | value = json.loads(inp)
330 |
331 | if isinstance(value, dict):
332 | return _decode_inner(value)
333 |
334 | raise TypeError('expected a dictionary, got ' + type(inp).__name__)
335 |
336 | PUBLISH_URL_FORMAT = '{}/workspaces/{}/webservices/{}'
337 |
338 | if sys.version_info >= (3, 0):
339 | _code_args = ['co_argcount', 'co_kwonlyargcount', 'co_nlocals', 'co_stacksize', 'co_flags',
340 | 'co_code', 'co_consts', 'co_names', 'co_varnames', 'co_filename', 'co_name',
341 | 'co_firstlineno', 'co_lnotab', 'co_freevars', 'co_cellvars']
342 | _func_args = ['__name__', '__defaults__', '__closure__']
343 | else:
344 | _code_args = ['co_argcount', 'co_nlocals', 'co_stacksize', 'co_flags', 'co_code', 'co_consts',
345 | 'co_names', 'co_varnames', 'co_filename', 'co_name', 'co_firstlineno', 'co_lnotab',
346 | 'co_freevars', 'co_cellvars']
347 | _func_args = ['func_name', 'func_defaults', 'func_closure']
348 |
349 |
350 | class _Serializer(object):
351 | '''serializes the specified functions, and the globals it uses as well.
352 |
353 | normal globals are just serialized as-is, they must be picklable to do so.
354 |
355 | other functions which are referenced are serialized as an additional function, and
356 | will be repopulated in globals. This allows things like mutually recursive functions
357 | to exist.
358 | '''
359 | def __init__(self):
360 | self.functions = set()
361 | self.queue = deque()
362 |
363 | if sys.version_info < (3, ):
364 | CLASS_TYPES = (typesmod.ClassType, type)
365 | else:
366 | CLASS_TYPES = type
367 |
368 | def serialize(self, obj):
369 | self.queue.append(('func', obj.__name__, obj))
370 | self.functions.add((obj.__name__, obj))
371 | self.mod = obj.__module__
372 |
373 | return self.serialize_obj(obj)
374 |
375 | def serialize_obj(self, obj):
376 | res = []
377 | while self.queue:
378 | objType, name, cur = self.queue.popleft()
379 |
380 | if objType == 'func':
381 | res.append((objType, name, self.get_code_args(cur)))
382 | elif objType == 'mod':
383 | res.append((objType, name, cur.__name__))
384 | elif objType == 'type':
385 | raise NotImplementedError('new style class not supported')
386 | elif objType == 'oldclass':
387 | res.append((objType, name, [cur.__name__, cur.__module__, cur.__bases__, {n:self.serialize_obj(v) for n, v in cur.__dict__.items()}]))
388 | else:
389 | raise Exception('Unknown serialization type')
390 |
391 | return pickle.dumps(res)
392 |
393 | @staticmethod
394 | def find_globals(code):
395 | """walks the byte code to find the variables which are actually globals"""
396 | cur_byte = 0
397 | byte_code = code.co_code
398 |
399 | names = set()
400 | while cur_byte < len(byte_code):
401 | op = ord(byte_code[cur_byte])
402 |
403 | if op >= dis.HAVE_ARGUMENT:
404 | if op == _LOAD_GLOBAL:
405 | oparg = ord(byte_code[cur_byte + 1]) + (ord(byte_code[cur_byte + 2]) << 8)
406 | name = code.co_names[oparg]
407 | names.add(name)
408 |
409 | cur_byte += 2
410 | cur_byte += 1
411 |
412 | return names
413 |
414 | def get_code_args(self, func):
415 | code = func.__code__
416 |
417 | codeArgs = [getattr(code, name) for name in _code_args]
418 | funcArgs = [getattr(func, name) for name in _func_args]
419 | globals = {}
420 |
421 | for name in self.find_globals(code):
422 | if name in func.__globals__:
423 | value = func.__globals__[name]
424 | if isinstance(value, FunctionType):
425 | if (name, value) not in self.functions:
426 | self.queue.append(('func', name, value))
427 | self.functions.add((name, value))
428 | elif isinstance(value, ModuleType):
429 | self.queue.append(('mod', name, value))
430 | elif isinstance(value, _Serializer.CLASS_TYPES) and value.__module__ == self.mod:
431 | # class that needs to be serialized...
432 | if isinstance(value, type):
433 | # new-style class
434 | self.queue.append(('type', name, value))
435 | else:
436 | # old-style class
437 | self.queue.append(('oldclass', name, value))
438 | else:
439 | globals[name] = value
440 |
441 | return pickle.dumps((codeArgs, funcArgs, globals))
442 |
443 | def _serialize_func(func):
444 | return _Serializer().serialize(func)
445 |
446 | def _deserialize_func(funcs, globalDict):
447 | items = pickle.loads(funcs)
448 | res = None
449 | for objType, name, data in items:
450 | if objType == 'func':
451 | codeArgs, funcArgs, updatedGlobals = pickle.loads(data)
452 | code = CodeType(*codeArgs)
453 |
454 | globalDict.update(**updatedGlobals)
455 |
456 | value = FunctionType(code, globalDict, *funcArgs)
457 | elif objType == 'mod':
458 | value = __import__(data)
459 | elif objType == 'oldclass':
460 | class_name, module, bases, class_dict = data
461 | value = typesmod.ClassType(class_name, bases, {k:_deserialize_func(v, globalDict) for k, v in class_dict.items()})
462 | value.__module__ = module
463 | elif objType == 'type':
464 | raise Exception('deserialize type')
465 | else:
466 | raise Exception('Unknown serialization type')
467 | globalDict[name] = value
468 |
469 | if res is None:
470 | res = value
471 |
472 | return res
473 |
474 | def _get_args(func):
475 | raw_schema = _get_dataframe_schema(func)
476 | if raw_schema is not None:
477 | return list(raw_schema.keys())
478 |
479 | args = inspect.getargs(func.__code__)
480 | all_args = args.args
481 | if args.varargs is not None:
482 | all_args.append(args.varargs)
483 | if args.keywords is not None:
484 | all_args.append(args.keywords)
485 | return all_args
486 |
487 | def _encode_arg(arg, type):
488 | if type == OBJECT_NAME:
489 | return _encode(arg)
490 | elif type['type'].lower() == 'string':
491 | return arg
492 |
493 | return json.dumps(arg)
494 |
495 | def _decode_one_response(response, real_type):
496 | if real_type == OBJECT_NAME:
497 | return _decode(response[0])
498 | elif real_type['type'].lower() == 'string':
499 | return response[0]
500 |
501 | # TODO: These shouldn't be necessary, AzureML is returning things to us oddly...
502 | if response[0] == 'True':
503 | return True
504 | elif response[0] == 'False':
505 | return False
506 | return json.loads(response[0])
507 |
508 | def _get_dict_type(column, index, type, types):
509 | if type is not None and column in type:
510 | return _annotation_to_type(type[column])
511 |
512 | return {'type': types[index]}
513 |
514 | def _decode_response(columns, types, response, type):
515 | if isinstance(type, tuple):
516 | # multi-value decode...
517 | return tuple(_decode_one_response((r, ), _annotation_to_type(t)) for r, t in zip(response, type))
518 | elif isinstance(type, dict):
519 | return {c:_decode_one_response((r, ), _get_dict_type(c, i, type, types)) for (i, c), r in zip(enumerate(columns), response)}
520 | elif columns is not None and len(columns) > 1:
521 | return {c:_decode_one_response((r, ), {'type': types[i]}) for (i, c), r in zip(enumerate(columns), response)}
522 |
523 | return _decode_one_response(response, _annotation_to_type(type))
524 |
525 | class published(object):
526 | """The result of publishing a service or marking a method as being published.
527 |
528 | Supports being called to invoke the remote service, iteration for unpacking the url,
529 | api key, and help url, or the url, api_key, and help_url can be accessed directly
530 | as attributes.
531 | """
532 |
533 | def __init__(self, url, api_key, help_url, func, service_id):
534 | self.url = url
535 | self.api_key = api_key
536 | self.help_url = help_url
537 | self.func = func
538 | self.service_id = service_id
539 |
540 | def __repr__(self):
541 | return ''.format(self.func.__name__, self.url)
542 |
543 | def _invoke(self, call_args):
544 | body = {
545 | "Inputs": {
546 | getattr(self.func, '__input_name__', 'input1'): {
547 | "ColumnNames": _get_args(self.func),
548 | "Values": call_args,
549 | }
550 | },
551 | "GlobalParameters": {}
552 | }
553 |
554 | resp = requests.post(
555 | self.url,
556 | json=body,
557 | headers={
558 | 'authorization': 'bearer ' + self.api_key,
559 | }
560 | )
561 |
562 | r = resp.json()
563 | if resp.status_code >= 300:
564 | try:
565 | code = r['error']['code']
566 | except LookupError:
567 | code = None
568 | if code in ('ModuleExecutionError', 'Unauthorized'):
569 | raise RuntimeError(r['error']['details'][0]['message'])
570 | raise ValueError(str(r))
571 | return r
572 |
573 | def _map_args(self, *args, **kwargs):
574 | args = inspect.getcallargs(self.func, *args, **kwargs)
575 | return [ _encode_arg(args[name], _get_arg_type(name, self.func)) for name in _get_args(self.func) ]
576 |
577 | def __call__(self, *args, **kwargs):
578 | # Call remote function
579 | r = self._invoke([ self._map_args(*args, **kwargs) ])
580 | output_name = getattr(self.func, '__output_name__', 'output1')
581 | return _decode_response(
582 | r["Results"][output_name]["value"].get("ColumnNames"),
583 | r["Results"][output_name]["value"].get("ColumnTypes"),
584 | r["Results"][output_name]["value"]["Values"][0],
585 | _get_annotation('return', self.func)
586 | )
587 |
588 | def map(self, *args):
589 | """maps the function onto multiple inputs. The input should be multiple sequences. The
590 | sequences will be zipped together forming the positional arguments for the call. This is
591 | equivalent to map(func, ...) but is executed with a single network call."""
592 | call_args = [self._map_args(*cur_args) for cur_args in zip(*args)]
593 | r = self._invoke(call_args)
594 |
595 | ret_type = _get_annotation('return', self.func)
596 | output_name = getattr(self.func, '__output_name__', 'output1')
597 | return [_decode_response(
598 | r['Results'][output_name]['value'].get("ColumnNames"),
599 | r['Results'][output_name]['value'].get("ColumnTypes"),
600 | x,
601 | ret_type)
602 | for x in r['Results']['output1']['value']['Values']]
603 |
604 | def delete(self):
605 | """unpublishes the service"""
606 | raise NotImplementedError('delete not implemented yet')
607 |
608 | def __iter__(self):
609 | yield self.url
610 | yield self.api_key
611 | yield self.help_url
612 |
613 |
614 | def _get_dataframe_schema(function):
615 | return getattr(function, '__dataframe_schema__', None)
616 |
617 | def _get_main_source(function):
618 |
619 | main_source = u'def azureml_main(df1 = None, df2 = None):\n'
620 | main_source += u' results = []\n'
621 |
622 | if _get_dataframe_schema(function):
623 | # function just takes a dataframe...
624 | main_source += u' results.append(__user_function(df1))' + chr(10)
625 | else:
626 | # we're marshalling the arguments in.
627 | main_source += u' for i in range(df1.shape[0]):' + chr(10)
628 | for arg in _get_args(function):
629 | arg_type = _get_arg_type(arg, function)
630 | if pandas is not None and arg_type is pandas.DataFrame:
631 | raise Exception('Only a single DataFrame argument is supported')
632 |
633 | if _get_arg_type(arg, function) == OBJECT_NAME:
634 | main_source += ' ' + arg + u' = ' + u'_decode(df1["' + arg + u'"][i])' + chr(10)
635 | else:
636 | main_source += ' ' + arg + u' = ' + u'df1["' + arg + u'"][i]' + chr(10)
637 |
638 | main_source += u' results.append(__user_function('
639 |
640 | args = inspect.getargs(function.__code__)
641 | all_args = args.args
642 | if args.varargs is not None:
643 | all_args.append(u'*' + args.varargs)
644 | if args.keywords is not None:
645 | all_args.append(u'**' + args.keywords)
646 |
647 | # pass position arguments...
648 | main_source += u', '.join(all_args)
649 | main_source += u'))' + chr(10)
650 |
651 | ret_annotation = _get_annotation('return', function)
652 | if _get_dataframe_schema(function):
653 | # function just returns a data frame directly
654 | main_source += u' if len(results) == 1:' + chr(10)
655 | main_source += u' return results[0]' + chr(10)
656 | main_source += u' return pandas.DataFrame(results)' + chr(10)
657 | elif isinstance(ret_annotation, tuple):
658 | # multi-value return support...
659 | format = []
660 | arg_names = []
661 | for index, ret_type in enumerate(ret_annotation):
662 | arg_names.append(u'r' + str(index))
663 | t = _annotation_to_type(ret_type)
664 | if t == OBJECT_NAME:
665 | format.append(u'_encode(r' + str(index) + u')')
666 | else:
667 | format.append(u'r' + str(index))
668 | main_source += u' return pandas.DataFrame([(' + u', '.join(format) + u') for ' + ', '.join(arg_names) + u' in results])' + chr(10)
669 | elif _get_arg_type('return', function) == OBJECT_NAME:
670 | main_source += u' return pandas.DataFrame([_encode(r) for r in results])' + chr(10)
671 | else:
672 | main_source += u' return pandas.DataFrame(results)' + chr(10)
673 |
674 | return main_source
675 |
676 | def _get_source(function):
677 | source_file = inspect.getsourcefile(function)
678 | encoding = ''
679 | try:
680 | with open(source_file, 'rb') as source_file:
681 | line1 = source_file.readline()
682 | line2 = source_file.readline()
683 | if line1[:3] == '\xef\xbb\xbf':
684 | encoding = 'utf-8-sig'
685 | else:
686 | match = re.search(b"coding[:=]\s*([-\w.]+)", line1) or re.search(b"coding[:=]\s*([-\w.]+)", line2)
687 | if match:
688 | encoding = match.groups()[0]
689 | with codecs.open(source_file, 'r', encoding) as source_file:
690 | source_text = source_file.read()
691 | except:
692 | source_text = None
693 |
694 | # include our source code...
695 | ourfile = __file__
696 | if ourfile.endswith('.pyc'):
697 | ourfile = ourfile[:-1]
698 | if encoding:
699 | source = u'# coding=' + encoding.decode('ascii')
700 |
701 | with codecs.open(ourfile, 'r', 'ascii') as services_file:
702 | source = services_file.read()
703 |
704 | main_source = _get_main_source(function)
705 |
706 | source += chr(10) + main_source
707 |
708 | if source_text is None:
709 | # we're in a REPL environment, we need to serialize the code...
710 | #TODO: Remove base64 encoding when json double escape issue is fixed
711 | source += inspect.getsource(_deserialize_func)
712 | source += chr(10)
713 | source += u'__user_function = _deserialize_func(base64.decodestring(' + repr(base64.encodestring(_serialize_func(function)).replace(chr(10), '')) + '), globals())'
714 | else:
715 | # we can upload the source code itself...
716 | source += u'''
717 | # overwrite publish/service with ones which won't re-publish...
718 | import sys
719 | sys.modules['azureml'] = azureml = type(sys)('azureml')
720 | sys.modules['azureml.services'] = services = type(sys)('services')
721 | azureml.services = services
722 |
723 | def publish(func, *args, **kwargs):
724 | if callable(func):
725 | return func
726 | def wrapper(func):
727 | return func
728 | return wrapper
729 | services.publish = publish
730 |
731 | def service(*args):
732 | def wrapper(func):
733 | return func
734 | return wrapper
735 |
736 | def attach(*args, **kwargs):
737 | def wrapper(func):
738 | return func
739 | return wrapper
740 |
741 | services.service = service
742 | services.types = types
743 | services.returns = returns
744 | services.attach = attach
745 | services.dataframe_service = attach
746 | services.service_id = attach
747 |
748 | '''
749 | source += source_text
750 | source += chr(10)
751 | source += u'__user_function = ' + function.__name__
752 |
753 | return source
754 |
755 | _known_types = {
756 | int: {'type':'integer', 'format':'int64'},
757 | bool: {'type' : 'Boolean'},
758 | float: {'type': 'number', 'format':'double'},
759 | str if sys.version_info > (3, ) else unicode: {'type':'string'},
760 | #complex:'Complex64',
761 | }
762 |
763 | OBJECT_NAME = {"type":"string", "format":"string"} # "description":"Python custom serialization"
764 |
765 | def _get_annotation(name, func):
766 | try:
767 | annotations = func.__annotations__
768 | except AttributeError:
769 | return None
770 |
771 | return annotations.get(name)
772 |
773 | def _annotation_to_type(annotation):
774 | if annotation is None:
775 | return OBJECT_NAME
776 |
777 | if isinstance(annotation, str):
778 | # allow the user to specify the raw string value that will be passed...
779 | return annotation
780 |
781 | return _known_types.get(annotation) or OBJECT_NAME
782 |
783 | def _get_arg_type(name, func):
784 | if name != "return":
785 | raw_schema = _get_dataframe_schema(func)
786 | if raw_schema is not None:
787 | return _annotation_to_type(raw_schema[name])
788 |
789 | annotation = _get_annotation(name, func)
790 | return _annotation_to_type(annotation)
791 |
792 |
793 | def _add_file(adding, zip_file):
794 | if isinstance(adding, tuple):
795 | name, contents = adding
796 | else:
797 | name = adding
798 | contents = None
799 |
800 | if isinstance(name, tuple):
801 | name, dest_name = name
802 | else:
803 | name = dest_name = name
804 |
805 | if contents is None:
806 | contents = file(name, 'rb').read()
807 |
808 | zip_file.writestr(dest_name, contents)
809 |
810 | _DEBUG = False
811 | def _publish_worker(func, files, workspace_id = None, workspace_token = None, management_endpoint = None):
812 | workspace_id, workspace_token, _, management_endpoint = azureml._get_workspace_info(workspace_id, workspace_token, None, management_endpoint)
813 |
814 | script_code = _get_source(func) + chr(10)
815 | ret_type = _get_annotation('return', func)
816 |
817 | if isinstance(ret_type, tuple):
818 | # multi-value return
819 | results = OrderedDict()
820 | for index, obj_type in enumerate(ret_type):
821 | results['result' + str(index)] = _annotation_to_type(obj_type)
822 | elif isinstance(ret_type, dict):
823 | # multi-value return
824 | results = OrderedDict()
825 | for name, obj_type in ret_type.items():
826 | results[name] = _annotation_to_type(obj_type)
827 | else:
828 | results = {"result": _get_arg_type('return', func)}
829 |
830 | code_bundle = {
831 | "InputSchema": {name: _get_arg_type(name, func) for name in _get_args(func)},
832 | "OutputSchema": results,
833 | "Language" : "python-2.7-64",
834 | "SourceCode": script_code,
835 | }
836 |
837 | attachments = getattr(func, '__attachments__', None)
838 | if attachments or files:
839 | data = BytesIO()
840 | zip_file = zipfile.PyZipFile(data, 'w')
841 | if attachments:
842 | for adding in attachments:
843 | _add_file(adding, zip_file)
844 |
845 | if files:
846 | for adding in files:
847 | _add_file(adding, zip_file)
848 |
849 | zip_file.close()
850 |
851 | code_bundle['ZipContents'] = base64.b64encode(data.getvalue())
852 |
853 | name = getattr(func, '__service_name__', func.__name__)
854 | body = {
855 | "Name": name,
856 | "Type":"Code",
857 | "CodeBundle" : code_bundle
858 | }
859 | id = str(getattr(func, '__service_id__', uuid.uuid4())).replace('-', '')
860 | url = PUBLISH_URL_FORMAT.format(management_endpoint, workspace_id, id)
861 | headers = {'authorization': 'bearer ' + workspace_token}
862 | resp = requests.put(
863 | url,
864 | json=body,
865 | headers=headers
866 | )
867 |
868 | if _DEBUG:
869 | with open(func.__name__ + '.req', 'w') as f:
870 | f.write(url + chr(10))
871 | f.write(json.dumps(body))
872 | f.close()
873 |
874 | with open(func.__name__ + '.res', 'w') as f:
875 | f.write(str(resp.status_code) + chr(10))
876 | f.write(resp.text + chr(10))
877 | f.close()
878 |
879 | if resp.status_code < 200 or resp.status_code > 299:
880 | try:
881 | msg = resp.json()['error']['message']
882 | except:
883 | msg = str(resp.status_code)
884 | raise ValueError('Failed to publish function: ' + msg + chr(10) +
885 | 'Set azureml.services._DEBUG = True to enable writing {}.req/{}.res files'.format(func.__name__, func.__name__))
886 |
887 | j = resp.json()
888 | epUrl = url + '/endpoints/' + j['DefaultEndpointName']
889 | epResp = requests.get(epUrl, headers=headers)
890 | endpoints = epResp.json()
891 |
892 | url = endpoints['ApiLocation'] + '/execute?api-version=2.0'
893 |
894 | return published(url, endpoints['PrimaryKey'], endpoints['HelpLocation'] + '/score', func, id)
895 |
896 | def publish(func_or_workspace_id, workspace_id_or_token = None, workspace_token_or_none = None, files=(), endpoint=None):
897 | '''publishes a callable function or decorates a function to be published.
898 |
899 | Returns a callable, iterable object. Calling the object will invoke the published service.
900 | Iterating the object will give the API URL, API key, and API help url.
901 |
902 | To define a function which will be published to Azure you can simply decorate it with
903 | the @publish decorator. This will publish the service, and then future calls to the
904 | function will run against the operationalized version of the service in the cloud.
905 |
906 | >>> @publish(workspace_id, workspace_token)
907 | >>> def func(a, b):
908 | >>> return a + b
909 |
910 | After publishing you can then invoke the function using:
911 | func.service(1, 2)
912 |
913 | Or continue to invoke the function locally:
914 | func(1, 2)
915 |
916 | You can also just call publish directly to publish a function:
917 |
918 | >>> def func(a, b): return a + b
919 | >>>
920 | >>> res = publish(func, workspace_id, workspace_token)
921 | >>>
922 | >>> url, api_key, help_url = res
923 | >>> res(2, 3)
924 | 5
925 | >>> url, api_key, help_url = res.url, res.api_key, res.help_url
926 |
927 | The returned result will be the published service.
928 |
929 | You can specify a list of files which should be published along with the function.
930 | The resulting files will be stored in a subdirectory called 'Script Bundle'. The
931 | list of files can be one of:
932 | (('file1.txt', None), ) # file is read from disk
933 | (('file1.txt', b'contents'), ) # file contents are provided
934 | ('file1.txt', 'file2.txt') # files are read from disk, written with same filename
935 | ((('file1.txt', 'destname.txt'), None), ) # file is read from disk, written with different destination name
936 |
937 | The various formats for each filename can be freely mixed and matched.
938 | '''
939 | if not callable(func_or_workspace_id):
940 | def do_publish(func):
941 | func.service = _publish_worker(func, files, func_or_workspace_id, workspace_id_or_token, endpoint)
942 | return func
943 | return do_publish
944 |
945 | return _publish_worker(func_or_workspace_id, files, workspace_id_or_token, workspace_token_or_none, endpoint)
946 |
947 | def service(url, api_key, help_url = None):
948 | '''Marks a function as having been published and causes all invocations to go to the remote
949 | operationalized service.
950 |
951 | >>> @service(url, api_key)
952 | >>> def f(a, b):
953 | >>> pass
954 | '''
955 | def do_publish(func):
956 | return published(url, api_key, help_url, func, None)
957 | return do_publish
958 |
959 | def types(**args):
960 | """Specifies the types used for the arguments of a published service.
961 |
962 | @types(a=int, b = str)
963 | def f(a, b):
964 | pass
965 | """
966 | def l(func):
967 | if hasattr(func, '__annotations__'):
968 | func.__annotations__.update(args)
969 | else:
970 | func.__annotations__ = args
971 | return func
972 | return l
973 |
974 | def returns(type):
975 | """Specifies the return type for a published service.
976 |
977 | @returns(int)
978 | def f(...):
979 | pass
980 | """
981 | def l(func):
982 | if hasattr(func, '__annotations__'):
983 | func.__annotations__['return'] = type
984 | else:
985 | func.__annotations__ = {'return': type}
986 | return func
987 | return l
988 |
989 | def attach(name, contents = None):
990 | """attaches a file to the payload to be uploaded.
991 |
992 | If contents is omitted the file is read from disk.
993 | If name is a tuple it specifies the on-disk filename and the destination filename.
994 | """
995 | def do_attach(func):
996 | if hasattr(func, '__attachments__'):
997 | func.__attachments__.append((name, contents))
998 | else:
999 | func.__attachments__ = [(name, contents)]
1000 | return func
1001 | return do_attach
1002 |
1003 | def service_id(id):
1004 | """Specifies the service ID to enable re-publishing to the same end point.
1005 | Can be applied to the function which is being published:
1006 |
1007 | @publish(...)
1008 | @service_id('e5dd3903-796f-4544-b7aa-f4e08b2cc639')
1009 | def myfunc():
1010 | return 42
1011 |
1012 | When the function is published it will replace any existing instances of the
1013 | function.
1014 | """
1015 | def l(func):
1016 | func.__service_id__ = id
1017 | return func
1018 |
1019 | return l
1020 |
1021 | def name(name):
1022 | """Provides a friendly name for the published web service which can include spaces and other characters illegal for Python functions.
1023 | """
1024 | def l(func):
1025 | func.__service_name__ = name
1026 | return func
1027 |
1028 | return l
1029 |
1030 | def dataframe_service(**args):
1031 | """Indicates that the function operations on a data frame. The function
1032 | will receive a single input in the form of a data frame, and should return
1033 | a data frame object. The schema of the data frame is specified with this
1034 | decorator.
1035 |
1036 | @publish(...)
1037 | @dataframe_service(a = int, b = int)
1038 | def myfunc(df):
1039 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])])
1040 | """
1041 | def l(func):
1042 | func.__dataframe_schema__ = args
1043 | return func
1044 |
1045 | return l
1046 |
1047 | def input_name(name):
1048 | """specifies the name of the input the web service expects to receive. Defaults to 'input1'"""
1049 | def l(func):
1050 | func.__input_name__ = name
1051 | return func
1052 |
1053 | return l
1054 |
1055 | def output_name(name):
1056 | """specifies the name of the input the web service expects to receive. Defaults to 'input1'"""
1057 | def l(func):
1058 | func.__output_name__ = name
1059 | return func
1060 |
1061 | return l
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | python-dateutil
3 | pandas
4 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | #-------------------------------------------------------------------------
4 | # Copyright (c) Microsoft Corporation
5 | # All rights reserved.
6 | #
7 | # MIT License:
8 | # Permission is hereby granted, free of charge, to any person obtaining
9 | # a copy of this software and associated documentation files (the
10 | # "Software"), to deal in the Software without restriction, including
11 | # without limitation the rights to use, copy, modify, merge, publish,
12 | # distribute, sublicense, and/or sell copies of the Software, and to
13 | # permit persons to whom the Software is furnished to do so, subject to
14 | # the following conditions:
15 | #
16 | # The above copyright notice and this permission notice shall be
17 | # included in all copies or substantial portions of the Software.
18 | #
19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 | #--------------------------------------------------------------------------
27 |
28 | from setuptools import setup
29 |
30 | # To build:
31 | # python setup.py sdist
32 | # python setup.py bdist_wheel
33 | #
34 | # To install:
35 | # python setup.py install
36 | #
37 | # To register (only needed once):
38 | # python setup.py register
39 | #
40 | # To upload:
41 | # python setup.py sdist upload
42 | # python setup.py bdist_wheel upload
43 |
44 | setup(
45 | name='azureml',
46 | version='0.2.7',
47 | description='Microsoft Azure Machine Learning Python client library',
48 | license='MIT License',
49 | author='Microsoft Corporation',
50 | author_email='ptvshelp@microsoft.com',
51 | url='https://github.com/Azure/Azure-MachineLearning-ClientLibrary-Python',
52 | classifiers=[
53 | 'Development Status :: 3 - Alpha',
54 | 'Programming Language :: Python',
55 | 'Programming Language :: Python :: 2',
56 | 'Programming Language :: Python :: 2.7',
57 | 'Programming Language :: Python :: 3',
58 | 'Programming Language :: Python :: 3.3',
59 | 'Programming Language :: Python :: 3.4',
60 | 'License :: OSI Approved :: MIT License',
61 | ],
62 | packages=['azureml'],
63 | install_requires=[
64 | 'python-dateutil',
65 | 'requests',
66 | 'pandas',
67 | ]
68 | )
69 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | Test settings
2 | =============
3 |
4 | To successfully run tests, you'll need to create an **azuremltestsettings.json** file in this folder.
5 |
6 | This file contains credentials and lists various Azure resources to use when running the tests.
7 |
8 |
9 | Example
10 | -------
11 |
12 | ```
13 | {
14 | "workspace": {
15 | "id": "11111111111111111111111111111111",
16 | "token": "00000000000000000000000000000000",
17 | "endpoint": "https://studio.azureml.net"
18 | },
19 | "storage": {
20 | "accountName": "mystorageaccount",
21 | "accountKey": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==",
22 | "container": "mydatasettestcontainer",
23 | "mediumSizeBlob": "MediumSizeDataset_NH.csv",
24 | "unicodeBomBlob": "DatasetWithUnicodeBOM.txt",
25 | "blobs": [
26 | "Dataset_NH.csv",
27 | "Dataset_NH.tsv",
28 | "Dataset_WH.csv",
29 | "Dataset_WH.tsv",
30 | "Dataset.txt"
31 | ]
32 | },
33 | "intermediateDataset": {
34 | "experimentId": "11111111111111111111111111111111.f-id.22222222222222222222222222222222",
35 | "nodeId": "33333333-3333-3333-3333-333333333333-333",
36 | "portName": "Results dataset",
37 | "dataTypeId": "GenericCSV"
38 | },
39 | "diagnostics": {
40 | "writeBlobContents": "True",
41 | "writeSerializedFrame": "True"
42 | }
43 | }
44 | ```
45 |
46 |
47 | Workspace
48 | ---------
49 |
50 | From the Azure portal, create a new ML workspace. Open the new workspace in Studio. From the URL, you'll find your workspace id.
51 |
52 | In the settings page, you'll find 2 authorization tokens, you can use either one.
53 |
54 | Set the id and token in the json:
55 |
56 | ```
57 | "workspace": {
58 | "id": "11111111111111111111111111111111",
59 | "token": "00000000000000000000000000000000",
60 | "endpoint": "https://studio.azureml.net"
61 | },
62 | ```
63 |
64 |
65 | Storage account
66 | ---------------
67 |
68 | The storage section is used for some tests that load dataset files from Azure blob storage.
69 |
70 | You'll need to create an Azure storage account, create a container and upload dataset files to it.
71 |
72 | The round-trip tests rely on a naming convention for the ones in the blobs array:
73 | ```
74 | "blobs": [
75 | "Dataset_NH.csv",
76 | "Dataset_NH.tsv",
77 | "Dataset_WH.csv",
78 | "Dataset_WH.tsv",
79 | "Dataset.txt"
80 | ]
81 | ```
82 |
83 | NH means no header, WH means with header.
84 |
85 |
86 | Experiment
87 | ----------
88 |
89 | Create a new experiment. Add the following modules and connect them:
90 |
91 | - Airport Codes Dataset
92 | - Split
93 | - Convert to CSV
94 |
95 | Play the experiment and save.
96 |
97 | You'll need the experiment id (appears in URL), the node id (can be found in the HTML DOM), the port name (displayed as a tooltip when you hover on the output port) and the data type id.
98 |
99 | ```
100 | "intermediateDataset": {
101 | "experimentId": "11111111111111111111111111111111.f-id.22222222222222222222222222222222",
102 | "nodeId": "33333333-3333-3333-3333-333333333333-333",
103 | "portName": "Results dataset",
104 | "dataTypeId": "GenericCSV"
105 | },
106 | ```
107 |
108 |
109 | Diagnostics
110 | -----------
111 |
112 | Some of the tests can write intermediate results to disk, which can help with debugging.
113 |
114 | "diagnostics": {
115 | "writeBlobContents": "True",
116 | "writeSerializedFrame": "True"
117 | }
118 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | from os import path
27 | import json
28 | import numpy as np
29 | import random
30 | import string
31 |
32 |
33 | class TestSettings(object):
34 | class Workspace(object):
35 | def __init__(self, settings):
36 | self.settings = settings
37 |
38 | @property
39 | def id(self):
40 | return self.settings['id']
41 |
42 | @property
43 | def token(self):
44 | return self.settings['token']
45 |
46 | @property
47 | def endpoint(self):
48 | return self.settings['endpoint']
49 |
50 | @property
51 | def management_endpoint(self):
52 | return self.settings['management_endpoint']
53 |
54 | class Storage(object):
55 | def __init__(self, settings):
56 | self.settings = settings
57 |
58 | @property
59 | def account_name(self):
60 | return self.settings['accountName']
61 |
62 | @property
63 | def account_key(self):
64 | return self.settings['accountKey']
65 |
66 | @property
67 | def container(self):
68 | return self.settings['container']
69 |
70 | @property
71 | def medium_size_blob(self):
72 | return self.settings['mediumSizeBlob']
73 |
74 | @property
75 | def blobs(self):
76 | return self.settings['blobs']
77 |
78 | class IntermediateDataset(object):
79 | def __init__(self, settings):
80 | self.settings = settings
81 |
82 | @property
83 | def experiment_id(self):
84 | return self.settings['experimentId']
85 |
86 | @property
87 | def node_id(self):
88 | return self.settings['nodeId']
89 |
90 | @property
91 | def port_name(self):
92 | return self.settings['portName']
93 |
94 | @property
95 | def data_type_id(self):
96 | return self.settings['dataTypeId']
97 |
98 | class Diagnostics(object):
99 | def __init__(self, settings):
100 | self.settings = settings
101 |
102 | @property
103 | def write_blob_contents(self):
104 | return self.settings['writeBlobContents']
105 |
106 | @property
107 | def write_serialized_frame(self):
108 | return self.settings['writeSerializedFrame']
109 |
110 | def __init__(self, settings):
111 | self.workspace = TestSettings.Workspace(settings['workspace'])
112 | self.storage = TestSettings.Storage(settings['storage'])
113 | self.intermediateDataset = TestSettings.IntermediateDataset(settings['intermediateDataset'])
114 | self.diagnostics = TestSettings.Diagnostics(settings['diagnostics'])
115 |
116 |
117 | def load_test_settings():
118 | name = "azuremltestsettings.json"
119 | full_path = path.join(path.abspath(path.dirname(__file__)), name)
120 | if not path.exists(full_path):
121 | raise RuntimeError("Cannot run AzureML tests when the expected settings file , '{0}', does not exist!".format(full_path))
122 | with open(full_path, "r") as f:
123 | settings = json.load(f)
124 | return TestSettings(settings)
125 |
126 | def id_generator(size=10, chars=string.ascii_uppercase + string.digits):
127 | return ''.join(random.choice(chars) for _ in range(size))
128 |
--------------------------------------------------------------------------------
/tests/coverage.bat:
--------------------------------------------------------------------------------
1 | @echo OFF
2 | SETLOCAL
3 | cls
4 |
5 | if "%1%" == "" (
6 | set PYTHONDIR=%SystemDrive%\Anaconda
7 | ) else (
8 | set PYTHONDIR=%1%
9 | )
10 |
11 | if "%2%" == "" (
12 | set COVERAGEDIR=htmlcov
13 | ) else (
14 | set COVERAGEDIR=%2%
15 | )
16 |
17 | if "%PYTHONPATH%" == "" (
18 | set PYTHONPATH=..
19 | ) else (
20 | set PYTHONPATH=%PYTHONPATH%;..
21 | )
22 |
23 | if exist "%PYTHONDIR%\Scripts\coverage.exe" (
24 | goto :coverage
25 | )
26 |
27 |
28 | REM ---------------------------------------------------------------------------
29 | if not exist "%PYTHONDIR%\Scripts\pip.exe" (
30 | echo Cannot do a code coverage run when neither 'coverage' nor 'pip' are installed.
31 | goto :exit_door
32 | )
33 |
34 | echo Installing 'coverage' package...
35 | %PYTHONDIR%\Scripts\pip.exe install coverage
36 | echo Finished installing 'coverage' package
37 |
38 | REM ---------------------------------------------------------------------------
39 | :coverage
40 | echo Starting coverage run using %PYTHONDIR%
41 | %PYTHONDIR%\Scripts\coverage.exe run -m unittest discover -p "unittests.py"
42 | %PYTHONDIR%\Scripts\coverage.exe html -d %COVERAGEDIR%
43 | start %CD%\%COVERAGEDIR%\index.html
44 | echo Finished coverage run!
45 |
46 | REM ---------------------------------------------------------------------------
47 | :exit_door
48 | exit /B %UNITTEST_EC%
--------------------------------------------------------------------------------
/tests/foo.txt:
--------------------------------------------------------------------------------
1 | hello world!
--------------------------------------------------------------------------------
/tests/lib.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | from azureml import services
27 | import pandas
28 | from os import path
29 | import os
30 | try:
31 | import tests
32 | from tests.settings import load_test_settings
33 | settings = load_test_settings()
34 | TEST_WS = settings.workspace.id
35 | TEST_KEY = settings.workspace.token
36 | ENDPOINT = settings.workspace.management_endpoint
37 | except:
38 | TEST_WS = ''
39 | TEST_KEY = ''
40 | ENDPOINT = ''
41 |
42 |
43 | #@services.publish(TEST_WS, TEST_KEY)
44 | #def noparams():
45 | # return 'hello world!'
46 |
47 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
48 | @services.types(a = unicode, b = unicode)
49 | @services.returns(unicode)
50 | def str_typed(a, b):
51 | return a + b
52 |
53 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
54 | def untyped_identity(a):
55 | return a
56 |
57 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
58 | @services.attach((path.join(path.dirname(__file__), 'foo.txt'), 'foo.txt'))
59 | @services.types(a = unicode)
60 | @services.returns(unicode)
61 | def attached(a):
62 | return a + ''.join(file('Script Bundle\\foo.txt', 'rU').readlines())
63 |
64 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
65 | @services.types(a = float, b = float)
66 | @services.returns(float)
67 | def float_typed(a, b):
68 | return a / b
69 |
70 |
71 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
72 | @services.types(a = int, b = int)
73 | @services.returns((int, int))
74 | def multivalue_return(a, b):
75 | return a + b, a - b
76 |
77 |
78 | # style 1, var args
79 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
80 | def mysum(*args):
81 | return sum(args)
82 |
83 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
84 | @services.types(a = int, b = int)
85 | @services.returns(int)
86 | def typed(a, b):
87 | return a + b
88 |
89 |
90 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
91 | @services.types(a = bool, b = bool)
92 | @services.returns(bool)
93 | def bool_typed(a, b):
94 | return a and b
95 |
96 | ##@services.publish(TEST_WS, TEST_KEY)
97 | ##@services.types(a = complex, b = complex)
98 | ##@services.returns(complex)
99 | ##def complex_typed(a, b):
100 | ## return a * b
101 |
102 |
103 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
104 | @services.dataframe_service(a = int, b = int)
105 | @services.returns(int)
106 | def dataframe(df):
107 | return pandas.DataFrame([df['a'][i] + df['b'][i] for i in range(df.shape[0])])
108 |
109 |
110 | if hasattr(dataframe, 'service'):
111 | @services.service(dataframe.service.url, dataframe.service.api_key)
112 | @services.types(a = int, b = int)
113 | @services.returns(int)
114 | def dataframe_int(a, b):
115 | pass
116 |
117 | ## style 1, define a function and call the publish API explicitly.
118 |
119 | # style 1, define a function and publish it with a decorator
120 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
121 | def myfunc(a, b):
122 | return [a + b + a, a - b * b, a * b * a, a / b]
123 |
124 |
125 | # style 2, define a function and call the publish API explicitly.
126 | def myfunc2(a, b):
127 | return [a + b, a - b, a * b, a / b]
128 |
129 | published = services.publish(myfunc2, TEST_WS, TEST_KEY, endpoint=ENDPOINT)
130 |
131 |
132 |
133 |
134 | # style 1, kw args
135 | @services.publish(TEST_WS, TEST_KEY, endpoint=ENDPOINT)
136 | def kwargs(**args):
137 | return args
138 |
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/tests/performancetests.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | import unittest
27 | import pandas as pd
28 | from datetime import datetime
29 | from pandas.util.testing import assert_frame_equal
30 |
31 | from azure.storage import BlobService
32 | from azureml import (
33 | BytesIO,
34 | Workspace,
35 | DataTypeIds,
36 | serialize_dataframe,
37 | )
38 | from tests import (
39 | load_test_settings,
40 | )
41 |
42 |
43 | settings = load_test_settings()
44 |
45 |
46 | class PerformanceTests(unittest.TestCase):
47 | def setUp(self):
48 | self.workspace = Workspace(
49 | settings.workspace.id,
50 | settings.workspace.token,
51 | settings.workspace.endpoint
52 | )
53 | self.blob = BlobService(
54 | settings.storage.account_name,
55 | settings.storage.account_key
56 | )
57 |
58 | def _write_blob_contents(self, filename, data):
59 | if settings.diagnostics.write_blob_contents:
60 | with open('original-blob-' + filename, 'wb') as data_file:
61 | data_file.write(data)
62 |
63 | def _write_serialized_frame(self, filename, data):
64 | if settings.diagnostics.write_serialized_frame:
65 | with open('serialized-frame-' + filename, 'wb') as data_file:
66 | data_file.write(data)
67 |
68 | def test_serialize_40mb_dataframe(self):
69 | # Arrange
70 | blob_name = settings.storage.medium_size_blob
71 | original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
72 | original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig')
73 |
74 | self._write_blob_contents(blob_name, original_data)
75 |
76 | # Act
77 | start_time = datetime.now()
78 | writer = BytesIO()
79 | serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe)
80 | elapsed_time = datetime.now() - start_time
81 | result_data = writer.getvalue()
82 |
83 | self._write_serialized_frame(blob_name, result_data)
84 |
85 | # Assert
86 | result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig')
87 | assert_frame_equal(original_dataframe, result_dataframe)
88 | self.assertLess(elapsed_time.total_seconds(), 10)
89 |
90 |
91 | if __name__ == '__main__':
92 | unittest.main()
93 |
--------------------------------------------------------------------------------
/tests/roundtriptests.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | import unittest
27 | import pandas as pd
28 | from pandas.util.testing import assert_frame_equal
29 |
30 | from azure.storage import BlobService
31 | from azureml import (
32 | BytesIO,
33 | Workspace,
34 | DataTypeIds,
35 | )
36 | from tests import (
37 | id_generator,
38 | load_test_settings,
39 | )
40 |
41 |
42 | settings = load_test_settings()
43 |
44 |
45 | class RoundTripTests(unittest.TestCase):
46 | def setUp(self):
47 | self.workspace = Workspace(
48 | settings.workspace.id,
49 | settings.workspace.token,
50 | settings.workspace.endpoint
51 | )
52 | self.blob = BlobService(
53 | settings.storage.account_name,
54 | settings.storage.account_key
55 | )
56 |
57 | def _write_blob_contents(self, filename, data):
58 | if settings.diagnostics.write_blob_contents:
59 | with open('original-blob-' + filename, 'wb') as data_file:
60 | data_file.write(data)
61 |
62 | def _write_serialized_frame(self, filename, data):
63 | if settings.diagnostics.write_serialized_frame:
64 | with open('serialized-frame-' + filename, 'wb') as data_file:
65 | data_file.write(data)
66 |
67 | def test_download_blob_then_upload_as_dataframe_then_read_dataset(self):
68 | def datatypeid_from_header_and_format(header, format):
69 | if format == 'csv':
70 | if header == 'wh':
71 | return DataTypeIds.GenericCSV
72 | else:
73 | return DataTypeIds.GenericCSVNoHeader
74 | elif format == 'tsv':
75 | if header == 'wh':
76 | return DataTypeIds.GenericTSV
77 | else:
78 | return DataTypeIds.GenericTSVNoHeader
79 | elif format == 'txt':
80 | return DataTypeIds.PlainText
81 | else:
82 | self.assertTrue(False, 'Unexpected format')
83 |
84 | def split_blob_name(blob_name):
85 | # blob naming convention:
86 | # name_.
87 | # : WH: with header
88 | # NH: no header
89 | # : CSV: comma separated
90 | # TSV: tab separated
91 | # TXT: newline separated
92 | name, format = blob_name.lower().split('.')
93 | if format != 'txt':
94 | name, header = name.split('_')
95 | else:
96 | header = 'nh'
97 |
98 | return name, format, header
99 |
100 | for blob_name in settings.storage.blobs:
101 | print(blob_name)
102 |
103 | name, format, header = split_blob_name(blob_name)
104 |
105 | # Read the data from blob storage
106 | original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
107 | self._write_blob_contents(blob_name, original_data)
108 |
109 | # Parse the data to a dataframe using Pandas
110 | original_dataframe = pd.read_csv(
111 | BytesIO(original_data),
112 | header=0 if header == 'wh' else None,
113 | sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n',
114 | encoding='utf-8-sig'
115 | )
116 |
117 | # Upload the dataframe as a new dataset
118 | dataset_name = 'unittest' + name + id_generator()
119 | description = 'safe to be deleted - ' + dataset_name
120 | data_type_id = datatypeid_from_header_and_format(header, format)
121 | self.workspace.datasets.add_from_dataframe(
122 | original_dataframe,
123 | data_type_id,
124 | dataset_name,
125 | description,
126 | )
127 |
128 | # Get the new dataset
129 | dataset = self.workspace.datasets[dataset_name]
130 | self.assertIsNotNone(dataset)
131 |
132 | # Read the dataset as a dataframe
133 | result_data = dataset.read_as_binary()
134 | self._write_serialized_frame(blob_name, result_data)
135 | result_dataframe = dataset.to_dataframe()
136 |
137 | # Verify that the dataframes are equal
138 | assert_frame_equal(original_dataframe, result_dataframe)
139 |
140 | def test_azureml_example_datasets(self):
141 | max_size = 10 * 1024 * 1024
142 | skip = [
143 | 'Restaurant feature data',
144 | 'IMDB Movie Titles',
145 | 'Book Reviews from Amazon',
146 | ]
147 |
148 | for dataset in self.workspace.example_datasets:
149 | if not hasattr(dataset, 'to_dataframe'):
150 | print('skipped (unsupported format): {0}'.format(dataset.name))
151 | continue
152 |
153 | if dataset.size > max_size:
154 | print('skipped (max size): {0}'.format(dataset.name))
155 | continue
156 |
157 | if dataset.name in skip:
158 | print('skipped: {0}'.format(dataset.name))
159 | continue
160 |
161 | print('downloading: ' + dataset.name)
162 | frame = dataset.to_dataframe()
163 |
164 | print('uploading: ' + dataset.name)
165 | dataset_name = 'unittest' + dataset.name + id_generator()
166 | description = 'safe to be deleted - ' + dataset_name
167 | self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
168 |
169 |
170 | if __name__ == '__main__':
171 | unittest.main()
172 |
--------------------------------------------------------------------------------
/tests/serialize_test.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | import unittest
27 | import azureml
28 | import sys
29 | from azureml.services import _serialize_func, _deserialize_func, _encode, _decode
30 |
31 | def mutually_ref_f():
32 | mutually_ref_g
33 | return 42, mutually_ref_g
34 |
35 | def mutually_ref_g():
36 | return 100, mutually_ref_f
37 |
38 | abc = 200
39 | def reads_global():
40 | return abc
41 |
42 |
43 | class MyClass():
44 | pass
45 |
46 | class BaseClass: pass
47 |
48 | class DerivedClass(BaseClass):
49 | pass
50 |
51 | def reads_class():
52 | return MyClass()
53 |
54 | def reads_derived_class():
55 | return DerivedClass()
56 |
57 | def aliased_function():
58 | return 42
59 |
60 | alias = aliased_function
61 | def calls_aliased_function():
62 | return alias()
63 |
64 | def reads_module():
65 | return sys.version
66 |
67 | class Test_serialize_test(unittest.TestCase):
68 | def make_globals(self):
69 | return {'__builtins__' : __builtins__}
70 |
71 | def test_module(self):
72 | serialized = _serialize_func(reads_module)
73 | glbs = self.make_globals()
74 | f = _deserialize_func(serialized, glbs)
75 | self.assertEqual(f(), sys.version)
76 |
77 | def test_aliasing(self):
78 | serialized = _serialize_func(calls_aliased_function)
79 | glbs = self.make_globals()
80 | f = _deserialize_func(serialized, glbs)
81 | self.assertEqual(f(), 42)
82 |
83 | def test_mutually_ref(self):
84 | global mutually_ref_f, mutually_ref_g
85 |
86 | glbs = self.make_globals()
87 | serialized = _serialize_func(mutually_ref_f)
88 | del mutually_ref_f, mutually_ref_g
89 |
90 | f = _deserialize_func(serialized, glbs)
91 | self.assertEqual(f()[0], 42)
92 |
93 | self.assertEqual(f()[1]()[0], 100)
94 |
95 | def test_reads_global(self):
96 | global abc, reads_global
97 |
98 | glbs = self.make_globals()
99 | s = _serialize_func(reads_global)
100 | del abc, reads_global
101 | f = _deserialize_func(s, glbs)
102 |
103 | self.assertEqual(f(), 200)
104 | pass
105 |
106 | def test_core_types(self):
107 | values = [42, 'abc', b'abc', 100.0, True, False, 3j, None, [1,2,3], (1,2,3), {2:3}]
108 |
109 | for value in values:
110 | self.assertEqual(_decode(_encode(value)), value)
111 |
112 | def test_other_types(self):
113 | try:
114 | import numpy
115 | self.assertTrue(_decode(_encode(numpy.ndarray(42))).all())
116 | except:
117 | return
118 |
119 | def test_reads_class(self):
120 | global reads_class, MyClass
121 |
122 | s = _serialize_func(reads_class)
123 | del reads_class, MyClass
124 |
125 | glbs = self.make_globals()
126 | f = _deserialize_func(s, glbs)
127 |
128 | self.assertTrue(repr(f()).startswith('<__main__.MyClass instance at'))
129 |
130 | #def test_reads_derived_class(self):
131 | # global reads_derived_class, BaseClass, DerivedClass
132 |
133 | # s = _serialize_func(reads_derived_class)
134 | # del reads_derived_class, BaseClass, DerivedClass
135 |
136 | # glbs = self.make_globals()
137 | # f = _deserialize_func(s, glbs)
138 |
139 | # print(glbs)
140 | # print(repr(f()))
141 |
142 | if __name__ == '__main__':
143 | unittest.main()
144 |
--------------------------------------------------------------------------------
/tests/servicestests.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | from azureml import services
27 | import time
28 | import tests
29 | import traceback
30 | import unittest
31 | import lib
32 | import uuid
33 |
34 | def invoke(published_func, *args, **kwargs):
35 | '''helper to repeatedly invoke the function until it becomes available...'''
36 | for i in xrange(100):
37 | time.sleep(5)
38 | try:
39 | return published_func(*args, **kwargs)
40 | break
41 | except Exception as e:
42 | traceback.print_exc()
43 | print(e)
44 |
45 | def invoke_map(published_func, *args):
46 | '''helper to repeatedly invoke the function until it becomes available...'''
47 | for i in xrange(100):
48 | time.sleep(5)
49 | try:
50 | return published_func.map(*args)
51 | break
52 | except Exception as e:
53 | traceback.print_exc()
54 | print(e)
55 |
56 | class Test_services(unittest.TestCase):
57 | def test_service_id(self):
58 | service_id = uuid.UUID(lib.str_typed.service.service_id)
59 | self.assertNotEqual(service_id, uuid.UUID('00000000000000000000000000000000'))
60 |
61 | def test_str_typed(self):
62 | self.assertEqual(invoke(lib.str_typed.service, 'abc', 'def'), 'abcdef')
63 |
64 | def test_attached(self):
65 | self.assertEqual(invoke(lib.attached.service, 'test '), 'test hello world!')
66 |
67 | def test_bool_typed(self):
68 | self.assertEqual(invoke(lib.bool_typed.service, True, False), False)
69 |
70 | def test_float_typed(self):
71 | self.assertEqual(invoke(lib.float_typed.service, 3.0, 5.0), .6)
72 |
73 | def test_multivalue_return(self):
74 | self.assertEqual(invoke(lib.multivalue_return.service, 1, 2), (3, -1))
75 |
76 | def test_map(self):
77 | # invoking via map
78 | self.assertEqual(invoke_map(lib.typed.service, [1, 1], [2, 4]), [3, 5])
79 |
80 | def test_varargs(self):
81 | # style 1, var args
82 | self.assertEqual(invoke(lib.mysum.service, 1, 2, 3), 6)
83 |
84 | def test_interned_values(self):
85 | # style 1, var args
86 | self.assertEqual(invoke(lib.untyped_identity.service, [1, 1, None, None]), [1, 1, None, None])
87 |
88 | def test_kwargs(self):
89 | self.assertEqual(invoke(lib.kwargs.service, x = 1, y = 2), {'y': 2, 'x': 1})
90 |
91 | def test_simple_decorator(self):
92 | # style 1, define a function and publish it with a decorator
93 | self.assertEqual(invoke(lib.myfunc.service, 1, 2), [4, -3, 2, 0])
94 |
95 | def test_publish_explicitly(self):
96 | # style 2, define a function and call the publish API explicitly.
97 | self.assertEqual(invoke(lib.published, 1, 2), [3, -1, 2, 0])
98 |
99 | def test_strongly_typed(self):
100 | # a strongly typed version...
101 | self.assertEqual(invoke(lib.typed.service, 1, 2), 3)
102 |
103 | def test_data_frame_input(self):
104 | # style 2, define a function and call the publish API explicitly.
105 | self.assertEqual(invoke(lib.dataframe_int, 1, 2), 3.0)
106 |
107 |
108 | #def test_complex_typed(self):
109 | # print(invoke(lib.complex_typed, 3j, 5j))
110 |
111 | def test_consume_published(self):
112 | # style 3, consume an already published service
113 | url, api_key, help_url = lib.published
114 |
115 | @services.service(url, api_key)
116 | def published_func(a, b):
117 | pass
118 |
119 | self.assertEqual(invoke(published_func, 1, 2), [3, -1, 2, 0])
120 |
121 | if __name__ == '__main__':
122 | unittest.main()
123 |
--------------------------------------------------------------------------------
/tests/unittests.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation
3 | # All rights reserved.
4 | #
5 | # MIT License:
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #--------------------------------------------------------------------------
25 |
26 | import math
27 | import sys
28 | import unittest
29 | import os
30 | import pandas as pd
31 | import numpy as np
32 | from os import path
33 | from pandas.util.testing import assert_frame_equal
34 | import random
35 |
36 | from azureml import (
37 | BytesIO,
38 | Workspace,
39 | DataTypeIds,
40 | AzureMLConflictHttpError,
41 | AzureMLHttpError,
42 | UnsupportedDatasetTypeError,
43 | serialize_dataframe,
44 | deserialize_dataframe,
45 | )
46 | from tests import (
47 | id_generator,
48 | load_test_settings,
49 | )
50 |
51 |
52 | EXAMPLE_EXPERIMENT_ID = '506153734175476c4f62416c57734963.f-id.1f022fd4578847dc867d662a51f0a105'
53 | EXAMPLE_EXPERIMENT_DESC = 'Binary Classification: Breast cancer detection'
54 |
55 | EXAMPLE_DATASET_NAME = 'Airport Codes Dataset'
56 | EXAMPLE_UNSUPPORTED_DATASET_NAME = 'Breast cancer data'
57 |
58 | settings = load_test_settings()
59 |
60 |
61 | class WorkspaceTests(unittest.TestCase):
62 | def test_create(self):
63 | # Arrange
64 |
65 | # Act
66 | workspace = Workspace(
67 | workspace_id=settings.workspace.id,
68 | authorization_token=settings.workspace.token,
69 | endpoint=settings.workspace.endpoint
70 | )
71 |
72 | # Assert
73 |
74 | def test_create_ini(self):
75 | # Arrange
76 | try:
77 | with open(path.expanduser('~/.azureml/settings.ini'), 'w') as config:
78 | config.write('''
79 | [workspace]
80 | id=test_id
81 | authorization_token=test_token
82 | api_endpoint=api_endpoint
83 | management_endpoint=management_endpoint
84 | ''')
85 |
86 | workspace = Workspace()
87 | # Assert
88 | self.assertEqual(workspace.workspace_id, 'test_id')
89 | self.assertEqual(workspace.authorization_token, 'test_token')
90 | self.assertEqual(workspace.api_endpoint, 'api_endpoint')
91 | self.assertEqual(workspace.management_endpoint, 'management_endpoint')
92 | finally:
93 | if path.exists(path.expanduser('~/.azureml/settings.ini')):
94 | os.unlink(path.expanduser('~/.azureml/settings.ini'))
95 |
96 | def test_create_json(self):
97 | # Arrange
98 |
99 | # Act
100 |
101 | try:
102 | with open(path.expanduser('~/.azureml/settings.json'), 'w') as config:
103 | config.write('''
104 | {"workspace":{
105 | "id":"test_id",
106 | "authorization_token": "test_token",
107 | "api_endpoint":"api_endpoint",
108 | "management_endpoint":"management_endpoint"
109 | }}''')
110 |
111 | workspace = Workspace()
112 | # Assert
113 | self.assertEqual(workspace.workspace_id, 'test_id')
114 | self.assertEqual(workspace.authorization_token, 'test_token')
115 | self.assertEqual(workspace.api_endpoint, 'api_endpoint')
116 | self.assertEqual(workspace.management_endpoint, 'management_endpoint')
117 | finally:
118 | if path.exists(path.expanduser('~/.azureml/settings.json')):
119 | os.unlink(path.expanduser('~/.azureml/settings.json'))
120 |
121 |
122 | def test_create_no_workspace_id(self):
123 | # Arrange
124 |
125 | # Act
126 | with self.assertRaises(TypeError):
127 | workspace = Workspace(
128 | workspace_id='',
129 | authorization_token=settings.workspace.token,
130 | )
131 |
132 | # Assert
133 |
134 | def test_create_no_workspace_token(self):
135 | # Arrange
136 |
137 | # Act
138 | with self.assertRaises(TypeError):
139 | workspace = Workspace(
140 | workspace_id=settings.workspace.id,
141 | authorization_token='',
142 | )
143 |
144 | # Assert
145 |
146 | def test_create_no_endpoint(self):
147 | # Arrange
148 |
149 | # Act
150 | with self.assertRaises(TypeError):
151 | workspace = Workspace(
152 | workspace_id=settings.workspace.id,
153 | authorization_token=settings.workspace.token,
154 | endpoint=None
155 | )
156 |
157 | # Assert
158 |
159 |
160 | class ExperimentsTests(unittest.TestCase):
161 | def setUp(self):
162 | self.workspace = Workspace(
163 | settings.workspace.id,
164 | settings.workspace.token,
165 | settings.workspace.endpoint
166 | )
167 |
168 | def test_iter(self):
169 | # Arrange
170 |
171 | # Act
172 | all = []
173 | for experiment in self.workspace.experiments:
174 | all.append(experiment)
175 | print(experiment.experiment_id)
176 | print(experiment.description.encode('ascii', 'ignore'))
177 | print('')
178 |
179 | # Assert
180 | self.assertGreater(len(all), 0)
181 |
182 | def test_iter_example_experiments(self):
183 | # Arrange
184 |
185 | # Act
186 | all = []
187 | for experiment in self.workspace.example_experiments:
188 | all.append(experiment)
189 | print(experiment.experiment_id)
190 | print(experiment.description.encode('ascii', 'ignore'))
191 | print('')
192 | self.assertTrue(experiment.is_example)
193 |
194 | # Assert
195 | self.assertGreater(len(all), 0)
196 | self.assertEqual(1, len([e for e in all if e.description == EXAMPLE_EXPERIMENT_DESC]))
197 |
198 | def test_iter_user_experiments(self):
199 | # Arrange
200 |
201 | # Act
202 | all = []
203 | for experiment in self.workspace.user_experiments:
204 | all.append(experiment)
205 | print(experiment.experiment_id)
206 | print(experiment.description.encode('ascii', 'ignore'))
207 | print('')
208 | self.assertFalse(experiment.is_example)
209 |
210 | # Assert
211 | self.assertGreater(len(all), 0)
212 | self.assertEqual(0, len([e for e in all if e.description == EXAMPLE_EXPERIMENT_DESC]))
213 |
214 | def test_len(self):
215 | # Arrange
216 |
217 | # Act
218 | result = len(self.workspace.experiments)
219 |
220 | # Assert
221 | self.assertGreater(result, 0)
222 |
223 | def test_getitem_by_index(self):
224 | # Arrange
225 |
226 | # Act
227 | result = self.workspace.experiments[0]
228 |
229 | # Assert
230 | self.assertIsNotNone(result)
231 |
232 | def test_getitem_by_index_long(self):
233 | if sys.version_info >= (3,):
234 | return
235 |
236 | # Arrange
237 |
238 | # Act
239 | index = long(0) # can't use 0L as that breaks 3.x parsing
240 | result = self.workspace.experiments[index]
241 |
242 | # Assert
243 | self.assertIsNotNone(result)
244 |
245 | def test_getitem_by_index_out_of_range(self):
246 | # Arrange
247 |
248 | # Act
249 | with self.assertRaises(IndexError):
250 | result = self.workspace.experiments[32700]
251 |
252 | # Assert
253 |
254 | def test_getitem_by_id(self):
255 | # Arrange
256 |
257 | # Act
258 | id = settings.intermediateDataset.experiment_id
259 | result = self.workspace.experiments[id]
260 |
261 | # Assert
262 | self.assertIsNotNone(result)
263 | self.assertEqual(result.experiment_id, id)
264 |
265 | def test_getitem_by_id_does_not_exist(self):
266 | # Arrange
267 |
268 | # Act
269 | with self.assertRaises(IndexError):
270 | result = self.workspace.experiments['Does Not Exist']
271 |
272 | # Assert
273 |
274 | def test_repr(self):
275 | # Arrange
276 |
277 | # Act
278 | result = repr(self.workspace.example_experiments)
279 |
280 | # Assert
281 | self.assertIn(EXAMPLE_EXPERIMENT_DESC, result)
282 |
283 |
284 | class ExperimentTests(unittest.TestCase):
285 | def setUp(self):
286 | self.workspace = Workspace(
287 | settings.workspace.id,
288 | settings.workspace.token,
289 | settings.workspace.endpoint
290 | )
291 |
292 | def assertArrayEqual(self, a, b):
293 | if sys.version_info < (3,):
294 | self.assertItemsEqual(a, b)
295 | else:
296 | self.assertCountEqual(a, b)
297 |
298 | def test_metadata(self):
299 | # Arrange
300 | experiment = self.workspace.experiments[
301 | settings.intermediateDataset.experiment_id]
302 |
303 | # Act
304 | print('status.status_code: {0}'.format(experiment.status.status_code))
305 | print('status.status_detail: {0}'.format(experiment.status.status_detail))
306 | print('status.creation_time: {0}'.format(experiment.status.creation_time))
307 | print('description: {0}'.format(experiment.description.encode('ascii','ignore')))
308 | print('creator: {0}'.format(experiment.creator))
309 | print('experiment_id: {0}'.format(experiment.experiment_id))
310 | print('job_id: {0}'.format(experiment.job_id))
311 | print('version_id: {0}'.format(experiment.version_id))
312 | print('etag: {0}'.format(experiment.etag))
313 | print('run_id: {0}'.format(experiment.run_id))
314 | print('is_archived: {0}'.format(experiment.is_archived))
315 | print('is_example: {0}'.format(experiment.is_example))
316 |
317 | # Assert
318 |
319 | def test_repr(self):
320 | # Arrange
321 | experiment = self.workspace.experiments[
322 | settings.intermediateDataset.experiment_id]
323 |
324 | # Act
325 | result = repr(experiment)
326 |
327 | # Assert
328 | expected = u'{0}\t{1}'.format(experiment.experiment_id, experiment.description)
329 | if sys.version_info < (3,):
330 | self.assertEqual(type(result), bytes)
331 | self.assertEqual(result, expected.encode('ascii', 'ignore'))
332 | else:
333 | self.assertEqual(type(result), str)
334 | self.assertEqual(result, expected)
335 |
336 | def test_get_intermediate_dataset(self):
337 | # Arrange
338 | experiment = self.workspace.experiments[
339 | settings.intermediateDataset.experiment_id]
340 |
341 | # Act
342 | result = experiment.get_intermediate_dataset(
343 | settings.intermediateDataset.node_id,
344 | settings.intermediateDataset.port_name,
345 | settings.intermediateDataset.data_type_id
346 | )
347 |
348 | # Assert
349 | self.assertIsNotNone(result)
350 | self.assertEqual(result.workspace, self.workspace)
351 | self.assertEqual(result.experiment, experiment)
352 | self.assertEqual(result.node_id, settings.intermediateDataset.node_id)
353 | self.assertEqual(result.port_name, settings.intermediateDataset.port_name)
354 | self.assertEqual(result.data_type_id, settings.intermediateDataset.data_type_id)
355 |
356 |
357 | class IntermediateDatasetTests(unittest.TestCase):
358 | def setUp(self):
359 | self.workspace = Workspace(
360 | settings.workspace.id,
361 | settings.workspace.token,
362 | settings.workspace.endpoint
363 | )
364 |
365 | self.experiment = self.workspace.experiments[
366 | settings.intermediateDataset.experiment_id]
367 |
368 | self.dataset = self.experiment.get_intermediate_dataset(
369 | settings.intermediateDataset.node_id,
370 | settings.intermediateDataset.port_name,
371 | settings.intermediateDataset.data_type_id
372 | )
373 |
374 | def test_to_dataframe(self):
375 | # Arrange
376 |
377 | # Act
378 | result = self.dataset.to_dataframe()
379 |
380 | # Assert
381 | self.assertGreater(len(result.columns), 0)
382 | self.assertGreater(len(result.values[0]), 0)
383 |
384 | def test_to_dataframe_unsupported_data_type_id(self):
385 | # Arrange
386 | dataset = self.experiment.get_intermediate_dataset(
387 | settings.intermediateDataset.node_id,
388 | settings.intermediateDataset.port_name,
389 | 'Unsupported'
390 | )
391 |
392 | # Act
393 | result = hasattr(dataset, 'to_dataframe')
394 |
395 | # Assert
396 | self.assertFalse(result)
397 |
398 | def test_open(self):
399 | # Arrange
400 |
401 | # Act
402 | result = self.dataset.open()
403 |
404 | # Assert
405 | self.assertIsNotNone(result)
406 | raw_data = result.read()
407 | self.assertGreater(len(raw_data), 0)
408 |
409 | def test_read_as_binary(self):
410 | # Arrange
411 |
412 | # Act
413 | result = self.dataset.read_as_binary()
414 |
415 | # Assert
416 | self.assertGreater(len(result), 0)
417 |
418 | def test_read_as_text(self):
419 | # Arrange
420 |
421 | # Act
422 | result = self.dataset.read_as_text()
423 |
424 | # Assert
425 | self.assertGreater(len(result), 0)
426 |
427 |
428 | class DatasetsTests(unittest.TestCase):
429 | def setUp(self):
430 | self.workspace = Workspace(
431 | settings.workspace.id,
432 | settings.workspace.token,
433 | settings.workspace.endpoint
434 | )
435 |
436 | def test_len(self):
437 | # Arrange
438 |
439 | # Act
440 | result = len(self.workspace.datasets)
441 |
442 | # Assert
443 | self.assertGreater(result, 0)
444 |
445 | def test_getitem_by_index(self):
446 | # Arrange
447 |
448 | # Act
449 | result = self.workspace.datasets[0]
450 |
451 | # Assert
452 | self.assertIsNotNone(result)
453 |
454 | def test_getitem_by_index_long(self):
455 | if sys.version_info >= (3,):
456 | return
457 |
458 | # Arrange
459 |
460 | # Act
461 | index = long(0) # can't use 0L as that breaks 3.x parsing
462 | result = self.workspace.datasets[index]
463 |
464 | # Assert
465 | self.assertIsNotNone(result)
466 |
467 | def test_getitem_by_index_out_of_range(self):
468 | # Arrange
469 |
470 | # Act
471 | with self.assertRaises(IndexError):
472 | result = self.workspace.datasets[32700]
473 |
474 | # Assert
475 |
476 | def test_getitem_by_name(self):
477 | # Arrange
478 |
479 | # Act
480 | result = self.workspace.datasets[EXAMPLE_DATASET_NAME]
481 |
482 | # Assert
483 | self.assertIsNotNone(result)
484 | self.assertEqual(result.name, EXAMPLE_DATASET_NAME)
485 |
486 | def test_getitem_by_name_wrong_case(self):
487 | # Arrange
488 |
489 | # Act
490 | with self.assertRaises(IndexError):
491 | result = self.workspace.datasets[EXAMPLE_DATASET_NAME.upper()]
492 |
493 | # Assert
494 |
495 | def test_getitem_by_name_does_not_exist(self):
496 | # Arrange
497 |
498 | # Act
499 | with self.assertRaises(IndexError):
500 | result = self.workspace.datasets['Does Not Exist']
501 |
502 | # Assert
503 |
504 | def test_iter(self):
505 | # Arrange
506 |
507 | # Act
508 | all = []
509 | for dataset in self.workspace.datasets:
510 | all.append(dataset)
511 | print(dataset.name)
512 |
513 | # Assert
514 | self.assertGreater(len(all), 0)
515 |
516 | def test_iter_example_datasets(self):
517 | # Arrange
518 |
519 | # Act
520 | all = []
521 | for dataset in self.workspace.example_datasets:
522 | all.append(dataset)
523 | print(dataset.dataset_id)
524 | print(dataset.name)
525 | print(dataset.data_type_id)
526 | print('')
527 | self.assertTrue(dataset.is_example)
528 |
529 | # Assert
530 | self.assertGreater(len(all), 0)
531 | self.assertEqual(1, len([a for a in all if a.name ==EXAMPLE_DATASET_NAME]))
532 |
533 | def test_iter_user_datasets(self):
534 | # Arrange
535 |
536 | # Act
537 | all = []
538 | for dataset in self.workspace.user_datasets:
539 | all.append(dataset)
540 | print(dataset.dataset_id)
541 | print(dataset.name)
542 | print(dataset.data_type_id)
543 | print('')
544 | self.assertFalse(dataset.is_example)
545 |
546 | # Assert
547 | self.assertGreater(len(all), 0)
548 | self.assertEqual(0, len([a for a in all if a.name ==EXAMPLE_DATASET_NAME]))
549 |
550 | def test_repr(self):
551 | # Arrange
552 |
553 | # Act
554 | result = repr(self.workspace.example_datasets)
555 |
556 | # Assert
557 | self.assertIn('{0}\n'.format(EXAMPLE_DATASET_NAME), result)
558 |
559 |
560 | class UploadTests(unittest.TestCase):
561 | def setUp(self):
562 | self.workspace = Workspace(
563 | settings.workspace.id,
564 | settings.workspace.token,
565 | settings.workspace.endpoint
566 | )
567 |
568 | self.original_data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
569 | self.original_dataframe = pd.DataFrame(self.original_data)
570 | self.original_name = 'unittestcsvwh' + id_generator()
571 | self.original_description = 'safe to be deleted - ' + self.original_name
572 |
573 | self.updated_data = [{'a': 101, 'b': 102}, {'a': 105, 'b': 110, 'c': 120}]
574 | self.updated_dataframe = pd.DataFrame(self.updated_data)
575 | self.updated_name = 'unittestcsvwhupdate' + id_generator()
576 | self.updated_description = 'updated'
577 |
578 |
579 | def test_add_from_dataframe(self):
580 | # Arrange
581 |
582 | # Act
583 | result = self.workspace.datasets.add_from_dataframe(
584 | self.original_dataframe,
585 | DataTypeIds.GenericCSV,
586 | self.original_name,
587 | self.original_description,
588 | )
589 |
590 | # Assert
591 | self.assertIsNotNone(result)
592 | self.assertEqual(result.name, self.original_name)
593 | self.assertEqual(result.description, self.original_description)
594 | self.assertEqual(result.data_type_id, DataTypeIds.GenericCSV)
595 | self.assertEqual(result.owner, 'Python SDK')
596 | self.assertIsNotNone(self.workspace.datasets[self.original_name])
597 |
598 | def test_add_from_dataframe_conflict(self):
599 | # Arrange
600 | self.workspace.datasets.add_from_dataframe(
601 | self.original_dataframe,
602 | DataTypeIds.GenericCSV,
603 | self.original_name,
604 | self.original_description,
605 | )
606 |
607 | # Act
608 | with self.assertRaises(AzureMLConflictHttpError):
609 | result = self.workspace.datasets.add_from_dataframe(
610 | self.original_dataframe,
611 | DataTypeIds.GenericCSV,
612 | self.original_name,
613 | self.original_description,
614 | )
615 |
616 | # Assert
617 |
618 | def test_update_from_dataframe(self):
619 | # Arrange
620 | dataset = self.workspace.datasets.add_from_dataframe(
621 | self.original_dataframe,
622 | DataTypeIds.GenericCSV,
623 | self.original_name,
624 | self.original_description,
625 | )
626 |
627 | # Act
628 | result = dataset.update_from_dataframe(self.updated_dataframe)
629 |
630 | # Assert
631 | self.assertIsNone(result)
632 | actual_dataframe = dataset.to_dataframe()
633 | self.assertEqual(dataset.name, self.original_name)
634 | self.assertEqual(dataset.description, self.original_description)
635 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericCSV)
636 | assert_frame_equal(actual_dataframe, self.updated_dataframe)
637 |
638 | def test_update_from_dataframe_with_type_id_name_description(self):
639 | # Arrange
640 | dataset = self.workspace.datasets.add_from_dataframe(
641 | self.original_dataframe,
642 | DataTypeIds.GenericCSV,
643 | self.original_name,
644 | self.original_description,
645 | )
646 |
647 | # Act
648 | result = dataset.update_from_dataframe(
649 | self.updated_dataframe,
650 | DataTypeIds.GenericTSV,
651 | self.updated_name,
652 | self.updated_description)
653 |
654 | # Assert
655 | self.assertIsNone(result)
656 | actual_dataframe = dataset.to_dataframe()
657 | self.assertEqual(dataset.name, self.updated_name)
658 | self.assertEqual(dataset.description, self.updated_description)
659 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericTSV)
660 | assert_frame_equal(actual_dataframe, self.updated_dataframe)
661 |
662 | def test_add_from_dataframe_invalid_name(self):
663 | # Arrange
664 | invalid_name = 'unittestcsvwh:' + id_generator()
665 |
666 | # Act
667 | try:
668 | result = self.workspace.datasets.add_from_dataframe(
669 | self.original_dataframe,
670 | DataTypeIds.GenericCSV,
671 | invalid_name,
672 | self.original_description,
673 | )
674 | self.assertTrue(False, 'Failed to raise AzureMLHttpError.')
675 | except AzureMLHttpError as error:
676 | self.assertIn('forbidden characters', str(error))
677 | self.assertEqual(error.status_code, 400)
678 |
679 | # Assert
680 |
681 | def test_add_from_raw_data(self):
682 | # Arrange
683 | original_raw_data = _frame_to_raw_data(self.original_dataframe, ',', True)
684 |
685 | # Act
686 | result = self.workspace.datasets.add_from_raw_data(
687 | original_raw_data,
688 | DataTypeIds.GenericCSV,
689 | self.original_name,
690 | self.original_description,
691 | )
692 |
693 | # Assert
694 | self.assertIsNotNone(result)
695 | self.assertIsNotNone(self.workspace.datasets[self.original_name])
696 | self.assertEqual(result.name, self.original_name)
697 | self.assertEqual(result.description, self.original_description)
698 |
699 | def test_add_from_raw_data_chunked(self):
700 | original_name = 'unittestcsvwh' + id_generator()
701 |
702 | # Arrange
703 | original_raw_data = b''.join(chr(random.randint(0, 255)) for x in range(0x800000))
704 |
705 | # Act
706 | result = self.workspace.datasets.add_from_raw_data(
707 | original_raw_data,
708 | DataTypeIds.GenericCSV,
709 | original_name,
710 | 'test description',
711 | )
712 |
713 | # Assert
714 | self.assertIsNotNone(result)
715 | self.assertIsNotNone(self.workspace.datasets[original_name])
716 | self.assertEqual(result.name, original_name)
717 |
718 | new_data = self.workspace.datasets[original_name].read_as_binary()
719 | self.assertEqual(original_raw_data, new_data)
720 |
721 |
722 | def test_update_from_raw_data(self):
723 | # Arrange
724 | dataset = self.workspace.datasets.add_from_dataframe(
725 | self.original_dataframe,
726 | DataTypeIds.GenericCSV,
727 | self.original_name,
728 | self.original_description,
729 | )
730 |
731 | updated_raw_data = _frame_to_raw_data(self.updated_dataframe, ',', True)
732 |
733 | # Act
734 | result = dataset.update_from_raw_data(updated_raw_data)
735 |
736 | # Assert
737 | self.assertIsNone(result)
738 | actual_dataframe = dataset.to_dataframe()
739 | self.assertEqual(dataset.name, self.original_name)
740 | self.assertEqual(dataset.description, self.original_description)
741 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericCSV)
742 | assert_frame_equal(actual_dataframe, self.updated_dataframe)
743 |
744 | def test_update_from_raw_data_with_data_type_id_name_description(self):
745 | # Arrange
746 | dataset = self.workspace.datasets.add_from_dataframe(
747 | self.original_dataframe,
748 | DataTypeIds.GenericCSV,
749 | self.original_name,
750 | self.original_description,
751 | )
752 |
753 | updated_raw_data = _frame_to_raw_data(self.updated_dataframe, '\t', True)
754 |
755 | # Act
756 | result = dataset.update_from_raw_data(
757 | updated_raw_data,
758 | DataTypeIds.GenericTSV,
759 | self.updated_name,
760 | self.updated_description,
761 | )
762 |
763 | # Assert
764 | self.assertIsNone(result)
765 | actual_dataframe = dataset.to_dataframe()
766 | self.assertEqual(dataset.name, self.updated_name)
767 | self.assertEqual(dataset.description, self.updated_description)
768 | self.assertEqual(dataset.data_type_id, DataTypeIds.GenericTSV)
769 | assert_frame_equal(actual_dataframe, self.updated_dataframe)
770 |
771 | def test_update_from_dataframe_example_dataset(self):
772 | # Arrange
773 | dataset = self.workspace.example_datasets[0]
774 |
775 | # Act
776 | result = hasattr(dataset, 'update_from_dataframe')
777 |
778 | # Assert
779 | self.assertFalse(result)
780 |
781 | def test_update_from_raw_data_example_dataset(self):
782 | # Arrange
783 | dataset = self.workspace.example_datasets[0]
784 |
785 | # Act
786 | result = hasattr(dataset, 'update_from_raw_data')
787 |
788 | # Assert
789 | self.assertFalse(result)
790 |
791 |
792 | class DatasetTests(unittest.TestCase):
793 | def setUp(self):
794 | self.workspace = Workspace(
795 | settings.workspace.id,
796 | settings.workspace.token,
797 | settings.workspace.endpoint
798 | )
799 |
800 | def assertArrayEqual(self, a, b):
801 | if sys.version_info < (3,):
802 | self.assertItemsEqual(a, b)
803 | else:
804 | self.assertCountEqual(a, b)
805 |
806 | def test_metadata(self):
807 | # Arrange
808 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
809 |
810 | # Act
811 | print('visualize_end_point.base_uri: {0}'.format(dataset.visualize_end_point.base_uri))
812 | print('visualize_end_point.size: {0}'.format(dataset.visualize_end_point.size))
813 | print('visualize_end_point.endpoint_type: {0}'.format(dataset.visualize_end_point.endpoint_type))
814 | print('visualize_end_point.credential_container: {0}'.format(dataset.visualize_end_point.credential_container))
815 | print('visualize_end_point.access_credential: {0}'.format(dataset.visualize_end_point.access_credential))
816 | print('visualize_end_point.location: {0}'.format(dataset.visualize_end_point.location))
817 | print('visualize_end_point.file_type: {0}'.format(dataset.visualize_end_point.file_type))
818 | print('visualize_end_point.is_auxiliary: {0}'.format(dataset.visualize_end_point.is_auxiliary))
819 | print('visualize_end_point.name: {0}'.format(dataset.visualize_end_point.name))
820 | print('schema_end_point.base_uri: {0}'.format(dataset.schema_end_point.base_uri))
821 | print('schema_end_point.size: {0}'.format(dataset.schema_end_point.size))
822 | print('schema_end_point.endpoint_type: {0}'.format(dataset.schema_end_point.endpoint_type))
823 | print('schema_end_point.credential_container: {0}'.format(dataset.schema_end_point.credential_container))
824 | print('schema_end_point.access_credential: {0}'.format(dataset.schema_end_point.access_credential))
825 | print('schema_end_point.location: {0}'.format(dataset.schema_end_point.location))
826 | print('schema_end_point.file_type: {0}'.format(dataset.schema_end_point.file_type))
827 | print('schema_end_point.is_auxiliary: {0}'.format(dataset.schema_end_point.is_auxiliary))
828 | print('schema_end_point.name: {0}'.format(dataset.schema_end_point.name))
829 | print('schema_status: {0}'.format(dataset.schema_status))
830 | print('dataset_id: {0}'.format(dataset.dataset_id))
831 | print('data_type_id: {0}'.format(dataset.data_type_id))
832 | print('name: {0}'.format(dataset.name))
833 | print('description: {0}'.format(dataset.description))
834 | print('family_id: {0}'.format(dataset.family_id))
835 | print('resource_upload_id: {0}'.format(dataset.resource_upload_id))
836 | print('source_origin: {0}'.format(dataset.source_origin))
837 | print('size: {0}'.format(dataset.size))
838 | print('created_date: {0}'.format(dataset.created_date))
839 | print('owner: {0}'.format(dataset.owner))
840 | print('experiment_id: {0}'.format(dataset.experiment_id))
841 | print('client_version: {0}'.format(dataset.client_version))
842 | print('promoted_from: {0}'.format(dataset.promoted_from))
843 | print('uploaded_from_filename: {0}'.format(dataset.uploaded_from_filename))
844 | print('service_version: {0}'.format(dataset.service_version))
845 | print('is_latest: {0}'.format(dataset.is_latest))
846 | print('category: {0}'.format(dataset.category))
847 | print('download_location.base_uri: {0}'.format(dataset.download_location.base_uri))
848 | print('download_location.size: {0}'.format(dataset.download_location.size))
849 | print('download_location.endpoint_type: {0}'.format(dataset.download_location.endpoint_type))
850 | print('download_location.credential_container: {0}'.format(dataset.download_location.credential_container))
851 | print('download_location.access_credential: {0}'.format(dataset.download_location.access_credential))
852 | print('download_location.location: {0}'.format(dataset.download_location.location))
853 | print('download_location.file_type: {0}'.format(dataset.download_location.file_type))
854 | print('download_location.is_auxiliary: {0}'.format(dataset.download_location.is_auxiliary))
855 | print('download_location.name: {0}'.format(dataset.download_location.name))
856 | print('is_deprecated: {0}'.format(dataset.is_deprecated))
857 | print('culture: {0}'.format(dataset.culture))
858 | print('batch: {0}'.format(dataset.batch))
859 | print('created_date_ticks: {0}'.format(dataset.created_date_ticks))
860 |
861 | # Assert
862 |
863 | def test_repr(self):
864 | # Arrange
865 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
866 |
867 | # Act
868 | result = repr(dataset)
869 |
870 | # Assert
871 | self.assertEqual(dataset.name, result)
872 |
873 | def test_to_dataframe(self):
874 | # Arrange
875 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
876 |
877 | # Act
878 | result = dataset.to_dataframe()
879 |
880 | # Assert
881 | self.assertArrayEqual(
882 | result.columns,
883 | [u'airport_id', u'city', u'state', u'name'])
884 | self.assertArrayEqual(
885 | result.values[0],
886 | [10165, 'Adak Island', 'AK', 'Adak'])
887 | self.assertArrayEqual(
888 | result.values[-1],
889 | [14543, 'Rock Springs', 'WY', 'Rock Springs Sweetwater County'])
890 |
891 | def test_to_dataframe_unsupported_data_type_id(self):
892 | # Arrange
893 | dataset = self.workspace.datasets[EXAMPLE_UNSUPPORTED_DATASET_NAME]
894 |
895 | # Act
896 | result = hasattr(dataset, 'to_dataframe')
897 |
898 | # Assert
899 | self.assertFalse(result)
900 |
901 | def test_open(self):
902 | # Arrange
903 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
904 |
905 | # Act
906 | result = dataset.open()
907 |
908 | # Assert
909 | self.assertIsNotNone(result)
910 | raw_data = result.read()
911 | expected = b'airport_id,city,state,name\r\n10165,Adak Island, AK, Adak'
912 | self.assertEqual(raw_data[:len(expected)], expected)
913 |
914 | def test_read_as_binary(self):
915 | # Arrange
916 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
917 |
918 | # Act
919 | result = dataset.read_as_binary()
920 |
921 | # Assert
922 | expected = b'airport_id,city,state,name\r\n10165,Adak Island, AK, Adak'
923 | self.assertEqual(result[:len(expected)], expected)
924 |
925 | def test_read_as_text(self):
926 | # Arrange
927 | dataset = self.workspace.datasets[EXAMPLE_DATASET_NAME]
928 |
929 | # Act
930 | result = dataset.read_as_text()
931 |
932 | # Assert
933 | lines = result.splitlines()
934 | self.assertEqual(lines[0], 'airport_id,city,state,name')
935 | self.assertEqual(lines[1], '10165,Adak Island, AK, Adak')
936 | self.assertEqual(lines[-1], '14543,Rock Springs, WY, Rock Springs Sweetwater County')
937 |
938 |
939 | class SerializationTests(unittest.TestCase):
940 | def assertArrayEqual(self, a, b):
941 | if sys.version_info < (3,):
942 | self.assertItemsEqual(a, b)
943 | else:
944 | self.assertCountEqual(a, b)
945 |
946 | def test_serialize_to_csv(self):
947 | # Arrange
948 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
949 | dataframe = pd.DataFrame(data)
950 |
951 | # Act
952 | writer = BytesIO()
953 | serialize_dataframe(writer, DataTypeIds.GenericCSV, dataframe)
954 | result = writer.getvalue()
955 |
956 | # Assert
957 | self.assertGreater(len(result), 0)
958 | self.assertEqual(result, b'a,b,c\n1.0,2.0,\n5.1,10.1,20.1\n')
959 |
960 | def test_serialize_to_csv_no_header(self):
961 | # Arrange
962 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
963 | dataframe = pd.DataFrame(data)
964 |
965 | # Act
966 | writer = BytesIO()
967 | serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe)
968 | result = writer.getvalue()
969 |
970 | # Assert
971 | self.assertGreater(len(result), 0)
972 | self.assertEqual(result, b'1.0,2.0,\n5.1,10.1,20.1\n')
973 |
974 | def test_serialize_to_tsv(self):
975 | # Arrange
976 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
977 | dataframe = pd.DataFrame(data)
978 |
979 | # Act
980 | writer = BytesIO()
981 | serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe)
982 | result = writer.getvalue()
983 |
984 | # Assert
985 | self.assertGreater(len(result), 0)
986 | self.assertEqual(result, b'a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n')
987 |
988 | def test_serialize_to_tsv_no_header(self):
989 | # Arrange
990 | data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
991 | dataframe = pd.DataFrame(data)
992 |
993 | # Act
994 | writer = BytesIO()
995 | serialize_dataframe(writer, DataTypeIds.GenericTSVNoHeader, dataframe)
996 | result = writer.getvalue()
997 |
998 | # Assert
999 | self.assertGreater(len(result), 0)
1000 | self.assertEqual(result, b'1.0\t2.0\t\n5.1\t10.1\t20.1\n')
1001 |
1002 | def test_serialize_to_plain_text(self):
1003 | # Arrange
1004 | data = ['This is the first', 'This is second line']
1005 | dataframe = pd.DataFrame(data)
1006 |
1007 | # Act
1008 | writer = BytesIO()
1009 | serialize_dataframe(writer, DataTypeIds.PlainText, dataframe)
1010 | result = writer.getvalue()
1011 |
1012 | # Assert
1013 | self.assertGreater(len(result), 0)
1014 | self.assertEqual(result, b'This is the first\nThis is second line\n')
1015 |
1016 | def test_deserialize_from_plain_text_bom(self):
1017 | # Arrange
1018 | data = b'\xef\xbb\xbfJohn enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.\r\nMicrosoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'
1019 |
1020 | # Act
1021 | reader = BytesIO(data)
1022 | result = deserialize_dataframe(reader, DataTypeIds.PlainText)
1023 |
1024 | # Assert
1025 | self.assertIsNotNone(result)
1026 | expected = [
1027 | {0: 'John enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.'},
1028 | {0: 'Microsoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'},
1029 | ]
1030 | assert_frame_equal(pd.DataFrame(expected), result)
1031 |
1032 | def test_deserialize_from_csv(self):
1033 | # Arrange
1034 | data = b'a,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'
1035 |
1036 | # Act
1037 | reader = BytesIO(data)
1038 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV)
1039 |
1040 | # Assert
1041 | self.assertIsNotNone(result)
1042 | expected = [
1043 | {'a': 1.0, 'b': 2.0},
1044 | {'a': 5.1, 'b': 10.1, 'c': 20.1},
1045 | {'a': 50.2, 'c': 50.3},
1046 | ]
1047 | assert_frame_equal(pd.DataFrame(expected), result)
1048 |
1049 | def test_deserialize_from_csv_bom(self):
1050 | # Arrange
1051 | data = b'\xef\xbb\xbfa,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'
1052 |
1053 | # Act
1054 | reader = BytesIO(data)
1055 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV)
1056 |
1057 | # Assert
1058 | self.assertIsNotNone(result)
1059 | expected = [
1060 | {'a': 1.0, 'b': 2.0},
1061 | {'a': 5.1, 'b': 10.1, 'c': 20.1},
1062 | {'a': 50.2, 'c': 50.3},
1063 | ]
1064 | assert_frame_equal(pd.DataFrame(expected), result)
1065 |
1066 | def test_deserialize_from_csv_spaces(self):
1067 | # Arrange
1068 | data = b'a, b, c\n1.0, two, nan\n5.1, "ten point one", 20.1\n50.2, , 50.3\n'
1069 |
1070 | # Act
1071 | reader = BytesIO(data)
1072 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSV)
1073 |
1074 | # Assert
1075 | self.assertIsNotNone(result)
1076 | expected = [
1077 | {'a': 1.0, 'b': 'two'},
1078 | {'a': 5.1, 'b': 'ten point one', 'c': 20.1},
1079 | {'a': 50.2, 'c': 50.3},
1080 | ]
1081 | assert_frame_equal(pd.DataFrame(expected), result)
1082 |
1083 | def test_deserialize_from_csv_no_header(self):
1084 | # Arrange
1085 | data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'
1086 |
1087 | # Act
1088 | reader = BytesIO(data)
1089 | result = deserialize_dataframe(reader, DataTypeIds.GenericCSVNoHeader)
1090 |
1091 | # Assert
1092 | self.assertIsNotNone(result)
1093 | expected = [
1094 | {0: 1.0, 1: 2.0},
1095 | {0: 5.1, 1: 10.1, 2: 20.1},
1096 | {0: 50.2, 2: 50.3},
1097 | ]
1098 | assert_frame_equal(pd.DataFrame(expected), result)
1099 |
1100 | @unittest.skip('ARFF is not supported yet.')
1101 | def test_deserialize_from_arff(self):
1102 | # Arrange
1103 | data = b"""@RELATION Unnamed
1104 |
1105 | @ATTRIBUTE Class NUMERIC
1106 | @ATTRIBUTE age NUMERIC
1107 | @ATTRIBUTE menopause NUMERIC
1108 | @ATTRIBUTE tumor-size NUMERIC
1109 |
1110 | @DATA
1111 | 0,5,1,1
1112 | 0,5,4,4
1113 | 1,4,8,8
1114 |
1115 | """
1116 |
1117 | # Act
1118 | reader = BytesIO(data)
1119 | result = deserialize_dataframe(reader, DataTypeIds.ARFF)
1120 | print(result)
1121 |
1122 | # Assert
1123 | self.assertIsNotNone(result)
1124 | expected = [
1125 | {'Class': 0., 'age': 5., 'menopause': 1., 'tumor-size':1.},
1126 | {'Class': 0., 'age': 5., 'menopause': 4., 'tumor-size':4.},
1127 | {'Class': 1., 'age': 4., 'menopause': 8., 'tumor-size':8.},
1128 | ]
1129 | assert_frame_equal(pd.DataFrame(expected), result)
1130 |
1131 | def test_deserialize_from_unsupported_data_type_id(self):
1132 | # Arrange
1133 | data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'
1134 |
1135 | # Act
1136 | reader = BytesIO(data)
1137 | with self.assertRaises(UnsupportedDatasetTypeError):
1138 | result = deserialize_dataframe(reader, 'Unsupported')
1139 |
1140 | # Assert
1141 |
1142 |
1143 | def _frame_to_raw_data(dataframe, sep, header):
1144 | return dataframe.to_csv(sep=sep, header=header, index=False, encoding='utf-8')
1145 |
1146 |
1147 | if __name__ == '__main__':
1148 | unittest.main()
1149 |
--------------------------------------------------------------------------------