├── .ci ├── azure-pipelines-v2.yml ├── azure-pipelines.yml ├── steps │ ├── ai-architecture-template.yml │ └── papermill.yml └── vars │ └── deployment_params.yml ├── .gitignore ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── environment.yml ├── notebooks ├── 00_AMLConfiguration.ipynb ├── 01_DataPrep.ipynb ├── 02_TrainOnLocal.ipynb ├── 03_DevelopScoringScript.ipynb ├── 04_CreateImage.ipynb ├── 05_DeployOnAKS.ipynb ├── 06_SpeedTestWebApp.ipynb ├── 07_RealTimeScoring.ipynb ├── 08_TearDown.ipynb ├── Makefile ├── __init__.py └── dev_env_template ├── project_sample.yml ├── pytest.ini ├── sample_workspace_conf.yml └── tests ├── __init__.py └── test_notebooks.py /.ci/azure-pipelines-v2.yml: -------------------------------------------------------------------------------- 1 | # ML Realtime Scoring Pipeline 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | resources: 13 | repositories: 14 | - repository: aitemplates 15 | type: github 16 | name: microsoft/AI 17 | endpoint: AIArchitecturesAndPractices-GitHub 18 | 19 | trigger: 20 | batch: true 21 | branches: 22 | include: 23 | - master 24 | 25 | pr: 26 | autoCancel: true 27 | branches: 28 | include: 29 | - master 30 | 31 | variables: 32 | - template: ./vars/deployment_params.yml 33 | 34 | stages: 35 | - template: .ci/stages/deploy_notebooks_stages_v5.yml@aitemplates 36 | parameters: 37 | Agent: $(Agent_Name) 38 | jobDisplayName: az-ml-realtime-score 39 | TridentWorkloadTypeShort: ${{ variables.TridentWorkloadTypeShort }} 40 | DeployLocation: ${{ variables.DeployLocation }} 41 | ProjectLocation: ${{ variables.ProjectLocation }} 42 | conda: ${{ variables.conda }} 43 | post_cleanup: false 44 | 45 | flighting_release: false 46 | flighting_preview: false 47 | flighting_master: false 48 | -------------------------------------------------------------------------------- /.ci/azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # AI Architecture Template TODO: update tile 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | resources: 13 | repositories: 14 | - repository: aitemplates 15 | type: github 16 | name: microsoft/AI 17 | endpoint: AIArchitecturesAndPractices-GitHub 18 | 19 | schedules: 20 | - cron: "*/10 * * * *" 21 | displayName: Daily midnight build 22 | always: true 23 | branches: 24 | include: 25 | - master 26 | # MLAKSDeploy Pipeline 27 | 28 | 29 | trigger: 30 | batch: true 31 | branches: 32 | include: 33 | - master 34 | 35 | pr: 36 | autoCancel: true 37 | branches: 38 | include: 39 | - master 40 | 41 | stages: 42 | - template: .ci/stages/deploy_notebooks_stages_v2.yml@aitemplates 43 | parameters: 44 | Agent: $(Agent_Name) 45 | jobDisplayName: ai-architecture-template #TODO: Update with project name 46 | DefaultWorkingDirectory: $(System.DefaultWorkingDirectory) 47 | workload_vars: ../vars/ai-architecture-template.yml #TODO: Update with project name 48 | flighting_release: false 49 | flighting_preview: false 50 | flighting_master: false 51 | -------------------------------------------------------------------------------- /.ci/steps/ai-architecture-template.yml: -------------------------------------------------------------------------------- 1 | # AI Architecture Template TODO: update tile 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a Docker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | parameters: 13 | azureSubscription: '' 14 | azure_subscription: '' 15 | location: '' 16 | azureresourcegroup: '' 17 | workspacename: '' 18 | azureregion: westus2 19 | aksimagename: '' 20 | aks_name: '' 21 | aks_service_name: myimage 22 | conda: '' 23 | doCleanup: true 24 | python_path: '' 25 | flighting_release: false 26 | flighting_preview: false 27 | flighting_master: false 28 | 29 | steps: 30 | - template: config_conda.yml 31 | parameters: 32 | conda_location: . 33 | azureSubscription: ${{parameters.azureSubscription}} 34 | conda: ai-architecture-template 35 | flighting_release: ${{parameters.flighting_release}} 36 | flighting_preview: ${{parameters.flighting_preview}} 37 | flighting_master: ${{parameters.flighting_master}} 38 | 39 | - template: azpapermill.yml 40 | parameters: 41 | notebook: 00_AMLConfiguration.ipynb 42 | location: ${{parameters.location}} 43 | azureSubscription: ${{parameters.azureSubscription}} 44 | conda: ai-architecture-template 45 | azure_subscription: ${{parameters.azure_subscription}} 46 | azureresourcegroup: ${{parameters.azureresourcegroup}} 47 | workspacename: "aiarchtemplate" 48 | azureregion: ${{parameters.azureregion}} 49 | aksimagename: ${{parameters.aksimagename}} 50 | 51 | # Insert more notebook steps here 52 | 53 | - template: pytest_steps.yml 54 | parameters: 55 | location: ${{parameters.location}} 56 | azureSubscription: ${{parameters.azureSubscription}} 57 | conda: ai-architecture-template 58 | 59 | - template: cleanuptask.yml 60 | parameters: 61 | azureSubscription: ${{parameters.azureSubscription}} 62 | conda: ${{parameters.conda}} 63 | azureresourcegroup: ${{parameters.azureresourcegroup}} 64 | doCleanup: ${{parameters.doCleanup}} -------------------------------------------------------------------------------- /.ci/steps/papermill.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | parameters: 4 | notebook: 01_DataPrep.ipynb # defaults for any parameters that aren't specified 5 | location: "{{cookiecutter.project_name}}" 6 | 7 | 8 | steps: 9 | - bash: | 10 | source /usr/share/miniconda/etc/profile.d/conda.sh 11 | conda activate MLAKSDeployAML 12 | export PYTHONPATH=$(pwd)/{{cookiecutter.project_name}}:${PYTHONPATH} 13 | cd ${{parameters.location}} 14 | echo Execute ${{parameters.notebook}} 15 | papermill ${{parameters.notebook}} output.ipynb \ 16 | --log-output \ 17 | --no-progress-bar \ 18 | -k python3 19 | sleep 30 20 | displayName: '${{parameters.notebook}}' -------------------------------------------------------------------------------- /.ci/vars/deployment_params.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | TridentWorkloadTypeShort: azmlrts 3 | DeployLocation: westus 4 | ProjectLocation: "." 5 | conda: az-ml-realtime-score 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Project Configuration Files 3 | workspace_conf.yml 4 | *.output_ipynb 5 | .azureml 6 | pylint-results.xml 7 | project.yml 8 | .idea 9 | score.py 10 | 11 | #AML 12 | aml_config/ 13 | scripts/aml_config/ 14 | assets/ 15 | scripts/assets/ 16 | .amlignore 17 | scripts/.amlignore 18 | scripts/__pycache__/ 19 | 20 | # Environments 21 | .env 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | pip-wheel-metadata/ 46 | share/python-wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .nox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *.cover 72 | *.py,cover 73 | .hypothesis/ 74 | .pytest_cache/ 75 | 76 | # Translations 77 | *.mo 78 | *.pot 79 | 80 | # Django stuff: 81 | *.log 82 | local_settings.py 83 | db.sqlite3 84 | db.sqlite3-journal 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | .python-version 108 | 109 | # pipenv 110 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 111 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 112 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 113 | # install all needed dependencies. 114 | #Pipfile.lock 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | /project.yml 153 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python module names) to load, 30 | # usually to register additional checkers. 31 | load-plugins=pylint_junit 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=global-variable-undefined, 64 | global-statement, 65 | too-many-arguments, 66 | too-many-function-args, 67 | pointless-statement, 68 | missing-module-docstring, 69 | trailing-whitespace, 70 | fixme, 71 | print-statement, 72 | parameter-unpacking, 73 | unpacking-in-except, 74 | old-raise-syntax, 75 | backtick, 76 | long-suffix, 77 | old-ne-operator, 78 | old-octal-literal, 79 | import-star-module-level, 80 | non-ascii-bytes-literal, 81 | raw-checker-failed, 82 | bad-inline-option, 83 | locally-disabled, 84 | file-ignored, 85 | suppressed-message, 86 | useless-suppression, 87 | deprecated-pragma, 88 | use-symbolic-message-instead, 89 | apply-builtin, 90 | basestring-builtin, 91 | buffer-builtin, 92 | cmp-builtin, 93 | coerce-builtin, 94 | execfile-builtin, 95 | file-builtin, 96 | long-builtin, 97 | raw_input-builtin, 98 | reduce-builtin, 99 | standarderror-builtin, 100 | unicode-builtin, 101 | xrange-builtin, 102 | coerce-method, 103 | delslice-method, 104 | getslice-method, 105 | setslice-method, 106 | no-absolute-import, 107 | old-division, 108 | dict-iter-method, 109 | dict-view-method, 110 | next-method-called, 111 | metaclass-assignment, 112 | indexing-exception, 113 | raising-string, 114 | reload-builtin, 115 | oct-method, 116 | hex-method, 117 | nonzero-method, 118 | cmp-method, 119 | input-builtin, 120 | round-builtin, 121 | intern-builtin, 122 | unichr-builtin, 123 | map-builtin-not-iterating, 124 | zip-builtin-not-iterating, 125 | range-builtin-not-iterating, 126 | filter-builtin-not-iterating, 127 | using-cmp-argument, 128 | eq-without-hash, 129 | div-method, 130 | idiv-method, 131 | rdiv-method, 132 | exception-message-attribute, 133 | invalid-str-codec, 134 | sys-max-int, 135 | bad-python3-import, 136 | deprecated-string-function, 137 | deprecated-str-translate-call, 138 | deprecated-itertools-function, 139 | deprecated-types-field, 140 | next-method-defined, 141 | dict-items-not-iterating, 142 | dict-keys-not-iterating, 143 | dict-values-not-iterating, 144 | deprecated-operator-function, 145 | deprecated-urllib-function, 146 | xreadlines-attribute, 147 | deprecated-sys-function, 148 | exception-escape, 149 | comprehension-escape 150 | 151 | # Enable the message, report, category or checker with the given id(s). You can 152 | # either give multiple identifier separated by comma (,) or put this option 153 | # multiple time (only on the command line, not in the configuration file where 154 | # it should appear only once). See also the "--disable" option for examples. 155 | enable=c-extension-no-member 156 | 157 | 158 | [REPORTS] 159 | 160 | # Python expression which should return a score less than or equal to 10. You 161 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 162 | # which contain the number of messages in each category, as well as 'statement' 163 | # which is the total number of statements analyzed. This score is used by the 164 | # global evaluation report (RP0004). 165 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 166 | 167 | # Template used to display messages. This is a python new-style format string 168 | # used to format the message information. See doc for all details. 169 | #msg-template= 170 | 171 | # Set the output format. Available formats are text, parseable, colorized, json 172 | # and msvs (visual studio). You can also give a reporter class, e.g. 173 | # mypackage.mymodule.MyReporterClass. 174 | output-format=text 175 | 176 | # Tells whether to display a full report or only the messages. 177 | reports=no 178 | 179 | # Activate the evaluation score. 180 | score=yes 181 | 182 | 183 | [REFACTORING] 184 | 185 | # Maximum number of nested blocks for function / method body 186 | max-nested-blocks=5 187 | 188 | # Complete name of functions that never returns. When checking for 189 | # inconsistent-return-statements if a never returning function is called then 190 | # it will be considered as an explicit return statement and no message will be 191 | # printed. 192 | never-returning-functions=sys.exit 193 | 194 | 195 | [BASIC] 196 | 197 | # Naming style matching correct argument names. 198 | argument-naming-style=snake_case 199 | 200 | # Regular expression matching correct argument names. Overrides argument- 201 | # naming-style. 202 | #argument-rgx= 203 | 204 | # Naming style matching correct attribute names. 205 | attr-naming-style=snake_case 206 | 207 | # Regular expression matching correct attribute names. Overrides attr-naming- 208 | # style. 209 | #attr-rgx= 210 | 211 | # Bad variable names which should always be refused, separated by a comma. 212 | bad-names=foo, 213 | bar, 214 | baz, 215 | toto, 216 | tutu, 217 | tata 218 | 219 | # Naming style matching correct class attribute names. 220 | class-attribute-naming-style=any 221 | 222 | # Regular expression matching correct class attribute names. Overrides class- 223 | # attribute-naming-style. 224 | #class-attribute-rgx= 225 | 226 | # Naming style matching correct class names. 227 | class-naming-style=PascalCase 228 | 229 | # Regular expression matching correct class names. Overrides class-naming- 230 | # style. 231 | #class-rgx= 232 | 233 | # Naming style matching correct constant names. 234 | const-naming-style=snake_case 235 | 236 | # Regular expression matching correct constant names. Overrides const-naming- 237 | # style. 238 | #const-rgx= 239 | 240 | # Minimum line length for functions/classes that require docstrings, shorter 241 | # ones are exempt. 242 | docstring-min-length=-1 243 | 244 | # Naming style matching correct function names. 245 | function-naming-style=snake_case 246 | 247 | # Regular expression matching correct function names. Overrides function- 248 | # naming-style. 249 | #function-rgx= 250 | 251 | # Good variable names which should always be accepted, separated by a comma. 252 | good-names=i, 253 | j, 254 | k, 255 | ex, 256 | Run, 257 | _ 258 | 259 | # Include a hint for the correct naming format with invalid-name. 260 | include-naming-hint=no 261 | 262 | # Naming style matching correct inline iteration names. 263 | inlinevar-naming-style=any 264 | 265 | # Regular expression matching correct inline iteration names. Overrides 266 | # inlinevar-naming-style. 267 | #inlinevar-rgx= 268 | 269 | # Naming style matching correct method names. 270 | method-naming-style=snake_case 271 | 272 | # Regular expression matching correct method names. Overrides method-naming- 273 | # style. 274 | #method-rgx= 275 | 276 | # Naming style matching correct module names. 277 | module-naming-style=any 278 | 279 | # Regular expression matching correct module names. Overrides module-naming- 280 | # style. 281 | #module-rgx= 282 | 283 | # Colon-delimited sets of names that determine each other's naming style when 284 | # the name regexes allow several styles. 285 | name-group= 286 | 287 | # Regular expression which should only match function or class names that do 288 | # not require a docstring. 289 | no-docstring-rgx=^_ 290 | 291 | # List of decorators that produce properties, such as abc.abstractproperty. Add 292 | # to this list to register other decorators that produce valid properties. 293 | # These decorators are taken in consideration only for invalid-name. 294 | property-classes=abc.abstractproperty 295 | 296 | # Naming style matching correct variable names. 297 | variable-naming-style=snake_case 298 | 299 | # Regular expression matching correct variable names. Overrides variable- 300 | # naming-style. 301 | #variable-rgx= 302 | 303 | 304 | [FORMAT] 305 | 306 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 307 | expected-line-ending-format= 308 | 309 | # Regexp for a line that is allowed to be longer than the limit. 310 | ignore-long-lines=^\s*(# )??$|^\s*get_ipython\S+ 311 | 312 | # Number of spaces of indent required inside a hanging or continued line. 313 | indent-after-paren=4 314 | 315 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 316 | # tab). 317 | indent-string=' ' 318 | 319 | # Maximum number of characters on a single line. 320 | max-line-length=120 321 | 322 | # Maximum number of lines in a module. 323 | max-module-lines=1000 324 | 325 | # List of optional constructs for which whitespace checking is disabled. `dict- 326 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 327 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 328 | # `empty-line` allows space-only lines. 329 | no-space-check=trailing-comma, 330 | dict-separator 331 | 332 | # Allow the body of a class to be on the same line as the declaration if body 333 | # contains single statement. 334 | single-line-class-stmt=no 335 | 336 | # Allow the body of an if to be on the same line as the test if there is no 337 | # else. 338 | single-line-if-stmt=no 339 | 340 | 341 | [LOGGING] 342 | 343 | # Format style used to check logging format string. `old` means using % 344 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 345 | logging-format-style=old 346 | 347 | # Logging modules to check that the string format arguments are in logging 348 | # function parameter format. 349 | logging-modules=logging 350 | 351 | 352 | [MISCELLANEOUS] 353 | 354 | # List of note tags to take in consideration, separated by a comma. 355 | notes=FIXME, 356 | XXX, 357 | TODO 358 | 359 | 360 | [SIMILARITIES] 361 | 362 | # Ignore comments when computing similarities. 363 | ignore-comments=yes 364 | 365 | # Ignore docstrings when computing similarities. 366 | ignore-docstrings=yes 367 | 368 | # Ignore imports when computing similarities. 369 | ignore-imports=no 370 | 371 | # Minimum lines number of a similarity. 372 | min-similarity-lines=4 373 | 374 | 375 | [SPELLING] 376 | 377 | # Limits count of emitted suggestions for spelling mistakes. 378 | max-spelling-suggestions=4 379 | 380 | # Spelling dictionary name. Available dictionaries: none. To make it work, 381 | # install the python-enchant package. 382 | spelling-dict= 383 | 384 | # List of comma separated words that should not be checked. 385 | spelling-ignore-words= 386 | 387 | # A path to a file that contains the private dictionary; one word per line. 388 | spelling-private-dict-file= 389 | 390 | # Tells whether to store unknown words to the private dictionary (see the 391 | # --spelling-private-dict-file option) instead of raising a message. 392 | spelling-store-unknown-words=no 393 | 394 | 395 | [STRING] 396 | 397 | # This flag controls whether the implicit-str-concat-in-sequence should 398 | # generate a warning on implicit string concatenation in sequences defined over 399 | # several lines. 400 | check-str-concat-over-line-jumps=no 401 | 402 | 403 | [TYPECHECK] 404 | 405 | # List of decorators that produce context managers, such as 406 | # contextlib.contextmanager. Add to this list to register other decorators that 407 | # produce valid context managers. 408 | contextmanager-decorators=contextlib.contextmanager 409 | 410 | # List of members which are set dynamically and missed by pylint inference 411 | # system, and so shouldn't trigger E1101 when accessed. Python regular 412 | # expressions are accepted. 413 | generated-members= 414 | 415 | # Tells whether missing members accessed in mixin class should be ignored. A 416 | # mixin class is detected if its name ends with "mixin" (case insensitive). 417 | ignore-mixin-members=yes 418 | 419 | # Tells whether to warn about missing members when the owner of the attribute 420 | # is inferred to be None. 421 | ignore-none=yes 422 | 423 | # This flag controls whether pylint should warn about no-member and similar 424 | # checks whenever an opaque object is returned when inferring. The inference 425 | # can return multiple potential results while evaluating a Python object, but 426 | # some branches might not be evaluated, which results in partial inference. In 427 | # that case, it might be useful to still emit no-member and other checks for 428 | # the rest of the inferred objects. 429 | ignore-on-opaque-inference=yes 430 | 431 | # List of class names for which member attributes should not be checked (useful 432 | # for classes with dynamically set attributes). This supports the use of 433 | # qualified names. 434 | ignored-classes=optparse.Values,thread._local,_thread._local 435 | 436 | # List of module names for which member attributes should not be checked 437 | # (useful for modules/projects where namespaces are manipulated during runtime 438 | # and thus existing member attributes cannot be deduced by static analysis). It 439 | # supports qualified module names, as well as Unix pattern matching. 440 | ignored-modules= 441 | 442 | # Show a hint with possible names when a member name was not found. The aspect 443 | # of finding the hint is based on edit distance. 444 | missing-member-hint=yes 445 | 446 | # The minimum edit distance a name should have in order to be considered a 447 | # similar match for a missing member name. 448 | missing-member-hint-distance=1 449 | 450 | # The total number of similar names that should be taken in consideration when 451 | # showing a hint for a missing member. 452 | missing-member-max-choices=1 453 | 454 | # List of decorators that change the signature of a decorated function. 455 | signature-mutators= 456 | 457 | 458 | [VARIABLES] 459 | 460 | # List of additional names supposed to be defined in builtins. Remember that 461 | # you should avoid defining new builtins when possible. 462 | additional-builtins=get_ipython 463 | 464 | # Tells whether unused global variables should be treated as a violation. 465 | allow-global-unused-variables=yes 466 | 467 | # List of strings which can identify a callback function by name. A callback 468 | # name must start or end with one of those strings. 469 | callbacks=cb_, 470 | _cb 471 | 472 | # A regular expression matching the name of dummy variables (i.e. expected to 473 | # not be used). 474 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 475 | 476 | # Argument names that match this expression will be ignored. Default to name 477 | # with leading underscore. 478 | ignored-argument-names=_.*|^ignored_|^unused_ 479 | 480 | # Tells whether we should check for unused import in __init__ files. 481 | init-import=no 482 | 483 | # List of qualified module names which can have objects that can redefine 484 | # builtins. 485 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 486 | 487 | 488 | [CLASSES] 489 | 490 | # List of method names used to declare (i.e. assign) instance attributes. 491 | defining-attr-methods=__init__, 492 | __new__, 493 | setUp, 494 | __post_init__ 495 | 496 | # List of member names, which should be excluded from the protected access 497 | # warning. 498 | exclude-protected=_asdict, 499 | _fields, 500 | _replace, 501 | _source, 502 | _make 503 | 504 | # List of valid names for the first argument in a class method. 505 | valid-classmethod-first-arg=cls 506 | 507 | # List of valid names for the first argument in a metaclass class method. 508 | valid-metaclass-classmethod-first-arg=cls 509 | 510 | 511 | [DESIGN] 512 | 513 | # Maximum number of arguments for function / method. 514 | max-args=5 515 | 516 | # Maximum number of attributes for a class (see R0902). 517 | max-attributes=7 518 | 519 | # Maximum number of boolean expressions in an if statement (see R0916). 520 | max-bool-expr=5 521 | 522 | # Maximum number of branch for function / method body. 523 | max-branches=12 524 | 525 | # Maximum number of locals for function / method body. 526 | max-locals=15 527 | 528 | # Maximum number of parents for a class (see R0901). 529 | max-parents=7 530 | 531 | # Maximum number of public methods for a class (see R0904). 532 | max-public-methods=20 533 | 534 | # Maximum number of return / yield for function / method body. 535 | max-returns=6 536 | 537 | # Maximum number of statements in function / method body. 538 | max-statements=50 539 | 540 | # Minimum number of public methods for a class (see R0903). 541 | min-public-methods=2 542 | 543 | 544 | [IMPORTS] 545 | 546 | # List of modules that can be imported at any level, not just the top level 547 | # one. 548 | allow-any-import-level= 549 | 550 | # Allow wildcard imports from modules that define __all__. 551 | allow-wildcard-with-all=no 552 | 553 | # Analyse import fallback blocks. This can be used to support both Python 2 and 554 | # 3 compatible code, which means that the block might have code that exists 555 | # only in one or another interpreter, leading to false positives when analysed. 556 | analyse-fallback-blocks=no 557 | 558 | # Deprecated modules which should not be used, separated by a comma. 559 | deprecated-modules=optparse,tkinter.tix 560 | 561 | # Create a graph of external dependencies in the given file (report RP0402 must 562 | # not be disabled). 563 | ext-import-graph= 564 | 565 | # Create a graph of every (i.e. internal and external) dependencies in the 566 | # given file (report RP0402 must not be disabled). 567 | import-graph= 568 | 569 | # Create a graph of internal dependencies in the given file (report RP0402 must 570 | # not be disabled). 571 | int-import-graph= 572 | 573 | # Force import order to recognize a module as part of the standard 574 | # compatibility libraries. 575 | known-standard-library= 576 | 577 | # Force import order to recognize a module as part of a third party library. 578 | known-third-party=enchant 579 | 580 | # Couples of modules and preferred modules, separated by a comma. 581 | preferred-modules= 582 | 583 | 584 | [EXCEPTIONS] 585 | 586 | # Exceptions that will emit a warning when being caught. Defaults to 587 | # "BaseException, Exception". 588 | overgeneral-exceptions=BaseException, 589 | Exception 590 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_apis/build/status/AGCE%20AI/Happy%20Path%20Builds/AI%20ML%20RTS?branchName=master)](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=118&branchName=master) 2 | ### Authors: Fidan Boylu Uz, Yan Zhang, Mario Bourgoin 3 | ### Acknowledgements: Mathew Salvaris 4 | 5 | # Deploying Python models for real-time scoring using Azure Machine Learning 6 | 7 | In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on (1) how to train a machine learning model using Python; (2) how to deploy a trained machine learning model throught Azure Machine Learning (AzureML). The tutorials cover how to deploy models on following deployment target: 8 | 9 | ## Overview 10 | This scenario shows how to deploy a Frequently Asked Questions (FAQ) matching model as a web service to provide predictions for user questions. For this scenario, “Input Data” in the [architecture diagram](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/realtime-scoring-python) refers to text strings containing the user questions to match with a list of FAQs. The scenario is designed for the Scikit-Learn machine learning library for Python but can be generalized to any scenario that uses Python models to make real-time predictions. 11 | 12 | ## Design 13 | 14 | The scenario uses a subset of Stack Overflow question data which includes original questions tagged as JavaScript, their duplicate questions, and their answers. It trains a Scikit-Learn pipeline to predict the match probability of a duplicate question with each of the original questions. These predictions are made in real time using a REST API endpoint. 15 | The application flow for this architecture is as follows: 16 | 1. The client sends a HTTP POST request with the encoded question data. 17 | 2. The webservice extracts the question from the request 18 | 3. The question is then sent to the Scikit-learn pipeline model for featurization and scoring. 19 | 4. The matching FAQ questions with their scores are then piped into a JSON object and returned to the client. 20 | 21 | An example app that consumes the results is included with the scenario. 22 | 23 | ## Prerequisites 24 | 1. Linux (Ubuntu). 25 | 1. [Anaconda Python](https://www.anaconda.com/download) 26 | 1. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed. 27 | 1. [Azure account](https://azure.microsoft.com). 28 | 29 | 30 | --- 31 | **NOTE** 32 | You will need to be able to run docker commands without sudo to run this tutorial. Use the following commands to do this. 33 | 34 | ```bash 35 | sudo usermod -aG docker $USER 36 | newgrp docker 37 | ``` 38 | --- 39 | 40 | The tutorial was developed on an [Azure Ubuntu 41 | DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro), 42 | which addresses the first three prerequisites. 43 | 44 | ## Setup 45 | 46 | To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Azure seamlessly. 47 | 48 | 1. Create a _Linux_ _Ubuntu_ VM. 49 | 1. Log in to your VM. We recommend that you use a graphical client 50 | such as 51 | [X2Go](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#x2go) 52 | to access your VM. The remaining steps are to be done on the VM. 53 | 1. Open a terminal emulator. 54 | 1. Clone, fork, or download the zip file for this repository: 55 | ``` 56 | git clone https://github.com/Microsoft/az-ml-realtime-score.git 57 | ``` 58 | 1. Enter the local repository: 59 | ``` 60 | cd az-ml-realtime-score 61 | ``` 62 | 1. Copy `sample_workspace_conf.yml` to a new file, `workspace_conf.yml`, and fill in each field. This will keep secrets out of the source code, and this file will be ignored by git. 63 | 1. Create the Python az-ml-realtime-score virtual environment using the environment.yml: 64 | ``` 65 | conda env create -f environment.yml 66 | ``` 67 | 1. Activate the virtual environment: 68 | ``` 69 | source activate az-ml-realtime-score 70 | ``` 71 | The remaining steps should be done in this virtual environment. 72 | 1. Login to Azure: 73 | ``` 74 | az login 75 | ``` 76 | You can verify that you are logged in to your subscription by executing 77 | the command: 78 | ``` 79 | az account show -o table 80 | ``` 81 | 1. Start the Jupyter notebook server: 82 | ``` 83 | jupyter notebook 84 | ``` 85 | 86 | # Contributing 87 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 88 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 89 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 90 | 91 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 92 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 93 | provided by the bot. You will only need to do this once across all repositories using our CLA. 94 | 95 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 96 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 97 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 98 | 99 | 100 | # Related projects 101 | 102 | [Microsoft AI Github](https://github.com/microsoft/ai) Find other Best Practice projects, and Azure AI Designed patterns in our central repository. 103 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: az-ml-realtime-score 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6.2 6 | - pip 7 | - jupyter 8 | - pytest 9 | - pytest-cov 10 | - pylint 11 | - pandas 12 | - pip: 13 | - papermill 14 | - azureml-core==1.0.85.2 15 | - pylint-junit 16 | - pytest-nunit 17 | - nbconvert 18 | - junit-xml 19 | - nbformat 20 | - Microsoft-AI-Azure-Utility-Samples 21 | - python-dotenv 22 | -------------------------------------------------------------------------------- /notebooks/00_AMLConfiguration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ai-architecture-template - 00_AMLConfiguration.ipynb\n", 8 | "TODO: Update with new repo name\n", 9 | "\n", 10 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 11 | "\n", 12 | "Licensed under the MIT License.\n", 13 | "\n", 14 | "# Installation and configuration\n", 15 | "This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. \n", 16 | "You can use an existing workspace or create a new one.\n", 17 | "\n", 18 | "## Prerequisites\n", 19 | "\n", 20 | "If you have already completed the prerequisites and selected the correct Kernel for this notebook, the AML Python SDK \n", 21 | "is already installed. Let's load the imports and check the AML SDK version.\n", 22 | "\n", 23 | "## Set up your Azure Machine Learning workspace\n", 24 | "## Load Configurations from file\n", 25 | "\n", 26 | "Configurations are loaded by default from a file `project.yml`, to prevent accident commits of Azure secrets into \n", 27 | "source control. This file name is included in the `.gitignore` to also prevent accident commits. A template file \n", 28 | "is included that should be copied, and each parameter filled in.\n", 29 | "\n", 30 | "If the file is not present, and UI Prompt will pop up to insert configurations, and save to the file.\n", 31 | "\n", 32 | "## Create the workspace\n", 33 | "This cell will also create an AML workspace for you in a subscription, provided you have the correct permissions.\n", 34 | "\n", 35 | "This will fail when:\n", 36 | "1. You do not have permission to create a workspace in the resource group\n", 37 | "1. You do not have permission to create a resource group if it's non-existing.\n", 38 | "1. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this \n", 39 | "subscription\n", 40 | "\n", 41 | "If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to \n", 42 | "provision the required resources. If this cell succeeds, you're done configuring AML!\n", 43 | "\n", 44 | "After creation we will check the details of the workspace." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "pycharm": { 52 | "name": "#%%\n" 53 | } 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from azure_utils.machine_learning.utils import get_or_create_workspace_from_file\n", 58 | "\n", 59 | "ws = get_or_create_workspace_from_file()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "We can now move on to [Data Preperation](01_DataPrep.ipynb) notebook to train our model using Azure Machine \n", 67 | "Learning." 68 | ] 69 | } 70 | ], 71 | "metadata": { 72 | "celltoolbar": "Tags", 73 | "kernelspec": { 74 | "display_name": "az-ml-realtime-score", 75 | "language": "python", 76 | "name": "az-ml-realtime-score" 77 | }, 78 | "pycharm": { 79 | "stem_cell": { 80 | "cell_type": "raw", 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "source": [] 85 | } 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /notebooks/01_DataPrep.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Data Preparation" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data \n", 24 | "which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we \n", 25 | "provide the steps to prepare the data to use in model development for training a model that will match a new \n", 26 | "question with an existing original question. " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import os\n", 36 | "\n", 37 | "import pandas as pd\n", 38 | "from azure_utils.utilities import read_csv_gz, clean_text, round_sample_strat, random_merge\n", 39 | "\n", 40 | "from notebooks import directory" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Below, we define some parameters that will be used in the data cleaning as well as train and test set preparation." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# The size of the test set\n", 57 | "test_size = 0.21\n", 58 | "# The minimum length of clean text\n", 59 | "min_text = 150\n", 60 | "# The minimum number of duplicates per question\n", 61 | "min_dupes = 12\n", 62 | "# The maximum number of duplicate matches\n", 63 | "match = 20\n", 64 | "# The output files path\n", 65 | "outputs_path = directory + \"/data_folder\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Data cleaning" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Next, we download the questions, duplicate questions and answers and load the datasets into pandas dataframes using \n", 80 | "the helper functions." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# URLs to original questions, duplicate questions, and answers.\n", 90 | "data_url = \"https://bostondata.blob.core.windows.net/stackoverflow/{}\"\n", 91 | "questions_url = data_url.format(\"orig-q.tsv.gz\")\n", 92 | "dupes_url = data_url.format(\"dup-q.tsv.gz\")\n", 93 | "answers_url = data_url.format(\"ans.tsv.gz\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Load datasets.\n", 103 | "questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 104 | "dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 105 | "answers = read_csv_gz(answers_url, names=('Id', 'Text0'))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Let's now check the dataframes. Notice that questions and duplicates have \"AnswerID\" column that would help match \n", 113 | "ith the index of answers dataframe." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/html": [ 124 | "
\n", 125 | "\n", 138 | "\n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
AnswerIdText0CreationDate
Id
220231220233Accessing the web page's HTTP Headers in JavaS...2008-10-20 22:54:38.767
391979810461Get client IP using just JavaScript?. <p>I nee...2008-12-24 18:22:30.780
109086109091Stop setInterval call in JavaScript. <p>I am u...2008-09-20 19:29:55.377
4615546181Validate email address in JavaScript?. <p>How ...2008-09-05 16:10:11.093
121499121708When onblur occurs, how can I find out which e...2008-09-23 14:48:43.483
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " AnswerId Text0 \\\n", 190 | "Id \n", 191 | "220231 220233 Accessing the web page's HTTP Headers in JavaS... \n", 192 | "391979 810461 Get client IP using just JavaScript?.

I nee... \n", 193 | "109086 109091 Stop setInterval call in JavaScript.

I am u... \n", 194 | "46155 46181 Validate email address in JavaScript?.

How ... \n", 195 | "121499 121708 When onblur occurs, how can I find out which e... \n", 196 | "\n", 197 | " CreationDate \n", 198 | "Id \n", 199 | "220231 2008-10-20 22:54:38.767 \n", 200 | "391979 2008-12-24 18:22:30.780 \n", 201 | "109086 2008-09-20 19:29:55.377 \n", 202 | "46155 2008-09-05 16:10:11.093 \n", 203 | "121499 2008-09-23 14:48:43.483 " 204 | ] 205 | }, 206 | "execution_count": 5, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "questions.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "

\n", 224 | "\n", 237 | "\n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | "
AnswerIdText0CreationDate
Id
665430665404Disable \"Back\" & \"Refresh\" Button in Browser. ...2009-03-20 09:13:31.800
114525336868The difference between the two functions? (\"fu...2008-09-22 12:24:06.583
1347093147765ASP.NET Page_Unload to stop user from leaving ...2009-08-28 13:46:51.217
120825226633883See if a variable is an array using JavaScript...2009-07-30 17:57:42.363
177867122704How do I copy the data of an element with jque...2008-10-07 10:23:40.017
\n", 285 | "
" 286 | ], 287 | "text/plain": [ 288 | " AnswerId Text0 \\\n", 289 | "Id \n", 290 | "665430 665404 Disable \"Back\" & \"Refresh\" Button in Browser. ... \n", 291 | "114525 336868 The difference between the two functions? (\"fu... \n", 292 | "1347093 147765 ASP.NET Page_Unload to stop user from leaving ... \n", 293 | "1208252 26633883 See if a variable is an array using JavaScript... \n", 294 | "177867 122704 How do I copy the data of an element with jque... \n", 295 | "\n", 296 | " CreationDate \n", 297 | "Id \n", 298 | "665430 2009-03-20 09:13:31.800 \n", 299 | "114525 2008-09-22 12:24:06.583 \n", 300 | "1347093 2009-08-28 13:46:51.217 \n", 301 | "1208252 2009-07-30 17:57:42.363 \n", 302 | "177867 2008-10-07 10:23:40.017 " 303 | ] 304 | }, 305 | "execution_count": 6, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "dupes.head()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 7, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/html": [ 322 | "
\n", 323 | "\n", 336 | "\n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
Text0
Id
119473<p>Try <a href=\"http://johannburkard.de/blog/p...
324533<p>Adapted from <a href=\"http://www.javascript...
108232<p>That is known as a textbox watermark, and i...
194399<p><strong>Obfuscation:</strong></p> <p>Try <a...
80127<p>In JavaScript, \"this\" always refers to the ...
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " Text0\n", 374 | "Id \n", 375 | "119473

Try Adapted from That is known as a textbox watermark, and i...\n", 378 | "194399

Obfuscation:

Try In JavaScript, \"this\" always refers to the ..." 380 | ] 381 | }, 382 | "execution_count": 7, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "answers.head()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "Let's check the first original question's text." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 8, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "Accessing the web page's HTTP Headers in JavaScript.

How do I access a page's HTTP response headers via JavaScript?

Related to this question, which was modified to ask about accessing two specific HTTP headers.

Related:
How do I access the HTTP request header fields via JavaScript?

\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "print(questions.iloc[0, 1])" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "Let's now check the duplicates for that question." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 9, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | " AnswerId Text0 \\\n", 432 | "Id \n", 433 | "3177208 220233 Monitoring http request header on a page. ... \n", 438 | "17466305 220233 How to read HTTP header values from JavaScript... \n", 439 | "26647511 220233 Is there a JS API to get information about hea... \n", 440 | "35604233 220233 How to read http request headers with javascri... \n", 441 | "\n", 442 | " CreationDate \n", 443 | "Id \n", 444 | "3177208 2010-07-05 04:20:19.663 \n", 445 | "12258705 2012-09-04 07:31:07.973 \n", 446 | "12256134 2012-09-04 02:43:08.860 \n", 447 | "15135883 2013-02-28 12:44:38.393 \n", 448 | "14673437 2013-02-03 14:19:00.697 \n", 449 | "17466305 2013-07-04 09:08:32.240 \n", 450 | "26647511 2014-10-30 07:43:01.117 \n", 451 | "35604233 2016-02-24 14:00:49.247 \n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "print(dupes[dupes.AnswerId == questions.iloc[0, 0]])" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "Below is the answer to the original question." 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 10, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "name": "stdout", 473 | "output_type": "stream", 474 | "text": [ 475 | "

Unfortunately, there isn't an API to give you the HTTP response headers for your initial page request. That was the original question posted here. It has been repeatedly asked, too, because some people would like to get the actual response headers of the original page request without issuing another one.


For AJAX Requests:

If an HTTP request is made over AJAX, it is possible to get the response headers with the getAllResponseHeaders() method. It's part of the XMLHttpRequest API. To see how this can be applied, check out the fetchSimilarHeaders() function below. Note that this is a work-around to the problem that won't be reliable for some applications.

myXMLHttpRequest.getAllResponseHeaders(); 

This will not give you information about the original page request's HTTP response headers, but it could be used to make educated guesses about what those headers were. More on that is described next.


Getting header values from the Initial Page Request:

This question was first asked several years ago, asking specifically about how to get at the original HTTP response headers for the current page (i.e. the same page inside of which the javascript was running). This is quite a different question than simply getting the response headers for any HTTP request. For the initial page request, the headers aren't readily available to javascript. Whether the header values you need will be reliably and sufficiently consistent if you request the same page again via AJAX will depend on your particular application.

The following are a few suggestions for getting around that problem.


1. Requests on Resources which are largely static

If the response is largely static and the headers are not expected to change much between requests, you could make an AJAX request for the same page you're currently on and assume that they're they are the same values which were part of the page's HTTP response. This could allow you to access the headers you need using the nice XMLHttpRequest API described above.

function fetchSimilarHeaders (callback) { var request = new XMLHttpRequest(); request.onreadystatechange = function () { if (request.readyState === 4) { // // The following headers may often be similar // to those of the original page request... // if (callback && typeof callback === 'function') { callback(request.getAllResponseHeaders()); } } }; // // Re-request the same page (document.location) // We hope to get the same or similar response headers to those which // came with the current page, but we have no guarantee. // Since we are only after the headers, a HEAD request may be sufficient. // request.open('HEAD', document.location, true); request.send(null); } 

This approach will be problematic if you truly have to rely on the values being consistent between requests, since you can't fully guarantee that they are the same. It's going to depend on your specific application and\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "print(answers.at[questions.iloc[0, 0], 'Text0'])" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Next, we use the helper functions to clean questions, duplicates and answers from unwanted text such as code, html \n", 488 | "tags and links. Notice that we add a new column 'Text' to each dataframe for clean text in lowercase." 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 11, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# Clean up all text, and keep only data with some clean text.\n", 498 | "for df in (questions, dupes, answers):\n", 499 | " df[\"Text\"] = df.Text0.apply(clean_text).str.lower()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 12, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "questions = questions[questions.Text.str.len() > 0]\n", 509 | "answers = answers[answers.Text.str.len() > 0]\n", 510 | "dupes = dupes[dupes.Text.str.len() > 0]" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "Let's compare the first original question and cleaned version as an example." 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 13, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "Accessing the web page's HTTP Headers in JavaScript.

How do I access a page's HTTP response headers via JavaScript?

Related to this question, which was modified to ask about accessing two specific HTTP headers.

Related:
How do I access the HTTP request header fields via JavaScript?

\n" 530 | ] 531 | } 532 | ], 533 | "source": [ 534 | "# Original question.\n", 535 | "print(questions.iloc[0, 1])" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 14, 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "accessing the web page's http headers in javascript. how do i access a page's http response headers via javascript? related to this question, which was modified to ask about accessing two specific http headers. related: how do i access the http request header fields via javascript? \n" 548 | ] 549 | } 550 | ], 551 | "source": [ 552 | "# After cleaning.\n", 553 | "print(questions.iloc[0, 3])" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "It turns out that some duplicate questions were also in original questions. Also, some original questions and some \n", 561 | "duplicate questions were duplicated in the datasets. In the following, we remove them from the dataframes." 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 15, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "# First, remove dupes that are questions, then remove duplicated questions and dupes.\n", 571 | "dupes = dupes[~dupes.index.isin(questions.index)]\n", 572 | "questions = questions[~questions.index.duplicated(keep='first')]\n", 573 | "dupes = dupes[~dupes.index.duplicated(keep='first')]" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "We also make sure we keep questions with answers and duplicates." 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 16, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "# Keep only questions with answers and dupes, answers to questions, and dupes of questions.\n", 590 | "questions = questions[\n", 591 | " questions.AnswerId.isin(answers.index) & questions.AnswerId.isin(dupes.AnswerId)\n", 592 | "]\n", 593 | "answers = answers[answers.index.isin(questions.AnswerId)]\n", 594 | "dupes = dupes[dupes.AnswerId.isin(questions.AnswerId)]" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 17, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "# Verify data integrity.\n", 604 | "assert questions.AnswerId.isin(answers.index).all()\n", 605 | "assert answers.index.isin(questions.AnswerId).all()\n", 606 | "assert questions.AnswerId.isin(dupes.AnswerId).all()\n", 607 | "assert dupes.AnswerId.isin(questions.AnswerId).all()" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | "Below are some statistics on the data. Notice that some questions have very low number of duplicates while others may \n", 615 | "have a large number. " 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 18, 621 | "metadata": {}, 622 | "outputs": [ 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | "Text statistics:\n", 628 | " count mean std min 25% 50% 75% max\n", 629 | "questions 1714.0 415.827305 319.857854 56.0 225.0 334.0 509.0 3982.0\n", 630 | "answers 1714.0 616.274212 673.060199 1.0 178.0 375.0 757.0 3982.0\n", 631 | "dupes 16139.0 441.303612 363.638297 25.0 247.0 357.0 519.0 3989.0\n", 632 | "\n", 633 | "Duplication statistics:\n", 634 | " count mean std min 25% 50% 75% max\n", 635 | "duplications 1714.0 9.415986 41.638847 1.0 3.0 4.0 7.0 1369.0\n", 636 | "\n", 637 | "Largest class: 8.48%\n" 638 | ] 639 | } 640 | ], 641 | "source": [ 642 | "# Report on the data.\n", 643 | "print(\"Text statistics:\")\n", 644 | "print(\n", 645 | " pd.DataFrame(\n", 646 | " [\n", 647 | " questions.Text.str.len().describe().rename(\"questions\"),\n", 648 | " answers.Text.str.len().describe().rename(\"answers\"),\n", 649 | " dupes.Text.str.len().describe().rename(\"dupes\"),\n", 650 | " ]\n", 651 | " )\n", 652 | ")\n", 653 | "print(\"\\nDuplication statistics:\")\n", 654 | "print(pd.DataFrame([dupes.AnswerId.value_counts().describe().rename(\"duplications\")]))\n", 655 | "print(\n", 656 | " \"\\nLargest class: {:.2%}\".format(\n", 657 | " dupes.AnswerId.value_counts().max() / dupes.shape[0]\n", 658 | " )\n", 659 | ")" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "Now, we reset all indexes to use them as columns in the rest of the steps." 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 19, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "# Reset each dataframe's index.\n", 676 | "questions.reset_index(inplace=True)\n", 677 | "answers.reset_index(inplace=True)\n", 678 | "dupes.reset_index(inplace=True)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "We filter the questions and duplicates to have at least min_text number of characters." 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 20, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "# Apply the minimum text length to questions and dupes.\n", 695 | "questions = questions[questions.Text.str.len() >= min_text]\n", 696 | "dupes = dupes[dupes.Text.str.len() >= min_text]" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 21, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "# Keep only questions with dupes, and dupes of questions.\n", 706 | "label_column = \"AnswerId\"\n", 707 | "questions = questions[questions[label_column].isin(dupes[label_column])]\n", 708 | "dupes = dupes[dupes[label_column].isin(questions[label_column])]" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "Here, we remove questions and their duplicates that are less than min_dupes parameter." 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 22, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "# Restrict the questions to those with a minimum number of dupes.\n", 725 | "answerid_count = dupes.groupby(label_column)[label_column].count()\n", 726 | "answerid_min = answerid_count.index[answerid_count >= min_dupes]\n", 727 | "questions = questions[questions[label_column].isin(answerid_min)]\n", 728 | "dupes = dupes[dupes[label_column].isin(answerid_min)]" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 23, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | " # Verify data integrity.\n", 738 | "assert questions[label_column].isin(dupes[label_column]).all()\n", 739 | "assert dupes[label_column].isin(questions[label_column]).all()" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "Here are some statistics on the resulting dataset." 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 24, 752 | "metadata": {}, 753 | "outputs": [ 754 | { 755 | "name": "stdout", 756 | "output_type": "stream", 757 | "text": [ 758 | "Restrictions: min_text=150, min_dupes=12\n", 759 | "Restricted text statistics:\n", 760 | " count mean std min 25% 50% 75% max\n", 761 | "questions 182.0 413.450549 218.028193 153.0 264.25 338.5 510.5 1475.0\n", 762 | "dupes 8260.0 479.882324 398.791447 150.0 270.00 380.0 553.0 3989.0\n", 763 | "\n", 764 | "Restricted duplication statistics:\n", 765 | " count mean std min 25% 50% 75% max\n", 766 | "duplications 182.0 45.384615 117.074823 12.0 15.0 20.0 33.0 1328.0\n", 767 | "\n", 768 | "Restricted largest class: 16.08%\n" 769 | ] 770 | } 771 | ], 772 | "source": [ 773 | "# Report on the data.\n", 774 | "print(\"Restrictions: min_text={}, min_dupes={}\".format(min_text, min_dupes))\n", 775 | "print(\"Restricted text statistics:\")\n", 776 | "print(\n", 777 | " pd.DataFrame(\n", 778 | " [\n", 779 | " questions.Text.str.len().describe().rename(\"questions\"),\n", 780 | " dupes.Text.str.len().describe().rename(\"dupes\"),\n", 781 | " ]\n", 782 | " )\n", 783 | ")\n", 784 | "print(\"\\nRestricted duplication statistics:\")\n", 785 | "print(\n", 786 | " pd.DataFrame([dupes[label_column].value_counts().describe().rename(\"duplications\")])\n", 787 | ")\n", 788 | "print(\n", 789 | " \"\\nRestricted largest class: {:.2%}\".format(\n", 790 | " dupes[label_column].value_counts().max() / dupes.shape[0]\n", 791 | " )\n", 792 | ")" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "## Prepare train and test sets" 800 | ] 801 | }, 802 | { 803 | "cell_type": "markdown", 804 | "metadata": {}, 805 | "source": [ 806 | "In this part, we prepare train and test sets. For training a binary classification model, we will need to construct \n", 807 | "match and non-match pairs from duplicates and their questions. Finding matching pairs can be accomplished by joining \n", 808 | "each duplicate with its question. However, non-match examples need to be constructed randomly. " 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "As a first step, to make sure we train and test the performance of the model on each question, we will need to have \n", 816 | "examples of match and non-match pairs for each question both in train and test sets. In order to achieve that, \n", 817 | "we split the duplicates in a stratified manner into train and test sets making sure at least 1 or more duplicates per \n", 818 | "question is in the test set depending on test_size parameter and number of duplicates per each question." 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 25, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "# Split dupes into train and test ensuring at least one of each label class is in test.\n", 828 | "dupes_test = round_sample_strat(dupes, dupes[label_column], frac=test_size)\n", 829 | "dupes_train = dupes[~dupes.Id.isin(dupes_test.Id)]" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 26, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "assert dupes_test[label_column].unique().shape[0] == dupes[label_column].unique().shape[0]" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 27, 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "# The relevant columns for text pairs data.\n", 848 | "balanced_pairs_columns = ['Id_x', 'AnswerId_x', 'Text_x', 'Id_y', 'Text_y', 'AnswerId_y', 'Label', 'n']" 849 | ] 850 | }, 851 | { 852 | "cell_type": "markdown", 853 | "metadata": {}, 854 | "source": [ 855 | "Next, we pair each training duplicate in train set with its matching question and N-1 random questions using the \n", 856 | "helper function." 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": 28, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "# Use AnswerId to pair each training dupe with its matching question and also with N-1 questions not its match.\n", 866 | "balanced_pairs_train = random_merge(dupes_train, questions, N=match)" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "metadata": {}, 872 | "source": [ 873 | "Labeling is done such that matching pairs are labeled as 1 and non-match pairs are labeled as 0." 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 29, 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "# Label records by matching AnswerIds.\n", 883 | "balanced_pairs_train[\"Label\"] = (\n", 884 | " balanced_pairs_train.AnswerId_x == balanced_pairs_train.AnswerId_y\n", 885 | ").astype(int)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 30, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "# Keep only the relevant data.\n", 895 | "balanced_pairs_train = balanced_pairs_train[balanced_pairs_columns]" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": 31, 901 | "metadata": {}, 902 | "outputs": [ 903 | { 904 | "data": { 905 | "text/html": [ 906 | "
\n", 907 | "\n", 920 | "\n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | "
Id_xAnswerId_xText_xId_yText_yAnswerId_yLabeln
0177867122704how do i copy the data of an element with jque...122102what is the most efficient way to clone an obj...12270410
1565430122704(deep) copying an array using jquery. possibl...122102what is the most efficient way to clone an obj...12270410
23474697122704how to clone js object?. possible duplicate: ...122102what is the most efficient way to clone an obj...12270410
310801878122704how can i copy a variable without pointing to ...122102what is the most efficient way to clone an obj...12270410
49610918122704how do i get a new reference to an object. po...122102what is the most efficient way to clone an obj...12270410
\n", 992 | "
" 993 | ], 994 | "text/plain": [ 995 | " Id_x AnswerId_x Text_x \\\n", 996 | "0 177867 122704 how do i copy the data of an element with jque... \n", 997 | "1 565430 122704 (deep) copying an array using jquery. possibl... \n", 998 | "2 3474697 122704 how to clone js object?. possible duplicate: ... \n", 999 | "3 10801878 122704 how can i copy a variable without pointing to ... \n", 1000 | "4 9610918 122704 how do i get a new reference to an object. po... \n", 1001 | "\n", 1002 | " Id_y Text_y AnswerId_y \\\n", 1003 | "0 122102 what is the most efficient way to clone an obj... 122704 \n", 1004 | "1 122102 what is the most efficient way to clone an obj... 122704 \n", 1005 | "2 122102 what is the most efficient way to clone an obj... 122704 \n", 1006 | "3 122102 what is the most efficient way to clone an obj... 122704 \n", 1007 | "4 122102 what is the most efficient way to clone an obj... 122704 \n", 1008 | "\n", 1009 | " Label n \n", 1010 | "0 1 0 \n", 1011 | "1 1 0 \n", 1012 | "2 1 0 \n", 1013 | "3 1 0 \n", 1014 | "4 1 0 " 1015 | ] 1016 | }, 1017 | "execution_count": 31, 1018 | "metadata": {}, 1019 | "output_type": "execute_result" 1020 | } 1021 | ], 1022 | "source": [ 1023 | "balanced_pairs_train.head()" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 32, 1029 | "metadata": {}, 1030 | "outputs": [], 1031 | "source": [ 1032 | "# Sort the data by dupe ID and Label.\n", 1033 | "balanced_pairs_train.sort_values(by=['Id_x', 'Label'], ascending=[True, False], inplace=True)" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "markdown", 1038 | "metadata": {}, 1039 | "source": [ 1040 | "In testing set, we match each duplicate with all the original questions and label them same way as training set." 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": 33, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "# Use AnswerId to pair each testing dupe with all questions.\n", 1050 | "balanced_pairs_test = random_merge(dupes_test, questions, N=questions.shape[0])" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "code", 1055 | "execution_count": 34, 1056 | "metadata": {}, 1057 | "outputs": [], 1058 | "source": [ 1059 | "# Label records by matching AnswerIds.\n", 1060 | "balanced_pairs_test[\"Label\"] = (\n", 1061 | " balanced_pairs_test.AnswerId_x == balanced_pairs_test.AnswerId_y\n", 1062 | ").astype(int)" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 35, 1068 | "metadata": {}, 1069 | "outputs": [], 1070 | "source": [ 1071 | "# Keep only the relevant data.\n", 1072 | "balanced_pairs_test = balanced_pairs_test[balanced_pairs_columns]" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 36, 1078 | "metadata": {}, 1079 | "outputs": [ 1080 | { 1081 | "data": { 1082 | "text/html": [ 1083 | "
\n", 1084 | "\n", 1097 | "\n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | "
Id_xAnswerId_xText_xId_yText_yAnswerId_yLabeln
0180459536700getting the length of a 'named' array?. i'm no...5223length of a javascript object (that is, associ...670010
187022196700how to get javascript hash table count?. poss...5223length of a javascript object (that is, associ...670010
21448533627943calculate distance between two geolocs in java...27928calculate distance between two latitude-longit...2794310
32138358227943android java calculate distance many coordinat...27928calculate distance between two latitude-longit...2794310
42714079631047how can i check if append element already exis...31044is there an \"exists\" function for jquery?. how...3104710
\n", 1169 | "
" 1170 | ], 1171 | "text/plain": [ 1172 | " Id_x AnswerId_x Text_x \\\n", 1173 | "0 18045953 6700 getting the length of a 'named' array?. i'm no... \n", 1174 | "1 8702219 6700 how to get javascript hash table count?. poss... \n", 1175 | "2 14485336 27943 calculate distance between two geolocs in java... \n", 1176 | "3 21383582 27943 android java calculate distance many coordinat... \n", 1177 | "4 27140796 31047 how can i check if append element already exis... \n", 1178 | "\n", 1179 | " Id_y Text_y AnswerId_y \\\n", 1180 | "0 5223 length of a javascript object (that is, associ... 6700 \n", 1181 | "1 5223 length of a javascript object (that is, associ... 6700 \n", 1182 | "2 27928 calculate distance between two latitude-longit... 27943 \n", 1183 | "3 27928 calculate distance between two latitude-longit... 27943 \n", 1184 | "4 31044 is there an \"exists\" function for jquery?. how... 31047 \n", 1185 | "\n", 1186 | " Label n \n", 1187 | "0 1 0 \n", 1188 | "1 1 0 \n", 1189 | "2 1 0 \n", 1190 | "3 1 0 \n", 1191 | "4 1 0 " 1192 | ] 1193 | }, 1194 | "execution_count": 36, 1195 | "metadata": {}, 1196 | "output_type": "execute_result" 1197 | } 1198 | ], 1199 | "source": [ 1200 | "balanced_pairs_test.head()" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "code", 1205 | "execution_count": 37, 1206 | "metadata": {}, 1207 | "outputs": [], 1208 | "source": [ 1209 | "# Sort the data by dupe ID and Label.\n", 1210 | "balanced_pairs_test.sort_values(\n", 1211 | " by=[\"Id_x\", \"Label\"], ascending=[True, False], inplace=True\n", 1212 | ")" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "markdown", 1217 | "metadata": {}, 1218 | "source": [ 1219 | "Finally, we report the final train and test sets and save as text files to be used by modeling." 1220 | ] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "execution_count": 38, 1225 | "metadata": {}, 1226 | "outputs": [ 1227 | { 1228 | "name": "stdout", 1229 | "output_type": "stream", 1230 | "text": [ 1231 | "balanced_pairs_train: 132,500 rows with 5.00% matches\n", 1232 | "balanced_pairs_test: 297,570 rows with 0.55% matches\n" 1233 | ] 1234 | } 1235 | ], 1236 | "source": [ 1237 | "# Report on the datasets.\n", 1238 | "print(\n", 1239 | " \"balanced_pairs_train: {:,} rows with {:.2%} matches\".format(\n", 1240 | " balanced_pairs_train.shape[0], balanced_pairs_train.Label.mean()\n", 1241 | " )\n", 1242 | ")\n", 1243 | "print(\n", 1244 | " \"balanced_pairs_test: {:,} rows with {:.2%} matches\".format(\n", 1245 | " balanced_pairs_test.shape[0], balanced_pairs_test.Label.mean()\n", 1246 | " )\n", 1247 | ")" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": 39, 1253 | "metadata": {}, 1254 | "outputs": [ 1255 | { 1256 | "name": "stdout", 1257 | "output_type": "stream", 1258 | "text": [ 1259 | "Writing 132,500 to C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks/data_folder\\balanced_pairs_train.tsv\n", 1260 | "Writing 297,570 to C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks/data_folder\\balanced_pairs_test.tsv\n", 1261 | "Writing 182 to C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks/data_folder\\questions.tsv\n", 1262 | "Writing 1,635 to C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks/data_folder\\dupes_test.tsv\n" 1263 | ] 1264 | } 1265 | ], 1266 | "source": [ 1267 | "os.makedirs(outputs_path, exist_ok=True)\n", 1268 | "\n", 1269 | "# Save the data.\n", 1270 | "balanced_pairs_train_path = os.path.join(outputs_path, \"balanced_pairs_train.tsv\")\n", 1271 | "print(\n", 1272 | " \"Writing {:,} to {}\".format(\n", 1273 | " balanced_pairs_train.shape[0], balanced_pairs_train_path\n", 1274 | " )\n", 1275 | ")\n", 1276 | "balanced_pairs_train.to_csv(\n", 1277 | " balanced_pairs_train_path, sep=\"\\t\", header=True, index=False\n", 1278 | ")\n", 1279 | "\n", 1280 | "balanced_pairs_test_path = os.path.join(outputs_path, \"balanced_pairs_test.tsv\")\n", 1281 | "print(\n", 1282 | " \"Writing {:,} to {}\".format(balanced_pairs_test.shape[0], balanced_pairs_test_path)\n", 1283 | ")\n", 1284 | "balanced_pairs_test.to_csv(balanced_pairs_test_path, sep=\"\\t\", header=True, index=False)\n", 1285 | "\n", 1286 | "# Save original questions to be used for scoring later.\n", 1287 | "questions_path = os.path.join(outputs_path, \"questions.tsv\")\n", 1288 | "print(\"Writing {:,} to {}\".format(questions.shape[0], questions_path))\n", 1289 | "questions.to_csv(questions_path, sep=\"\\t\", header=True, index=False)\n", 1290 | "\n", 1291 | "# Save the test duplicate questions to be used with the scoring function.\n", 1292 | "dupes_test_path = os.path.join(outputs_path, \"dupes_test.tsv\")\n", 1293 | "print(\"Writing {:,} to {}\".format(dupes_test.shape[0], dupes_test_path))\n", 1294 | "dupes_test.to_csv(dupes_test_path, sep=\"\\t\", header=True, index=False)" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "markdown", 1299 | "metadata": {}, 1300 | "source": [ 1301 | "We can now move on to [train on local](02_TrainOnLocal.ipynb) notebook to train our model using Azure Machine \n", 1302 | "Learning." 1303 | ] 1304 | } 1305 | ], 1306 | "metadata": { 1307 | "kernelspec": { 1308 | "display_name": "az-ml-realtime-score", 1309 | "language": "python", 1310 | "name": "az-ml-realtime-score" 1311 | }, 1312 | "language_info": { 1313 | "codemirror_mode": { 1314 | "name": "ipython", 1315 | "version": 3 1316 | }, 1317 | "file_extension": ".py", 1318 | "mimetype": "text/x-python", 1319 | "name": "python", 1320 | "nbconvert_exporter": "python", 1321 | "pygments_lexer": "ipython3", 1322 | "version": "3.6.2" 1323 | }, 1324 | "pycharm": { 1325 | "stem_cell": { 1326 | "cell_type": "raw", 1327 | "source": [], 1328 | "metadata": { 1329 | "collapsed": false 1330 | } 1331 | } 1332 | } 1333 | }, 1334 | "nbformat": 4, 1335 | "nbformat_minor": 2 1336 | } -------------------------------------------------------------------------------- /notebooks/02_TrainOnLocal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 7 | "\n", 8 | "Licensed under the MIT License.\n", 9 | "\n", 10 | "# Train Locally\n", 11 | "In this notebook, you will perform the following using Azure Machine Learning.\n", 12 | "* Load workspace.\n", 13 | "* Configure & execute a local run in a user-managed Python environment.\n", 14 | "* Configure & execute a local run in a system-managed Python environment.\n", 15 | "* Configure & execute a local run in a Docker environment.\n", 16 | "* Register model for operationalization." 17 | ], 18 | "metadata": { 19 | "collapsed": false 20 | } 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 19, 25 | "outputs": [], 26 | "source": [ 27 | "import os\n", 28 | "\n", 29 | "from azure_utils.machine_learning.utils import get_workspace_from_config\n", 30 | "from azureml.core import Experiment\n", 31 | "from azureml.core import ScriptRunConfig\n", 32 | "from azureml.core.runconfig import RunConfiguration\n", 33 | "\n", 34 | "from notebooks import directory" 35 | ], 36 | "metadata": { 37 | "collapsed": false, 38 | "pycharm": { 39 | "name": "#%%\n", 40 | "is_executing": false 41 | } 42 | } 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "source": [ 47 | "## Initialize Model Hyperparameters\n", 48 | "\n", 49 | "This notebook uses a training script that uses \n", 50 | "[lightgbm](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api). \n", 51 | "Here we set the number of estimators. " 52 | ], 53 | "metadata": { 54 | "collapsed": false 55 | } 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 20, 60 | "outputs": [], 61 | "source": [ 62 | "num_estimators = \"10\"" 63 | ], 64 | "metadata": { 65 | "collapsed": false, 66 | "pycharm": { 67 | "name": "#%%\n", 68 | "is_executing": false 69 | } 70 | } 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "source": [ 75 | "## Initialize Workspace\n", 76 | "\n", 77 | "Initialize a workspace object from persisted configuration file." 78 | ], 79 | "metadata": { 80 | "collapsed": false 81 | } 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 21, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "text": [ 90 | "dciborowlapws\n", 91 | "dciborow-lap-test\n", 92 | "westus\n" 93 | ], 94 | "output_type": "stream" 95 | } 96 | ], 97 | "source": [ 98 | "ws = get_workspace_from_config()\n", 99 | "print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")" 100 | ], 101 | "metadata": { 102 | "collapsed": false, 103 | "pycharm": { 104 | "name": "#%%\n", 105 | "is_executing": false 106 | } 107 | } 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "## Create An Experiment\n", 113 | "**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics \n", 114 | "and output artifacts from your experiments." 115 | ], 116 | "metadata": { 117 | "collapsed": false 118 | } 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 22, 123 | "outputs": [], 124 | "source": [ 125 | "experiment_name = \"mlaks-train-on-local\"\n", 126 | "exp = Experiment(workspace=ws, name=experiment_name)" 127 | ], 128 | "metadata": { 129 | "collapsed": false, 130 | "pycharm": { 131 | "name": "#%%\n", 132 | "is_executing": false 133 | } 134 | } 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "source": [ 139 | "## Configure & Run\n", 140 | "\n", 141 | "In this section, we show three different ways of locally training your model through Azure ML SDK for demonstration \n", 142 | "purposes. Only one of these runs is sufficient to register the model.\n", 143 | "\n", 144 | "\n", 145 | "### User-managed environment\n", 146 | "Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages that are \n", 147 | "available in the Python environment you choose to run the script. We will use the environment created for this \n", 148 | "tutorial which has Azure ML SDK and other dependencies installed." 149 | ], 150 | "metadata": { 151 | "collapsed": false 152 | } 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 23, 157 | "outputs": [], 158 | "source": [ 159 | "# Editing a run configuration property on-fly.\n", 160 | "run_config_user_managed = RunConfiguration()\n", 161 | "\n", 162 | "run_config_user_managed.environment.python.user_managed_dependencies = True\n", 163 | "\n", 164 | "# Choose the specific Python environment of this tutorial by pointing to the Python path\n", 165 | "run_config_user_managed.environment.python.interpreter_path = \"/anaconda/envs/az-ml-realtime-score/bin/python\"\n" 166 | ], 167 | "metadata": { 168 | "collapsed": false, 169 | "pycharm": { 170 | "name": "#%%\n", 171 | "is_executing": false 172 | } 173 | } 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "source": [ 178 | "#### Submit script to run in the user-managed environment\n", 179 | "Note that the whole `scripts` folder is submitted for execution, including the `item_selector.py` and `label_rank.py` \n", 180 | "files. The model will be written to `outputs` directory which is a special directory such that all content in this \n", 181 | "directory is automatically uploaded to your workspace. " 182 | ], 183 | "metadata": { 184 | "collapsed": false 185 | } 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 24, 190 | "outputs": [], 191 | "source": [ 192 | "if not os.path.isdir(\"script\"):\n", 193 | " os.mkdir(\"script\")" 194 | ], 195 | "metadata": { 196 | "collapsed": false, 197 | "pycharm": { 198 | "name": "#%%\n", 199 | "is_executing": false 200 | } 201 | } 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 25, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "text": [ 210 | "Overwriting script/create_model.py\n" 211 | ], 212 | "output_type": "stream" 213 | } 214 | ], 215 | "source": [ 216 | "%%writefile script/create_model.py\n", 217 | "from azure_utils.machine_learning import create_model\n", 218 | "\n", 219 | "if __name__ == '__main__':\n", 220 | " create_model.main()\n" 221 | ], 222 | "metadata": { 223 | "collapsed": false, 224 | "pycharm": { 225 | "name": "#%%\n", 226 | "is_executing": false 227 | } 228 | } 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 26, 233 | "outputs": [], 234 | "source": [ 235 | "script = \"create_model.py\"\n", 236 | "args = [\n", 237 | " \"--inputs\",\n", 238 | " os.path.abspath(directory + \"/data_folder\"),\n", 239 | " \"--outputs\",\n", 240 | " \"outputs\",\n", 241 | " \"--estimators\",\n", 242 | " num_estimators,\n", 243 | " \"--match\",\n", 244 | " \"5\",\n", 245 | "]" 246 | ], 247 | "metadata": { 248 | "collapsed": false, 249 | "pycharm": { 250 | "name": "#%%\n", 251 | "is_executing": false 252 | } 253 | } 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 27, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "text": [ 262 | "RunId: mlaks-train-on-local_1582779167_afeaba3b\n", 263 | "Web View: https://ml.azure.com/experiments/mlaks-train-on-local/runs/mlaks-train-on-local_1582779167_afeaba3b?wsid=/subscriptions/0ca618d2-22a8-413a-96d0-0f1b531129c3/resourcegroups/dciborow-lap-test/workspaces/dciborowlapws\n", 264 | "\n", 265 | "Streaming azureml-logs/60_control_log.txt\n", 266 | "=========================================\n", 267 | "\n", 268 | "Streaming log file azureml-logs/60_control_log.txt\n", 269 | "Running: ['cmd.exe', '/c', 'C:\\\\Users\\\\dciborow\\\\AppData\\\\Local\\\\Temp\\\\azureml_runs\\\\mlaks-train-on-local_1582779167_afeaba3b\\\\azureml-environment-setup/conda_env_checker.bat']\n", 270 | "\n", 271 | "Streaming azureml-logs/70_driver_log.txt\n", 272 | "========================================\n", 273 | "\n", 274 | "Starting the daemon thread to refresh tokens in background for process with pid = 13488\n", 275 | "Entering Run History Context Manager.\n", 276 | "Preparing to call script [ create_model.py ] with arguments: ['--inputs', 'C:\\\\Users\\\\dciborow\\\\Source\\\\Repos\\\\az-ml-realtime-score\\\\notebooks\\\\data_folder', '--outputs', 'outputs', '--estimators', '10', '--match', '5']\n", 277 | "After variable expansion, calling script [ create_model.py ] with arguments: ['--inputs', 'C:\\\\Users\\\\dciborow\\\\Source\\\\Repos\\\\az-ml-realtime-score\\\\notebooks\\\\data_folder', '--outputs', 'outputs', '--estimators', '10', '--match', '5']\n", 278 | "\n", 279 | "C:\\Users\\dciborow\\.conda\\envs\\az-ml-realtime-score\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", 280 | " warnings.warn(msg, category=FutureWarning)\n", 281 | "Reading C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks\\data_folder\\balanced_pairs_train.tsv\n", 282 | "train: 33,125 rows with 20.00% matches\n", 283 | "Training...\n", 284 | "outputs\\model.pkl size: 1.46 MB\n", 285 | "Reading C:\\Users\\dciborow\\Source\\Repos\\az-ml-realtime-score\\notebooks\\data_folder\\balanced_pairs_test.tsv\n", 286 | "test: 297,570 rows with 0.55% matches\n", 287 | "Testing...\n", 288 | "Accuracy @1 = 0.00%\n", 289 | "Accuracy @2 = 24.04%\n", 290 | "Accuracy @3 = 34.74%\n", 291 | "Mean Rank 29.5104\n", 292 | "\n", 293 | "\n", 294 | "The experiment completed successfully. Finalizing run...\n", 295 | "Logging experiment finalizing status in history service.\n", 296 | "Starting the daemon thread to refresh tokens in background for process with pid = 13488\n", 297 | "Cleaning up all outstanding Run operations, waiting 300.0 seconds\n", 298 | "2 items cleaning up...\n", 299 | "Cleanup took 0.005007028579711914 seconds\n", 300 | "\n", 301 | "Execution Summary\n", 302 | "=================\n", 303 | "RunId: mlaks-train-on-local_1582779167_afeaba3b\n", 304 | "Web View: https://ml.azure.com/experiments/mlaks-train-on-local/runs/mlaks-train-on-local_1582779167_afeaba3b?wsid=/subscriptions/0ca618d2-22a8-413a-96d0-0f1b531129c3/resourcegroups/dciborow-lap-test/workspaces/dciborowlapws\n", 305 | "\n" 306 | ], 307 | "output_type": "stream" 308 | }, 309 | { 310 | "data": { 311 | "text/plain": "{'Accuracy @1': 0.0,\n 'Accuracy @2': 0.24036697247706423,\n 'Accuracy @3': 0.3474006116207951,\n 'Mean Rank': 29.51039755351682}" 312 | }, 313 | "metadata": {}, 314 | "output_type": "execute_result", 315 | "execution_count": 27 316 | } 317 | ], 318 | "source": [ 319 | "src = ScriptRunConfig(\n", 320 | " source_directory=\"./script\",\n", 321 | " script=script,\n", 322 | " arguments=args,\n", 323 | " run_config=run_config_user_managed,\n", 324 | ")\n", 325 | "\n", 326 | "run = exp.submit(src) \n", 327 | "run.wait_for_completion(show_output=True)\n", 328 | "run.get_file_names()\n", 329 | "run.get_metrics()" 330 | ], 331 | "metadata": { 332 | "collapsed": false, 333 | "pycharm": { 334 | "name": "#%%\n", 335 | "is_executing": false 336 | } 337 | } 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "source": [ 342 | "## Register Model" 343 | ], 344 | "metadata": { 345 | "collapsed": false 346 | } 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 28, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": "{'Accuracy @1': 0.0,\n 'Accuracy @2': 0.24036697247706423,\n 'Accuracy @3': 0.3474006116207951,\n 'Mean Rank': 29.51039755351682}" 355 | }, 356 | "metadata": {}, 357 | "output_type": "execute_result", 358 | "execution_count": 28 359 | } 360 | ], 361 | "source": [ 362 | "run.get_metrics()" 363 | ], 364 | "metadata": { 365 | "collapsed": false, 366 | "pycharm": { 367 | "name": "#%%\n", 368 | "is_executing": false 369 | } 370 | } 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "source": [ 375 | "## Register Model\n", 376 | "\n", 377 | "We now register the model with the workspace so that we can later deploy the model." 378 | ], 379 | "metadata": { 380 | "collapsed": false 381 | } 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 29, 386 | "outputs": [], 387 | "source": [ 388 | "# supply a model name, and the full path to the serialized model file.\n", 389 | "model = run.register_model(model_name=\"question_match_model\", model_path=\"./outputs/model.pkl\")" 390 | ], 391 | "metadata": { 392 | "collapsed": false, 393 | "pycharm": { 394 | "name": "#%%\n", 395 | "is_executing": false 396 | } 397 | } 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 30, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "text": [ 406 | "question_match_model\n", 407 | "1\n", 408 | "aml://asset/b990ad8f9702428c9e978a0cd470ed8f\n" 409 | ], 410 | "output_type": "stream" 411 | } 412 | ], 413 | "source": [ 414 | "print(model.name, model.version, model.url, sep=\"\\n\")" 415 | ], 416 | "metadata": { 417 | "collapsed": false, 418 | "pycharm": { 419 | "name": "#%%\n", 420 | "is_executing": false 421 | } 422 | } 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "source": [ 427 | "We can now move on to [Develop Scoring Script](03_DevelopScoringScript.ipynb) notebook to train our model\n", 428 | "using Azure Machine Learning." 429 | ], 430 | "metadata": { 431 | "collapsed": false 432 | } 433 | } 434 | ], 435 | "metadata": { 436 | "authors": [ 437 | { 438 | "name": "roastala" 439 | } 440 | ], 441 | "kernelspec": { 442 | "display_name": "az-ml-realtime-score", 443 | "language": "python", 444 | "name": "az-ml-realtime-score" 445 | }, 446 | "language_info": { 447 | "codemirror_mode": { 448 | "name": "ipython", 449 | "version": 3 450 | }, 451 | "file_extension": ".py", 452 | "mimetype": "text/x-python", 453 | "name": "python", 454 | "nbconvert_exporter": "python", 455 | "pygments_lexer": "ipython3", 456 | "version": "3.6.2" 457 | }, 458 | "pycharm": { 459 | "stem_cell": { 460 | "cell_type": "raw", 461 | "source": [], 462 | "metadata": { 463 | "collapsed": false 464 | } 465 | } 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 2 470 | } -------------------------------------------------------------------------------- /notebooks/03_DevelopScoringScript.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License.\n", 10 | "\n", 11 | "# Develop Scoring Script\n", 12 | "\n", 13 | "In this notebook, we will develop the scoring script and test it locally. We will use the scoring script to create the \n", 14 | "web service that will call the model for scoring." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import sys\n", 24 | "\n", 25 | "from azure_utils.machine_learning.utils import get_workspace_from_config\n", 26 | "from azureml.core.model import Model" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "sys.path.append('./scripts/')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Let's load the workspace." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "ws = get_workspace_from_config()\n", 52 | "print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Let's retrieve the model registered earlier and download it." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "model_name = 'question_match_model'\n", 69 | "\n", 70 | "model = Model(ws, name=model_name)\n", 71 | "print(model.name, model.version, model.url, sep=\"\\n\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "model.download(target_dir=\".\", exist_ok=True)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Create Scoring Script" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "We use the writefile magic to write the contents of the below cell to `score.py` which includes the `init` and `run` \n", 95 | "functions required by AML.\n", 96 | "- The init() function typically loads the model into a global object.\n", 97 | "- The run(input_data) function uses the model to predict a value based on the input_data." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "%%writefile score.py\n", 107 | "\n", 108 | "import json\n", 109 | "import logging\n", 110 | "import timeit as t\n", 111 | "from azure_utils.machine_learning.duplicate_model import DuplicateModel\n", 112 | "\n", 113 | "def init():\n", 114 | " logger = logging.getLogger(\"scoring_script\")\n", 115 | " global model\n", 116 | " model_path = \"model.pkl\"\n", 117 | " questions_path = \"./data_folder/questions.tsv\"\n", 118 | " start = t.default_timer()\n", 119 | " model = DuplicateModel(model_path, questions_path)\n", 120 | " end = t.default_timer()\n", 121 | " load_time_msg = \"Model loading time: {0} ms\".format(\n", 122 | " round((end - start) * 1000, 2))\n", 123 | " logger.info(load_time_msg)\n", 124 | "\n", 125 | "\n", 126 | "def run(body):\n", 127 | " logger = logging.getLogger(\"scoring_script\")\n", 128 | " json_load_text = json.loads(body)\n", 129 | " text_to_score = json_load_text[\"input\"]\n", 130 | " start = t.default_timer()\n", 131 | " resp = model.score(text_to_score)\n", 132 | " end = t.default_timer()\n", 133 | " logger.info(\"Prediction took {0} ms\".format(round((end - start) * 1000,\n", 134 | " 2)))\n", 135 | " return json.dumps(resp)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "pycharm": { 142 | "name": "#%% md\n" 143 | } 144 | }, 145 | "source": [ 146 | "We can now move on to [Create Image](04_CreateImage.ipynb) notebook to train our model using \n", 147 | "Azure Machine Learning.\n" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "az-ml-realtime-score", 154 | "language": "python", 155 | "name": "az-ml-realtime-score" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.6.2" 168 | }, 169 | "pycharm": { 170 | "stem_cell": { 171 | "cell_type": "raw", 172 | "source": [], 173 | "metadata": { 174 | "collapsed": false 175 | } 176 | } 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 2 181 | } -------------------------------------------------------------------------------- /notebooks/04_CreateImage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License.\n", 10 | "\n", 11 | "# Create Image\n", 12 | "In this notebook, we show the following steps for deploying a web service using AzureML:\n", 13 | "- Create an image\n", 14 | "- Test image locally" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from azure_utils.machine_learning.realtime.image import get_or_create_image, lightgbm_test_image_locally, get_model\n", 24 | "from notebooks import directory" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "pycharm": { 31 | "name": "#%% md\n" 32 | } 33 | }, 34 | "source": [ 35 | "AML will use the following information to create an image, provision a cluster and deploy a service. Replace the \n", 36 | "values in the following cell with your information." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Create an image\n", 44 | "We will now modify the `score.py` created in the previous notebook for the `init()` function to use the model we \n", 45 | "registered to the workspace earlier." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "%%writefile score.py\n", 55 | "\n", 56 | "import sys\n", 57 | "import json\n", 58 | "import logging\n", 59 | "import timeit as t\n", 60 | "from azureml.core.model import Model\n", 61 | "from azureml.contrib.services.aml_request import rawhttp\n", 62 | "from azureml.contrib.services.aml_response import AMLResponse\n", 63 | "from azure_utils.machine_learning.duplicate_model import DuplicateModel\n", 64 | "\n", 65 | "sys.path.append('./scripts/')\n", 66 | "\n", 67 | "\n", 68 | "def init():\n", 69 | " logger = logging.getLogger(\"scoring_script\")\n", 70 | " global model\n", 71 | " model_name = 'question_match_model'\n", 72 | " model_path = Model.get_model_path(model_name)\n", 73 | " questions_path = './notebooks/data_folder/questions.tsv'\n", 74 | " start = t.default_timer()\n", 75 | " model = DuplicateModel(model_path, questions_path)\n", 76 | " end = t.default_timer()\n", 77 | " load_time_msg = \"Model loading time: {0} ms\".format(\n", 78 | " round((end - start) * 1000, 2))\n", 79 | " logger.info(load_time_msg)\n", 80 | "\n", 81 | "\n", 82 | "@rawhttp\n", 83 | "def run(request):\n", 84 | " \"\"\"\n", 85 | " Function runs on each request\n", 86 | " \"\"\"\n", 87 | " body = request.data\n", 88 | " if request.method == 'POST':\n", 89 | " logger = logging.getLogger(\"scoring_script\")\n", 90 | " json_load_text = json.loads(body)\n", 91 | " text_to_score = json_load_text['input']\n", 92 | " start = t.default_timer()\n", 93 | " resp = model.score(text_to_score)\n", 94 | " end = t.default_timer()\n", 95 | " logger.info(\"Prediction took {0} ms\".format(\n", 96 | " round((end - start) * 1000, 2)))\n", 97 | " return json.dumps(resp)\n", 98 | " if request.method == 'GET':\n", 99 | " resp_body = {\n", 100 | " \"azEnvironment\": \"Azure\",\n", 101 | " \"location\": \"westus2\",\n", 102 | " \"osType\": \"Ubuntu 16.04\",\n", 103 | " \"resourceGroupName\": \"\",\n", 104 | " \"resourceId\": \"\",\n", 105 | " \"sku\": \"\",\n", 106 | " \"subscriptionId\": \"\",\n", 107 | " \"uniqueId\": \"PythonMLRST\",\n", 108 | " \"vmSize\": \"\",\n", 109 | " \"zone\": \"\",\n", 110 | " \"isServer\": False,\n", 111 | " \"version\": \"\"\n", 112 | " }\n", 113 | " return resp_body\n", 114 | " return AMLResponse(\"bad request\", 500)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Let's specify the conda and pip dependencies for the image." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "dependencies = [\"./notebooks/data_folder/questions.tsv\"]\n", 131 | "models = [get_model(model_name='question_match_model')]\n", 132 | "\n", 133 | "image = get_or_create_image(models=models, dependencies=dependencies)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Test image locally" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Now, let's use one of the duplicate questions to test our image." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "lightgbm_test_image_locally(image, directory)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "pycharm": { 163 | "name": "#%% md\n" 164 | } 165 | }, 166 | "source": [ 167 | "## Conclusion\n", 168 | "\n", 169 | "We can now move on to [Deploy on Azure Kubernetes Service](05_DeployOnAKS.ipynb) notebook. " 170 | ] 171 | } 172 | ], 173 | "metadata": { 174 | "authors": [ 175 | { 176 | "name": "raymondl" 177 | } 178 | ], 179 | "kernelspec": { 180 | "display_name": "az-ml-realtime-score", 181 | "language": "python", 182 | "name": "az-ml-realtime-score" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.2" 195 | }, 196 | "pycharm": { 197 | "stem_cell": { 198 | "cell_type": "raw", 199 | "source": [], 200 | "metadata": { 201 | "collapsed": false 202 | } 203 | } 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 2 208 | } -------------------------------------------------------------------------------- /notebooks/05_DeployOnAKS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 7 | "\n", 8 | "Licensed under the MIT License." 9 | ], 10 | "metadata": { 11 | "collapsed": false 12 | } 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "source": [ 17 | "# Deploying a web service to Azure Kubernetes Service (AKS)\n", 18 | "In this notebook, we show the following steps for deploying a web service using AzureML:\n", 19 | "- Provision an AKS cluster (one time action)\n", 20 | "- Deploy the service\n", 21 | "- Test the web service\n", 22 | "- Scale up the service" 23 | ], 24 | "metadata": { 25 | "collapsed": false 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "outputs": [], 32 | "source": [ 33 | "from azure_utils.machine_learning.realtime.kubernetes import get_or_create_aks_service" 34 | ], 35 | "metadata": { 36 | "collapsed": false, 37 | "pycharm": { 38 | "name": "#%%\n" 39 | } 40 | } 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "AML will use the following information to create an image, provision a cluster and deploy a service. Replace the \n", 46 | "values in the following cell with your information." 47 | ], 48 | "metadata": { 49 | "collapsed": false 50 | } 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "outputs": [], 56 | "source": [ 57 | "aks_service = get_or_create_aks_service()\n" 58 | ], 59 | "metadata": { 60 | "collapsed": false, 61 | "pycharm": { 62 | "name": "#%%\n" 63 | } 64 | } 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "source": [ 69 | "Next, we will test the [throughput of the web service](06_SpeedTestWebApp.ipynb)." 70 | ], 71 | "metadata": { 72 | "collapsed": false 73 | } 74 | } 75 | ], 76 | "metadata": { 77 | "authors": [ 78 | { 79 | "name": "raymondl" 80 | } 81 | ], 82 | "celltoolbar": "Tags", 83 | "kernelspec": { 84 | "display_name": "az-ml-realtime-score", 85 | "language": "python", 86 | "name": "az-ml-realtime-score" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.6.2" 99 | }, 100 | "pycharm": { 101 | "stem_cell": { 102 | "cell_type": "raw", 103 | "source": [], 104 | "metadata": { 105 | "collapsed": false 106 | } 107 | } 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } -------------------------------------------------------------------------------- /notebooks/06_SpeedTestWebApp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Test deployed web application" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook pulls some images and tests them against the deployed web application. We submit requests asychronously \n", 15 | "which should reduce the contribution of latency." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from urllib.parse import urlparse\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from azure_utils.configuration.project_configuration import ProjectConfiguration\n", 28 | "from azure_utils.machine_learning.utils import get_workspace_from_config\n", 29 | "from azureml.core.webservice import AksWebservice" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "ws = get_workspace_from_config()\n", 39 | "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=\"\\n\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Let's retrieve the web service." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "project_configuration = ProjectConfiguration(\"project.yml\")\n", 56 | "aks_service_name = project_configuration.get_settings('aks_service_name')\n", 57 | "aks_service = AksWebservice(ws, name=aks_service_name)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "We will test our service concurrently but only have 4 concurrent requests at any time. We have only deployed one pod \n", 65 | "on one node and increasing the number of concurrent calls does not really increase throughput. Feel free to try \n", 66 | "different values and see how the service responds." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "tags": [ 74 | "parameters" 75 | ] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "concurrent_requests = 4 # Number of requests at a time" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Get the scoring URL and API key of the service." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "scoring_url = aks_service.scoring_uri\n", 96 | "api_key = aks_service.get_keys()[0]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Below we are going to use [Locust](https://locust.io/) to load test our deployed model. First we need to write the \n", 104 | "locustfile." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "%%writefile locustfile.py\n", 114 | "from locust import HttpLocust, TaskSet, task\n", 115 | "import os\n", 116 | "import pandas as pd\n", 117 | "from utilities import text_to_json\n", 118 | "from itertools import cycle\n", 119 | "\n", 120 | "_NUMBER_OF_REQUESTS = os.getenv('NUMBER_OF_REQUESTS', 100)\n", 121 | "dupes_test_path = './data_folder/dupes_test.tsv'\n", 122 | "dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n", 123 | "dupes_to_score = dupes_test.iloc[:_NUMBER_OF_REQUESTS, 4]\n", 124 | "_SCORE_PATH = os.getenv('SCORE_PATH', \"/score\")\n", 125 | "_API_KEY = os.getenv('API_KEY')\n", 126 | "\n", 127 | "\n", 128 | "class UserBehavior(TaskSet):\n", 129 | " def on_start(self):\n", 130 | " print('Running setup')\n", 131 | " self._text_generator = cycle(dupes_to_score.apply(text_to_json))\n", 132 | " self._headers = {\n", 133 | " \"content-type\": \"application/json\",\n", 134 | " 'Authorization': ('Bearer {}'.format(_API_KEY))\n", 135 | " }\n", 136 | "\n", 137 | " @task\n", 138 | " def score(self):\n", 139 | " self.client.post(_SCORE_PATH,\n", 140 | " data=next(self._text_generator),\n", 141 | " headers=self._headers)\n", 142 | "\n", 143 | "\n", 144 | "class WebsiteUser(HttpLocust):\n", 145 | " task_set = UserBehavior\n", 146 | " # min and max time to wait before repeating task\n", 147 | " min_wait = 10\n", 148 | " max_wait = 200" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "Below we define the locust command we want to run. We are going to run at a hatch rate of 10 and the whole test will \n", 156 | "last 1 minute. Feel free to adjust the parameters below and see how the results differ. The results of the test will \n", 157 | "be saved to two csv files **modeltest_requests.csv** and **modeltest_distribution.csv**" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "parsed_url = urlparse(scoring_url)\n", 167 | "cmd = \"locust -H {host} --no-web -c {users} -r {rate} -t {duration} --csv=modeltest --only-summary\".format(\n", 168 | " host=\"{url.scheme}://{url.netloc}\".format(url=parsed_url),\n", 169 | " users=concurrent_requests, # concurrent users\n", 170 | " rate=10, # hatch rate (users / second)\n", 171 | " duration='1m', # test duration\n", 172 | ")" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "scrolled": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "! API_KEY={api_key} SCORE_PATH={parsed_url.path} PYTHONPATH={os.path.abspath('../')} {cmd}" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Here are the summary results of our test and below that the distribution infromation of those tests. " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "pd.read_csv(\"modeltest_requests.csv\")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "pd.read_csv(\"modeltest_distribution.csv\")" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "To tear down the cluster and all related resources go to the [tear down the cluster](07_TearDown.ipynb) notebook." 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "jupytext": { 221 | "formats": "ipynb" 222 | }, 223 | "kernelspec": { 224 | "display_name": "az-ml-realtime-score", 225 | "language": "python", 226 | "name": "az-ml-realtime-score" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.2" 239 | }, 240 | "pycharm": { 241 | "stem_cell": { 242 | "cell_type": "raw", 243 | "source": [], 244 | "metadata": { 245 | "collapsed": false 246 | } 247 | } 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 2 252 | } -------------------------------------------------------------------------------- /notebooks/07_RealTimeScoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 9 | "\n", 10 | "Licensed under the MIT License." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "extensions": { 17 | "jupyter_dashboards": { 18 | "version": 1, 19 | "views": { 20 | "grid_default": {}, 21 | "report_default": { 22 | "hidden": false 23 | } 24 | } 25 | } 26 | } 27 | }, 28 | "source": [ 29 | "# Explore Duplicate Question Matches\n", 30 | "Use this dashboard to explore the relationship between duplicate and original questions." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "extensions": { 37 | "jupyter_dashboards": { 38 | "version": 1, 39 | "views": { 40 | "grid_default": {}, 41 | "report_default": { 42 | "hidden": true 43 | } 44 | } 45 | } 46 | } 47 | }, 48 | "source": [ 49 | "## Setup\n", 50 | "This section loads needed packages, and defines useful functions." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "extensions": { 58 | "jupyter_dashboards": { 59 | "version": 1, 60 | "views": { 61 | "grid_default": {}, 62 | "report_default": { 63 | "hidden": true 64 | } 65 | } 66 | } 67 | } 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "from __future__ import print_function\n", 72 | "\n", 73 | "import math\n", 74 | "\n", 75 | "import ipywidgets as widgets\n", 76 | "import pandas as pd\n", 77 | "import requests\n", 78 | "from azure_utils.configuration.project_configuration import ProjectConfiguration\n", 79 | "from azure_utils.machine_learning.utils import get_workspace_from_config\n", 80 | "from azure_utils.utilities import read_questions, text_to_json\n", 81 | "from azureml.core.webservice import AksWebservice\n", 82 | "\n", 83 | "from notebooks import directory" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "ws = get_workspace_from_config()\n", 93 | "print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "project_configuration = ProjectConfiguration(\"project.yml\")\n", 103 | "aks_service_name = project_configuration.get_settings('aks_service_name')\n", 104 | "aks_service = AksWebservice(ws, name=aks_service_name)\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Load the duplicate questions scoring app's URL." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "scoring_url = aks_service.scoring_uri\n", 121 | "api_key = aks_service.get_keys()[0]" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "extensions": { 128 | "jupyter_dashboards": { 129 | "version": 1, 130 | "views": { 131 | "grid_default": {}, 132 | "report_default": { 133 | "hidden": true 134 | } 135 | } 136 | } 137 | } 138 | }, 139 | "source": [ 140 | "A constructor function for ID-text contents. Constructs buttons and text areas for each text ID and text passage.\n", 141 | "* Each buttons's description is set to a text's ID, and its click action is set to the handler.\n", 142 | "* Each text area's content is set to a text.\n", 143 | "* A dictionary is created to map IDs to text areas." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "extensions": { 151 | "jupyter_dashboards": { 152 | "version": 1, 153 | "views": { 154 | "grid_default": {}, 155 | "report_default": { 156 | "hidden": true 157 | } 158 | } 159 | } 160 | } 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "def buttons_and_texts(data,\n", 165 | " text_id,\n", 166 | " answerid,\n", 167 | " text,\n", 168 | " handle_click,\n", 169 | " layout=widgets.Layout(width=\"100%\"),\n", 170 | " num=15):\n", 171 | " \"\"\"Construct buttons, text areas, and a mapping from IDs to text areas.\"\"\"\n", 172 | " items = []\n", 173 | " text_map = {}\n", 174 | " for i in range(min(num, len(data))):\n", 175 | " button = widgets.Button(description=data.iloc[i][text_id])\n", 176 | " button.answerid = data.iloc[i][answerid] if answerid in data else None\n", 177 | " button.open = False\n", 178 | " button.on_click(handle_click)\n", 179 | " items.append(button)\n", 180 | " text_area = widgets.Textarea(data.iloc[i][text],\n", 181 | " placeholder=data.iloc[i][id],\n", 182 | " layout=layout)\n", 183 | " items.append(text_area)\n", 184 | " text_map[data.iloc[i][id]] = text_area\n", 185 | " return items, text_map" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "extensions": { 192 | "jupyter_dashboards": { 193 | "version": 1, 194 | "views": { 195 | "grid_default": {}, 196 | "report_default": { 197 | "hidden": true 198 | } 199 | } 200 | } 201 | } 202 | }, 203 | "source": [ 204 | "A constructor function for the duplicates and questions explorer widget. This builds a box containing duplicates and \n", 205 | "question tabs, each in turn containing boxes that contain the buttons and text areas." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "extensions": { 213 | "jupyter_dashboards": { 214 | "version": 1, 215 | "views": { 216 | "grid_default": {}, 217 | "report_default": { 218 | "hidden": true 219 | } 220 | } 221 | } 222 | } 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "def duplicates_questions_widget( layout=widgets.Layout(width=\"100%\")):\n", 227 | " \"\"\"Construct a duplicates and questions exploration widget.\"\"\"\n", 228 | " # Construct the duplicates Tab of buttons and text areas.\n", 229 | " duplicates_items, duplicates_map_inner = buttons_and_texts(\n", 230 | " duplicates,\n", 231 | " duplicates_id,\n", 232 | " duplicates_answerid,\n", 233 | " duplicates_click\n", 234 | " )\n", 235 | " duplicates_tab = widgets.Tab(\n", 236 | " [widgets.VBox(duplicates_items, layout=layout)],\n", 237 | " layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n", 238 | " )\n", 239 | " duplicates_tab.set_title(0, duplicates_title)\n", 240 | " # Construct the questions Tab of buttons and text areas.\n", 241 | " questions_items, questions_map_inner = buttons_and_texts(\n", 242 | " questions,\n", 243 | " questions_id,\n", 244 | " questions_answerid,\n", 245 | " questions_text,\n", 246 | " questions_click\n", 247 | " )\n", 248 | " questions_tab = widgets.Tab(\n", 249 | " [widgets.VBox(questions_items, layout=layout)],\n", 250 | " layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n", 251 | " )\n", 252 | " questions_tab.set_title(0, questions_title)\n", 253 | " # Put both tabs in an HBox.\n", 254 | " duplicates_questions_inner = widgets.HBox([duplicates_tab, questions_tab], layout=layout)\n", 255 | " return duplicates_map_inner, questions_map_inner, duplicates_questions_inner" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": { 261 | "extensions": { 262 | "jupyter_dashboards": { 263 | "version": 1, 264 | "views": { 265 | "grid_default": {}, 266 | "report_default": { 267 | "hidden": true 268 | } 269 | } 270 | } 271 | } 272 | }, 273 | "source": [ 274 | "A handler function for a question passage button press. If the passage's text window is open, it is collapsed. \n", 275 | "Otherwise, it is opened." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "extensions": { 283 | "jupyter_dashboards": { 284 | "version": 1, 285 | "views": { 286 | "grid_default": {}, 287 | "report_default": { 288 | "hidden": true 289 | } 290 | } 291 | } 292 | } 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "def questions_click(button):\n", 297 | " \"\"\"Respond to a click on a question button.\"\"\"\n", 298 | " global questions_map\n", 299 | " if button.open:\n", 300 | " questions_map[button.description].rows = None\n", 301 | " button.open = False\n", 302 | " else:\n", 303 | " questions_map[button.description].rows = 10\n", 304 | " button.open = True" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": { 310 | "extensions": { 311 | "jupyter_dashboards": { 312 | "version": 1, 313 | "views": { 314 | "grid_default": {}, 315 | "report_default": { 316 | "hidden": true 317 | } 318 | } 319 | } 320 | } 321 | }, 322 | "source": [ 323 | "A handler function for a duplicate obligation button press. If the obligation is not selected, select it and update \n", 324 | "the questions tab with its top 15 question passages ordered by match score. Otherwise, if the duplicate's text window \n", 325 | "is open, it is collapsed, else it is opened." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "extensions": { 333 | "jupyter_dashboards": { 334 | "version": 1, 335 | "views": { 336 | "grid_default": {}, 337 | "report_default": { 338 | "hidden": true 339 | } 340 | } 341 | } 342 | } 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "def duplicates_click(button):\n", 347 | " \"\"\"Respond to a click on a duplicate button.\"\"\"\n", 348 | " global duplicates_map\n", 349 | " if select_duplicate(button):\n", 350 | " duplicates_map[button.description].rows = 10\n", 351 | " button.open = True\n", 352 | " else:\n", 353 | " if button.open:\n", 354 | " duplicates_map[button.description].rows = None\n", 355 | " button.open = False\n", 356 | " else:\n", 357 | " duplicates_map[button.description].rows = 10\n", 358 | " button.open = True\n", 359 | "\n", 360 | "\n", 361 | "def select_duplicate(button):\n", 362 | " \"\"\"Update the displayed questions to correspond to the button's duplicate\n", 363 | " selections. Returns whether or not the selected duplicate changed.\n", 364 | " \"\"\"\n", 365 | " global selected_button, questions_map, duplicates_questions\n", 366 | " if \"selected_button\" not in globals() or button != selected_button:\n", 367 | " if \"selected_button\" in globals():\n", 368 | " selected_button.style.button_color = None\n", 369 | " selected_button.style.font_weight = \"\"\n", 370 | " selected_button = button\n", 371 | " selected_button.style.button_color = \"yellow\"\n", 372 | " selected_button.style.font_weight = \"bold\"\n", 373 | " duplicates_text = duplicates_map[selected_button.description].value\n", 374 | " questions_scores = score_text(duplicates_text)\n", 375 | " ordered_questions = questions.loc[questions_scores[questions_id]]\n", 376 | " questions_items, questions_map = buttons_and_texts(\n", 377 | " ordered_questions,\n", 378 | " questions_id,\n", 379 | " questions_answerid,\n", 380 | " questions_text,\n", 381 | " questions_click\n", 382 | " )\n", 383 | " if questions_button_color is True and selected_button.answerid is not None:\n", 384 | " set_button_color(questions_items[::2], selected_button.answerid)\n", 385 | " if questions_button_score is True:\n", 386 | " questions_items = [\n", 387 | " item for button, text_area in zip(*[iter(questions_items)] * 2)\n", 388 | " for item in (add_button_prob(button, questions_scores),\n", 389 | " text_area)\n", 390 | " ]\n", 391 | " duplicates_questions.children[1].children[0].children = questions_items\n", 392 | " duplicates_questions.children[1].set_title(0,\n", 393 | " selected_button.description)\n", 394 | " return True\n", 395 | " return False\n", 396 | "\n", 397 | "\n", 398 | "def add_button_prob(button, questions_scores):\n", 399 | " \"\"\"Return an HBox containing button and its probability.\"\"\"\n", 400 | " button_id = button.description\n", 401 | " prob = widgets.Label(score_label + \": \" + str(\n", 402 | " int(\n", 403 | " math.ceil(score_scale *\n", 404 | " questions_scores.loc[button_id][questions_probability]))))\n", 405 | " return widgets.HBox([button, prob])\n", 406 | "\n", 407 | "\n", 408 | "def set_button_color(button, answerid):\n", 409 | " \"\"\"Set each button's color according to its label.\"\"\"\n", 410 | " for a_button in button:\n", 411 | " a_button.style.button_color = (\"lightgreen\" if a_button.answerid == answerid else None)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Functions for interacting with the web service." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "def score_text(text):\n", 428 | " \"\"\"Return a data frame with the original question scores for the text.\"\"\"\n", 429 | " headers = {\n", 430 | " \"content-type\": \"application/json\",\n", 431 | " \"Authorization\": (\"Bearer \" + api_key),\n", 432 | " }\n", 433 | " jsontext = text_to_json(text)\n", 434 | " result = requests.post(scoring_url, data=jsontext, headers=headers)\n", 435 | " scores = result.json()\n", 436 | " scores_df = pd.DataFrame(\n", 437 | " scores,\n", 438 | " columns=[questions_id, questions_answerid, questions_probability])\n", 439 | " scores_df[questions_id] = scores_df[questions_id].astype(str)\n", 440 | " scores_df[questions_answerid] = scores_df[questions_answerid].astype(str)\n", 441 | " scores_df = scores_df.set_index(questions_id, drop=False)\n", 442 | " return scores_df" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": { 448 | "extensions": { 449 | "jupyter_dashboards": { 450 | "version": 1, 451 | "views": { 452 | "grid_default": {}, 453 | "report_default": { 454 | "hidden": true 455 | } 456 | } 457 | } 458 | } 459 | }, 460 | "source": [ 461 | "Control the appearance of cell output boxes." 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "extensions": { 469 | "jupyter_dashboards": { 470 | "version": 1, 471 | "views": { 472 | "grid_default": {}, 473 | "report_default": { 474 | "hidden": true 475 | } 476 | } 477 | } 478 | } 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "%%html\n", 483 | "" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": { 498 | "extensions": { 499 | "jupyter_dashboards": { 500 | "version": 1, 501 | "views": { 502 | "grid_default": {}, 503 | "report_default": { 504 | "hidden": true 505 | } 506 | } 507 | } 508 | } 509 | }, 510 | "source": [ 511 | "## Load data\n", 512 | "\n", 513 | "Load the pre-formatted text of questions." 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "extensions": { 521 | "jupyter_dashboards": { 522 | "version": 1, 523 | "views": { 524 | "grid_default": {}, 525 | "report_default": { 526 | "hidden": true 527 | } 528 | } 529 | } 530 | } 531 | }, 532 | "outputs": [], 533 | "source": [ 534 | "questions_title = 'Questions'\n", 535 | "questions_id = 'Id'\n", 536 | "questions_answerid = 'AnswerId'\n", 537 | "questions_text = 'Text'\n", 538 | "questions_probability = 'Probability'\n", 539 | "questions_path = directory + '/data_folder/questions.tsv'\n", 540 | "questions = read_questions(questions_path, questions_id, questions_answerid)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": { 546 | "extensions": { 547 | "jupyter_dashboards": { 548 | "version": 1, 549 | "views": { 550 | "grid_default": {}, 551 | "report_default": { 552 | "hidden": true 553 | } 554 | } 555 | } 556 | } 557 | }, 558 | "source": [ 559 | "Load the pre-formatted text of duplicates." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": { 566 | "extensions": { 567 | "jupyter_dashboards": { 568 | "version": 1, 569 | "views": { 570 | "grid_default": {}, 571 | "report_default": { 572 | "hidden": true 573 | } 574 | } 575 | } 576 | } 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "duplicates_title = 'Duplicates'\n", 581 | "duplicates_id = 'Id'\n", 582 | "duplicates_answerid = 'AnswerId'\n", 583 | "duplicates_path = directory + '/data_folder/dupes_test.tsv'\n", 584 | "duplicates = read_questions(duplicates_path, duplicates_id, duplicates_answerid)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "extensions": { 591 | "jupyter_dashboards": { 592 | "version": 1, 593 | "views": { 594 | "grid_default": {}, 595 | "report_default": { 596 | "hidden": false 597 | } 598 | } 599 | } 600 | } 601 | }, 602 | "source": [ 603 | "## Explore original questions matched up with duplicate questions\n", 604 | "\n", 605 | "Define other variables and settings used in creating the interface." 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": { 612 | "extensions": { 613 | "jupyter_dashboards": { 614 | "version": 1, 615 | "views": { 616 | "grid_default": {}, 617 | "report_default": { 618 | "hidden": true 619 | } 620 | } 621 | } 622 | } 623 | }, 624 | "outputs": [], 625 | "source": [ 626 | "questions_display = 15\n", 627 | "questions_button_color = True\n", 628 | "questions_button_score = True\n", 629 | "score_label = 'Score'\n", 630 | "score_scale = 100" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": { 636 | "extensions": { 637 | "jupyter_dashboards": { 638 | "version": 1, 639 | "views": { 640 | "grid_default": {}, 641 | "report_default": { 642 | "hidden": true 643 | } 644 | } 645 | } 646 | } 647 | }, 648 | "source": [ 649 | "This builds the exploration widget as a box containing duplicates and question tabs, each in turn containing boxes \n", 650 | "that have for each ID-text pair a button and a text area." 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "extensions": { 658 | "jupyter_dashboards": { 659 | "version": 1, 660 | "views": { 661 | "grid_default": {}, 662 | "report_default": { 663 | "hidden": false 664 | } 665 | } 666 | } 667 | } 668 | }, 669 | "outputs": [], 670 | "source": [ 671 | "duplicates_map, questions_map, duplicates_questions = duplicates_questions_widget(duplicates, questions)\n", 672 | "duplicates_questions" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "To tear down the cluster and related resources go to the [last notebook](08_TearDown.ipynb)." 680 | ] 681 | } 682 | ], 683 | "metadata": { 684 | "extensions": { 685 | "jupyter_dashboards": { 686 | "activeView": "report_default", 687 | "version": 1, 688 | "views": { 689 | "grid_default": { 690 | "name": "grid", 691 | "type": "grid" 692 | }, 693 | "report_default": { 694 | "name": "report", 695 | "type": "report" 696 | } 697 | } 698 | } 699 | }, 700 | "kernelspec": { 701 | "display_name": "az-ml-realtime-score", 702 | "language": "python", 703 | "name": "az-ml-realtime-score" 704 | }, 705 | "language_info": { 706 | "codemirror_mode": { 707 | "name": "ipython", 708 | "version": 3 709 | }, 710 | "file_extension": ".py", 711 | "mimetype": "text/x-python", 712 | "name": "python", 713 | "nbconvert_exporter": "python", 714 | "pygments_lexer": "ipython3", 715 | "version": "3.6.2" 716 | }, 717 | "pycharm": { 718 | "stem_cell": { 719 | "cell_type": "raw", 720 | "source": [], 721 | "metadata": { 722 | "collapsed": false 723 | } 724 | } 725 | } 726 | }, 727 | "nbformat": 4, 728 | "nbformat_minor": 2 729 | } -------------------------------------------------------------------------------- /notebooks/08_TearDown.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Tear it all down\n", 17 | "Use this notebook to clean up the web service, image, model and the AKS cluster created by the tutorial." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from azure_utils.configuration.project_configuration import ProjectConfiguration\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "project_configuration = ProjectConfiguration(\"project.yml\")\n", 36 | "resource_group = project_configuration.get_settings('resource_group')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "pycharm": { 44 | "name": "#%%\n" 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "!az group delete --yes --name $resource_group" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "source": [ 55 | "You have completed the sample." 56 | ], 57 | "metadata": { 58 | "collapsed": false, 59 | "pycharm": { 60 | "name": "#%% md\n" 61 | } 62 | } 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "az-ml-realtime-score", 68 | "language": "python", 69 | "name": "az-ml-realtime-score" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.6.2" 82 | }, 83 | "pycharm": { 84 | "stem_cell": { 85 | "cell_type": "raw", 86 | "source": [], 87 | "metadata": { 88 | "collapsed": false 89 | } 90 | } 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } -------------------------------------------------------------------------------- /notebooks/Makefile: -------------------------------------------------------------------------------- 1 | .ONESHELL: 2 | SHELL=/bin/bash 3 | 4 | define PROJECT_HELP_MSG 5 | Makefile for testing notebooks 6 | Make sure you have edited the dev_env_template files and renamed it to .dev_env 7 | All the variables loaded in this makefile must come from the .dev_env file 8 | 9 | Usage: 10 | make test run all notebooks 11 | make clean delete env and remove files 12 | endef 13 | export PROJECT_HELP_MSG 14 | env_location=.dev_env 15 | PWD:=$(shell pwd) 16 | include ${env_location} 17 | 18 | 19 | help: 20 | echo "$$PROJECT_HELP_MSG" | less 21 | 22 | 23 | test: setup test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5 test-notebook6 test-notebook7 \ 24 | test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2 25 | @echo All Notebooks Passed 26 | 27 | setup: 28 | conda env create -f environment.yml 29 | ifndef TENANT_ID 30 | @echo starting interactive login 31 | az login -o table 32 | az account set --subscription ${SUBSCRIPTION_ID} 33 | else 34 | @echo using service principal login 35 | az login -t ${TENANT_ID} --service-principal -u ${SP_USERNAME} --password ${SP_PASSWORD} 36 | endif 37 | 38 | 39 | test-notebook1: 40 | source activate MLAKSDeployAML 41 | @echo Testing 00_AMLConfiguration.ipynb 42 | papermill 00_AMLConfiguration.ipynb test.ipynb \ 43 | --log-output \ 44 | --no-progress-bar \ 45 | -k python3 \ 46 | -p subscription_id ${SUBSCRIPTION_ID} \ 47 | -p resource_group ${RESOURCE_GROUP} \ 48 | -p workspace_name ${WORKSPACE_NAME} \ 49 | -p workspace_region ${WORKSPACE_REGION} \ 50 | -p image_name ${IMAGE_NAME} \ 51 | 52 | test-notebook2: 53 | source activate MLAKSDeployAML 54 | @echo Testing 01_DataPrep.ipynb 55 | papermill 01_DataPrep.ipynb test.ipynb \ 56 | --log-output \ 57 | --no-progress-bar \ 58 | -k python3 59 | 60 | test-notebook3: 61 | source activate MLAKSDeployAML 62 | @echo Testing 02_TrainOnLocal.ipynb 63 | papermill 02_TrainOnLocal.ipynb test.ipynb \ 64 | --log-output \ 65 | --no-progress-bar \ 66 | -k python3 67 | 68 | test-notebook4: 69 | source activate MLAKSDeployAML 70 | @echo Testing 03_DevelopScoringScript.ipynb 71 | papermill 03_DevelopScoringScript.ipynb test.ipynb \ 72 | --log-output \ 73 | --no-progress-bar \ 74 | -k python3 75 | sleep 1m 76 | 77 | test-notebook5: 78 | source activate MLAKSDeployAML 79 | @echo Testing 04_CreateImage.ipynb 80 | papermill 04_CreateImage.ipynb test.ipynb \ 81 | --log-output \ 82 | --no-progress-bar \ 83 | -k python3 84 | sleep 30 85 | 86 | test-notebook6: 87 | source activate MLAKSDeployAML 88 | @echo Testing 05_DeployOnAKS.ipynb 89 | papermill aks/05_DeployOnAKS.ipynb test.ipynb \ 90 | --log-output \ 91 | --no-progress-bar \ 92 | -k python3 \ 93 | -p aks_name ${AKS_NAME} \ 94 | -p aks_location ${WORKSPACE_REGION} \ 95 | -p aks_service_name ${AKS_SERVICE_NAME} 96 | 97 | test-notebook7: 98 | source activate MLAKSDeployAML 99 | @echo Testing 06_SpeedTestWebApp.ipynb 100 | papermill aks/06_SpeedTestWebApp.ipynb test.ipynb \ 101 | --log-output \ 102 | --no-progress-bar \ 103 | -k python3 104 | 105 | test-notebook8: 106 | source activate MLAKSDeployAML 107 | @echo Testing 07_RealTimeScoring.ipynb 108 | papermill aks/07_RealTimeScoring.ipynb test.ipynb \ 109 | --log-output \ 110 | --no-progress-bar \ 111 | -k python3 112 | 113 | 114 | test-notebook-iot1: 115 | source activate MLAKSDeployAML 116 | @echo Testing 05_DeployOnIOTedge.ipynb 117 | export PYTHONPATH=${PWD}:${PYTHONPATH} 118 | cd iotedge 119 | mkdir ./data_folder 120 | cp ../data_folder/dupes_test.tsv ./data_folder 121 | papermill 05_DeployOnIOTedge.ipynb test.ipynb \ 122 | --log-output \ 123 | --no-progress-bar \ 124 | -k python3 \ 125 | -p iot_hub_name fstlstnameiothub \ 126 | -p device_id mydevice \ 127 | -p module_name mymodule 128 | 129 | test-notebook9: 130 | source activate MLAKSDeployAML 131 | @echo Testing 08_TearDown.ipynb 132 | papermill aks/08_TearDown.ipynb test.ipynb \ 133 | --log-output \ 134 | --no-progress-bar \ 135 | -k python3 136 | 137 | test-notebook-iot2: 138 | source activate MLAKSDeployAML 139 | @echo Testing 06_TearDown.ipynb 140 | export PYTHONPATH=${PWD}:${PYTHONPATH} 141 | papermill iotedge/06_TearDown.ipynb test.ipynb \ 142 | --log-output \ 143 | --no-progress-bar \ 144 | -k python3 145 | 146 | 147 | test-cookiecutter-aks: 148 | cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \ 149 | subscription_id="${SUBSCRIPTION_ID}" \ 150 | workspace_region=${WORKSPACE_REGION} \ 151 | deployment_type="aks" 152 | 153 | test-cookiecutter-iot: 154 | cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \ 155 | subscription_id=${SUBSCRIPTION_ID} \ 156 | workspace_region=${WORKSPACE_REGION} \ 157 | deployment_type="iotedge" 158 | 159 | remove-notebook: 160 | rm -f test.ipynb 161 | 162 | clean: remove-notebook 163 | conda remove --name MLAKSDeployAML -y --all 164 | rm -rf aml_config 165 | rm -rf __pycache__ 166 | rm -rf .ipynb_checkpoints 167 | rm -rf data_folder 168 | rm -rf azureml-models 169 | rm -rf score.py lgbmenv.yml model.pkl 170 | rm -rf iotedge/deployment.json iotedge/deviceconfig.sh 171 | rm -rf iotedge/data_folder 172 | 173 | notebook: 174 | source activate MLAKSDeployAML 175 | jupyter notebook --port 9999 --ip 0.0.0.0 --no-browser 176 | 177 | install-jupytext: 178 | source activate MLAKSDeployAML 179 | conda install -c conda-forge jupytext 180 | 181 | convert-to-py: 182 | jupytext --set-formats ipynb,py_scripts//py --sync *.ipynb 183 | 184 | sync: 185 | jupytext --sync *.ipynb 186 | 187 | convert-to-ipynb: 188 | jupytext --set-formats ipynb *.ipynb 189 | 190 | remove-py: 191 | rm -r py_scripts 192 | 193 | .PHONY: help test setup clean remove-notebook test-notebook1 test-notebook2 test-notebook3 test-notebook4 \ 194 | test-notebook5 test-notebook6 test-notebook7 test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2 195 | -------------------------------------------------------------------------------- /notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | az-ml-realtime-score - __init__.py 3 | 4 | Copyright (c) Microsoft Corporation. All rights reserved. 5 | Licensed under the MIT License. 6 | """ 7 | import os 8 | 9 | directory = os.path.dirname(os.path.realpath(__file__)) 10 | -------------------------------------------------------------------------------- /notebooks/dev_env_template: -------------------------------------------------------------------------------- 1 | # Fill in the fields below and rename to .dev_env 2 | # TENANT_ID, SP_USERNAME and SP_PASSWORD are optional. If not supplied Azure cli will default to interactive login 3 | TENANT_ID= 4 | SP_USERNAME= 5 | SP_PASSWORD= 6 | SUBSCRIPTION_ID= 7 | RESOURCE_GROUP="deployrg" 8 | WORKSPACE_NAME="workspace" 9 | WORKSPACE_REGION="eastus" 10 | IMAGE_NAME="deployimg" 11 | AKS_NAME="deployaks" 12 | AKS_SERVICE_NAME="deployservice" -------------------------------------------------------------------------------- /project_sample.yml: -------------------------------------------------------------------------------- 1 | project_name: AI Default Project 2 | settings: 3 | - subscription_id: 4 | - description: Azure Subscription Id 5 | - value: <> 6 | - resource_group: 7 | - description: Azure Resource Group Name 8 | - value: <> 9 | - workspace_name: 10 | - description: Azure ML Workspace Name 11 | - value: <> 12 | - workspace_region: 13 | - description: Azure ML Workspace Region 14 | - value: <> 15 | - image_name: 16 | - description: Docker Container Image Name 17 | - value: <> 18 | - aks_service_name: 19 | - description: AKS Service Name 20 | - value: <> 21 | - aks_name: 22 | - description: AKS Cluster Name 23 | - value: <> 24 | - aks_location: 25 | - description: AKS Azure Region 26 | - value: <> -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | junit_family=xunit1 -------------------------------------------------------------------------------- /sample_workspace_conf.yml: -------------------------------------------------------------------------------- 1 | subscription_id: "<>" 2 | resource_group: "<>" 3 | workspace_name: "<>" 4 | workspace_region: "<>" 5 | 6 | image_name: "<>" 7 | aks_service_name: "<>" 8 | aks_name: "<>" 9 | aks_location: "<>" 10 | 11 | storage_conn_string: "<>" 12 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ai-architecture-template - __init__.py 3 | 4 | Copyright (c) Microsoft Corporation. All rights reserved. 5 | Licensed under the MIT License. 6 | """ -------------------------------------------------------------------------------- /tests/test_notebooks.py: -------------------------------------------------------------------------------- 1 | """ 2 | ai-architecture-template - test_notebooks.py 3 | 4 | Copyright (c) Microsoft Corporation. All rights reserved. 5 | Licensed under the MIT License. 6 | """ 7 | import pytest 8 | 9 | from azure_utils.dev_ops.testing_utilities import run_notebook 10 | 11 | from notebooks import directory 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "notebook", 16 | [ 17 | directory + "/00_AMLConfiguration.ipynb", 18 | directory + "/01_DataPrep.ipynb", 19 | directory + "/02_TrainOnLocal.ipynb", 20 | directory + "/03_DevelopScoringScript.ipynb", 21 | directory + "/04_CreateImage.ipynb", 22 | directory + "/05_DeployOnAKS.ipynb" 23 | ] 24 | ) 25 | def test_notebook(notebook, add_nunit_attachment): 26 | """ Test Notebooks and Save Output to Text Files""" 27 | run_notebook(notebook, add_nunit_attachment, kernel_name="az-ml-realtime-score", root=directory) 28 | --------------------------------------------------------------------------------