├── .ci ├── azure-pipelines-v2.yml ├── azure-pipelines.yml ├── steps │ ├── ai-architecture-template.yml │ └── papermill.yml └── vars │ └── deployment_params.yml ├── .gitignore ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── environment.yml ├── notebooks ├── 00_AMLConfiguration.ipynb ├── 01_DataPrep.ipynb ├── 02_TrainOnLocal.ipynb ├── 03_DevelopScoringScript.ipynb ├── 04_CreateImage.ipynb ├── 05_DeployOnAKS.ipynb ├── 06_SpeedTestWebApp.ipynb ├── 07_RealTimeScoring.ipynb ├── 08_TearDown.ipynb ├── Makefile ├── __init__.py └── dev_env_template ├── project_sample.yml ├── pytest.ini ├── sample_workspace_conf.yml └── tests ├── __init__.py └── test_notebooks.py /.ci/azure-pipelines-v2.yml: -------------------------------------------------------------------------------- 1 | # ML Realtime Scoring Pipeline 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | resources: 13 | repositories: 14 | - repository: aitemplates 15 | type: github 16 | name: microsoft/AI 17 | endpoint: AIArchitecturesAndPractices-GitHub 18 | 19 | trigger: 20 | batch: true 21 | branches: 22 | include: 23 | - master 24 | 25 | pr: 26 | autoCancel: true 27 | branches: 28 | include: 29 | - master 30 | 31 | variables: 32 | - template: ./vars/deployment_params.yml 33 | 34 | stages: 35 | - template: .ci/stages/deploy_notebooks_stages_v5.yml@aitemplates 36 | parameters: 37 | Agent: $(Agent_Name) 38 | jobDisplayName: az-ml-realtime-score 39 | TridentWorkloadTypeShort: ${{ variables.TridentWorkloadTypeShort }} 40 | DeployLocation: ${{ variables.DeployLocation }} 41 | ProjectLocation: ${{ variables.ProjectLocation }} 42 | conda: ${{ variables.conda }} 43 | post_cleanup: false 44 | 45 | flighting_release: false 46 | flighting_preview: false 47 | flighting_master: false 48 | -------------------------------------------------------------------------------- /.ci/azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # AI Architecture Template TODO: update tile 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | resources: 13 | repositories: 14 | - repository: aitemplates 15 | type: github 16 | name: microsoft/AI 17 | endpoint: AIArchitecturesAndPractices-GitHub 18 | 19 | schedules: 20 | - cron: "*/10 * * * *" 21 | displayName: Daily midnight build 22 | always: true 23 | branches: 24 | include: 25 | - master 26 | # MLAKSDeploy Pipeline 27 | 28 | 29 | trigger: 30 | batch: true 31 | branches: 32 | include: 33 | - master 34 | 35 | pr: 36 | autoCancel: true 37 | branches: 38 | include: 39 | - master 40 | 41 | stages: 42 | - template: .ci/stages/deploy_notebooks_stages_v2.yml@aitemplates 43 | parameters: 44 | Agent: $(Agent_Name) 45 | jobDisplayName: ai-architecture-template #TODO: Update with project name 46 | DefaultWorkingDirectory: $(System.DefaultWorkingDirectory) 47 | workload_vars: ../vars/ai-architecture-template.yml #TODO: Update with project name 48 | flighting_release: false 49 | flighting_preview: false 50 | flighting_master: false 51 | -------------------------------------------------------------------------------- /.ci/steps/ai-architecture-template.yml: -------------------------------------------------------------------------------- 1 | # AI Architecture Template TODO: update tile 2 | # 3 | # A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub" 4 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml 5 | # 6 | # An Agent_Name Variable must be creating in the Azure DevOps UI. 7 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables 8 | # 9 | # This must point to an Agent Pool, with a Self-Hosted Linux VM with a Docker. 10 | # https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops 11 | 12 | parameters: 13 | azureSubscription: '' 14 | azure_subscription: '' 15 | location: '' 16 | azureresourcegroup: '' 17 | workspacename: '' 18 | azureregion: westus2 19 | aksimagename: '' 20 | aks_name: '' 21 | aks_service_name: myimage 22 | conda: '' 23 | doCleanup: true 24 | python_path: '' 25 | flighting_release: false 26 | flighting_preview: false 27 | flighting_master: false 28 | 29 | steps: 30 | - template: config_conda.yml 31 | parameters: 32 | conda_location: . 33 | azureSubscription: ${{parameters.azureSubscription}} 34 | conda: ai-architecture-template 35 | flighting_release: ${{parameters.flighting_release}} 36 | flighting_preview: ${{parameters.flighting_preview}} 37 | flighting_master: ${{parameters.flighting_master}} 38 | 39 | - template: azpapermill.yml 40 | parameters: 41 | notebook: 00_AMLConfiguration.ipynb 42 | location: ${{parameters.location}} 43 | azureSubscription: ${{parameters.azureSubscription}} 44 | conda: ai-architecture-template 45 | azure_subscription: ${{parameters.azure_subscription}} 46 | azureresourcegroup: ${{parameters.azureresourcegroup}} 47 | workspacename: "aiarchtemplate" 48 | azureregion: ${{parameters.azureregion}} 49 | aksimagename: ${{parameters.aksimagename}} 50 | 51 | # Insert more notebook steps here 52 | 53 | - template: pytest_steps.yml 54 | parameters: 55 | location: ${{parameters.location}} 56 | azureSubscription: ${{parameters.azureSubscription}} 57 | conda: ai-architecture-template 58 | 59 | - template: cleanuptask.yml 60 | parameters: 61 | azureSubscription: ${{parameters.azureSubscription}} 62 | conda: ${{parameters.conda}} 63 | azureresourcegroup: ${{parameters.azureresourcegroup}} 64 | doCleanup: ${{parameters.doCleanup}} -------------------------------------------------------------------------------- /.ci/steps/papermill.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | parameters: 4 | notebook: 01_DataPrep.ipynb # defaults for any parameters that aren't specified 5 | location: "{{cookiecutter.project_name}}" 6 | 7 | 8 | steps: 9 | - bash: | 10 | source /usr/share/miniconda/etc/profile.d/conda.sh 11 | conda activate MLAKSDeployAML 12 | export PYTHONPATH=$(pwd)/{{cookiecutter.project_name}}:${PYTHONPATH} 13 | cd ${{parameters.location}} 14 | echo Execute ${{parameters.notebook}} 15 | papermill ${{parameters.notebook}} output.ipynb \ 16 | --log-output \ 17 | --no-progress-bar \ 18 | -k python3 19 | sleep 30 20 | displayName: '${{parameters.notebook}}' -------------------------------------------------------------------------------- /.ci/vars/deployment_params.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | TridentWorkloadTypeShort: azmlrts 3 | DeployLocation: westus 4 | ProjectLocation: "." 5 | conda: az-ml-realtime-score 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Project Configuration Files 3 | workspace_conf.yml 4 | *.output_ipynb 5 | .azureml 6 | pylint-results.xml 7 | project.yml 8 | .idea 9 | score.py 10 | 11 | #AML 12 | aml_config/ 13 | scripts/aml_config/ 14 | assets/ 15 | scripts/assets/ 16 | .amlignore 17 | scripts/.amlignore 18 | scripts/__pycache__/ 19 | 20 | # Environments 21 | .env 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | pip-wheel-metadata/ 46 | share/python-wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .nox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *.cover 72 | *.py,cover 73 | .hypothesis/ 74 | .pytest_cache/ 75 | 76 | # Translations 77 | *.mo 78 | *.pot 79 | 80 | # Django stuff: 81 | *.log 82 | local_settings.py 83 | db.sqlite3 84 | db.sqlite3-journal 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | .python-version 108 | 109 | # pipenv 110 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 111 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 112 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 113 | # install all needed dependencies. 114 | #Pipfile.lock 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | /project.yml 153 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python module names) to load, 30 | # usually to register additional checkers. 31 | load-plugins=pylint_junit 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=global-variable-undefined, 64 | global-statement, 65 | too-many-arguments, 66 | too-many-function-args, 67 | pointless-statement, 68 | missing-module-docstring, 69 | trailing-whitespace, 70 | fixme, 71 | print-statement, 72 | parameter-unpacking, 73 | unpacking-in-except, 74 | old-raise-syntax, 75 | backtick, 76 | long-suffix, 77 | old-ne-operator, 78 | old-octal-literal, 79 | import-star-module-level, 80 | non-ascii-bytes-literal, 81 | raw-checker-failed, 82 | bad-inline-option, 83 | locally-disabled, 84 | file-ignored, 85 | suppressed-message, 86 | useless-suppression, 87 | deprecated-pragma, 88 | use-symbolic-message-instead, 89 | apply-builtin, 90 | basestring-builtin, 91 | buffer-builtin, 92 | cmp-builtin, 93 | coerce-builtin, 94 | execfile-builtin, 95 | file-builtin, 96 | long-builtin, 97 | raw_input-builtin, 98 | reduce-builtin, 99 | standarderror-builtin, 100 | unicode-builtin, 101 | xrange-builtin, 102 | coerce-method, 103 | delslice-method, 104 | getslice-method, 105 | setslice-method, 106 | no-absolute-import, 107 | old-division, 108 | dict-iter-method, 109 | dict-view-method, 110 | next-method-called, 111 | metaclass-assignment, 112 | indexing-exception, 113 | raising-string, 114 | reload-builtin, 115 | oct-method, 116 | hex-method, 117 | nonzero-method, 118 | cmp-method, 119 | input-builtin, 120 | round-builtin, 121 | intern-builtin, 122 | unichr-builtin, 123 | map-builtin-not-iterating, 124 | zip-builtin-not-iterating, 125 | range-builtin-not-iterating, 126 | filter-builtin-not-iterating, 127 | using-cmp-argument, 128 | eq-without-hash, 129 | div-method, 130 | idiv-method, 131 | rdiv-method, 132 | exception-message-attribute, 133 | invalid-str-codec, 134 | sys-max-int, 135 | bad-python3-import, 136 | deprecated-string-function, 137 | deprecated-str-translate-call, 138 | deprecated-itertools-function, 139 | deprecated-types-field, 140 | next-method-defined, 141 | dict-items-not-iterating, 142 | dict-keys-not-iterating, 143 | dict-values-not-iterating, 144 | deprecated-operator-function, 145 | deprecated-urllib-function, 146 | xreadlines-attribute, 147 | deprecated-sys-function, 148 | exception-escape, 149 | comprehension-escape 150 | 151 | # Enable the message, report, category or checker with the given id(s). You can 152 | # either give multiple identifier separated by comma (,) or put this option 153 | # multiple time (only on the command line, not in the configuration file where 154 | # it should appear only once). See also the "--disable" option for examples. 155 | enable=c-extension-no-member 156 | 157 | 158 | [REPORTS] 159 | 160 | # Python expression which should return a score less than or equal to 10. You 161 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 162 | # which contain the number of messages in each category, as well as 'statement' 163 | # which is the total number of statements analyzed. This score is used by the 164 | # global evaluation report (RP0004). 165 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 166 | 167 | # Template used to display messages. This is a python new-style format string 168 | # used to format the message information. See doc for all details. 169 | #msg-template= 170 | 171 | # Set the output format. Available formats are text, parseable, colorized, json 172 | # and msvs (visual studio). You can also give a reporter class, e.g. 173 | # mypackage.mymodule.MyReporterClass. 174 | output-format=text 175 | 176 | # Tells whether to display a full report or only the messages. 177 | reports=no 178 | 179 | # Activate the evaluation score. 180 | score=yes 181 | 182 | 183 | [REFACTORING] 184 | 185 | # Maximum number of nested blocks for function / method body 186 | max-nested-blocks=5 187 | 188 | # Complete name of functions that never returns. When checking for 189 | # inconsistent-return-statements if a never returning function is called then 190 | # it will be considered as an explicit return statement and no message will be 191 | # printed. 192 | never-returning-functions=sys.exit 193 | 194 | 195 | [BASIC] 196 | 197 | # Naming style matching correct argument names. 198 | argument-naming-style=snake_case 199 | 200 | # Regular expression matching correct argument names. Overrides argument- 201 | # naming-style. 202 | #argument-rgx= 203 | 204 | # Naming style matching correct attribute names. 205 | attr-naming-style=snake_case 206 | 207 | # Regular expression matching correct attribute names. Overrides attr-naming- 208 | # style. 209 | #attr-rgx= 210 | 211 | # Bad variable names which should always be refused, separated by a comma. 212 | bad-names=foo, 213 | bar, 214 | baz, 215 | toto, 216 | tutu, 217 | tata 218 | 219 | # Naming style matching correct class attribute names. 220 | class-attribute-naming-style=any 221 | 222 | # Regular expression matching correct class attribute names. Overrides class- 223 | # attribute-naming-style. 224 | #class-attribute-rgx= 225 | 226 | # Naming style matching correct class names. 227 | class-naming-style=PascalCase 228 | 229 | # Regular expression matching correct class names. Overrides class-naming- 230 | # style. 231 | #class-rgx= 232 | 233 | # Naming style matching correct constant names. 234 | const-naming-style=snake_case 235 | 236 | # Regular expression matching correct constant names. Overrides const-naming- 237 | # style. 238 | #const-rgx= 239 | 240 | # Minimum line length for functions/classes that require docstrings, shorter 241 | # ones are exempt. 242 | docstring-min-length=-1 243 | 244 | # Naming style matching correct function names. 245 | function-naming-style=snake_case 246 | 247 | # Regular expression matching correct function names. Overrides function- 248 | # naming-style. 249 | #function-rgx= 250 | 251 | # Good variable names which should always be accepted, separated by a comma. 252 | good-names=i, 253 | j, 254 | k, 255 | ex, 256 | Run, 257 | _ 258 | 259 | # Include a hint for the correct naming format with invalid-name. 260 | include-naming-hint=no 261 | 262 | # Naming style matching correct inline iteration names. 263 | inlinevar-naming-style=any 264 | 265 | # Regular expression matching correct inline iteration names. Overrides 266 | # inlinevar-naming-style. 267 | #inlinevar-rgx= 268 | 269 | # Naming style matching correct method names. 270 | method-naming-style=snake_case 271 | 272 | # Regular expression matching correct method names. Overrides method-naming- 273 | # style. 274 | #method-rgx= 275 | 276 | # Naming style matching correct module names. 277 | module-naming-style=any 278 | 279 | # Regular expression matching correct module names. Overrides module-naming- 280 | # style. 281 | #module-rgx= 282 | 283 | # Colon-delimited sets of names that determine each other's naming style when 284 | # the name regexes allow several styles. 285 | name-group= 286 | 287 | # Regular expression which should only match function or class names that do 288 | # not require a docstring. 289 | no-docstring-rgx=^_ 290 | 291 | # List of decorators that produce properties, such as abc.abstractproperty. Add 292 | # to this list to register other decorators that produce valid properties. 293 | # These decorators are taken in consideration only for invalid-name. 294 | property-classes=abc.abstractproperty 295 | 296 | # Naming style matching correct variable names. 297 | variable-naming-style=snake_case 298 | 299 | # Regular expression matching correct variable names. Overrides variable- 300 | # naming-style. 301 | #variable-rgx= 302 | 303 | 304 | [FORMAT] 305 | 306 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 307 | expected-line-ending-format= 308 | 309 | # Regexp for a line that is allowed to be longer than the limit. 310 | ignore-long-lines=^\s*(# )??$|^\s*get_ipython\S+ 311 | 312 | # Number of spaces of indent required inside a hanging or continued line. 313 | indent-after-paren=4 314 | 315 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 316 | # tab). 317 | indent-string=' ' 318 | 319 | # Maximum number of characters on a single line. 320 | max-line-length=120 321 | 322 | # Maximum number of lines in a module. 323 | max-module-lines=1000 324 | 325 | # List of optional constructs for which whitespace checking is disabled. `dict- 326 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 327 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 328 | # `empty-line` allows space-only lines. 329 | no-space-check=trailing-comma, 330 | dict-separator 331 | 332 | # Allow the body of a class to be on the same line as the declaration if body 333 | # contains single statement. 334 | single-line-class-stmt=no 335 | 336 | # Allow the body of an if to be on the same line as the test if there is no 337 | # else. 338 | single-line-if-stmt=no 339 | 340 | 341 | [LOGGING] 342 | 343 | # Format style used to check logging format string. `old` means using % 344 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 345 | logging-format-style=old 346 | 347 | # Logging modules to check that the string format arguments are in logging 348 | # function parameter format. 349 | logging-modules=logging 350 | 351 | 352 | [MISCELLANEOUS] 353 | 354 | # List of note tags to take in consideration, separated by a comma. 355 | notes=FIXME, 356 | XXX, 357 | TODO 358 | 359 | 360 | [SIMILARITIES] 361 | 362 | # Ignore comments when computing similarities. 363 | ignore-comments=yes 364 | 365 | # Ignore docstrings when computing similarities. 366 | ignore-docstrings=yes 367 | 368 | # Ignore imports when computing similarities. 369 | ignore-imports=no 370 | 371 | # Minimum lines number of a similarity. 372 | min-similarity-lines=4 373 | 374 | 375 | [SPELLING] 376 | 377 | # Limits count of emitted suggestions for spelling mistakes. 378 | max-spelling-suggestions=4 379 | 380 | # Spelling dictionary name. Available dictionaries: none. To make it work, 381 | # install the python-enchant package. 382 | spelling-dict= 383 | 384 | # List of comma separated words that should not be checked. 385 | spelling-ignore-words= 386 | 387 | # A path to a file that contains the private dictionary; one word per line. 388 | spelling-private-dict-file= 389 | 390 | # Tells whether to store unknown words to the private dictionary (see the 391 | # --spelling-private-dict-file option) instead of raising a message. 392 | spelling-store-unknown-words=no 393 | 394 | 395 | [STRING] 396 | 397 | # This flag controls whether the implicit-str-concat-in-sequence should 398 | # generate a warning on implicit string concatenation in sequences defined over 399 | # several lines. 400 | check-str-concat-over-line-jumps=no 401 | 402 | 403 | [TYPECHECK] 404 | 405 | # List of decorators that produce context managers, such as 406 | # contextlib.contextmanager. Add to this list to register other decorators that 407 | # produce valid context managers. 408 | contextmanager-decorators=contextlib.contextmanager 409 | 410 | # List of members which are set dynamically and missed by pylint inference 411 | # system, and so shouldn't trigger E1101 when accessed. Python regular 412 | # expressions are accepted. 413 | generated-members= 414 | 415 | # Tells whether missing members accessed in mixin class should be ignored. A 416 | # mixin class is detected if its name ends with "mixin" (case insensitive). 417 | ignore-mixin-members=yes 418 | 419 | # Tells whether to warn about missing members when the owner of the attribute 420 | # is inferred to be None. 421 | ignore-none=yes 422 | 423 | # This flag controls whether pylint should warn about no-member and similar 424 | # checks whenever an opaque object is returned when inferring. The inference 425 | # can return multiple potential results while evaluating a Python object, but 426 | # some branches might not be evaluated, which results in partial inference. In 427 | # that case, it might be useful to still emit no-member and other checks for 428 | # the rest of the inferred objects. 429 | ignore-on-opaque-inference=yes 430 | 431 | # List of class names for which member attributes should not be checked (useful 432 | # for classes with dynamically set attributes). This supports the use of 433 | # qualified names. 434 | ignored-classes=optparse.Values,thread._local,_thread._local 435 | 436 | # List of module names for which member attributes should not be checked 437 | # (useful for modules/projects where namespaces are manipulated during runtime 438 | # and thus existing member attributes cannot be deduced by static analysis). It 439 | # supports qualified module names, as well as Unix pattern matching. 440 | ignored-modules= 441 | 442 | # Show a hint with possible names when a member name was not found. The aspect 443 | # of finding the hint is based on edit distance. 444 | missing-member-hint=yes 445 | 446 | # The minimum edit distance a name should have in order to be considered a 447 | # similar match for a missing member name. 448 | missing-member-hint-distance=1 449 | 450 | # The total number of similar names that should be taken in consideration when 451 | # showing a hint for a missing member. 452 | missing-member-max-choices=1 453 | 454 | # List of decorators that change the signature of a decorated function. 455 | signature-mutators= 456 | 457 | 458 | [VARIABLES] 459 | 460 | # List of additional names supposed to be defined in builtins. Remember that 461 | # you should avoid defining new builtins when possible. 462 | additional-builtins=get_ipython 463 | 464 | # Tells whether unused global variables should be treated as a violation. 465 | allow-global-unused-variables=yes 466 | 467 | # List of strings which can identify a callback function by name. A callback 468 | # name must start or end with one of those strings. 469 | callbacks=cb_, 470 | _cb 471 | 472 | # A regular expression matching the name of dummy variables (i.e. expected to 473 | # not be used). 474 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 475 | 476 | # Argument names that match this expression will be ignored. Default to name 477 | # with leading underscore. 478 | ignored-argument-names=_.*|^ignored_|^unused_ 479 | 480 | # Tells whether we should check for unused import in __init__ files. 481 | init-import=no 482 | 483 | # List of qualified module names which can have objects that can redefine 484 | # builtins. 485 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 486 | 487 | 488 | [CLASSES] 489 | 490 | # List of method names used to declare (i.e. assign) instance attributes. 491 | defining-attr-methods=__init__, 492 | __new__, 493 | setUp, 494 | __post_init__ 495 | 496 | # List of member names, which should be excluded from the protected access 497 | # warning. 498 | exclude-protected=_asdict, 499 | _fields, 500 | _replace, 501 | _source, 502 | _make 503 | 504 | # List of valid names for the first argument in a class method. 505 | valid-classmethod-first-arg=cls 506 | 507 | # List of valid names for the first argument in a metaclass class method. 508 | valid-metaclass-classmethod-first-arg=cls 509 | 510 | 511 | [DESIGN] 512 | 513 | # Maximum number of arguments for function / method. 514 | max-args=5 515 | 516 | # Maximum number of attributes for a class (see R0902). 517 | max-attributes=7 518 | 519 | # Maximum number of boolean expressions in an if statement (see R0916). 520 | max-bool-expr=5 521 | 522 | # Maximum number of branch for function / method body. 523 | max-branches=12 524 | 525 | # Maximum number of locals for function / method body. 526 | max-locals=15 527 | 528 | # Maximum number of parents for a class (see R0901). 529 | max-parents=7 530 | 531 | # Maximum number of public methods for a class (see R0904). 532 | max-public-methods=20 533 | 534 | # Maximum number of return / yield for function / method body. 535 | max-returns=6 536 | 537 | # Maximum number of statements in function / method body. 538 | max-statements=50 539 | 540 | # Minimum number of public methods for a class (see R0903). 541 | min-public-methods=2 542 | 543 | 544 | [IMPORTS] 545 | 546 | # List of modules that can be imported at any level, not just the top level 547 | # one. 548 | allow-any-import-level= 549 | 550 | # Allow wildcard imports from modules that define __all__. 551 | allow-wildcard-with-all=no 552 | 553 | # Analyse import fallback blocks. This can be used to support both Python 2 and 554 | # 3 compatible code, which means that the block might have code that exists 555 | # only in one or another interpreter, leading to false positives when analysed. 556 | analyse-fallback-blocks=no 557 | 558 | # Deprecated modules which should not be used, separated by a comma. 559 | deprecated-modules=optparse,tkinter.tix 560 | 561 | # Create a graph of external dependencies in the given file (report RP0402 must 562 | # not be disabled). 563 | ext-import-graph= 564 | 565 | # Create a graph of every (i.e. internal and external) dependencies in the 566 | # given file (report RP0402 must not be disabled). 567 | import-graph= 568 | 569 | # Create a graph of internal dependencies in the given file (report RP0402 must 570 | # not be disabled). 571 | int-import-graph= 572 | 573 | # Force import order to recognize a module as part of the standard 574 | # compatibility libraries. 575 | known-standard-library= 576 | 577 | # Force import order to recognize a module as part of a third party library. 578 | known-third-party=enchant 579 | 580 | # Couples of modules and preferred modules, separated by a comma. 581 | preferred-modules= 582 | 583 | 584 | [EXCEPTIONS] 585 | 586 | # Exceptions that will emit a warning when being caught. Defaults to 587 | # "BaseException, Exception". 588 | overgeneral-exceptions=BaseException, 589 | Exception 590 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=118&branchName=master) 2 | ### Authors: Fidan Boylu Uz, Yan Zhang, Mario Bourgoin 3 | ### Acknowledgements: Mathew Salvaris 4 | 5 | # Deploying Python models for real-time scoring using Azure Machine Learning 6 | 7 | In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on (1) how to train a machine learning model using Python; (2) how to deploy a trained machine learning model throught Azure Machine Learning (AzureML). The tutorials cover how to deploy models on following deployment target: 8 | 9 | ## Overview 10 | This scenario shows how to deploy a Frequently Asked Questions (FAQ) matching model as a web service to provide predictions for user questions. For this scenario, “Input Data” in the [architecture diagram](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/realtime-scoring-python) refers to text strings containing the user questions to match with a list of FAQs. The scenario is designed for the Scikit-Learn machine learning library for Python but can be generalized to any scenario that uses Python models to make real-time predictions. 11 | 12 | ## Design 13 | 14 | The scenario uses a subset of Stack Overflow question data which includes original questions tagged as JavaScript, their duplicate questions, and their answers. It trains a Scikit-Learn pipeline to predict the match probability of a duplicate question with each of the original questions. These predictions are made in real time using a REST API endpoint. 15 | The application flow for this architecture is as follows: 16 | 1. The client sends a HTTP POST request with the encoded question data. 17 | 2. The webservice extracts the question from the request 18 | 3. The question is then sent to the Scikit-learn pipeline model for featurization and scoring. 19 | 4. The matching FAQ questions with their scores are then piped into a JSON object and returned to the client. 20 | 21 | An example app that consumes the results is included with the scenario. 22 | 23 | ## Prerequisites 24 | 1. Linux (Ubuntu). 25 | 1. [Anaconda Python](https://www.anaconda.com/download) 26 | 1. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed. 27 | 1. [Azure account](https://azure.microsoft.com). 28 | 29 | 30 | --- 31 | **NOTE** 32 | You will need to be able to run docker commands without sudo to run this tutorial. Use the following commands to do this. 33 | 34 | ```bash 35 | sudo usermod -aG docker $USER 36 | newgrp docker 37 | ``` 38 | --- 39 | 40 | The tutorial was developed on an [Azure Ubuntu 41 | DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro), 42 | which addresses the first three prerequisites. 43 | 44 | ## Setup 45 | 46 | To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Azure seamlessly. 47 | 48 | 1. Create a _Linux_ _Ubuntu_ VM. 49 | 1. Log in to your VM. We recommend that you use a graphical client 50 | such as 51 | [X2Go](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#x2go) 52 | to access your VM. The remaining steps are to be done on the VM. 53 | 1. Open a terminal emulator. 54 | 1. Clone, fork, or download the zip file for this repository: 55 | ``` 56 | git clone https://github.com/Microsoft/az-ml-realtime-score.git 57 | ``` 58 | 1. Enter the local repository: 59 | ``` 60 | cd az-ml-realtime-score 61 | ``` 62 | 1. Copy `sample_workspace_conf.yml` to a new file, `workspace_conf.yml`, and fill in each field. This will keep secrets out of the source code, and this file will be ignored by git. 63 | 1. Create the Python az-ml-realtime-score virtual environment using the environment.yml: 64 | ``` 65 | conda env create -f environment.yml 66 | ``` 67 | 1. Activate the virtual environment: 68 | ``` 69 | source activate az-ml-realtime-score 70 | ``` 71 | The remaining steps should be done in this virtual environment. 72 | 1. Login to Azure: 73 | ``` 74 | az login 75 | ``` 76 | You can verify that you are logged in to your subscription by executing 77 | the command: 78 | ``` 79 | az account show -o table 80 | ``` 81 | 1. Start the Jupyter notebook server: 82 | ``` 83 | jupyter notebook 84 | ``` 85 | 86 | # Contributing 87 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 88 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 89 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 90 | 91 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 92 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 93 | provided by the bot. You will only need to do this once across all repositories using our CLA. 94 | 95 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 96 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 97 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 98 | 99 | 100 | # Related projects 101 | 102 | [Microsoft AI Github](https://github.com/microsoft/ai) Find other Best Practice projects, and Azure AI Designed patterns in our central repository. 103 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: az-ml-realtime-score 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6.2 6 | - pip 7 | - jupyter 8 | - pytest 9 | - pytest-cov 10 | - pylint 11 | - pandas 12 | - pip: 13 | - papermill 14 | - azureml-core==1.0.85.2 15 | - pylint-junit 16 | - pytest-nunit 17 | - nbconvert 18 | - junit-xml 19 | - nbformat 20 | - Microsoft-AI-Azure-Utility-Samples 21 | - python-dotenv 22 | -------------------------------------------------------------------------------- /notebooks/00_AMLConfiguration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ai-architecture-template - 00_AMLConfiguration.ipynb\n", 8 | "TODO: Update with new repo name\n", 9 | "\n", 10 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 11 | "\n", 12 | "Licensed under the MIT License.\n", 13 | "\n", 14 | "# Installation and configuration\n", 15 | "This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. \n", 16 | "You can use an existing workspace or create a new one.\n", 17 | "\n", 18 | "## Prerequisites\n", 19 | "\n", 20 | "If you have already completed the prerequisites and selected the correct Kernel for this notebook, the AML Python SDK \n", 21 | "is already installed. Let's load the imports and check the AML SDK version.\n", 22 | "\n", 23 | "## Set up your Azure Machine Learning workspace\n", 24 | "## Load Configurations from file\n", 25 | "\n", 26 | "Configurations are loaded by default from a file `project.yml`, to prevent accident commits of Azure secrets into \n", 27 | "source control. This file name is included in the `.gitignore` to also prevent accident commits. A template file \n", 28 | "is included that should be copied, and each parameter filled in.\n", 29 | "\n", 30 | "If the file is not present, and UI Prompt will pop up to insert configurations, and save to the file.\n", 31 | "\n", 32 | "## Create the workspace\n", 33 | "This cell will also create an AML workspace for you in a subscription, provided you have the correct permissions.\n", 34 | "\n", 35 | "This will fail when:\n", 36 | "1. You do not have permission to create a workspace in the resource group\n", 37 | "1. You do not have permission to create a resource group if it's non-existing.\n", 38 | "1. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this \n", 39 | "subscription\n", 40 | "\n", 41 | "If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to \n", 42 | "provision the required resources. If this cell succeeds, you're done configuring AML!\n", 43 | "\n", 44 | "After creation we will check the details of the workspace." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "pycharm": { 52 | "name": "#%%\n" 53 | } 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from azure_utils.machine_learning.utils import get_or_create_workspace_from_file\n", 58 | "\n", 59 | "ws = get_or_create_workspace_from_file()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "We can now move on to [Data Preperation](01_DataPrep.ipynb) notebook to train our model using Azure Machine \n", 67 | "Learning." 68 | ] 69 | } 70 | ], 71 | "metadata": { 72 | "celltoolbar": "Tags", 73 | "kernelspec": { 74 | "display_name": "az-ml-realtime-score", 75 | "language": "python", 76 | "name": "az-ml-realtime-score" 77 | }, 78 | "pycharm": { 79 | "stem_cell": { 80 | "cell_type": "raw", 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "source": [] 85 | } 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /notebooks/01_DataPrep.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Data Preparation" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data \n", 24 | "which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we \n", 25 | "provide the steps to prepare the data to use in model development for training a model that will match a new \n", 26 | "question with an existing original question. " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import os\n", 36 | "\n", 37 | "import pandas as pd\n", 38 | "from azure_utils.utilities import read_csv_gz, clean_text, round_sample_strat, random_merge\n", 39 | "\n", 40 | "from notebooks import directory" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Below, we define some parameters that will be used in the data cleaning as well as train and test set preparation." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# The size of the test set\n", 57 | "test_size = 0.21\n", 58 | "# The minimum length of clean text\n", 59 | "min_text = 150\n", 60 | "# The minimum number of duplicates per question\n", 61 | "min_dupes = 12\n", 62 | "# The maximum number of duplicate matches\n", 63 | "match = 20\n", 64 | "# The output files path\n", 65 | "outputs_path = directory + \"/data_folder\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Data cleaning" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Next, we download the questions, duplicate questions and answers and load the datasets into pandas dataframes using \n", 80 | "the helper functions." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# URLs to original questions, duplicate questions, and answers.\n", 90 | "data_url = \"https://bostondata.blob.core.windows.net/stackoverflow/{}\"\n", 91 | "questions_url = data_url.format(\"orig-q.tsv.gz\")\n", 92 | "dupes_url = data_url.format(\"dup-q.tsv.gz\")\n", 93 | "answers_url = data_url.format(\"ans.tsv.gz\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Load datasets.\n", 103 | "questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 104 | "dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 105 | "answers = read_csv_gz(answers_url, names=('Id', 'Text0'))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Let's now check the dataframes. Notice that questions and duplicates have \"AnswerID\" column that would help match \n", 113 | "ith the index of answers dataframe." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/html": [ 124 | "
\n", 142 | " | AnswerId | \n", 143 | "Text0 | \n", 144 | "CreationDate | \n", 145 | "
---|---|---|---|
Id | \n", 148 | "\n", 149 | " | \n", 150 | " | \n", 151 | " |
220231 | \n", 156 | "220233 | \n", 157 | "Accessing the web page's HTTP Headers in JavaS... | \n", 158 | "2008-10-20 22:54:38.767 | \n", 159 | "
391979 | \n", 162 | "810461 | \n", 163 | "Get client IP using just JavaScript?. <p>I nee... | \n", 164 | "2008-12-24 18:22:30.780 | \n", 165 | "
109086 | \n", 168 | "109091 | \n", 169 | "Stop setInterval call in JavaScript. <p>I am u... | \n", 170 | "2008-09-20 19:29:55.377 | \n", 171 | "
46155 | \n", 174 | "46181 | \n", 175 | "Validate email address in JavaScript?. <p>How ... | \n", 176 | "2008-09-05 16:10:11.093 | \n", 177 | "
121499 | \n", 180 | "121708 | \n", 181 | "When onblur occurs, how can I find out which e... | \n", 182 | "2008-09-23 14:48:43.483 | \n", 183 | "
I nee... \n", 193 | "109086 109091 Stop setInterval call in JavaScript.
I am u... \n", 194 | "46155 46181 Validate email address in JavaScript?.
How ... \n", 195 | "121499 121708 When onblur occurs, how can I find out which e... \n", 196 | "\n", 197 | " CreationDate \n", 198 | "Id \n", 199 | "220231 2008-10-20 22:54:38.767 \n", 200 | "391979 2008-12-24 18:22:30.780 \n", 201 | "109086 2008-09-20 19:29:55.377 \n", 202 | "46155 2008-09-05 16:10:11.093 \n", 203 | "121499 2008-09-23 14:48:43.483 " 204 | ] 205 | }, 206 | "execution_count": 5, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "questions.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "
\n", 241 | " | AnswerId | \n", 242 | "Text0 | \n", 243 | "CreationDate | \n", 244 | "
---|---|---|---|
Id | \n", 247 | "\n", 248 | " | \n", 249 | " | \n", 250 | " |
665430 | \n", 255 | "665404 | \n", 256 | "Disable \"Back\" & \"Refresh\" Button in Browser. ... | \n", 257 | "2009-03-20 09:13:31.800 | \n", 258 | "
114525 | \n", 261 | "336868 | \n", 262 | "The difference between the two functions? (\"fu... | \n", 263 | "2008-09-22 12:24:06.583 | \n", 264 | "
1347093 | \n", 267 | "147765 | \n", 268 | "ASP.NET Page_Unload to stop user from leaving ... | \n", 269 | "2009-08-28 13:46:51.217 | \n", 270 | "
1208252 | \n", 273 | "26633883 | \n", 274 | "See if a variable is an array using JavaScript... | \n", 275 | "2009-07-30 17:57:42.363 | \n", 276 | "
177867 | \n", 279 | "122704 | \n", 280 | "How do I copy the data of an element with jque... | \n", 281 | "2008-10-07 10:23:40.017 | \n", 282 | "
\n", 340 | " | Text0 | \n", 341 | "
---|---|
Id | \n", 344 | "\n", 345 | " |
119473 | \n", 350 | "<p>Try <a href=\"http://johannburkard.de/blog/p... | \n", 351 | "
324533 | \n", 354 | "<p>Adapted from <a href=\"http://www.javascript... | \n", 355 | "
108232 | \n", 358 | "<p>That is known as a textbox watermark, and i... | \n", 359 | "
194399 | \n", 362 | "<p><strong>Obfuscation:</strong></p> <p>Try <a... | \n", 363 | "
80127 | \n", 366 | "<p>In JavaScript, \"this\" always refers to the ... | \n", 367 | "
Try Adapted from That is known as a textbox watermark, and i...\n",
378 | "194399 Obfuscation: Try How do I access a page's HTTP response headers via JavaScript? Related to this question, which was modified to ask about accessing two specific HTTP headers. Related: Unfortunately, there isn't an API to give you the HTTP response headers for your initial page request. That was the original question posted here. It has been repeatedly asked, too, because some people would like to get the actual response headers of the original page request without issuing another one. If an HTTP request is made over AJAX, it is possible to get the response headers with the The API was specified in the following candidate recommendation for XMLHttpRequest: XMLHttpRequest - W3C Candidate Recommendation 3 August 2010 Specifically, the The MDN documentation is good, too: developer.mozilla.org: This will not give you information about the original page request's HTTP response headers, but it could be used to make educated guesses about what those headers were. More on that is described next. This question was first asked several years ago, asking specifically about how to get at the original HTTP response headers for the current page (i.e. the same page inside of which the javascript was running). This is quite a different question than simply getting the response headers for any HTTP request. For the initial page request, the headers aren't readily available to javascript. Whether the header values you need will be reliably and sufficiently consistent if you request the same page again via AJAX will depend on your particular application. The following are a few suggestions for getting around that problem. If the response is largely static and the headers are not expected to change much between requests, you could make an AJAX request for the same page you're currently on and assume that they're they are the same values which were part of the page's HTTP response. This could allow you to access the headers you need using the nice XMLHttpRequest API described above. This approach will be problematic if you truly have to rely on the values being consistent between requests, since you can't fully guarantee that they are the same. It's going to depend on your specific application and\n"
476 | ]
477 | }
478 | ],
479 | "source": [
480 | "print(answers.at[questions.iloc[0, 0], 'Text0'])"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "Next, we use the helper functions to clean questions, duplicates and answers from unwanted text such as code, html \n",
488 | "tags and links. Notice that we add a new column 'Text' to each dataframe for clean text in lowercase."
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 11,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "# Clean up all text, and keep only data with some clean text.\n",
498 | "for df in (questions, dupes, answers):\n",
499 | " df[\"Text\"] = df.Text0.apply(clean_text).str.lower()"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 12,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "questions = questions[questions.Text.str.len() > 0]\n",
509 | "answers = answers[answers.Text.str.len() > 0]\n",
510 | "dupes = dupes[dupes.Text.str.len() > 0]"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "Let's compare the first original question and cleaned version as an example."
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 13,
523 | "metadata": {},
524 | "outputs": [
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "Accessing the web page's HTTP Headers in JavaScript. How do I access a page's HTTP response headers via JavaScript? Related to this question, which was modified to ask about accessing two specific HTTP headers. Related:
\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "print(questions.iloc[0, 1])"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "Let's now check the duplicates for that question."
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 9,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "name": "stdout",
429 | "output_type": "stream",
430 | "text": [
431 | " AnswerId Text0 \\\n",
432 | "Id \n",
433 | "3177208 220233 Monitoring http request header on a page.
How do I access the HTTP request header fields via JavaScript?
For AJAX Requests:getAllResponseHeaders()
method. It's part of the XMLHttpRequest API. To see how this can be applied, check out the fetchSimilarHeaders()
function below. Note that this is a work-around to the problem that won't be reliable for some applications.myXMLHttpRequest.getAllResponseHeaders();
getAllResponseHeaders()
method was specified in the following section: w3.org: XMLHttpRequest
: the getallresponseheaders()
method XMLHttpRequest
.
Getting header values from the Initial Page Request:
1. Requests on Resources which are largely staticfunction fetchSimilarHeaders (callback) { var request = new XMLHttpRequest(); request.onreadystatechange = function () { if (request.readyState === 4) { // // The following headers may often be similar // to those of the original page request... // if (callback && typeof callback === 'function') { callback(request.getAllResponseHeaders()); } } }; // // Re-request the same page (document.location) // We hope to get the same or similar response headers to those which // came with the current page, but we have no guarantee. // Since we are only after the headers, a HEAD request may be sufficient. // request.open('HEAD', document.location, true); request.send(null); }
\n"
530 | ]
531 | }
532 | ],
533 | "source": [
534 | "# Original question.\n",
535 | "print(questions.iloc[0, 1])"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 14,
541 | "metadata": {},
542 | "outputs": [
543 | {
544 | "name": "stdout",
545 | "output_type": "stream",
546 | "text": [
547 | "accessing the web page's http headers in javascript. how do i access a page's http response headers via javascript? related to this question, which was modified to ask about accessing two specific http headers. related: how do i access the http request header fields via javascript? \n"
548 | ]
549 | }
550 | ],
551 | "source": [
552 | "# After cleaning.\n",
553 | "print(questions.iloc[0, 3])"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "It turns out that some duplicate questions were also in original questions. Also, some original questions and some \n",
561 | "duplicate questions were duplicated in the datasets. In the following, we remove them from the dataframes."
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 15,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": [
570 | "# First, remove dupes that are questions, then remove duplicated questions and dupes.\n",
571 | "dupes = dupes[~dupes.index.isin(questions.index)]\n",
572 | "questions = questions[~questions.index.duplicated(keep='first')]\n",
573 | "dupes = dupes[~dupes.index.duplicated(keep='first')]"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "We also make sure we keep questions with answers and duplicates."
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 16,
586 | "metadata": {},
587 | "outputs": [],
588 | "source": [
589 | "# Keep only questions with answers and dupes, answers to questions, and dupes of questions.\n",
590 | "questions = questions[\n",
591 | " questions.AnswerId.isin(answers.index) & questions.AnswerId.isin(dupes.AnswerId)\n",
592 | "]\n",
593 | "answers = answers[answers.index.isin(questions.AnswerId)]\n",
594 | "dupes = dupes[dupes.AnswerId.isin(questions.AnswerId)]"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 17,
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "# Verify data integrity.\n",
604 | "assert questions.AnswerId.isin(answers.index).all()\n",
605 | "assert answers.index.isin(questions.AnswerId).all()\n",
606 | "assert questions.AnswerId.isin(dupes.AnswerId).all()\n",
607 | "assert dupes.AnswerId.isin(questions.AnswerId).all()"
608 | ]
609 | },
610 | {
611 | "cell_type": "markdown",
612 | "metadata": {},
613 | "source": [
614 | "Below are some statistics on the data. Notice that some questions have very low number of duplicates while others may \n",
615 | "have a large number. "
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": 18,
621 | "metadata": {},
622 | "outputs": [
623 | {
624 | "name": "stdout",
625 | "output_type": "stream",
626 | "text": [
627 | "Text statistics:\n",
628 | " count mean std min 25% 50% 75% max\n",
629 | "questions 1714.0 415.827305 319.857854 56.0 225.0 334.0 509.0 3982.0\n",
630 | "answers 1714.0 616.274212 673.060199 1.0 178.0 375.0 757.0 3982.0\n",
631 | "dupes 16139.0 441.303612 363.638297 25.0 247.0 357.0 519.0 3989.0\n",
632 | "\n",
633 | "Duplication statistics:\n",
634 | " count mean std min 25% 50% 75% max\n",
635 | "duplications 1714.0 9.415986 41.638847 1.0 3.0 4.0 7.0 1369.0\n",
636 | "\n",
637 | "Largest class: 8.48%\n"
638 | ]
639 | }
640 | ],
641 | "source": [
642 | "# Report on the data.\n",
643 | "print(\"Text statistics:\")\n",
644 | "print(\n",
645 | " pd.DataFrame(\n",
646 | " [\n",
647 | " questions.Text.str.len().describe().rename(\"questions\"),\n",
648 | " answers.Text.str.len().describe().rename(\"answers\"),\n",
649 | " dupes.Text.str.len().describe().rename(\"dupes\"),\n",
650 | " ]\n",
651 | " )\n",
652 | ")\n",
653 | "print(\"\\nDuplication statistics:\")\n",
654 | "print(pd.DataFrame([dupes.AnswerId.value_counts().describe().rename(\"duplications\")]))\n",
655 | "print(\n",
656 | " \"\\nLargest class: {:.2%}\".format(\n",
657 | " dupes.AnswerId.value_counts().max() / dupes.shape[0]\n",
658 | " )\n",
659 | ")"
660 | ]
661 | },
662 | {
663 | "cell_type": "markdown",
664 | "metadata": {},
665 | "source": [
666 | "Now, we reset all indexes to use them as columns in the rest of the steps."
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 19,
672 | "metadata": {},
673 | "outputs": [],
674 | "source": [
675 | "# Reset each dataframe's index.\n",
676 | "questions.reset_index(inplace=True)\n",
677 | "answers.reset_index(inplace=True)\n",
678 | "dupes.reset_index(inplace=True)"
679 | ]
680 | },
681 | {
682 | "cell_type": "markdown",
683 | "metadata": {},
684 | "source": [
685 | "We filter the questions and duplicates to have at least min_text number of characters."
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": 20,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "# Apply the minimum text length to questions and dupes.\n",
695 | "questions = questions[questions.Text.str.len() >= min_text]\n",
696 | "dupes = dupes[dupes.Text.str.len() >= min_text]"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": 21,
702 | "metadata": {},
703 | "outputs": [],
704 | "source": [
705 | "# Keep only questions with dupes, and dupes of questions.\n",
706 | "label_column = \"AnswerId\"\n",
707 | "questions = questions[questions[label_column].isin(dupes[label_column])]\n",
708 | "dupes = dupes[dupes[label_column].isin(questions[label_column])]"
709 | ]
710 | },
711 | {
712 | "cell_type": "markdown",
713 | "metadata": {},
714 | "source": [
715 | "Here, we remove questions and their duplicates that are less than min_dupes parameter."
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": 22,
721 | "metadata": {},
722 | "outputs": [],
723 | "source": [
724 | "# Restrict the questions to those with a minimum number of dupes.\n",
725 | "answerid_count = dupes.groupby(label_column)[label_column].count()\n",
726 | "answerid_min = answerid_count.index[answerid_count >= min_dupes]\n",
727 | "questions = questions[questions[label_column].isin(answerid_min)]\n",
728 | "dupes = dupes[dupes[label_column].isin(answerid_min)]"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 23,
734 | "metadata": {},
735 | "outputs": [],
736 | "source": [
737 | " # Verify data integrity.\n",
738 | "assert questions[label_column].isin(dupes[label_column]).all()\n",
739 | "assert dupes[label_column].isin(questions[label_column]).all()"
740 | ]
741 | },
742 | {
743 | "cell_type": "markdown",
744 | "metadata": {},
745 | "source": [
746 | "Here are some statistics on the resulting dataset."
747 | ]
748 | },
749 | {
750 | "cell_type": "code",
751 | "execution_count": 24,
752 | "metadata": {},
753 | "outputs": [
754 | {
755 | "name": "stdout",
756 | "output_type": "stream",
757 | "text": [
758 | "Restrictions: min_text=150, min_dupes=12\n",
759 | "Restricted text statistics:\n",
760 | " count mean std min 25% 50% 75% max\n",
761 | "questions 182.0 413.450549 218.028193 153.0 264.25 338.5 510.5 1475.0\n",
762 | "dupes 8260.0 479.882324 398.791447 150.0 270.00 380.0 553.0 3989.0\n",
763 | "\n",
764 | "Restricted duplication statistics:\n",
765 | " count mean std min 25% 50% 75% max\n",
766 | "duplications 182.0 45.384615 117.074823 12.0 15.0 20.0 33.0 1328.0\n",
767 | "\n",
768 | "Restricted largest class: 16.08%\n"
769 | ]
770 | }
771 | ],
772 | "source": [
773 | "# Report on the data.\n",
774 | "print(\"Restrictions: min_text={}, min_dupes={}\".format(min_text, min_dupes))\n",
775 | "print(\"Restricted text statistics:\")\n",
776 | "print(\n",
777 | " pd.DataFrame(\n",
778 | " [\n",
779 | " questions.Text.str.len().describe().rename(\"questions\"),\n",
780 | " dupes.Text.str.len().describe().rename(\"dupes\"),\n",
781 | " ]\n",
782 | " )\n",
783 | ")\n",
784 | "print(\"\\nRestricted duplication statistics:\")\n",
785 | "print(\n",
786 | " pd.DataFrame([dupes[label_column].value_counts().describe().rename(\"duplications\")])\n",
787 | ")\n",
788 | "print(\n",
789 | " \"\\nRestricted largest class: {:.2%}\".format(\n",
790 | " dupes[label_column].value_counts().max() / dupes.shape[0]\n",
791 | " )\n",
792 | ")"
793 | ]
794 | },
795 | {
796 | "cell_type": "markdown",
797 | "metadata": {},
798 | "source": [
799 | "## Prepare train and test sets"
800 | ]
801 | },
802 | {
803 | "cell_type": "markdown",
804 | "metadata": {},
805 | "source": [
806 | "In this part, we prepare train and test sets. For training a binary classification model, we will need to construct \n",
807 | "match and non-match pairs from duplicates and their questions. Finding matching pairs can be accomplished by joining \n",
808 | "each duplicate with its question. However, non-match examples need to be constructed randomly. "
809 | ]
810 | },
811 | {
812 | "cell_type": "markdown",
813 | "metadata": {},
814 | "source": [
815 | "As a first step, to make sure we train and test the performance of the model on each question, we will need to have \n",
816 | "examples of match and non-match pairs for each question both in train and test sets. In order to achieve that, \n",
817 | "we split the duplicates in a stratified manner into train and test sets making sure at least 1 or more duplicates per \n",
818 | "question is in the test set depending on test_size parameter and number of duplicates per each question."
819 | ]
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": 25,
824 | "metadata": {},
825 | "outputs": [],
826 | "source": [
827 | "# Split dupes into train and test ensuring at least one of each label class is in test.\n",
828 | "dupes_test = round_sample_strat(dupes, dupes[label_column], frac=test_size)\n",
829 | "dupes_train = dupes[~dupes.Id.isin(dupes_test.Id)]"
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": 26,
835 | "metadata": {},
836 | "outputs": [],
837 | "source": [
838 | "assert dupes_test[label_column].unique().shape[0] == dupes[label_column].unique().shape[0]"
839 | ]
840 | },
841 | {
842 | "cell_type": "code",
843 | "execution_count": 27,
844 | "metadata": {},
845 | "outputs": [],
846 | "source": [
847 | "# The relevant columns for text pairs data.\n",
848 | "balanced_pairs_columns = ['Id_x', 'AnswerId_x', 'Text_x', 'Id_y', 'Text_y', 'AnswerId_y', 'Label', 'n']"
849 | ]
850 | },
851 | {
852 | "cell_type": "markdown",
853 | "metadata": {},
854 | "source": [
855 | "Next, we pair each training duplicate in train set with its matching question and N-1 random questions using the \n",
856 | "helper function."
857 | ]
858 | },
859 | {
860 | "cell_type": "code",
861 | "execution_count": 28,
862 | "metadata": {},
863 | "outputs": [],
864 | "source": [
865 | "# Use AnswerId to pair each training dupe with its matching question and also with N-1 questions not its match.\n",
866 | "balanced_pairs_train = random_merge(dupes_train, questions, N=match)"
867 | ]
868 | },
869 | {
870 | "cell_type": "markdown",
871 | "metadata": {},
872 | "source": [
873 | "Labeling is done such that matching pairs are labeled as 1 and non-match pairs are labeled as 0."
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": 29,
879 | "metadata": {},
880 | "outputs": [],
881 | "source": [
882 | "# Label records by matching AnswerIds.\n",
883 | "balanced_pairs_train[\"Label\"] = (\n",
884 | " balanced_pairs_train.AnswerId_x == balanced_pairs_train.AnswerId_y\n",
885 | ").astype(int)"
886 | ]
887 | },
888 | {
889 | "cell_type": "code",
890 | "execution_count": 30,
891 | "metadata": {},
892 | "outputs": [],
893 | "source": [
894 | "# Keep only the relevant data.\n",
895 | "balanced_pairs_train = balanced_pairs_train[balanced_pairs_columns]"
896 | ]
897 | },
898 | {
899 | "cell_type": "code",
900 | "execution_count": 31,
901 | "metadata": {},
902 | "outputs": [
903 | {
904 | "data": {
905 | "text/html": [
906 | "
How do I access the HTTP request header fields via JavaScript?\n",
921 | " \n",
922 | "
\n",
992 | "\n",
923 | " \n",
933 | " \n",
934 | " \n",
935 | " \n",
924 | " Id_x \n",
925 | " AnswerId_x \n",
926 | " Text_x \n",
927 | " Id_y \n",
928 | " Text_y \n",
929 | " AnswerId_y \n",
930 | " Label \n",
931 | " n \n",
932 | " \n",
936 | " \n",
946 | " 0 \n",
937 | " 177867 \n",
938 | " 122704 \n",
939 | " how do i copy the data of an element with jque... \n",
940 | " 122102 \n",
941 | " what is the most efficient way to clone an obj... \n",
942 | " 122704 \n",
943 | " 1 \n",
944 | " 0 \n",
945 | " \n",
947 | " \n",
957 | " 1 \n",
948 | " 565430 \n",
949 | " 122704 \n",
950 | " (deep) copying an array using jquery. possibl... \n",
951 | " 122102 \n",
952 | " what is the most efficient way to clone an obj... \n",
953 | " 122704 \n",
954 | " 1 \n",
955 | " 0 \n",
956 | " \n",
958 | " \n",
968 | " 2 \n",
959 | " 3474697 \n",
960 | " 122704 \n",
961 | " how to clone js object?. possible duplicate: ... \n",
962 | " 122102 \n",
963 | " what is the most efficient way to clone an obj... \n",
964 | " 122704 \n",
965 | " 1 \n",
966 | " 0 \n",
967 | " \n",
969 | " \n",
979 | " 3 \n",
970 | " 10801878 \n",
971 | " 122704 \n",
972 | " how can i copy a variable without pointing to ... \n",
973 | " 122102 \n",
974 | " what is the most efficient way to clone an obj... \n",
975 | " 122704 \n",
976 | " 1 \n",
977 | " 0 \n",
978 | " \n",
980 | " \n",
990 | " \n",
991 | "4 \n",
981 | " 9610918 \n",
982 | " 122704 \n",
983 | " how do i get a new reference to an object. po... \n",
984 | " 122102 \n",
985 | " what is the most efficient way to clone an obj... \n",
986 | " 122704 \n",
987 | " 1 \n",
988 | " 0 \n",
989 | " \n",
1098 | " \n",
1099 | "
\n",
1169 | "\n",
1100 | " \n",
1110 | " \n",
1111 | " \n",
1112 | " \n",
1101 | " Id_x \n",
1102 | " AnswerId_x \n",
1103 | " Text_x \n",
1104 | " Id_y \n",
1105 | " Text_y \n",
1106 | " AnswerId_y \n",
1107 | " Label \n",
1108 | " n \n",
1109 | " \n",
1113 | " \n",
1123 | " 0 \n",
1114 | " 18045953 \n",
1115 | " 6700 \n",
1116 | " getting the length of a 'named' array?. i'm no... \n",
1117 | " 5223 \n",
1118 | " length of a javascript object (that is, associ... \n",
1119 | " 6700 \n",
1120 | " 1 \n",
1121 | " 0 \n",
1122 | " \n",
1124 | " \n",
1134 | " 1 \n",
1125 | " 8702219 \n",
1126 | " 6700 \n",
1127 | " how to get javascript hash table count?. poss... \n",
1128 | " 5223 \n",
1129 | " length of a javascript object (that is, associ... \n",
1130 | " 6700 \n",
1131 | " 1 \n",
1132 | " 0 \n",
1133 | " \n",
1135 | " \n",
1145 | " 2 \n",
1136 | " 14485336 \n",
1137 | " 27943 \n",
1138 | " calculate distance between two geolocs in java... \n",
1139 | " 27928 \n",
1140 | " calculate distance between two latitude-longit... \n",
1141 | " 27943 \n",
1142 | " 1 \n",
1143 | " 0 \n",
1144 | " \n",
1146 | " \n",
1156 | " 3 \n",
1147 | " 21383582 \n",
1148 | " 27943 \n",
1149 | " android java calculate distance many coordinat... \n",
1150 | " 27928 \n",
1151 | " calculate distance between two latitude-longit... \n",
1152 | " 27943 \n",
1153 | " 1 \n",
1154 | " 0 \n",
1155 | " \n",
1157 | " \n",
1167 | " \n",
1168 | "4 \n",
1158 | " 27140796 \n",
1159 | " 31047 \n",
1160 | " how can i check if append element already exis... \n",
1161 | " 31044 \n",
1162 | " is there an \"exists\" function for jquery?. how... \n",
1163 | " 31047 \n",
1164 | " 1 \n",
1165 | " 0 \n",
1166 | "