├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── examples
├── CIFAR10_Classification_transfer_Learning.ipynb
└── CIFAR10_classification.ipynb
└── microsoftvision
├── __init__.py
├── models
├── __init__.py
├── resnext.py
└── utils.py
└── version.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Project
2 | # Microsoft Vision
3 |
4 | ## Installation
5 | ``pip install microsoftvision``
6 |
7 |
8 | ## Usage
9 | Input images should be in BGR format of shape (3 x H x W), where H and W are expected to be at least 224.
10 | The images have to be loaded in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
11 |
12 | Example script:
13 | ```
14 | import microsoftvision
15 | import torch
16 |
17 | # This will load pretrained model
18 | model = microsoftvision.models.resnet50(pretrained=True)
19 |
20 | # Load model to CPU memory, interface is the same as torchvision
21 | model = microsoftvision.models.resnet50(pretrained=True, map_location=torch.device('cpu'))
22 | ```
23 |
24 | Example of creating image embeddings:
25 | ```
26 | import microsoftvision
27 | from torchvision import transforms
28 | import torch
29 | from PIL import Image
30 |
31 | def get_image():
32 | img = cv2.imread('example.jpg', cv2.IMREAD_COLOR)
33 | preprocess = transforms.Compose([
34 | transforms.ToPILImage(),
35 | transforms.Resize(224),
36 | transforms.CenterCrop(224),
37 | transforms.ToTensor(),
38 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
39 | ])
40 | return preprocess(img).unsqueeze(0) # Unsqueeze only required when there's 1 image in images batch
41 |
42 | model = microsoftvision.models.resnet50(pretrained=True)
43 | features = model(get_image())
44 | print(features.shape)
45 | ```
46 | Should output
47 | ```
48 | ...
49 | torch.Size([1, 2048])
50 | ```
51 |
52 |
53 |
54 | ## Contributing
55 |
56 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
57 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
58 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
59 |
60 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
61 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
62 | provided by the bot. You will only need to do this once across all repos using our CLA.
63 |
64 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
65 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
66 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
67 |
68 | ## Trademarks
69 |
70 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
71 | trademarks or logos is subject to and must follow
72 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
73 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
74 | Any use of third-party trademarks or logos are subject to those third-party's policies.
75 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 |
41 |
--------------------------------------------------------------------------------
/examples/CIFAR10_Classification_transfer_Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "CIFAR10_Classification_transfer_Learning.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | },
17 | "accelerator": "GPU",
18 | "widgets": {
19 | "application/vnd.jupyter.widget-state+json": {
20 | "bd820309094f48b38f473a235ef5a93d": {
21 | "model_module": "@jupyter-widgets/controls",
22 | "model_name": "HBoxModel",
23 | "state": {
24 | "_view_name": "HBoxView",
25 | "_dom_classes": [],
26 | "_model_name": "HBoxModel",
27 | "_view_module": "@jupyter-widgets/controls",
28 | "_model_module_version": "1.5.0",
29 | "_view_count": null,
30 | "_view_module_version": "1.5.0",
31 | "box_style": "",
32 | "layout": "IPY_MODEL_3a7877ed8e174bf38de1d4ca2ee29e4c",
33 | "_model_module": "@jupyter-widgets/controls",
34 | "children": [
35 | "IPY_MODEL_1f0f20c8720c4dcd9917057ff1c7dc3f",
36 | "IPY_MODEL_47e1b2f348aa4a2c9d039706260d5781"
37 | ]
38 | }
39 | },
40 | "3a7877ed8e174bf38de1d4ca2ee29e4c": {
41 | "model_module": "@jupyter-widgets/base",
42 | "model_name": "LayoutModel",
43 | "state": {
44 | "_view_name": "LayoutView",
45 | "grid_template_rows": null,
46 | "right": null,
47 | "justify_content": null,
48 | "_view_module": "@jupyter-widgets/base",
49 | "overflow": null,
50 | "_model_module_version": "1.2.0",
51 | "_view_count": null,
52 | "flex_flow": null,
53 | "width": null,
54 | "min_width": null,
55 | "border": null,
56 | "align_items": null,
57 | "bottom": null,
58 | "_model_module": "@jupyter-widgets/base",
59 | "top": null,
60 | "grid_column": null,
61 | "overflow_y": null,
62 | "overflow_x": null,
63 | "grid_auto_flow": null,
64 | "grid_area": null,
65 | "grid_template_columns": null,
66 | "flex": null,
67 | "_model_name": "LayoutModel",
68 | "justify_items": null,
69 | "grid_row": null,
70 | "max_height": null,
71 | "align_content": null,
72 | "visibility": null,
73 | "align_self": null,
74 | "height": null,
75 | "min_height": null,
76 | "padding": null,
77 | "grid_auto_rows": null,
78 | "grid_gap": null,
79 | "max_width": null,
80 | "order": null,
81 | "_view_module_version": "1.2.0",
82 | "grid_template_areas": null,
83 | "object_position": null,
84 | "object_fit": null,
85 | "grid_auto_columns": null,
86 | "margin": null,
87 | "display": null,
88 | "left": null
89 | }
90 | },
91 | "1f0f20c8720c4dcd9917057ff1c7dc3f": {
92 | "model_module": "@jupyter-widgets/controls",
93 | "model_name": "FloatProgressModel",
94 | "state": {
95 | "_view_name": "ProgressView",
96 | "style": "IPY_MODEL_b2ad45b530124e2f81a4fcb8f60d6893",
97 | "_dom_classes": [],
98 | "description": "",
99 | "_model_name": "FloatProgressModel",
100 | "bar_style": "success",
101 | "max": 170498071,
102 | "_view_module": "@jupyter-widgets/controls",
103 | "_model_module_version": "1.5.0",
104 | "value": 170498071,
105 | "_view_count": null,
106 | "_view_module_version": "1.5.0",
107 | "orientation": "horizontal",
108 | "min": 0,
109 | "description_tooltip": null,
110 | "_model_module": "@jupyter-widgets/controls",
111 | "layout": "IPY_MODEL_11a0f5cf47914b839188653402d94651"
112 | }
113 | },
114 | "47e1b2f348aa4a2c9d039706260d5781": {
115 | "model_module": "@jupyter-widgets/controls",
116 | "model_name": "HTMLModel",
117 | "state": {
118 | "_view_name": "HTMLView",
119 | "style": "IPY_MODEL_36dc7a4e4687495e8430f27749e099a3",
120 | "_dom_classes": [],
121 | "description": "",
122 | "_model_name": "HTMLModel",
123 | "placeholder": "",
124 | "_view_module": "@jupyter-widgets/controls",
125 | "_model_module_version": "1.5.0",
126 | "value": " 170499072/? [00:05<00:00, 29465459.73it/s]",
127 | "_view_count": null,
128 | "_view_module_version": "1.5.0",
129 | "description_tooltip": null,
130 | "_model_module": "@jupyter-widgets/controls",
131 | "layout": "IPY_MODEL_8dcb70e7032941cf9e7f2bcefe775486"
132 | }
133 | },
134 | "b2ad45b530124e2f81a4fcb8f60d6893": {
135 | "model_module": "@jupyter-widgets/controls",
136 | "model_name": "ProgressStyleModel",
137 | "state": {
138 | "_view_name": "StyleView",
139 | "_model_name": "ProgressStyleModel",
140 | "description_width": "initial",
141 | "_view_module": "@jupyter-widgets/base",
142 | "_model_module_version": "1.5.0",
143 | "_view_count": null,
144 | "_view_module_version": "1.2.0",
145 | "bar_color": null,
146 | "_model_module": "@jupyter-widgets/controls"
147 | }
148 | },
149 | "11a0f5cf47914b839188653402d94651": {
150 | "model_module": "@jupyter-widgets/base",
151 | "model_name": "LayoutModel",
152 | "state": {
153 | "_view_name": "LayoutView",
154 | "grid_template_rows": null,
155 | "right": null,
156 | "justify_content": null,
157 | "_view_module": "@jupyter-widgets/base",
158 | "overflow": null,
159 | "_model_module_version": "1.2.0",
160 | "_view_count": null,
161 | "flex_flow": null,
162 | "width": null,
163 | "min_width": null,
164 | "border": null,
165 | "align_items": null,
166 | "bottom": null,
167 | "_model_module": "@jupyter-widgets/base",
168 | "top": null,
169 | "grid_column": null,
170 | "overflow_y": null,
171 | "overflow_x": null,
172 | "grid_auto_flow": null,
173 | "grid_area": null,
174 | "grid_template_columns": null,
175 | "flex": null,
176 | "_model_name": "LayoutModel",
177 | "justify_items": null,
178 | "grid_row": null,
179 | "max_height": null,
180 | "align_content": null,
181 | "visibility": null,
182 | "align_self": null,
183 | "height": null,
184 | "min_height": null,
185 | "padding": null,
186 | "grid_auto_rows": null,
187 | "grid_gap": null,
188 | "max_width": null,
189 | "order": null,
190 | "_view_module_version": "1.2.0",
191 | "grid_template_areas": null,
192 | "object_position": null,
193 | "object_fit": null,
194 | "grid_auto_columns": null,
195 | "margin": null,
196 | "display": null,
197 | "left": null
198 | }
199 | },
200 | "36dc7a4e4687495e8430f27749e099a3": {
201 | "model_module": "@jupyter-widgets/controls",
202 | "model_name": "DescriptionStyleModel",
203 | "state": {
204 | "_view_name": "StyleView",
205 | "_model_name": "DescriptionStyleModel",
206 | "description_width": "",
207 | "_view_module": "@jupyter-widgets/base",
208 | "_model_module_version": "1.5.0",
209 | "_view_count": null,
210 | "_view_module_version": "1.2.0",
211 | "_model_module": "@jupyter-widgets/controls"
212 | }
213 | },
214 | "8dcb70e7032941cf9e7f2bcefe775486": {
215 | "model_module": "@jupyter-widgets/base",
216 | "model_name": "LayoutModel",
217 | "state": {
218 | "_view_name": "LayoutView",
219 | "grid_template_rows": null,
220 | "right": null,
221 | "justify_content": null,
222 | "_view_module": "@jupyter-widgets/base",
223 | "overflow": null,
224 | "_model_module_version": "1.2.0",
225 | "_view_count": null,
226 | "flex_flow": null,
227 | "width": null,
228 | "min_width": null,
229 | "border": null,
230 | "align_items": null,
231 | "bottom": null,
232 | "_model_module": "@jupyter-widgets/base",
233 | "top": null,
234 | "grid_column": null,
235 | "overflow_y": null,
236 | "overflow_x": null,
237 | "grid_auto_flow": null,
238 | "grid_area": null,
239 | "grid_template_columns": null,
240 | "flex": null,
241 | "_model_name": "LayoutModel",
242 | "justify_items": null,
243 | "grid_row": null,
244 | "max_height": null,
245 | "align_content": null,
246 | "visibility": null,
247 | "align_self": null,
248 | "height": null,
249 | "min_height": null,
250 | "padding": null,
251 | "grid_auto_rows": null,
252 | "grid_gap": null,
253 | "max_width": null,
254 | "order": null,
255 | "_view_module_version": "1.2.0",
256 | "grid_template_areas": null,
257 | "object_position": null,
258 | "object_fit": null,
259 | "grid_auto_columns": null,
260 | "margin": null,
261 | "display": null,
262 | "left": null
263 | }
264 | }
265 | }
266 | }
267 | },
268 | "cells": [
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {
272 | "id": "RWSeWEwL0yo5"
273 | },
274 | "source": [
275 | "# **Microsoft Vision classification example**\n",
276 | "\n",
277 | "This example shows a simple way to use Microsoft vision with PyTorch for transfer learning and feature extraction from the Microsoft Vision model.\n",
278 | "\n",
279 | "This shows an example to plug-in a fully connected Neural network on top of the vision model which provides features from the data.."
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {
285 | "id": "Xr1xFVth2niR"
286 | },
287 | "source": [
288 | "Reading the necessary imports."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "metadata": {
294 | "id": "7IKQz4Rjc2RU"
295 | },
296 | "source": [
297 | "import time\n",
298 | "import torch\n",
299 | "import numpy as np\n",
300 | "import torch.nn as nn\n",
301 | "from torch import Tensor\n",
302 | "import torch.optim as optim\n",
303 | "import torch.nn.functional as F\n",
304 | "from torch.utils.data import DataLoader,TensorDataset\n",
305 | "from torchvision.datasets import CIFAR10\n",
306 | "import torchvision.transforms as transforms\n",
307 | "from progressbar import progressbar"
308 | ],
309 | "execution_count": 1,
310 | "outputs": []
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {
315 | "id": "JbdB_EU_2uSK"
316 | },
317 | "source": [
318 | "## Install the Microsoft Vision"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "metadata": {
324 | "colab": {
325 | "base_uri": "https://localhost:8080/"
326 | },
327 | "id": "IIDSsFp7olYB",
328 | "outputId": "126ec78e-56af-4cde-a0ef-eb451ee54115"
329 | },
330 | "source": [
331 | "!pip install microsoftvision\n",
332 | "\n",
333 | "import microsoftvision"
334 | ],
335 | "execution_count": 2,
336 | "outputs": [
337 | {
338 | "output_type": "stream",
339 | "text": [
340 | "Collecting microsoftvision\n",
341 | " Downloading https://files.pythonhosted.org/packages/71/db/65a4aebd1eac4c5920ac5fcf7c964f9834675b129ef82871435ea902b393/microsoftvision-1.0.5-py3-none-any.whl\n",
342 | "Requirement already satisfied: torch>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from microsoftvision) (1.8.0+cu101)\n",
343 | "Collecting azure-identity\n",
344 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2a/35/64e29615e7709c10c4f1d4310a8c13a6770142e9fcb9358fb8fa4d9b1578/azure_identity-1.5.0-py2.py3-none-any.whl (103kB)\n",
345 | "\u001b[K |████████████████████████████████| 112kB 22.0MB/s \n",
346 | "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from microsoftvision) (4.41.1)\n",
347 | "Collecting azure-storage-blob\n",
348 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/09/14/4ca417a9c92b0fb93516575dd7be9b058bf13d531dcc21239b5f8f216a69/azure_storage_blob-12.8.0-py2.py3-none-any.whl (341kB)\n",
349 | "\u001b[K |████████████████████████████████| 348kB 14.4MB/s \n",
350 | "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torch>=1.2.0->microsoftvision) (1.19.5)\n",
351 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.2.0->microsoftvision) (3.7.4.3)\n",
352 | "Collecting msal<2.0.0,>=1.6.0\n",
353 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e6/69/83ffc3004a19140a3c5d7151d7f79c280ac1b40a425fe5308b879eefcf25/msal-1.10.0-py2.py3-none-any.whl (60kB)\n",
354 | "\u001b[K |████████████████████████████████| 61kB 6.9MB/s \n",
355 | "\u001b[?25hRequirement already satisfied: six>=1.6 in /usr/local/lib/python3.7/dist-packages (from azure-identity->microsoftvision) (1.15.0)\n",
356 | "Collecting azure-core<2.0.0,>=1.0.0\n",
357 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3d/6d/95ca41cace692206529bddae67ee0a079e97f250951f28a213eea16d5050/azure_core-1.12.0-py2.py3-none-any.whl (130kB)\n",
358 | "\u001b[K |████████████████████████████████| 133kB 60.2MB/s \n",
359 | "\u001b[?25hCollecting cryptography>=2.1.4\n",
360 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/1f/acde6ff69864c5e78b56488e3afd93c1ccc8c2651186e2a5f93d93f64859/cryptography-3.4.6-cp36-abi3-manylinux2014_x86_64.whl (3.2MB)\n",
361 | "\u001b[K |████████████████████████████████| 3.2MB 52.4MB/s \n",
362 | "\u001b[?25hCollecting msal-extensions~=0.3.0\n",
363 | " Downloading https://files.pythonhosted.org/packages/49/cb/c833ffa0f97c3098b146ac19bb2266c2d84b2119ffff83fdf001bb59d3ae/msal_extensions-0.3.0-py2.py3-none-any.whl\n",
364 | "Collecting msrest>=0.6.18\n",
365 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e8/cc/6c96bfb3d3cf4c3bdedfa6b46503223f4c2a4fa388377697e0f8082a4fed/msrest-0.6.21-py2.py3-none-any.whl (85kB)\n",
366 | "\u001b[K |████████████████████████████████| 92kB 11.3MB/s \n",
367 | "\u001b[?25hRequirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from msal<2.0.0,>=1.6.0->azure-identity->microsoftvision) (2.23.0)\n",
368 | "Collecting PyJWT[crypto]<3,>=1.0.0\n",
369 | " Downloading https://files.pythonhosted.org/packages/b4/9b/8850f99027ed029af6828199cc87179eaccbbf1f9e6e373e7f0177d32dad/PyJWT-2.0.1-py3-none-any.whl\n",
370 | "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.7/dist-packages (from cryptography>=2.1.4->azure-identity->microsoftvision) (1.14.5)\n",
371 | "Collecting portalocker~=1.0; platform_system != \"Windows\"\n",
372 | " Downloading https://files.pythonhosted.org/packages/3b/e7/ceef002a300a98a208232fab593183249b6964b306ee7dabb29908419cca/portalocker-1.7.1-py2.py3-none-any.whl\n",
373 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from msrest>=0.6.18->azure-storage-blob->microsoftvision) (2020.12.5)\n",
374 | "Requirement already satisfied: requests-oauthlib>=0.5.0 in /usr/local/lib/python3.7/dist-packages (from msrest>=0.6.18->azure-storage-blob->microsoftvision) (1.3.0)\n",
375 | "Collecting isodate>=0.6.0\n",
376 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)\n",
377 | "\u001b[K |████████████████████████████████| 51kB 7.6MB/s \n",
378 | "\u001b[?25hRequirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.0.0->msal<2.0.0,>=1.6.0->azure-identity->microsoftvision) (1.24.3)\n",
379 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.0.0->msal<2.0.0,>=1.6.0->azure-identity->microsoftvision) (3.0.4)\n",
380 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.0.0->msal<2.0.0,>=1.6.0->azure-identity->microsoftvision) (2.10)\n",
381 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.12->cryptography>=2.1.4->azure-identity->microsoftvision) (2.20)\n",
382 | "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.5.0->msrest>=0.6.18->azure-storage-blob->microsoftvision) (3.1.0)\n",
383 | "Installing collected packages: cryptography, PyJWT, msal, azure-core, portalocker, msal-extensions, azure-identity, isodate, msrest, azure-storage-blob, microsoftvision\n",
384 | "Successfully installed PyJWT-2.0.1 azure-core-1.12.0 azure-identity-1.5.0 azure-storage-blob-12.8.0 cryptography-3.4.6 isodate-0.6.0 microsoftvision-1.0.5 msal-1.10.0 msal-extensions-0.3.0 msrest-0.6.21 portalocker-1.7.1\n"
385 | ],
386 | "name": "stdout"
387 | }
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {
393 | "id": "QQS10yGI3R4z"
394 | },
395 | "source": [
396 | "## Preprocess the Input Images\n",
397 | "\n",
398 | "Microsoft Vision model is using images in BGR format, hence the swapping of image channels at the end of preprocessing"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "metadata": {
404 | "id": "vxUUWJGuolfS"
405 | },
406 | "source": [
407 | "class Preprocess:\n",
408 | " def __init__(self):\n",
409 | " self.preprocess = transforms.Compose([\n",
410 | " transforms.Resize(224),\n",
411 | " transforms.CenterCrop(224),\n",
412 | " transforms.ToTensor(),\n",
413 | " transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])])\n",
414 | "\n",
415 | " def __call__(self, x):\n",
416 | " return self.preprocess(x)[[2,1,0],:,:]"
417 | ],
418 | "execution_count": 3,
419 | "outputs": []
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {
424 | "id": "eqxNc9dc3bNU"
425 | },
426 | "source": [
427 | "Import the CIFAR-10 dataset with the division to train and test sets. This can be replaced with any dataset without any changes to the rest of the code.\n",
428 | "\n",
429 | "Even this can be replaced with PyTorch with torchvision.datasets.ImageFolder\n",
430 | "\n",
431 | "A generic data loader where the images are arranged in this way:\n",
432 | "\n",
433 | ">root/dog/xxx.png
\n",
434 | "> root/dog/xxy.png
\n",
435 | ">root/dog/[...]/xxz.png
\n",
436 | ">root/cat/123.png
\n",
437 | ">root/cat/nsdf3.png
\n",
438 | ">root/cat/[...]/asd932_.png
"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "metadata": {
444 | "colab": {
445 | "base_uri": "https://localhost:8080/",
446 | "height": 117,
447 | "referenced_widgets": [
448 | "bd820309094f48b38f473a235ef5a93d",
449 | "3a7877ed8e174bf38de1d4ca2ee29e4c",
450 | "1f0f20c8720c4dcd9917057ff1c7dc3f",
451 | "47e1b2f348aa4a2c9d039706260d5781",
452 | "b2ad45b530124e2f81a4fcb8f60d6893",
453 | "11a0f5cf47914b839188653402d94651",
454 | "36dc7a4e4687495e8430f27749e099a3",
455 | "8dcb70e7032941cf9e7f2bcefe775486"
456 | ]
457 | },
458 | "id": "KPHTGjJYoljB",
459 | "outputId": "67095cbf-55ad-4534-c5c4-5f52bfbbec77"
460 | },
461 | "source": [
462 | "train_dataset = CIFAR10('path', download=True, train=True, transform=Preprocess())\n",
463 | "test_dataset = CIFAR10('path', download=True, train=False, transform=Preprocess())"
464 | ],
465 | "execution_count": 4,
466 | "outputs": [
467 | {
468 | "output_type": "stream",
469 | "text": [
470 | "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to path/cifar-10-python.tar.gz\n"
471 | ],
472 | "name": "stdout"
473 | },
474 | {
475 | "output_type": "display_data",
476 | "data": {
477 | "application/vnd.jupyter.widget-view+json": {
478 | "model_id": "bd820309094f48b38f473a235ef5a93d",
479 | "version_minor": 0,
480 | "version_major": 2
481 | },
482 | "text/plain": [
483 | "HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))"
484 | ]
485 | },
486 | "metadata": {
487 | "tags": []
488 | }
489 | },
490 | {
491 | "output_type": "stream",
492 | "text": [
493 | "\n",
494 | "Extracting path/cifar-10-python.tar.gz to path\n",
495 | "Files already downloaded and verified\n"
496 | ],
497 | "name": "stdout"
498 | }
499 | ]
500 | },
501 | {
502 | "cell_type": "markdown",
503 | "metadata": {
504 | "id": "Bc2Nzklg-L9f"
505 | },
506 | "source": [
507 | "## Loading Microsoft Vision pretrained model"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "metadata": {
513 | "colab": {
514 | "base_uri": "https://localhost:8080/"
515 | },
516 | "id": "tAHrdn_ho3ta",
517 | "outputId": "adf7778c-6635-4fb3-b04d-f921212bb975"
518 | },
519 | "source": [
520 | "model = microsoftvision.models.resnet50(pretrained=True)"
521 | ],
522 | "execution_count": 5,
523 | "outputs": [
524 | {
525 | "output_type": "stream",
526 | "text": [
527 | "Loading Microsoft Vision pretrained model\n",
528 | "Downloading model.\n"
529 | ],
530 | "name": "stdout"
531 | },
532 | {
533 | "output_type": "stream",
534 | "text": [
535 | "\r 0%| | 0/23 [00:00, ?MB/s]"
536 | ],
537 | "name": "stderr"
538 | },
539 | {
540 | "output_type": "stream",
541 | "text": [
542 | "Model size: 89 MB\n"
543 | ],
544 | "name": "stdout"
545 | },
546 | {
547 | "output_type": "stream",
548 | "text": [
549 | "100%|██████████| 23/23 [00:36<00:00, 1.61s/MB]"
550 | ],
551 | "name": "stderr"
552 | },
553 | {
554 | "output_type": "stream",
555 | "text": [
556 | "Model saved to MicrosoftVision.ResNet50.tar\n"
557 | ],
558 | "name": "stdout"
559 | },
560 | {
561 | "output_type": "stream",
562 | "text": [
563 | "\n"
564 | ],
565 | "name": "stderr"
566 | }
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "metadata": {
572 | "colab": {
573 | "base_uri": "https://localhost:8080/"
574 | },
575 | "id": "1mYUwS3yo4Oy",
576 | "outputId": "5f46d18b-c0f0-409d-a491-e47b61839f67"
577 | },
578 | "source": [
579 | "# using GPU for speed-up computation\n",
580 | "model.eval()\n",
581 | "model.cuda()"
582 | ],
583 | "execution_count": 6,
584 | "outputs": [
585 | {
586 | "output_type": "execute_result",
587 | "data": {
588 | "text/plain": [
589 | "ResNet(\n",
590 | " (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
591 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
592 | " (relu): ReLU(inplace=True)\n",
593 | " (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
594 | " (layer1): Sequential(\n",
595 | " (0): Bottleneck(\n",
596 | " (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
597 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
598 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
599 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
600 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
601 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
602 | " (relu): ReLU(inplace=True)\n",
603 | " (downsample): Sequential(\n",
604 | " (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
605 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
606 | " )\n",
607 | " )\n",
608 | " (1): Bottleneck(\n",
609 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
610 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
611 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
612 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
613 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
614 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
615 | " (relu): ReLU(inplace=True)\n",
616 | " )\n",
617 | " (2): Bottleneck(\n",
618 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
619 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
620 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
621 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
622 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
623 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
624 | " (relu): ReLU(inplace=True)\n",
625 | " )\n",
626 | " )\n",
627 | " (layer2): Sequential(\n",
628 | " (0): Bottleneck(\n",
629 | " (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
630 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
631 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
632 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
633 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
634 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
635 | " (relu): ReLU(inplace=True)\n",
636 | " (downsample): Sequential(\n",
637 | " (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
638 | " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
639 | " )\n",
640 | " )\n",
641 | " (1): Bottleneck(\n",
642 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
643 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
644 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
645 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
646 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
647 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
648 | " (relu): ReLU(inplace=True)\n",
649 | " )\n",
650 | " (2): Bottleneck(\n",
651 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
652 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
653 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
654 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
655 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
656 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
657 | " (relu): ReLU(inplace=True)\n",
658 | " )\n",
659 | " (3): Bottleneck(\n",
660 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
661 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
662 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
663 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
664 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
665 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
666 | " (relu): ReLU(inplace=True)\n",
667 | " )\n",
668 | " )\n",
669 | " (layer3): Sequential(\n",
670 | " (0): Bottleneck(\n",
671 | " (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
672 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
673 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
674 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
675 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
676 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
677 | " (relu): ReLU(inplace=True)\n",
678 | " (downsample): Sequential(\n",
679 | " (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
680 | " (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
681 | " )\n",
682 | " )\n",
683 | " (1): Bottleneck(\n",
684 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
685 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
686 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
687 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
688 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
689 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
690 | " (relu): ReLU(inplace=True)\n",
691 | " )\n",
692 | " (2): Bottleneck(\n",
693 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
694 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
695 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
696 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
697 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
698 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
699 | " (relu): ReLU(inplace=True)\n",
700 | " )\n",
701 | " (3): Bottleneck(\n",
702 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
703 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
704 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
705 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
706 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
707 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
708 | " (relu): ReLU(inplace=True)\n",
709 | " )\n",
710 | " (4): Bottleneck(\n",
711 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
712 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
713 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
714 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
715 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
716 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
717 | " (relu): ReLU(inplace=True)\n",
718 | " )\n",
719 | " (5): Bottleneck(\n",
720 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
721 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
722 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
723 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
724 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
725 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
726 | " (relu): ReLU(inplace=True)\n",
727 | " )\n",
728 | " )\n",
729 | " (layer4): Sequential(\n",
730 | " (0): Bottleneck(\n",
731 | " (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
732 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
733 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
734 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
735 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
736 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
737 | " (relu): ReLU(inplace=True)\n",
738 | " (downsample): Sequential(\n",
739 | " (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
740 | " (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
741 | " )\n",
742 | " )\n",
743 | " (1): Bottleneck(\n",
744 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
745 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
746 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
747 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
748 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
749 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
750 | " (relu): ReLU(inplace=True)\n",
751 | " )\n",
752 | " (2): Bottleneck(\n",
753 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
754 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
755 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
756 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
757 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
758 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
759 | " (relu): ReLU(inplace=True)\n",
760 | " )\n",
761 | " )\n",
762 | " (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
763 | ")"
764 | ]
765 | },
766 | "metadata": {
767 | "tags": []
768 | },
769 | "execution_count": 6
770 | }
771 | ]
772 | },
773 | {
774 | "cell_type": "markdown",
775 | "metadata": {
776 | "id": "CDesqNYx-Xv_"
777 | },
778 | "source": [
779 | "## Feature Extarction"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "metadata": {
785 | "id": "4sE2CvKwo8g6"
786 | },
787 | "source": [
788 | "def get_features(dataset, model):\n",
789 | " all_features = []\n",
790 | " all_labels = []\n",
791 | "\n",
792 | " with torch.no_grad():\n",
793 | " for images, labels in progressbar(DataLoader(dataset, batch_size=128, num_workers=8)):\n",
794 | " images = images.cuda()\n",
795 | " labels = labels.cuda()\n",
796 | " features = model(images)\n",
797 | "\n",
798 | " all_features.append(features)\n",
799 | " all_labels.append(labels)\n",
800 | "\n",
801 | " return torch.cat(all_features).cpu(), torch.cat(all_labels).cpu()"
802 | ],
803 | "execution_count": 8,
804 | "outputs": []
805 | },
806 | {
807 | "cell_type": "code",
808 | "metadata": {
809 | "colab": {
810 | "base_uri": "https://localhost:8080/"
811 | },
812 | "id": "t9_mMcl2o85e",
813 | "outputId": "5f0076c3-b90a-4d3c-9c78-4cfad6f9a963"
814 | },
815 | "source": [
816 | "train_features, train_labels = get_features(train_dataset, model)\n",
817 | "test_features, test_labels = get_features(test_dataset, model)"
818 | ],
819 | "execution_count": 9,
820 | "outputs": [
821 | {
822 | "output_type": "stream",
823 | "text": [
824 | "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
825 | " cpuset_checked))\n",
826 | "100% (391 of 391) |######################| Elapsed Time: 0:02:30 Time: 0:02:30\n",
827 | "100% (79 of 79) |########################| Elapsed Time: 0:00:31 Time: 0:00:31\n"
828 | ],
829 | "name": "stderr"
830 | }
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "metadata": {
836 | "id": "TbyLjapbo9J_"
837 | },
838 | "source": [
839 | "# train features preprocessing\n",
840 | "\n",
841 | "train_labels = train_labels.to(dtype=torch.float)\n",
842 | "train = TensorDataset( train_features, train_labels)\n",
843 | "\n",
844 | "# Create a data loader from the features\n",
845 | "train_loader = DataLoader(train, batch_size= 30,shuffle=True)"
846 | ],
847 | "execution_count": 100,
848 | "outputs": []
849 | },
850 | {
851 | "cell_type": "code",
852 | "metadata": {
853 | "id": "v--KTQqysJW0"
854 | },
855 | "source": [
856 | "# test features preprocessing\n",
857 | "\n",
858 | "test_labels = test_labels.to(dtype=torch.float)\n",
859 | "test = TensorDataset( Tensor(test_features), Tensor(test_labels))\n",
860 | "\n",
861 | "# Create a data loader from the features\n",
862 | "test_loader = DataLoader(test, batch_size= 10)"
863 | ],
864 | "execution_count": 101,
865 | "outputs": []
866 | },
867 | {
868 | "cell_type": "markdown",
869 | "metadata": {
870 | "id": "RKny7yz5-jLn"
871 | },
872 | "source": [
873 | "## Custom Fully Connected Model"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "metadata": {
879 | "id": "gjitgWUBsNNg"
880 | },
881 | "source": [
882 | "import torch.nn as nn\n",
883 | "import torch.nn.functional as F\n",
884 | "\n",
885 | "class Network(nn.Module):\n",
886 | " def __init__(self):\n",
887 | " super(Network, self).__init__()\n",
888 | " self.fc = nn.Linear(2048, 1024)\n",
889 | " self.dropout_layer = nn.Dropout(p=0.5)\n",
890 | " self.out = nn.Linear(1024, 10)\n",
891 | " self.relu = nn.ReLU()\n",
892 | "\n",
893 | " def forward(self, x):\n",
894 | " x = self.dropout_layer(self.relu(self.fc(x)))\n",
895 | " x = self.out(x)\n",
896 | " return x\n",
897 | "\n",
898 | "#instantiate our Neural Network class and moving it to the GPU\n",
899 | "network = Network().cuda()"
900 | ],
901 | "execution_count": 137,
902 | "outputs": []
903 | },
904 | {
905 | "cell_type": "code",
906 | "metadata": {
907 | "id": "k6V27takqUnd"
908 | },
909 | "source": [
910 | "\n",
911 | "#initialize Cross Entropy loss function\n",
912 | "criterion = nn.CrossEntropyLoss()\n",
913 | "\n",
914 | "\n",
915 | "# set Adam as optimizer function\n",
916 | "optimizer = optim.Adam(network.parameters(), lr=0.0001) # Optimizer\n",
917 | "\n"
918 | ],
919 | "execution_count": 138,
920 | "outputs": []
921 | },
922 | {
923 | "cell_type": "code",
924 | "metadata": {
925 | "id": "cmVuhSQ_w0Vy"
926 | },
927 | "source": [
928 | "dataset_sizes = {'train':len(train_labels),'test':len(test_labels)}"
929 | ],
930 | "execution_count": 139,
931 | "outputs": []
932 | },
933 | {
934 | "cell_type": "code",
935 | "metadata": {
936 | "id": "eZzCsipHsjB4"
937 | },
938 | "source": [
939 | "device = 'cuda'\n",
940 | "def train_model(model, criterion, optimizer, num_epochs=20):\n",
941 | " since = time.time()\n",
942 | "\n",
943 | " for epoch in range(num_epochs):\n",
944 | " print('Epoch {}/{}'.format(epoch, num_epochs - 1))\n",
945 | " print('-' * 10)\n",
946 | "\n",
947 | " model.train() # Set model to training mode\n",
948 | "\n",
949 | "\n",
950 | " running_loss = 0.0\n",
951 | " running_corrects = 0\n",
952 | "\n",
953 | " # Iterate over data.\n",
954 | " for inputs, labels in progressbar(train_loader):\n",
955 | " inputs, labels = inputs.to(device), labels.to(device)\n",
956 | " labels = labels.to(dtype=torch.long)\n",
957 | "\n",
958 | " # zero the parameter gradients\n",
959 | " optimizer.zero_grad()\n",
960 | "\n",
961 | " outputs = model(inputs)\n",
962 | " _, preds = torch.max(outputs, 1)\n",
963 | " loss = criterion(outputs, labels)\n",
964 | " loss.backward()\n",
965 | " optimizer.step()\n",
966 | "\n",
967 | " # statistics\n",
968 | " running_loss += loss.item() * inputs.size(0)\n",
969 | " running_corrects += torch.sum(preds == labels.data)\n",
970 | "\n",
971 | " epoch_loss = running_loss / dataset_sizes['train']\n",
972 | " epoch_acc = running_corrects.double() / dataset_sizes['train']\n",
973 | "\n",
974 | " print(' Loss: {:.4f} Acc: {:.4f}'.format(\n",
975 | " epoch_loss, epoch_acc))\n",
976 | "\n",
977 | "\n",
978 | "\n",
979 | " time_elapsed = time.time() - since\n",
980 | " print('Training complete in {:.0f}m {:.0f}s'.format(\n",
981 | " time_elapsed // 60, time_elapsed % 60))\n",
982 | " return model"
983 | ],
984 | "execution_count": 140,
985 | "outputs": []
986 | },
987 | {
988 | "cell_type": "code",
989 | "metadata": {
990 | "colab": {
991 | "base_uri": "https://localhost:8080/"
992 | },
993 | "id": "rKNXpfscuQOb",
994 | "outputId": "f596c02c-10e1-4684-9d9c-f84bce57ea85"
995 | },
996 | "source": [
997 | "model_ft = train_model(network, criterion,optimizer,num_epochs=5)"
998 | ],
999 | "execution_count": 141,
1000 | "outputs": [
1001 | {
1002 | "output_type": "stream",
1003 | "text": [
1004 | " 2% (43 of 1667) | | Elapsed Time: 0:00:00 ETA: 0:00:04"
1005 | ],
1006 | "name": "stderr"
1007 | },
1008 | {
1009 | "output_type": "stream",
1010 | "text": [
1011 | "Epoch 0/4\n",
1012 | "----------\n"
1013 | ],
1014 | "name": "stdout"
1015 | },
1016 | {
1017 | "output_type": "stream",
1018 | "text": [
1019 | "100% (1667 of 1667) |####################| Elapsed Time: 0:00:03 Time: 0:00:03\n",
1020 | " 5% (85 of 1667) |# | Elapsed Time: 0:00:00 ETA: 0:00:03"
1021 | ],
1022 | "name": "stderr"
1023 | },
1024 | {
1025 | "output_type": "stream",
1026 | "text": [
1027 | " Loss: 0.2625 Acc: 0.9115\n",
1028 | "Epoch 1/4\n",
1029 | "----------\n"
1030 | ],
1031 | "name": "stdout"
1032 | },
1033 | {
1034 | "output_type": "stream",
1035 | "text": [
1036 | "100% (1667 of 1667) |####################| Elapsed Time: 0:00:03 Time: 0:00:03\n",
1037 | " 5% (85 of 1667) |# | Elapsed Time: 0:00:00 ETA: 0:00:03"
1038 | ],
1039 | "name": "stderr"
1040 | },
1041 | {
1042 | "output_type": "stream",
1043 | "text": [
1044 | " Loss: 0.1985 Acc: 0.9304\n",
1045 | "Epoch 2/4\n",
1046 | "----------\n"
1047 | ],
1048 | "name": "stdout"
1049 | },
1050 | {
1051 | "output_type": "stream",
1052 | "text": [
1053 | "100% (1667 of 1667) |####################| Elapsed Time: 0:00:03 Time: 0:00:03\n",
1054 | " 3% (64 of 1667) | | Elapsed Time: 0:00:00 ETA: 0:00:03"
1055 | ],
1056 | "name": "stderr"
1057 | },
1058 | {
1059 | "output_type": "stream",
1060 | "text": [
1061 | " Loss: 0.1784 Acc: 0.9378\n",
1062 | "Epoch 3/4\n",
1063 | "----------\n"
1064 | ],
1065 | "name": "stdout"
1066 | },
1067 | {
1068 | "output_type": "stream",
1069 | "text": [
1070 | "100% (1667 of 1667) |####################| Elapsed Time: 0:00:03 Time: 0:00:03\n",
1071 | " 3% (64 of 1667) | | Elapsed Time: 0:00:00 ETA: 0:00:03"
1072 | ],
1073 | "name": "stderr"
1074 | },
1075 | {
1076 | "output_type": "stream",
1077 | "text": [
1078 | " Loss: 0.1659 Acc: 0.9414\n",
1079 | "Epoch 4/4\n",
1080 | "----------\n"
1081 | ],
1082 | "name": "stdout"
1083 | },
1084 | {
1085 | "output_type": "stream",
1086 | "text": [
1087 | "100% (1667 of 1667) |####################| Elapsed Time: 0:00:03 Time: 0:00:03\n"
1088 | ],
1089 | "name": "stderr"
1090 | },
1091 | {
1092 | "output_type": "stream",
1093 | "text": [
1094 | " Loss: 0.1543 Acc: 0.9459\n",
1095 | "Training complete in 0m 19s\n"
1096 | ],
1097 | "name": "stdout"
1098 | }
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "markdown",
1103 | "metadata": {
1104 | "id": "Hc3vPhw0_GqP"
1105 | },
1106 | "source": [
1107 | "## Testing the model with Test Dataset"
1108 | ]
1109 | },
1110 | {
1111 | "cell_type": "code",
1112 | "metadata": {
1113 | "colab": {
1114 | "base_uri": "https://localhost:8080/"
1115 | },
1116 | "id": "pTQJA4pIyMHR",
1117 | "outputId": "a531d77a-b718-4954-ca09-f39909b846f3"
1118 | },
1119 | "source": [
1120 | "correct = 0\n",
1121 | "total = 0\n",
1122 | "with torch.no_grad():\n",
1123 | " for data in progressbar(test_loader):\n",
1124 | " images, labels = data\n",
1125 | " images, labels = images.to(device), labels.to(device)\n",
1126 | " outputs = model_ft(images)\n",
1127 | " _, predicted = torch.max(outputs.data, 1)\n",
1128 | " total += labels.size(0)\n",
1129 | " correct += (predicted == labels).sum().item()\n",
1130 | "\n",
1131 | "print('Accuracy of the network on the test images: %d %%' % (\n",
1132 | " 100 * correct / total))"
1133 | ],
1134 | "execution_count": 142,
1135 | "outputs": [
1136 | {
1137 | "output_type": "stream",
1138 | "text": [
1139 | "100% (1000 of 1000) |####################| Elapsed Time: 0:00:00 Time: 0:00:00\n"
1140 | ],
1141 | "name": "stderr"
1142 | },
1143 | {
1144 | "output_type": "stream",
1145 | "text": [
1146 | "Accuracy of the network on the test images: 93 %\n"
1147 | ],
1148 | "name": "stdout"
1149 | }
1150 | ]
1151 | }
1152 | ]
1153 | }
--------------------------------------------------------------------------------
/examples/CIFAR10_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Microsoft Vision classification example\n",
8 | "This example shows an example of a simple way to classify images from any dataset using pretrained Microsoft Vision model.\n",
9 | "\n",
10 | "Using 1 GPU, under 10 minutes of training we can achieve score 92.92% accuracy on CIFAR-10 dataset using kNN algorithm.\n",
11 | "We also show how to plug-in LinearClassification algorithm on top of frozen featues."
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "We'll start by necessary imports. We are using pytorch as model backend and sklearn LogisticRegression for simplicity and reproducibility of our results."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 11,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import torch\n",
28 | "from torch.utils.data import DataLoader\n",
29 | "from torchvision.datasets import CIFAR10\n",
30 | "import torchvision.transforms as transforms\n",
31 | "import numpy as np\n",
32 | "from sklearn.linear_model import LogisticRegression\n",
33 | "from progressbar import progressbar\n",
34 | "from sklearn.neighbors import KNeighborsClassifier"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "And the new element: Microsoft Vision package. It gives immediate access to pretrained ResNet50 model in just 1 line. With interface very similar to torchvision"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import microsoftvision"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "Let's define how we'll preprocess images. We can notice that Microsoft Vision model is using images in BGR format, hence the swapping of image channels at the end of preprocessing"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "class Preprocess:\n",
67 | " def __init__(self):\n",
68 | " self.preprocess = transforms.Compose([\n",
69 | " transforms.Resize(224),\n",
70 | " transforms.CenterCrop(224),\n",
71 | " transforms.ToTensor(),\n",
72 | " transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])])\n",
73 | "\n",
74 | " def __call__(self, x):\n",
75 | " return self.preprocess(x)[[2,1,0],:,:]"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "Import the CIFAR-10 dataset with division to train and test sets. This can be replaced with any dataset without any changes to the rest of the code."
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "name": "stdout",
92 | "output_type": "stream",
93 | "text": [
94 | "Files already downloaded and verified\n",
95 | "Files already downloaded and verified\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "train_dataset = CIFAR10('path', download=True, train=True, transform=Preprocess())\n",
101 | "test_dataset = CIFAR10('path', download=True, train=False, transform=Preprocess())"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "And now, we are importing Microsoft Vision model with ResNet50 architecture. We are specifying that we want the pretrained version (same interface as torchvision)."
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 5,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "Loading Microsoft Vision pretrained model\n",
121 | "Downloading model.\n"
122 | ]
123 | },
124 | {
125 | "name": "stderr",
126 | "output_type": "stream",
127 | "text": [
128 | " 0%| | 0/23 [00:00, ?MB/s]"
129 | ]
130 | },
131 | {
132 | "name": "stdout",
133 | "output_type": "stream",
134 | "text": [
135 | "Model size: 89 MB\n"
136 | ]
137 | },
138 | {
139 | "name": "stderr",
140 | "output_type": "stream",
141 | "text": [
142 | "100%|██████████| 23/23 [00:06<00:00, 6.20MB/s]\n"
143 | ]
144 | },
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "Model saved to MicrosoftVision.ResNet50.tar\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "model = microsoftvision.models.resnet50(pretrained=True)"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "Microsoft vision model is just used to extract image features, without fine-tuning. Therefore we are setting it to evaluation mode. Let's use GPU to speed-up computation."
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 6,
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/plain": [
172 | "ResNet(\n",
173 | " (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
174 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
175 | " (relu): ReLU(inplace=True)\n",
176 | " (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
177 | " (layer1): Sequential(\n",
178 | " (0): Bottleneck(\n",
179 | " (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
180 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
181 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
182 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
183 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
184 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
185 | " (relu): ReLU(inplace=True)\n",
186 | " (downsample): Sequential(\n",
187 | " (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
188 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
189 | " )\n",
190 | " )\n",
191 | " (1): Bottleneck(\n",
192 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
193 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
194 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
195 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
196 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
197 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
198 | " (relu): ReLU(inplace=True)\n",
199 | " )\n",
200 | " (2): Bottleneck(\n",
201 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
202 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
203 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
204 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
205 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
206 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
207 | " (relu): ReLU(inplace=True)\n",
208 | " )\n",
209 | " )\n",
210 | " (layer2): Sequential(\n",
211 | " (0): Bottleneck(\n",
212 | " (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
213 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
214 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
215 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
216 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
217 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
218 | " (relu): ReLU(inplace=True)\n",
219 | " (downsample): Sequential(\n",
220 | " (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
221 | " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
222 | " )\n",
223 | " )\n",
224 | " (1): Bottleneck(\n",
225 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
226 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
227 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
228 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
229 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
230 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
231 | " (relu): ReLU(inplace=True)\n",
232 | " )\n",
233 | " (2): Bottleneck(\n",
234 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
235 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
236 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
237 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
238 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
239 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
240 | " (relu): ReLU(inplace=True)\n",
241 | " )\n",
242 | " (3): Bottleneck(\n",
243 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
244 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
245 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
246 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
247 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
248 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
249 | " (relu): ReLU(inplace=True)\n",
250 | " )\n",
251 | " )\n",
252 | " (layer3): Sequential(\n",
253 | " (0): Bottleneck(\n",
254 | " (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
255 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
256 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
257 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
258 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
259 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
260 | " (relu): ReLU(inplace=True)\n",
261 | " (downsample): Sequential(\n",
262 | " (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
263 | " (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
264 | " )\n",
265 | " )\n",
266 | " (1): Bottleneck(\n",
267 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
268 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
269 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
270 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
271 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
272 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
273 | " (relu): ReLU(inplace=True)\n",
274 | " )\n",
275 | " (2): Bottleneck(\n",
276 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
277 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
278 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
279 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
280 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
281 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
282 | " (relu): ReLU(inplace=True)\n",
283 | " )\n",
284 | " (3): Bottleneck(\n",
285 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
286 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
287 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
288 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
289 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
290 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
291 | " (relu): ReLU(inplace=True)\n",
292 | " )\n",
293 | " (4): Bottleneck(\n",
294 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
295 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
296 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
297 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
298 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
299 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
300 | " (relu): ReLU(inplace=True)\n",
301 | " )\n",
302 | " (5): Bottleneck(\n",
303 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
304 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
305 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
306 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
307 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
308 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
309 | " (relu): ReLU(inplace=True)\n",
310 | " )\n",
311 | " )\n",
312 | " (layer4): Sequential(\n",
313 | " (0): Bottleneck(\n",
314 | " (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
315 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
316 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
317 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
318 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
319 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
320 | " (relu): ReLU(inplace=True)\n",
321 | " (downsample): Sequential(\n",
322 | " (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
323 | " (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
324 | " )\n",
325 | " )\n",
326 | " (1): Bottleneck(\n",
327 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
328 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
329 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
330 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
331 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
332 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
333 | " (relu): ReLU(inplace=True)\n",
334 | " )\n",
335 | " (2): Bottleneck(\n",
336 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
337 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
338 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
339 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
340 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
341 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
342 | " (relu): ReLU(inplace=True)\n",
343 | " )\n",
344 | " )\n",
345 | " (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))\n",
346 | ")"
347 | ]
348 | },
349 | "execution_count": 6,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "model.eval()\n",
356 | "model.cuda()"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 7,
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "def get_features(dataset, model):\n",
366 | " all_features = []\n",
367 | " all_labels = []\n",
368 | "\n",
369 | " with torch.no_grad():\n",
370 | " for images, labels in progressbar(DataLoader(dataset, batch_size=128, num_workers=8)):\n",
371 | " images = images.cuda()\n",
372 | " labels = labels.cuda()\n",
373 | " features = model(images)\n",
374 | "\n",
375 | " all_features.append(features)\n",
376 | " all_labels.append(labels)\n",
377 | "\n",
378 | " return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "We're extracting all image features from training and test set. Those features will be used to train the linear regression model and calculate accuracy score."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 8,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "name": "stderr",
395 | "output_type": "stream",
396 | "text": [
397 | "100% (391 of 391) |######################| Elapsed Time: 0:01:40 Time: 0:01:40\n",
398 | "100% (79 of 79) |########################| Elapsed Time: 0:00:19 Time: 0:00:19\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "train_features, train_labels = get_features(train_dataset, model)\n",
404 | "test_features, test_labels = get_features(test_dataset, model)"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "metadata": {},
410 | "source": [
411 | "Fit the classifier to the training set and then measure performance on test set. Whole operation will take less than 10 min using 1 GPU!"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 13,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "# You can plug-in any classifier\n",
421 | "\n",
422 | "#classifier = LogisticRegression(random_state=0, max_iter=1000, verbose=1, n_jobs=16)\n",
423 | "classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=16)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 14,
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "name": "stdout",
433 | "output_type": "stream",
434 | "text": [
435 | "Accuracy: 92.92\n"
436 | ]
437 | }
438 | ],
439 | "source": [
440 | "classifier.fit(train_features, train_labels)\n",
441 | "predictions = classifier.predict(test_features)\n",
442 | "accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.\n",
443 | "print(f\"Accuracy: {accuracy}\")"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": null,
449 | "metadata": {},
450 | "outputs": [],
451 | "source": []
452 | }
453 | ],
454 | "metadata": {
455 | "kernelspec": {
456 | "display_name": "Python 3",
457 | "language": "python",
458 | "name": "python3"
459 | },
460 | "language_info": {
461 | "codemirror_mode": {
462 | "name": "ipython",
463 | "version": 3
464 | },
465 | "file_extension": ".py",
466 | "mimetype": "text/x-python",
467 | "name": "python",
468 | "nbconvert_exporter": "python",
469 | "pygments_lexer": "ipython3",
470 | "version": "3.6.10"
471 | }
472 | },
473 | "nbformat": 4,
474 | "nbformat_minor": 5
475 | }
476 |
--------------------------------------------------------------------------------
/microsoftvision/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import *
--------------------------------------------------------------------------------
/microsoftvision/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnext import *
2 |
--------------------------------------------------------------------------------
/microsoftvision/models/resnext.py:
--------------------------------------------------------------------------------
1 | '''
2 | This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause)
3 | with removed classification layer.
4 | '''
5 |
6 | import torch
7 | import torch.nn as nn
8 | from .utils import load_state_dict_from_url, load_state_dict
9 |
10 |
11 | __all__ = ['resnet50']
12 |
13 |
14 | model_urls = {
15 | 'resnet50': "https://argusvision.blob.core.windows.net/microsoftvision/MicrosoftVision.ResNet50.tar",
16 | }
17 |
18 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
19 | """3x3 convolution with padding"""
20 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
21 | padding=dilation, groups=groups, bias=False, dilation=dilation)
22 |
23 |
24 | def conv1x1(in_planes, out_planes, stride=1):
25 | """1x1 convolution"""
26 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
27 |
28 |
29 | class BasicBlock(nn.Module):
30 | expansion = 1
31 |
32 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
33 | base_width=64, dilation=1, norm_layer=None):
34 | super(BasicBlock, self).__init__()
35 | if norm_layer is None:
36 | norm_layer = nn.BatchNorm2d
37 | if groups != 1 or base_width != 64:
38 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
39 | if dilation > 1:
40 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
41 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
42 | self.conv1 = conv3x3(inplanes, planes, stride)
43 | self.bn1 = norm_layer(planes)
44 | self.relu = nn.ReLU(inplace=True)
45 | self.conv2 = conv3x3(planes, planes)
46 | self.bn2 = norm_layer(planes)
47 | self.downsample = downsample
48 | self.stride = stride
49 |
50 | def forward(self, x):
51 | identity = x
52 |
53 | out = self.conv1(x)
54 | out = self.bn1(out)
55 | out = self.relu(out)
56 |
57 | out = self.conv2(out)
58 | out = self.bn2(out)
59 |
60 | if self.downsample is not None:
61 | identity = self.downsample(x)
62 |
63 | out += identity
64 | out = self.relu(out)
65 |
66 | return out
67 |
68 |
69 | class Bottleneck(nn.Module):
70 | # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
71 | # while original implementation places the stride at the first 1x1 convolution(self.conv1)
72 | # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
73 | # This variant is also known as ResNet V1.5 and improves accuracy according to
74 | # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
75 |
76 | expansion = 4
77 |
78 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
79 | base_width=64, dilation=1, norm_layer=None):
80 | super(Bottleneck, self).__init__()
81 | if norm_layer is None:
82 | norm_layer = nn.BatchNorm2d
83 | width = int(planes * (base_width / 64.)) * groups
84 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
85 | self.conv1 = conv1x1(inplanes, width)
86 | self.bn1 = norm_layer(width)
87 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
88 | self.bn2 = norm_layer(width)
89 | self.conv3 = conv1x1(width, planes * self.expansion)
90 | self.bn3 = norm_layer(planes * self.expansion)
91 | self.relu = nn.ReLU(inplace=True)
92 | self.downsample = downsample
93 | self.stride = stride
94 |
95 | def forward(self, x):
96 | identity = x
97 |
98 | out = self.conv1(x)
99 | out = self.bn1(out)
100 | out = self.relu(out)
101 |
102 | out = self.conv2(out)
103 | out = self.bn2(out)
104 | out = self.relu(out)
105 |
106 | out = self.conv3(out)
107 | out = self.bn3(out)
108 |
109 | if self.downsample is not None:
110 | identity = self.downsample(x)
111 |
112 | out += identity
113 | out = self.relu(out)
114 |
115 | return out
116 |
117 |
118 | class ResNet(nn.Module):
119 |
120 | def __init__(self, block, layers, zero_init_residual=False,
121 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
122 | norm_layer=None):
123 | super(ResNet, self).__init__()
124 | if norm_layer is None:
125 | norm_layer = nn.BatchNorm2d
126 | self._norm_layer = norm_layer
127 |
128 | self.inplanes = 64
129 | self.dilation = 1
130 | if replace_stride_with_dilation is None:
131 | # each element in the tuple indicates if we should replace
132 | # the 2x2 stride with a dilated convolution instead
133 | replace_stride_with_dilation = [False, False, False]
134 | if len(replace_stride_with_dilation) != 3:
135 | raise ValueError("replace_stride_with_dilation should be None "
136 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
137 | self.groups = groups
138 | self.base_width = width_per_group
139 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
140 | bias=False)
141 | self.bn1 = norm_layer(self.inplanes)
142 | self.relu = nn.ReLU(inplace=True)
143 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
144 | self.layer1 = self._make_layer(block, 64, layers[0])
145 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
146 | dilate=replace_stride_with_dilation[0])
147 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
148 | dilate=replace_stride_with_dilation[1])
149 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
150 | dilate=replace_stride_with_dilation[2])
151 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
152 |
153 | for m in self.modules():
154 | if isinstance(m, nn.Conv2d):
155 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
156 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
157 | nn.init.constant_(m.weight, 1)
158 | nn.init.constant_(m.bias, 0)
159 |
160 | # Zero-initialize the last BN in each residual branch,
161 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
162 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
163 | if zero_init_residual:
164 | for m in self.modules():
165 | if isinstance(m, Bottleneck):
166 | nn.init.constant_(m.bn3.weight, 0)
167 | elif isinstance(m, BasicBlock):
168 | nn.init.constant_(m.bn2.weight, 0)
169 |
170 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
171 | norm_layer = self._norm_layer
172 | downsample = None
173 | previous_dilation = self.dilation
174 | if dilate:
175 | self.dilation *= stride
176 | stride = 1
177 | if stride != 1 or self.inplanes != planes * block.expansion:
178 | downsample = nn.Sequential(
179 | conv1x1(self.inplanes, planes * block.expansion, stride),
180 | norm_layer(planes * block.expansion),
181 | )
182 |
183 | layers = []
184 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
185 | self.base_width, previous_dilation, norm_layer))
186 | self.inplanes = planes * block.expansion
187 | for _ in range(1, blocks):
188 | layers.append(block(self.inplanes, planes, groups=self.groups,
189 | base_width=self.base_width, dilation=self.dilation,
190 | norm_layer=norm_layer))
191 |
192 | return nn.Sequential(*layers)
193 |
194 | def _forward_impl(self, x):
195 | # See note [TorchScript super()]
196 | x = self.conv1(x)
197 | x = self.bn1(x)
198 | x = self.relu(x)
199 | x = self.maxpool(x)
200 |
201 | x = self.layer1(x)
202 | x = self.layer2(x)
203 | x = self.layer3(x)
204 | x = self.layer4(x)
205 |
206 | x = self.avgpool(x)
207 | x = torch.flatten(x, 1)
208 |
209 | return x
210 |
211 | def forward(self, x):
212 | return self._forward_impl(x)
213 |
214 |
215 | def _resnet(arch, block, layers, pretrained, map_location, **kwargs):
216 | model = ResNet(block, layers, **kwargs)
217 | if pretrained:
218 | print("Loading Microsoft Vision pretrained model")
219 | state_dict = load_state_dict_from_url(model_urls[arch], map_location)
220 | load_state_dict(model, state_dict)
221 | else:
222 | print("Using default model initialization")
223 | return model
224 |
225 | def resnet50(pretrained=True, map_location=None):
226 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, map_location)
--------------------------------------------------------------------------------
/microsoftvision/models/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | import tempfile
4 | from urllib.request import urlopen, Request
5 | import shutil
6 | from azure.identity import DeviceCodeCredential
7 | from azure.storage.blob import BlobClient
8 | from tqdm import tqdm
9 |
10 | def download_model_from_blob(url, dst, progress=True):
11 | blob_client = BlobClient.from_blob_url(url)
12 | model_size = int(blob_client.get_blob_properties()['size'])
13 | try:
14 | print(f"Model size: {model_size//(1024*1024)} MB")
15 | with open(dst, "wb") as my_blob:
16 | segment_size = 4 * 1024 * 1024 # 1MB chunk
17 | offset = 0
18 | for i in tqdm(range((model_size // segment_size) + 1), unit='MB'):
19 | if offset >= model_size:
20 | break
21 | download_stream = blob_client.download_blob(offset=offset, length=segment_size)
22 | my_blob.write(download_stream.readall())
23 | offset += segment_size
24 |
25 | except:
26 | os.remove(dst)
27 | print("Downloading error")
28 | raise
29 |
30 | print(f"Model saved to {dst}")
31 |
32 |
33 | def load_state_dict_from_url(model_path, map_location=None):
34 | filename = os.path.basename(model_path)
35 |
36 | # This checks if model exists in current folder
37 | if not os.path.exists(filename):
38 | print("Downloading model.")
39 | download_model_from_blob(model_path, filename)
40 | else:
41 | print("Model already downloaded.")
42 |
43 | return torch.load(filename, map_location=map_location)['state_dict']
44 |
45 | def load_state_dict(model, pretrained_weights):
46 | weights = model.state_dict()
47 |
48 | # Remove non-exist keys
49 | for key in pretrained_weights.keys() - weights.keys():
50 | print("Delete unused model state key: %s" % key)
51 | del pretrained_weights[key]
52 |
53 | # Remove keys that size does not match
54 | for key, pretrained_weight in list(pretrained_weights.items()):
55 | weight = weights[key]
56 | if pretrained_weight.shape != weight.shape:
57 | print("Delete model state key with unmatched shape: %s" % key)
58 | del pretrained_weights[key]
59 |
60 | # Copy everything that pretrained_weights miss
61 | for key in weights.keys() - pretrained_weights.keys():
62 | print("Missing model state key: %s" % key)
63 | pretrained_weights[key] = weights[key]
64 |
65 | # Load the weights to model
66 | model.load_state_dict(pretrained_weights)
67 |
--------------------------------------------------------------------------------
/microsoftvision/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.0.5'
2 |
--------------------------------------------------------------------------------