├── .gitignore ├── Figure_1_Modality_Gap ├── features_clasp.npy ├── features_clasp_random.npy ├── features_clip.npy ├── features_clip_random.npy ├── features_convirt.npy ├── features_convirt_random.npy ├── features_videoclip.npy ├── features_videoclip_random.npy ├── repr_clasp.ipynb ├── repr_clip.ipynb ├── repr_convirt.ipynb ├── repr_videoclip.ipynb └── visualize.ipynb ├── Figure_2_Cone_Effect ├── Figure_2a_random_init_random_data │ ├── coco-extract.ipynb │ └── visualize.ipynb ├── Figure_2a_random_init_real_data │ ├── coco-extract.ipynb │ └── visualize.ipynb ├── Figure_2a_real_features │ └── real_features.ipynb ├── Figure_2b_random_MLP_layerwise │ ├── bias_linear_relu.ipynb │ └── no_bias │ │ └── linear_relu.ipynb └── Figure_2c_scatter_cones_random_init │ ├── MLP │ └── scatter_cones_linear_relu.ipynb │ ├── random_data │ ├── coco-extract.ipynb │ ├── visualizePCA.ipynb │ └── visualizeUMAP.ipynb │ ├── real_data │ ├── coco-extract.ipynb │ ├── visualizePCA.ipynb │ └── visualizeUMAP.ipynb │ └── real_data_ImageNet_pretrained │ ├── ImageNet-Pretrained-Cones.png │ ├── README.md │ ├── coco-extract.ipynb │ └── visualizeUMAP.ipynb ├── Figure_3_Contrastive_Learning ├── 3d_sphere.ipynb ├── Appendix_3d_sphere.ipynb ├── get_gap_stats.ipynb ├── mismatched_simulation.ipynb └── plot_optimization_exp.ipynb ├── LICENSE ├── README.md ├── Table_1_Implications_CLIP_Zero_Shot ├── shifting │ └── shift_features.ipynb ├── simulation │ └── simulation.ipynb └── training │ ├── datasets.py │ ├── train_clip.py │ └── utils.py ├── Table_2_Implications_CLIP_Fairness ├── coco-extract.ipynb └── shift_CLIP_FairFace_Bias.ipynb ├── docs └── figures │ ├── Figure1.png │ ├── Figure2.jpg │ ├── Figure2ab.png │ ├── Figure2c.png │ ├── Figure3.jpg │ ├── Tables.png │ ├── Theorem1.png │ ├── Theorem2.png │ └── Theorem_variance.png ├── environment.yml └── util ├── gap_amend_std.ipynb └── get_arch.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.pkl 6 | 7 | */dummy_val/* 8 | */val/* 9 | */features/* 10 | */fairface-img-margin025-trainval.zip.zip 11 | */fairface_label_val.csv 12 | 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | pip-wheel-metadata/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 103 | __pypackages__/ 104 | 105 | # Celery stuff 106 | celerybeat-schedule 107 | celerybeat.pid 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_clasp.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_clasp.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_clasp_random.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_clasp_random.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_clip.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_clip.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_clip_random.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_clip_random.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_convirt.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_convirt.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_convirt_random.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_convirt_random.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_videoclip.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_videoclip.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/features_videoclip_random.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_1_Modality_Gap/features_videoclip_random.npy -------------------------------------------------------------------------------- /Figure_1_Modality_Gap/visualize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from matplotlib import pyplot as plt\n", 11 | "plt.rcParams['figure.dpi'] = 300\n", 12 | "plt.rcParams['savefig.dpi'] = 300\n", 13 | "import seaborn as sns\n", 14 | "sns.set_theme()\n", 15 | "sns.set_context(\"talk\")\n", 16 | "\n", 17 | "import sys\n", 18 | "import os\n", 19 | "sys.path.append('ANONYMOUS_ROOTDIR/develop/open-world/')\n", 20 | "from utils import reduce_and_visualize" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "filenames = sorted([filename for filename in os.listdir() if filename.endswith('.npy')])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "for filename in filenames:\n", 39 | " print(filename)\n", 40 | " image_features, text_features = np.load(filename)\n", 41 | " reduce_and_visualize(image_features, text_features, connection=True)\n", 42 | " input()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [] 51 | } 52 | ], 53 | "metadata": { 54 | "interpreter": { 55 | "hash": "bf49421d02fb18daac2fe024769d7389ca36bccb970e26253e571efb021ca22f" 56 | }, 57 | "kernelspec": { 58 | "display_name": "Python 3.8.12 ('dalle')", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.8.12" 73 | }, 74 | "orig_nbformat": 4 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 2 78 | } 79 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2a_random_init_random_data/coco-extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YPHN7PJgKOzb" 7 | }, 8 | "source": [ 9 | "# Image Feature Pair Extract - CLIP, ResNet18. \n", 10 | "conda activate clip\n", 11 | "\n", 12 | "\n", 13 | "clip_image_features_list (118287, 512)\n", 14 | "target_image_features_list (118287, 512)\n", 15 | "clip_image_features_list (5000, 512)\n", 16 | "target_image_features_list (5000, 512)\n", 17 | "\n", 18 | "Feature extraction complete in 6m 16s" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "colab": { 26 | "base_uri": "https://localhost:8080/" 27 | }, 28 | "id": "C1hkDT38hSaP", 29 | "outputId": "70a44964-883d-4fd0-b95a-2c7f2b19aca9" 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Torch version: 1.7.1\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import torch\n", 43 | "import pickle\n", 44 | "import time\n", 45 | "print(\"Torch version:\", torch.__version__)\n", 46 | "\n", 47 | "assert torch.__version__.split(\".\") >= [\"1\", \"7\", \"1\"], \"PyTorch 1.7.1 or later is required\"\n", 48 | "\n", 49 | "import os\n", 50 | "import matplotlib.pyplot as plt\n", 51 | "from collections import OrderedDict\n", 52 | "import torch\n", 53 | "\n", 54 | "%matplotlib inline\n", 55 | "%config InlineBackend.figure_format = 'retina'" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Load CLIP" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": { 69 | "colab": { 70 | "base_uri": "https://localhost:8080/" 71 | }, 72 | "id": "uLFS29hnhlY4", 73 | "outputId": "11779e1e-8bdd-4167-c18e-d26bdd6b67db" 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']" 80 | ] 81 | }, 82 | "execution_count": 2, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "import clip\n", 89 | "\n", 90 | "clip.available_models()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# ViT-B-32.json\n", 100 | "# copied from https://github.com/mlfoundations/open_clip/blob/91f6cce16b7bee90b3b5d38ca305b5b3b67cc200/src/training/model_configs/ViT-B-32.json\n", 101 | "model_info = {\n", 102 | " \"embed_dim\": 512,\n", 103 | " \"image_resolution\": 224,\n", 104 | " \"vision_layers\": 12,\n", 105 | " \"vision_width\": 768,\n", 106 | " \"vision_patch_size\": 32,\n", 107 | " \"context_length\": 77,\n", 108 | " \"vocab_size\": 49408,\n", 109 | " \"transformer_width\": 512,\n", 110 | " \"transformer_heads\": 8,\n", 111 | " \"transformer_layers\": 12\n", 112 | "} \n", 113 | "from clip.model import CLIP\n", 114 | "model = CLIP(**model_info)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from torchvision import transforms\n", 124 | "input_size = model_info['image_resolution']\n", 125 | "preprocess = transforms.Compose([\n", 126 | " transforms.Resize(input_size),\n", 127 | " transforms.CenterCrop(input_size),\n", 128 | " transforms.ToTensor(),\n", 129 | " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", 130 | " ])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": { 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | }, 140 | "id": "IBRVTY9lbGm8", 141 | "outputId": "f06fd2fd-6126-475b-87d0-b10aa3b7da49" 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Model parameters: 151,277,313\n", 149 | "Input resolution: 224\n", 150 | "Context length: 77\n", 151 | "Vocab size: 49408\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "\n", 157 | "model.cuda().eval()\n", 158 | "input_resolution = model.visual.input_resolution\n", 159 | "context_length = model.context_length\n", 160 | "vocab_size = model.vocab_size\n", 161 | "\n", 162 | "print(\"Model parameters:\", f\"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}\")\n", 163 | "print(\"Input resolution:\", input_resolution)\n", 164 | "print(\"Context length:\", context_length)\n", 165 | "print(\"Vocab size:\", vocab_size)\n", 166 | "\n", 167 | "clip_model = model" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "torchvision.transforms.transforms.Compose" 179 | ] 180 | }, 181 | "execution_count": 6, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "type(preprocess)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# Load Data" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "loading annotations into memory...\n", 207 | "Done (t=0.10s)\n", 208 | "creating index...\n", 209 | "index created!\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "import torchvision\n", 215 | "from torch.utils.data import DataLoader\n", 216 | "\n", 217 | "def target_transform(caption_list):\n", 218 | " caption = caption_list[0] # only the first caption\n", 219 | " return clip.tokenize(caption)[0]\n", 220 | "\n", 221 | "# coco_train_dataset = torchvision.datasets.CocoCaptions(\n", 222 | "# root = '/home/ubuntu/data/coco/train2017',\n", 223 | "# annFile = '/home/ubuntu/data/coco/annotations/captions_train2017.json',\n", 224 | "# transform=preprocess,\n", 225 | "# target_transform=target_transform,\n", 226 | "# )\n", 227 | "\n", 228 | "coco_val_dataset = torchvision.datasets.CocoCaptions(\n", 229 | " root = '/home/ubuntu/data/coco/val2017',\n", 230 | " annFile = '/home/ubuntu/data/coco/annotations/captions_val2017.json',\n", 231 | " transform=preprocess,\n", 232 | " target_transform=target_transform,\n", 233 | " )" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# coco_train_dataloader = DataLoader(coco_train_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)\n", 243 | "coco_val_dataloader = DataLoader(coco_val_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "# ResNet" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "import torch\n", 260 | "import torch.nn as nn\n", 261 | "import torchvision.models as models\n", 262 | "from torch.autograd import Variable\n", 263 | "\n", 264 | "resnet18 = models.resnet18(pretrained=False) # resnet18 = models.resnet18(pretrained=True)\n", 265 | "modules=list(resnet18.children())[:-1]\n", 266 | "resnet18=nn.Sequential(*modules)\n", 267 | "for p in resnet18.parameters():\n", 268 | " p.requires_grad = False\n", 269 | "\n", 270 | "resnet18.cuda().eval()\n", 271 | "target_model = resnet18\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "# Extractor loop\n" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 10, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "clip_image_features_list (5000, 512)\n", 291 | "target_image_features_list (5000, 512)\n", 292 | "\n", 293 | "Feature Extraction completed in 0m 45s\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "since = time.time()\n", 299 | "dataloaders = {\n", 300 | " # 'train': coco_train_dataloader, \n", 301 | " 'val': coco_val_dataloader,\n", 302 | "}\n", 303 | "# Each epoch has a training and validation phase\n", 304 | "for phase in ['val']: # ['train', 'val',]:\n", 305 | "\n", 306 | " clip_model.eval() # Set model to evaluate mode, for extraction\n", 307 | " ##################################\n", 308 | " # Fields to be stored for postprocessing \n", 309 | " ##################################\n", 310 | " clip_image_features_list = []\n", 311 | " clip_text_features_list = []\n", 312 | " target_image_features_list = []\n", 313 | "\n", 314 | " # Iterate over data.\n", 315 | " for inputs, captions in dataloaders[phase]:\n", 316 | " # image_input = inputs.cuda(non_blocking=True)\n", 317 | " # text_input = captions.cuda(non_blocking=True)\n", 318 | "\n", 319 | " batch_size = len(captions)\n", 320 | " image_input = torch.randn((batch_size, 3, 224, 224)).cuda(non_blocking=True)\n", 321 | " text_input = torch.randint(0, 49408, (batch_size, 77)).cuda(non_blocking=True)\n", 322 | "\n", 323 | " \n", 324 | " with torch.set_grad_enabled(False):\n", 325 | " clip_image_features = clip_model.encode_image(image_input).float()\n", 326 | " clip_text_features = clip_model.encode_text(text_input).float()\n", 327 | " target_image_features = target_model(image_input).squeeze() \n", 328 | " ##################################\n", 329 | " # Evaluation book-keeping Field \n", 330 | " ##################################\n", 331 | " clip_image_features_list.append( clip_image_features.cpu().numpy() )\n", 332 | " clip_text_features_list.append( clip_text_features.cpu().numpy() )\n", 333 | " target_image_features_list.append( target_image_features.cpu().numpy() )\n", 334 | "\n", 335 | " ##################################\n", 336 | " # Evaluation book-keeping Field \n", 337 | " ##################################\n", 338 | " clip_image_features_list = np.concatenate( clip_image_features_list, axis=0)\n", 339 | " clip_text_features_list = np.concatenate( clip_text_features_list, axis=0)\n", 340 | " target_image_features_list = np.concatenate( target_image_features_list, axis=0)\n", 341 | " print('clip_image_features_list', clip_image_features_list.shape)\n", 342 | " print('target_image_features_list', target_image_features_list.shape)\n", 343 | "\n", 344 | " dump_result_dict = {\n", 345 | " \"clip_image_features_list\": clip_image_features_list, \n", 346 | " \"clip_text_features_list\" : clip_text_features_list,\n", 347 | " \"target_image_features_list\": target_image_features_list, \n", 348 | " }\n", 349 | " with open(os.path.join('features', 'feature_dump_{}.pkl'.format(phase) ), \"wb\") as pkl_file:\n", 350 | " pickle.dump(\n", 351 | " dump_result_dict, \n", 352 | " pkl_file, \n", 353 | " )\n", 354 | "\n", 355 | "print()\n", 356 | "\n", 357 | "time_elapsed = time.time() - since\n", 358 | "print('Feature Extraction completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [] 367 | } 368 | ], 369 | "metadata": { 370 | "accelerator": "GPU", 371 | "colab": { 372 | "collapsed_sections": [], 373 | "name": "Interacting with CLIP.ipynb", 374 | "provenance": [] 375 | }, 376 | "kernelspec": { 377 | "display_name": "Python 3", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.9.7" 391 | }, 392 | "widgets": { 393 | "application/vnd.jupyter.widget-state+json": { 394 | "12e23e2819094ee0a079d4eb77cfc4f9": { 395 | "model_module": "@jupyter-widgets/base", 396 | "model_module_version": "1.2.0", 397 | "model_name": "LayoutModel", 398 | "state": { 399 | "_model_module": "@jupyter-widgets/base", 400 | "_model_module_version": "1.2.0", 401 | "_model_name": "LayoutModel", 402 | "_view_count": null, 403 | "_view_module": "@jupyter-widgets/base", 404 | "_view_module_version": "1.2.0", 405 | "_view_name": "LayoutView", 406 | "align_content": null, 407 | "align_items": null, 408 | "align_self": null, 409 | "border": null, 410 | "bottom": null, 411 | "display": null, 412 | "flex": null, 413 | "flex_flow": null, 414 | "grid_area": null, 415 | "grid_auto_columns": null, 416 | "grid_auto_flow": null, 417 | "grid_auto_rows": null, 418 | "grid_column": null, 419 | "grid_gap": null, 420 | "grid_row": null, 421 | "grid_template_areas": null, 422 | "grid_template_columns": null, 423 | "grid_template_rows": null, 424 | "height": null, 425 | "justify_content": null, 426 | "justify_items": null, 427 | "left": null, 428 | "margin": null, 429 | "max_height": null, 430 | "max_width": null, 431 | "min_height": null, 432 | "min_width": null, 433 | "object_fit": null, 434 | "object_position": null, 435 | "order": null, 436 | "overflow": null, 437 | "overflow_x": null, 438 | "overflow_y": null, 439 | "padding": null, 440 | "right": null, 441 | "top": null, 442 | "visibility": null, 443 | "width": null 444 | } 445 | }, 446 | "1369964d45004b5e95a058910b2a33e6": { 447 | "model_module": "@jupyter-widgets/controls", 448 | "model_module_version": "1.5.0", 449 | "model_name": "HBoxModel", 450 | "state": { 451 | "_dom_classes": [], 452 | "_model_module": "@jupyter-widgets/controls", 453 | "_model_module_version": "1.5.0", 454 | "_model_name": "HBoxModel", 455 | "_view_count": null, 456 | "_view_module": "@jupyter-widgets/controls", 457 | "_view_module_version": "1.5.0", 458 | "_view_name": "HBoxView", 459 | "box_style": "", 460 | "children": [ 461 | "IPY_MODEL_7a5f52e56ede4ac3abe37a3ece007dc9", 462 | "IPY_MODEL_ce8b0faa1a1340b5a504d7b3546b3ccb" 463 | ], 464 | "layout": "IPY_MODEL_12e23e2819094ee0a079d4eb77cfc4f9" 465 | } 466 | }, 467 | "161969cae25a49f38aacd1568d3cac6c": { 468 | "model_module": "@jupyter-widgets/base", 469 | "model_module_version": "1.2.0", 470 | "model_name": "LayoutModel", 471 | "state": { 472 | "_model_module": "@jupyter-widgets/base", 473 | "_model_module_version": "1.2.0", 474 | "_model_name": "LayoutModel", 475 | "_view_count": null, 476 | "_view_module": "@jupyter-widgets/base", 477 | "_view_module_version": "1.2.0", 478 | "_view_name": "LayoutView", 479 | "align_content": null, 480 | "align_items": null, 481 | "align_self": null, 482 | "border": null, 483 | "bottom": null, 484 | "display": null, 485 | "flex": null, 486 | "flex_flow": null, 487 | "grid_area": null, 488 | "grid_auto_columns": null, 489 | "grid_auto_flow": null, 490 | "grid_auto_rows": null, 491 | "grid_column": null, 492 | "grid_gap": null, 493 | "grid_row": null, 494 | "grid_template_areas": null, 495 | "grid_template_columns": null, 496 | "grid_template_rows": null, 497 | "height": null, 498 | "justify_content": null, 499 | "justify_items": null, 500 | "left": null, 501 | "margin": null, 502 | "max_height": null, 503 | "max_width": null, 504 | "min_height": null, 505 | "min_width": null, 506 | "object_fit": null, 507 | "object_position": null, 508 | "order": null, 509 | "overflow": null, 510 | "overflow_x": null, 511 | "overflow_y": null, 512 | "padding": null, 513 | "right": null, 514 | "top": null, 515 | "visibility": null, 516 | "width": null 517 | } 518 | }, 519 | "4a61c10fc00c4f04bb00b82e942da210": { 520 | "model_module": "@jupyter-widgets/base", 521 | "model_module_version": "1.2.0", 522 | "model_name": "LayoutModel", 523 | "state": { 524 | "_model_module": "@jupyter-widgets/base", 525 | "_model_module_version": "1.2.0", 526 | "_model_name": "LayoutModel", 527 | "_view_count": null, 528 | "_view_module": "@jupyter-widgets/base", 529 | "_view_module_version": "1.2.0", 530 | "_view_name": "LayoutView", 531 | "align_content": null, 532 | "align_items": null, 533 | "align_self": null, 534 | "border": null, 535 | "bottom": null, 536 | "display": null, 537 | "flex": null, 538 | "flex_flow": null, 539 | "grid_area": null, 540 | "grid_auto_columns": null, 541 | "grid_auto_flow": null, 542 | "grid_auto_rows": null, 543 | "grid_column": null, 544 | "grid_gap": null, 545 | "grid_row": null, 546 | "grid_template_areas": null, 547 | "grid_template_columns": null, 548 | "grid_template_rows": null, 549 | "height": null, 550 | "justify_content": null, 551 | "justify_items": null, 552 | "left": null, 553 | "margin": null, 554 | "max_height": null, 555 | "max_width": null, 556 | "min_height": null, 557 | "min_width": null, 558 | "object_fit": null, 559 | "object_position": null, 560 | "order": null, 561 | "overflow": null, 562 | "overflow_x": null, 563 | "overflow_y": null, 564 | "padding": null, 565 | "right": null, 566 | "top": null, 567 | "visibility": null, 568 | "width": null 569 | } 570 | }, 571 | "5e6adc4592124a4581b85f4c1f3bab4d": { 572 | "model_module": "@jupyter-widgets/controls", 573 | "model_module_version": "1.5.0", 574 | "model_name": "ProgressStyleModel", 575 | "state": { 576 | "_model_module": "@jupyter-widgets/controls", 577 | "_model_module_version": "1.5.0", 578 | "_model_name": "ProgressStyleModel", 579 | "_view_count": null, 580 | "_view_module": "@jupyter-widgets/base", 581 | "_view_module_version": "1.2.0", 582 | "_view_name": "StyleView", 583 | "bar_color": null, 584 | "description_width": "initial" 585 | } 586 | }, 587 | "7a5f52e56ede4ac3abe37a3ece007dc9": { 588 | "model_module": "@jupyter-widgets/controls", 589 | "model_module_version": "1.5.0", 590 | "model_name": "FloatProgressModel", 591 | "state": { 592 | "_dom_classes": [], 593 | "_model_module": "@jupyter-widgets/controls", 594 | "_model_module_version": "1.5.0", 595 | "_model_name": "FloatProgressModel", 596 | "_view_count": null, 597 | "_view_module": "@jupyter-widgets/controls", 598 | "_view_module_version": "1.5.0", 599 | "_view_name": "ProgressView", 600 | "bar_style": "success", 601 | "description": "", 602 | "description_tooltip": null, 603 | "layout": "IPY_MODEL_4a61c10fc00c4f04bb00b82e942da210", 604 | "max": 169001437, 605 | "min": 0, 606 | "orientation": "horizontal", 607 | "style": "IPY_MODEL_5e6adc4592124a4581b85f4c1f3bab4d", 608 | "value": 169001437 609 | } 610 | }, 611 | "b597cd6f6cd443aba4bf4491ac7f957e": { 612 | "model_module": "@jupyter-widgets/controls", 613 | "model_module_version": "1.5.0", 614 | "model_name": "DescriptionStyleModel", 615 | "state": { 616 | "_model_module": "@jupyter-widgets/controls", 617 | "_model_module_version": "1.5.0", 618 | "_model_name": "DescriptionStyleModel", 619 | "_view_count": null, 620 | "_view_module": "@jupyter-widgets/base", 621 | "_view_module_version": "1.2.0", 622 | "_view_name": "StyleView", 623 | "description_width": "" 624 | } 625 | }, 626 | "ce8b0faa1a1340b5a504d7b3546b3ccb": { 627 | "model_module": "@jupyter-widgets/controls", 628 | "model_module_version": "1.5.0", 629 | "model_name": "HTMLModel", 630 | "state": { 631 | "_dom_classes": [], 632 | "_model_module": "@jupyter-widgets/controls", 633 | "_model_module_version": "1.5.0", 634 | "_model_name": "HTMLModel", 635 | "_view_count": null, 636 | "_view_module": "@jupyter-widgets/controls", 637 | "_view_module_version": "1.5.0", 638 | "_view_name": "HTMLView", 639 | "description": "", 640 | "description_tooltip": null, 641 | "layout": "IPY_MODEL_161969cae25a49f38aacd1568d3cac6c", 642 | "placeholder": "", 643 | "style": "IPY_MODEL_b597cd6f6cd443aba4bf4491ac7f957e", 644 | "value": " 169001984/? [00:06<00:00, 25734958.25it/s]" 645 | } 646 | } 647 | } 648 | } 649 | }, 650 | "nbformat": 4, 651 | "nbformat_minor": 0 652 | } 653 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2a_random_init_real_data/coco-extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YPHN7PJgKOzb" 7 | }, 8 | "source": [ 9 | "# Image Feature Pair Extract - CLIP, ResNet18. \n", 10 | "conda activate clip\n", 11 | "\n", 12 | "\n", 13 | "clip_image_features_list (118287, 512)\n", 14 | "target_image_features_list (118287, 512)\n", 15 | "clip_image_features_list (5000, 512)\n", 16 | "target_image_features_list (5000, 512)\n", 17 | "\n", 18 | "Feature extraction complete in 6m 16s" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "colab": { 26 | "base_uri": "https://localhost:8080/" 27 | }, 28 | "id": "C1hkDT38hSaP", 29 | "outputId": "70a44964-883d-4fd0-b95a-2c7f2b19aca9" 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Torch version: 1.7.1\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import torch\n", 43 | "import pickle\n", 44 | "import time\n", 45 | "print(\"Torch version:\", torch.__version__)\n", 46 | "\n", 47 | "assert torch.__version__.split(\".\") >= [\"1\", \"7\", \"1\"], \"PyTorch 1.7.1 or later is required\"\n", 48 | "\n", 49 | "import os\n", 50 | "import matplotlib.pyplot as plt\n", 51 | "from collections import OrderedDict\n", 52 | "import torch\n", 53 | "\n", 54 | "%matplotlib inline\n", 55 | "%config InlineBackend.figure_format = 'retina'" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Load CLIP" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": { 69 | "colab": { 70 | "base_uri": "https://localhost:8080/" 71 | }, 72 | "id": "uLFS29hnhlY4", 73 | "outputId": "11779e1e-8bdd-4167-c18e-d26bdd6b67db" 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']" 80 | ] 81 | }, 82 | "execution_count": 2, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "import clip\n", 89 | "\n", 90 | "clip.available_models()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# ViT-B-32.json\n", 100 | "# copied from https://github.com/mlfoundations/open_clip/blob/91f6cce16b7bee90b3b5d38ca305b5b3b67cc200/src/training/model_configs/ViT-B-32.json\n", 101 | "model_info = {\n", 102 | " \"embed_dim\": 512,\n", 103 | " \"image_resolution\": 224,\n", 104 | " \"vision_layers\": 12,\n", 105 | " \"vision_width\": 768,\n", 106 | " \"vision_patch_size\": 32,\n", 107 | " \"context_length\": 77,\n", 108 | " \"vocab_size\": 49408,\n", 109 | " \"transformer_width\": 512,\n", 110 | " \"transformer_heads\": 8,\n", 111 | " \"transformer_layers\": 12\n", 112 | "} \n", 113 | "from clip.model import CLIP\n", 114 | "model = CLIP(**model_info)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from torchvision import transforms\n", 124 | "input_size = model_info['image_resolution']\n", 125 | "preprocess = transforms.Compose([\n", 126 | " transforms.Resize(input_size),\n", 127 | " transforms.CenterCrop(input_size),\n", 128 | " transforms.ToTensor(),\n", 129 | " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", 130 | " ])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": { 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | }, 140 | "id": "IBRVTY9lbGm8", 141 | "outputId": "f06fd2fd-6126-475b-87d0-b10aa3b7da49" 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Model parameters: 151,277,313\n", 149 | "Input resolution: 224\n", 150 | "Context length: 77\n", 151 | "Vocab size: 49408\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "\n", 157 | "model.cuda().eval()\n", 158 | "input_resolution = model.visual.input_resolution\n", 159 | "context_length = model.context_length\n", 160 | "vocab_size = model.vocab_size\n", 161 | "\n", 162 | "print(\"Model parameters:\", f\"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}\")\n", 163 | "print(\"Input resolution:\", input_resolution)\n", 164 | "print(\"Context length:\", context_length)\n", 165 | "print(\"Vocab size:\", vocab_size)\n", 166 | "\n", 167 | "clip_model = model" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "torchvision.transforms.transforms.Compose" 179 | ] 180 | }, 181 | "execution_count": 6, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "type(preprocess)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# Load Data" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "loading annotations into memory...\n", 207 | "Done (t=0.14s)\n", 208 | "creating index...\n", 209 | "index created!\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "import torchvision\n", 215 | "from torch.utils.data import DataLoader\n", 216 | "\n", 217 | "def target_transform(caption_list):\n", 218 | " caption = caption_list[0] # only the first caption\n", 219 | " return clip.tokenize(caption)[0]\n", 220 | "\n", 221 | "# coco_train_dataset = torchvision.datasets.CocoCaptions(\n", 222 | "# root = '/home/ubuntu/data/coco/train2017',\n", 223 | "# annFile = '/home/ubuntu/data/coco/annotations/captions_train2017.json',\n", 224 | "# transform=preprocess,\n", 225 | "# target_transform=target_transform,\n", 226 | "# )\n", 227 | "\n", 228 | "coco_val_dataset = torchvision.datasets.CocoCaptions(\n", 229 | " root = '/home/ubuntu/data/coco/val2017',\n", 230 | " annFile = '/home/ubuntu/data/coco/annotations/captions_val2017.json',\n", 231 | " transform=preprocess,\n", 232 | " target_transform=target_transform,\n", 233 | " )" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# coco_train_dataloader = DataLoader(coco_train_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)\n", 243 | "coco_val_dataloader = DataLoader(coco_val_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "# ResNet" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "import torch\n", 260 | "import torch.nn as nn\n", 261 | "import torchvision.models as models\n", 262 | "from torch.autograd import Variable\n", 263 | "\n", 264 | "resnet18 = models.resnet18(pretrained=False) # resnet18 = models.resnet18(pretrained=True)\n", 265 | "modules=list(resnet18.children())[:-1]\n", 266 | "resnet18=nn.Sequential(*modules)\n", 267 | "for p in resnet18.parameters():\n", 268 | " p.requires_grad = False\n", 269 | "\n", 270 | "resnet18.cuda().eval()\n", 271 | "target_model = resnet18\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "# Extractor loop\n" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 11, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "clip_image_features_list (5000, 512)\n", 291 | "target_image_features_list (5000, 512)\n", 292 | "\n", 293 | "Feature Extraction completed in 0m 33s\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "since = time.time()\n", 299 | "dataloaders = {\n", 300 | " # 'train': coco_train_dataloader, \n", 301 | " 'val': coco_val_dataloader,\n", 302 | "}\n", 303 | "# Each epoch has a training and validation phase\n", 304 | "for phase in ['val']: # ['train', 'val',]:\n", 305 | "\n", 306 | " clip_model.eval() # Set model to evaluate mode, for extraction\n", 307 | " ##################################\n", 308 | " # Fields to be stored for postprocessing \n", 309 | " ##################################\n", 310 | " clip_image_features_list = []\n", 311 | " clip_text_features_list = []\n", 312 | " target_image_features_list = []\n", 313 | "\n", 314 | " # Iterate over data.\n", 315 | " for inputs, captions in dataloaders[phase]:\n", 316 | " image_input = inputs.cuda(non_blocking=True)\n", 317 | " text_input = captions.cuda(non_blocking=True)\n", 318 | " # TODO: add text here\n", 319 | " \n", 320 | " with torch.set_grad_enabled(False):\n", 321 | " clip_image_features = clip_model.encode_image(image_input).float()\n", 322 | " clip_text_features = clip_model.encode_text(text_input).float()\n", 323 | " target_image_features = target_model(image_input).squeeze() \n", 324 | " ##################################\n", 325 | " # Evaluation book-keeping Field \n", 326 | " ##################################\n", 327 | " clip_image_features_list.append( clip_image_features.cpu().numpy() )\n", 328 | " clip_text_features_list.append( clip_text_features.cpu().numpy() )\n", 329 | " target_image_features_list.append( target_image_features.cpu().numpy() )\n", 330 | "\n", 331 | " ##################################\n", 332 | " # Evaluation book-keeping Field \n", 333 | " ##################################\n", 334 | " clip_image_features_list = np.concatenate( clip_image_features_list, axis=0)\n", 335 | " clip_text_features_list = np.concatenate( clip_text_features_list, axis=0)\n", 336 | " target_image_features_list = np.concatenate( target_image_features_list, axis=0)\n", 337 | " print('clip_image_features_list', clip_image_features_list.shape)\n", 338 | " print('target_image_features_list', target_image_features_list.shape)\n", 339 | "\n", 340 | " dump_result_dict = {\n", 341 | " \"clip_image_features_list\": clip_image_features_list, \n", 342 | " \"clip_text_features_list\" : clip_text_features_list,\n", 343 | " \"target_image_features_list\": target_image_features_list, \n", 344 | " }\n", 345 | " with open(os.path.join('features', 'feature_dump_{}.pkl'.format(phase) ), \"wb\") as pkl_file:\n", 346 | " pickle.dump(\n", 347 | " dump_result_dict, \n", 348 | " pkl_file, \n", 349 | " )\n", 350 | "\n", 351 | "print()\n", 352 | "\n", 353 | "time_elapsed = time.time() - since\n", 354 | "print('Feature Extraction completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [] 363 | } 364 | ], 365 | "metadata": { 366 | "accelerator": "GPU", 367 | "colab": { 368 | "collapsed_sections": [], 369 | "name": "Interacting with CLIP.ipynb", 370 | "provenance": [] 371 | }, 372 | "kernelspec": { 373 | "display_name": "Python 3", 374 | "name": "python3" 375 | }, 376 | "language_info": { 377 | "codemirror_mode": { 378 | "name": "ipython", 379 | "version": 3 380 | }, 381 | "file_extension": ".py", 382 | "mimetype": "text/x-python", 383 | "name": "python", 384 | "nbconvert_exporter": "python", 385 | "pygments_lexer": "ipython3", 386 | "version": "3.9.7" 387 | }, 388 | "widgets": { 389 | "application/vnd.jupyter.widget-state+json": { 390 | "12e23e2819094ee0a079d4eb77cfc4f9": { 391 | "model_module": "@jupyter-widgets/base", 392 | "model_module_version": "1.2.0", 393 | "model_name": "LayoutModel", 394 | "state": { 395 | "_model_module": "@jupyter-widgets/base", 396 | "_model_module_version": "1.2.0", 397 | "_model_name": "LayoutModel", 398 | "_view_count": null, 399 | "_view_module": "@jupyter-widgets/base", 400 | "_view_module_version": "1.2.0", 401 | "_view_name": "LayoutView", 402 | "align_content": null, 403 | "align_items": null, 404 | "align_self": null, 405 | "border": null, 406 | "bottom": null, 407 | "display": null, 408 | "flex": null, 409 | "flex_flow": null, 410 | "grid_area": null, 411 | "grid_auto_columns": null, 412 | "grid_auto_flow": null, 413 | "grid_auto_rows": null, 414 | "grid_column": null, 415 | "grid_gap": null, 416 | "grid_row": null, 417 | "grid_template_areas": null, 418 | "grid_template_columns": null, 419 | "grid_template_rows": null, 420 | "height": null, 421 | "justify_content": null, 422 | "justify_items": null, 423 | "left": null, 424 | "margin": null, 425 | "max_height": null, 426 | "max_width": null, 427 | "min_height": null, 428 | "min_width": null, 429 | "object_fit": null, 430 | "object_position": null, 431 | "order": null, 432 | "overflow": null, 433 | "overflow_x": null, 434 | "overflow_y": null, 435 | "padding": null, 436 | "right": null, 437 | "top": null, 438 | "visibility": null, 439 | "width": null 440 | } 441 | }, 442 | "1369964d45004b5e95a058910b2a33e6": { 443 | "model_module": "@jupyter-widgets/controls", 444 | "model_module_version": "1.5.0", 445 | "model_name": "HBoxModel", 446 | "state": { 447 | "_dom_classes": [], 448 | "_model_module": "@jupyter-widgets/controls", 449 | "_model_module_version": "1.5.0", 450 | "_model_name": "HBoxModel", 451 | "_view_count": null, 452 | "_view_module": "@jupyter-widgets/controls", 453 | "_view_module_version": "1.5.0", 454 | "_view_name": "HBoxView", 455 | "box_style": "", 456 | "children": [ 457 | "IPY_MODEL_7a5f52e56ede4ac3abe37a3ece007dc9", 458 | "IPY_MODEL_ce8b0faa1a1340b5a504d7b3546b3ccb" 459 | ], 460 | "layout": "IPY_MODEL_12e23e2819094ee0a079d4eb77cfc4f9" 461 | } 462 | }, 463 | "161969cae25a49f38aacd1568d3cac6c": { 464 | "model_module": "@jupyter-widgets/base", 465 | "model_module_version": "1.2.0", 466 | "model_name": "LayoutModel", 467 | "state": { 468 | "_model_module": "@jupyter-widgets/base", 469 | "_model_module_version": "1.2.0", 470 | "_model_name": "LayoutModel", 471 | "_view_count": null, 472 | "_view_module": "@jupyter-widgets/base", 473 | "_view_module_version": "1.2.0", 474 | "_view_name": "LayoutView", 475 | "align_content": null, 476 | "align_items": null, 477 | "align_self": null, 478 | "border": null, 479 | "bottom": null, 480 | "display": null, 481 | "flex": null, 482 | "flex_flow": null, 483 | "grid_area": null, 484 | "grid_auto_columns": null, 485 | "grid_auto_flow": null, 486 | "grid_auto_rows": null, 487 | "grid_column": null, 488 | "grid_gap": null, 489 | "grid_row": null, 490 | "grid_template_areas": null, 491 | "grid_template_columns": null, 492 | "grid_template_rows": null, 493 | "height": null, 494 | "justify_content": null, 495 | "justify_items": null, 496 | "left": null, 497 | "margin": null, 498 | "max_height": null, 499 | "max_width": null, 500 | "min_height": null, 501 | "min_width": null, 502 | "object_fit": null, 503 | "object_position": null, 504 | "order": null, 505 | "overflow": null, 506 | "overflow_x": null, 507 | "overflow_y": null, 508 | "padding": null, 509 | "right": null, 510 | "top": null, 511 | "visibility": null, 512 | "width": null 513 | } 514 | }, 515 | "4a61c10fc00c4f04bb00b82e942da210": { 516 | "model_module": "@jupyter-widgets/base", 517 | "model_module_version": "1.2.0", 518 | "model_name": "LayoutModel", 519 | "state": { 520 | "_model_module": "@jupyter-widgets/base", 521 | "_model_module_version": "1.2.0", 522 | "_model_name": "LayoutModel", 523 | "_view_count": null, 524 | "_view_module": "@jupyter-widgets/base", 525 | "_view_module_version": "1.2.0", 526 | "_view_name": "LayoutView", 527 | "align_content": null, 528 | "align_items": null, 529 | "align_self": null, 530 | "border": null, 531 | "bottom": null, 532 | "display": null, 533 | "flex": null, 534 | "flex_flow": null, 535 | "grid_area": null, 536 | "grid_auto_columns": null, 537 | "grid_auto_flow": null, 538 | "grid_auto_rows": null, 539 | "grid_column": null, 540 | "grid_gap": null, 541 | "grid_row": null, 542 | "grid_template_areas": null, 543 | "grid_template_columns": null, 544 | "grid_template_rows": null, 545 | "height": null, 546 | "justify_content": null, 547 | "justify_items": null, 548 | "left": null, 549 | "margin": null, 550 | "max_height": null, 551 | "max_width": null, 552 | "min_height": null, 553 | "min_width": null, 554 | "object_fit": null, 555 | "object_position": null, 556 | "order": null, 557 | "overflow": null, 558 | "overflow_x": null, 559 | "overflow_y": null, 560 | "padding": null, 561 | "right": null, 562 | "top": null, 563 | "visibility": null, 564 | "width": null 565 | } 566 | }, 567 | "5e6adc4592124a4581b85f4c1f3bab4d": { 568 | "model_module": "@jupyter-widgets/controls", 569 | "model_module_version": "1.5.0", 570 | "model_name": "ProgressStyleModel", 571 | "state": { 572 | "_model_module": "@jupyter-widgets/controls", 573 | "_model_module_version": "1.5.0", 574 | "_model_name": "ProgressStyleModel", 575 | "_view_count": null, 576 | "_view_module": "@jupyter-widgets/base", 577 | "_view_module_version": "1.2.0", 578 | "_view_name": "StyleView", 579 | "bar_color": null, 580 | "description_width": "initial" 581 | } 582 | }, 583 | "7a5f52e56ede4ac3abe37a3ece007dc9": { 584 | "model_module": "@jupyter-widgets/controls", 585 | "model_module_version": "1.5.0", 586 | "model_name": "FloatProgressModel", 587 | "state": { 588 | "_dom_classes": [], 589 | "_model_module": "@jupyter-widgets/controls", 590 | "_model_module_version": "1.5.0", 591 | "_model_name": "FloatProgressModel", 592 | "_view_count": null, 593 | "_view_module": "@jupyter-widgets/controls", 594 | "_view_module_version": "1.5.0", 595 | "_view_name": "ProgressView", 596 | "bar_style": "success", 597 | "description": "", 598 | "description_tooltip": null, 599 | "layout": "IPY_MODEL_4a61c10fc00c4f04bb00b82e942da210", 600 | "max": 169001437, 601 | "min": 0, 602 | "orientation": "horizontal", 603 | "style": "IPY_MODEL_5e6adc4592124a4581b85f4c1f3bab4d", 604 | "value": 169001437 605 | } 606 | }, 607 | "b597cd6f6cd443aba4bf4491ac7f957e": { 608 | "model_module": "@jupyter-widgets/controls", 609 | "model_module_version": "1.5.0", 610 | "model_name": "DescriptionStyleModel", 611 | "state": { 612 | "_model_module": "@jupyter-widgets/controls", 613 | "_model_module_version": "1.5.0", 614 | "_model_name": "DescriptionStyleModel", 615 | "_view_count": null, 616 | "_view_module": "@jupyter-widgets/base", 617 | "_view_module_version": "1.2.0", 618 | "_view_name": "StyleView", 619 | "description_width": "" 620 | } 621 | }, 622 | "ce8b0faa1a1340b5a504d7b3546b3ccb": { 623 | "model_module": "@jupyter-widgets/controls", 624 | "model_module_version": "1.5.0", 625 | "model_name": "HTMLModel", 626 | "state": { 627 | "_dom_classes": [], 628 | "_model_module": "@jupyter-widgets/controls", 629 | "_model_module_version": "1.5.0", 630 | "_model_name": "HTMLModel", 631 | "_view_count": null, 632 | "_view_module": "@jupyter-widgets/controls", 633 | "_view_module_version": "1.5.0", 634 | "_view_name": "HTMLView", 635 | "description": "", 636 | "description_tooltip": null, 637 | "layout": "IPY_MODEL_161969cae25a49f38aacd1568d3cac6c", 638 | "placeholder": "", 639 | "style": "IPY_MODEL_b597cd6f6cd443aba4bf4491ac7f957e", 640 | "value": " 169001984/? [00:06<00:00, 25734958.25it/s]" 641 | } 642 | } 643 | } 644 | } 645 | }, 646 | "nbformat": 4, 647 | "nbformat_minor": 0 648 | } 649 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/random_data/visualizePCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualize COCO features\n", 8 | "\n", 9 | "1. visualize coco features\n", 10 | "2. identify pca-one; what is its cosine similarity with the residual (should be very high)\n", 11 | "3. move along the direction, plot 1-dim loss landscape. [-2,-1,-0.5,0,0.5,1,2]\n", 12 | " - need to have a fn(scalar,), output loss. \n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import argparse\n", 22 | "import os\n", 23 | "import random\n", 24 | "import shutil\n", 25 | "import time\n", 26 | "import warnings\n", 27 | "from enum import Enum\n", 28 | "import pickle\n", 29 | "import numpy as np\n", 30 | "from collections import defaultdict\n", 31 | "\n", 32 | "import torch\n", 33 | "import torch.nn as nn\n", 34 | "import torch.optim\n", 35 | "from torch.utils.data import Dataset, DataLoader\n", 36 | "import torch.backends.cudnn as cudnn\n", 37 | "\n", 38 | "import glob \n", 39 | "def my_norm(x):\n", 40 | " return x/np.linalg.norm(x, axis=-1, keepdims=True)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "data_dict_list = list()\n", 50 | "\n", 51 | "for pickle_path in glob.glob('./features*/feature_dump_*.pkl'):\n", 52 | " with open(pickle_path, 'rb') as pkl_file:\n", 53 | " data_dict = pickle.load(pkl_file)\n", 54 | " assert len(data_dict['clip_image_features_list']) == len(data_dict['clip_text_features_list'])\n", 55 | " # assert len(data_dict['clip_image_features_list']) == len(data_dict['target_image_features_list'])\n", 56 | " # print('Number of image-text pairs', len(data_dict['clip_image_features_list']))\n", 57 | " data_dict_list.append(data_dict)\n", 58 | "\n", 59 | "print('Number of experiment files loaded', len(data_dict_list))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# visualize.\n", 69 | "\n", 70 | "from sklearn.decomposition import PCA\n", 71 | "# from sklearn.decomposition import TruncatedSVD as PCA # showns as multiple lines. \n", 72 | "# from sklearn.manifold import TSNE as PCA # \n", 73 | "# import umap\n", 74 | "# from umap import UMAP as PCA\n", 75 | "import pandas as pd\n", 76 | "import matplotlib.pyplot as plt\n", 77 | "%matplotlib inline\n", 78 | "import seaborn as sns\n", 79 | "# sns.set(font_scale=2) # crazy big\n", 80 | "plt.rcParams['figure.dpi'] = 300\n", 81 | "plt.rcParams['savefig.dpi'] = 300\n", 82 | "sns.set_theme()\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Functionality: given a list of exp, plot one modality. \n", 92 | "sns.set_context(\"talk\", font_scale=1.5) # paper, notebook, talk, and poster; font_scale=1.5,\n", 93 | "\n", 94 | "def plot_scattered_cones(data_dict_list, modality_str, draw=True):\n", 95 | " assert modality_str in ['clip_image_features_list', 'clip_text_features_list', 'target_image_features_list']\n", 96 | " print('modality_str: ', modality_str)\n", 97 | " # dataset_size = len(data_dict_list[0][modality_str])\n", 98 | " dataset_size = 5000\n", 99 | "\n", 100 | " total_feature_list = list()\n", 101 | " label_list = list()\n", 102 | " for expriment_idx in range(len(data_dict_list)):\n", 103 | " total_feature_list.append(data_dict_list[expriment_idx][modality_str][:dataset_size])\n", 104 | " label_list.extend(['Random-{}'.format(expriment_idx+1)] * dataset_size)\n", 105 | " total_feature_np = np.concatenate(total_feature_list, axis=0) \n", 106 | " total_feature_np = my_norm(total_feature_np) # L2-normalize\n", 107 | " assert len(total_feature_np) == len(data_dict_list) * dataset_size\n", 108 | "\n", 109 | " pca = PCA(n_components=2)\n", 110 | " pca_result = pca.fit_transform(total_feature_np)\n", 111 | "\n", 112 | " df = pd.DataFrame()\n", 113 | " df['pca_one'] = pca_result[:,0]\n", 114 | " df['pca_two'] = pca_result[:,1] \n", 115 | " df['Random Seed'] = label_list\n", 116 | "\n", 117 | " if draw:\n", 118 | " plt.figure(figsize=(20.0,6.18 * 2))\n", 119 | " p1 = sns.scatterplot(\n", 120 | " x=\"pca_one\", y=\"pca_two\",\n", 121 | " hue=\"Random Seed\",\n", 122 | " data=df,\n", 123 | " legend=True,\n", 124 | " )\n", 125 | " plt.xlabel(\"\")\n", 126 | " plt.ylabel(\"\")\n", 127 | " plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), prop={'size': 18})\n", 128 | " plt.show()\n", 129 | "\n", 130 | " return df\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "df_clip_img = plot_scattered_cones(data_dict_list[:25], 'clip_image_features_list', draw=True)\n", 140 | "df_clip_txt = plot_scattered_cones(data_dict_list[:25], 'clip_text_features_list', draw=True)\n", 141 | "df_resnet = plot_scattered_cones(data_dict_list[:25], 'target_image_features_list', draw=True)\n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "def draw_df(df):\n", 151 | " plt.figure(figsize=(20.0,6.18 * 2))\n", 152 | " df['Seed'] = df['Random Seed'].str.replace('Random-', '', regex=False)\n", 153 | " p1 = sns.scatterplot(\n", 154 | " x=\"pca_one\", y=\"pca_two\",\n", 155 | " hue=\"Seed\",\n", 156 | " data=df,\n", 157 | " legend=True,\n", 158 | " )\n", 159 | " plt.xlabel(\"\")\n", 160 | " plt.ylabel(\"\")\n", 161 | " plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), ncol=2) # prop={'size': 50}, \n", 162 | " plt.show()\n", 163 | " return\n", 164 | "\n", 165 | "draw_df(df_clip_img)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "interpreter": { 178 | "hash": "09c077faaa20da841f22e0f4d12b4addb73e00d9291bc78d00732f9f39794f23" 179 | }, 180 | "kernelspec": { 181 | "display_name": "Python 3.9.7 ('clip')", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.9.7" 196 | }, 197 | "orig_nbformat": 4 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 2 201 | } 202 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/real_data/visualizePCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualize COCO features\n", 8 | "\n", 9 | "1. visualize coco features\n", 10 | "2. identify pca-one; what is its cosine similarity with the residual (should be very high)\n", 11 | "3. move along the direction, plot 1-dim loss landscape. [-2,-1,-0.5,0,0.5,1,2]\n", 12 | " - need to have a fn(scalar,), output loss. \n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import argparse\n", 22 | "import os\n", 23 | "import random\n", 24 | "import shutil\n", 25 | "import time\n", 26 | "import warnings\n", 27 | "from enum import Enum\n", 28 | "import pickle\n", 29 | "import numpy as np\n", 30 | "from collections import defaultdict\n", 31 | "\n", 32 | "import torch\n", 33 | "import torch.nn as nn\n", 34 | "import torch.optim\n", 35 | "from torch.utils.data import Dataset, DataLoader\n", 36 | "import torch.backends.cudnn as cudnn\n", 37 | "\n", 38 | "import glob \n", 39 | "def my_norm(x):\n", 40 | " return x/np.linalg.norm(x, axis=-1, keepdims=True)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "data_dict_list = list()\n", 50 | "\n", 51 | "for pickle_path in glob.glob('./features*/feature_dump_*.pkl'):\n", 52 | " with open(pickle_path, 'rb') as pkl_file:\n", 53 | " data_dict = pickle.load(pkl_file)\n", 54 | " assert len(data_dict['clip_image_features_list']) == len(data_dict['clip_text_features_list'])\n", 55 | " # assert len(data_dict['clip_image_features_list']) == len(data_dict['target_image_features_list'])\n", 56 | " # print('Number of image-text pairs', len(data_dict['clip_image_features_list']))\n", 57 | " data_dict_list.append(data_dict)\n", 58 | "\n", 59 | "print('Number of experiment files loaded', len(data_dict_list))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# visualize.\n", 69 | "\n", 70 | "from sklearn.decomposition import PCA\n", 71 | "# from sklearn.decomposition import TruncatedSVD as PCA # showns as multiple lines. \n", 72 | "# from sklearn.manifold import TSNE as PCA # \n", 73 | "# import umap\n", 74 | "# from umap import UMAP as PCA\n", 75 | "import pandas as pd\n", 76 | "import matplotlib.pyplot as plt\n", 77 | "%matplotlib inline\n", 78 | "import seaborn as sns\n", 79 | "# sns.set(font_scale=2) # crazy big\n", 80 | "plt.rcParams['figure.dpi'] = 300\n", 81 | "plt.rcParams['savefig.dpi'] = 300\n", 82 | "sns.set_theme()\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Functionality: given a list of exp, plot one modality. \n", 92 | "sns.set_context(\"talk\", font_scale=1.5) # paper, notebook, talk, and poster; font_scale=1.5,\n", 93 | "\n", 94 | "def plot_scattered_cones(data_dict_list, modality_str, draw=True):\n", 95 | " assert modality_str in ['clip_image_features_list', 'clip_text_features_list', 'target_image_features_list']\n", 96 | " print('modality_str: ', modality_str)\n", 97 | " # dataset_size = len(data_dict_list[0][modality_str])\n", 98 | " dataset_size = 5000\n", 99 | "\n", 100 | " total_feature_list = list()\n", 101 | " label_list = list()\n", 102 | " for expriment_idx in range(len(data_dict_list)):\n", 103 | " total_feature_list.append(data_dict_list[expriment_idx][modality_str][:dataset_size])\n", 104 | " label_list.extend(['Random-{}'.format(expriment_idx+1)] * dataset_size)\n", 105 | " total_feature_np = np.concatenate(total_feature_list, axis=0) \n", 106 | " total_feature_np = my_norm(total_feature_np) # L2-normalize\n", 107 | " assert len(total_feature_np) == len(data_dict_list) * dataset_size\n", 108 | "\n", 109 | " pca = PCA(n_components=6)\n", 110 | " pca_result = pca.fit_transform(total_feature_np)\n", 111 | " print('pca.explained_variance_ratio_', pca.explained_variance_ratio_)\n", 112 | " print('pca.singular_values_', pca.singular_values_)\n", 113 | "\n", 114 | " df = pd.DataFrame()\n", 115 | " df['pca_one'] = pca_result[:,0]\n", 116 | " df['pca_two'] = pca_result[:,1] \n", 117 | " df['Random Seed'] = label_list\n", 118 | "\n", 119 | " if draw:\n", 120 | " plt.figure(figsize=(20.0,6.18 * 2))\n", 121 | " p1 = sns.scatterplot(\n", 122 | " x=\"pca_one\", y=\"pca_two\",\n", 123 | " hue=\"Random Seed\",\n", 124 | " data=df,\n", 125 | " legend=True,\n", 126 | " )\n", 127 | " plt.xlabel(\"\")\n", 128 | " plt.ylabel(\"\")\n", 129 | " plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), prop={'size': 18})\n", 130 | " plt.show()\n", 131 | "\n", 132 | " return df\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "df_clip_img = plot_scattered_cones(data_dict_list[:25], 'clip_image_features_list', draw=True)\n", 142 | "df_clip_txt = plot_scattered_cones(data_dict_list[:25], 'clip_text_features_list', draw=True)\n", 143 | "df_resnet = plot_scattered_cones(data_dict_list[:25], 'target_image_features_list', draw=True)\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "def draw_df(df):\n", 153 | " plt.figure(figsize=(20.0,6.18 * 2))\n", 154 | " df['Seed'] = df['Random Seed'].str.replace('Random-', '', regex=False)\n", 155 | " p1 = sns.scatterplot(\n", 156 | " x=\"pca_one\", y=\"pca_two\",\n", 157 | " hue=\"Seed\",\n", 158 | " data=df,\n", 159 | " legend=True,\n", 160 | " )\n", 161 | " plt.xlabel(\"\")\n", 162 | " plt.ylabel(\"\")\n", 163 | " plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), ncol=2) # prop={'size': 50}, \n", 164 | " plt.show()\n", 165 | " return\n", 166 | "\n", 167 | "draw_df(df_clip_img)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Plot PCA Singular Values, Explained Variance Ratios. \n", 175 | "Kind of anwering Mert's question" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 26, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "modality_str: clip_image_features_list\n", 188 | "pca.explained_variance_ratio_\n", 189 | "0.043, 0.041, 0.039, 0.038, 0.036, 0.035, 0.035, 0.034, 0.033, 0.032, \n", 190 | "pca.singular_values_ [72.44832 70.31703 68.78217 68.24517 66.22955 65.66144 65.02128\n", 191 | " 64.06602 63.149437 62.50923 61.43108 60.71535 60.435135 59.02705\n", 192 | " 58.74808 57.4058 56.325825 56.2117 55.202732 54.309063 53.766792\n", 193 | " 52.040756 51.68926 49.76612 34.14688 33.398888 32.901985 31.960554\n", 194 | " 31.528515 31.300081 30.672626 30.518982 30.29744 29.762638 29.396282\n", 195 | " 28.373528 28.064127 27.74946 27.346584 27.130186 26.959745 26.397924\n", 196 | " 25.524904 25.109116 24.717733 24.531994 24.060846 23.81253 22.803596\n", 197 | " 20.144312]\n", 198 | "modality_str: clip_text_features_list\n", 199 | "pca.explained_variance_ratio_\n", 200 | "0.043, 0.041, 0.039, 0.037, 0.037, 0.035, 0.034, 0.033, 0.033, 0.031, \n", 201 | "pca.singular_values_ [71.93895 70.64999 68.51955 67.25281 66.71326 65.2795 64.50423\n", 202 | " 63.39669 62.925117 61.176167 59.73097 58.7134 58.423645 57.11752\n", 203 | " 56.474472 55.85696 54.98844 54.659405 54.08874 53.35901 51.593594\n", 204 | " 50.34826 49.43106 48.493847 16.067904 15.492056 15.30791 14.992251\n", 205 | " 14.946433 14.73657 14.656306 14.519942 14.41191 14.366245 14.130468\n", 206 | " 14.007584 13.708626 13.655253 13.45591 13.389069 13.198088 13.179104\n", 207 | " 13.093057 12.848161 12.838188 12.79897 12.603904 12.445068 12.337545\n", 208 | " 12.306129]\n", 209 | "modality_str: target_image_features_list\n", 210 | "pca.explained_variance_ratio_\n", 211 | "0.056, 0.055, 0.054, 0.051, 0.050, 0.050, 0.049, 0.046, 0.044, 0.043, \n", 212 | "pca.singular_values_ [57.44344 56.822586 56.4279 54.55056 54.171036 53.912224\n", 213 | " 53.301693 51.85659 50.885063 50.07982 49.386353 49.12857\n", 214 | " 48.405567 47.63106 47.15982 45.581974 45.29316 45.029636\n", 215 | " 44.288643 43.610165 42.718163 41.86789 40.769337 39.61369\n", 216 | " 4.8666005 4.7441974 4.5143256 4.4266877 4.175692 4.155532\n", 217 | " 4.1449823 4.055484 3.8198297 3.783392 3.687432 3.661967\n", 218 | " 3.6238446 3.5420978 3.483381 3.4556499 3.2627327 3.2502015\n", 219 | " 3.1480756 3.124066 3.0445938 2.9486566 2.828199 2.759845\n", 220 | " 2.7152538 2.6587367]\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "# Functionality: given a list of exp, plot one modality. \n", 226 | "sns.set_context(\"talk\", font_scale=1.5) # paper, notebook, talk, and poster; font_scale=1.5,\n", 227 | "\n", 228 | "def plot_pca_stats(data_dict_list, modality_str, draw=True):\n", 229 | " assert modality_str in ['clip_image_features_list', 'clip_text_features_list', 'target_image_features_list']\n", 230 | " print('modality_str: ', modality_str)\n", 231 | " # dataset_size = len(data_dict_list[0][modality_str])\n", 232 | " dataset_size = 5000\n", 233 | "\n", 234 | " total_feature_list = list()\n", 235 | " label_list = list()\n", 236 | " for expriment_idx in range(len(data_dict_list)):\n", 237 | " total_feature_list.append(data_dict_list[expriment_idx][modality_str][:dataset_size])\n", 238 | " label_list.extend(['Random-{}'.format(expriment_idx+1)] * dataset_size)\n", 239 | " total_feature_np = np.concatenate(total_feature_list, axis=0) \n", 240 | " total_feature_np = my_norm(total_feature_np) # L2-normalize\n", 241 | " assert len(total_feature_np) == len(data_dict_list) * dataset_size\n", 242 | "\n", 243 | " pca = PCA(n_components=50)\n", 244 | " pca_result = pca.fit_transform(total_feature_np)\n", 245 | " print('pca.explained_variance_ratio_')\n", 246 | " for ratio in pca.explained_variance_ratio_[:10]:\n", 247 | " print('{:.3f},'.format(ratio), end=' ')\n", 248 | " print()\n", 249 | "\n", 250 | "\n", 251 | " print('pca.singular_values_', pca.singular_values_)\n", 252 | " return\n", 253 | "\n", 254 | "\n", 255 | "df_clip_img = plot_pca_stats(data_dict_list[:25], 'clip_image_features_list', draw=True)\n", 256 | "df_clip_txt = plot_pca_stats(data_dict_list[:25], 'clip_text_features_list', draw=True)\n", 257 | "df_resnet = plot_pca_stats(data_dict_list[:25], 'target_image_features_list', draw=True)\n" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [] 266 | } 267 | ], 268 | "metadata": { 269 | "interpreter": { 270 | "hash": "09c077faaa20da841f22e0f4d12b4addb73e00d9291bc78d00732f9f39794f23" 271 | }, 272 | "kernelspec": { 273 | "display_name": "Python 3.9.7 ('clip')", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.9.7" 288 | }, 289 | "orig_nbformat": 4 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 2 293 | } 294 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/real_data_ImageNet_pretrained/ImageNet-Pretrained-Cones.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Weixin-Liang/Modality-Gap/8e20cb24efa4c5f89aad694f2f65eb43ffc46d10/Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/real_data_ImageNet_pretrained/ImageNet-Pretrained-Cones.png -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/real_data_ImageNet_pretrained/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |  5 | -------------------------------------------------------------------------------- /Figure_2_Cone_Effect/Figure_2c_scatter_cones_random_init/real_data_ImageNet_pretrained/coco-extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "YPHN7PJgKOzb" 7 | }, 8 | "source": [ 9 | "# If so, will such distinctively different cones remain if randomly initialized models are fully trained?\n", 10 | "\n", 11 | "\n", 12 | "env: conda activate clip\n", 13 | "\n", 14 | "https://github.com/SamsungLabs/pytorch-ensembles" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 24, 20 | "metadata": { 21 | "colab": { 22 | "base_uri": "https://localhost:8080/" 23 | }, 24 | "id": "C1hkDT38hSaP", 25 | "outputId": "70a44964-883d-4fd0-b95a-2c7f2b19aca9" 26 | }, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "Torch version: 1.7.1\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import torch\n", 39 | "import pickle\n", 40 | "import time\n", 41 | "print(\"Torch version:\", torch.__version__)\n", 42 | "\n", 43 | "assert torch.__version__.split(\".\") >= [\"1\", \"7\", \"1\"], \"PyTorch 1.7.1 or later is required\"\n", 44 | "\n", 45 | "import os\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "from collections import OrderedDict\n", 48 | "import torch\n", 49 | "\n", 50 | "%matplotlib inline\n", 51 | "%config InlineBackend.figure_format = 'retina'" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# Load CLIP" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 25, 64 | "metadata": { 65 | "colab": { 66 | "base_uri": "https://localhost:8080/" 67 | }, 68 | "id": "uLFS29hnhlY4", 69 | "outputId": "11779e1e-8bdd-4167-c18e-d26bdd6b67db" 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']" 76 | ] 77 | }, 78 | "execution_count": 25, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "import clip\n", 85 | "\n", 86 | "clip.available_models()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 26, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# ViT-B-32.json\n", 96 | "# copied from https://github.com/mlfoundations/open_clip/blob/91f6cce16b7bee90b3b5d38ca305b5b3b67cc200/src/training/model_configs/ViT-B-32.json\n", 97 | "model_info = {\n", 98 | " \"embed_dim\": 512,\n", 99 | " \"image_resolution\": 224,\n", 100 | " \"vision_layers\": 12,\n", 101 | " \"vision_width\": 768,\n", 102 | " \"vision_patch_size\": 32,\n", 103 | " \"context_length\": 77,\n", 104 | " \"vocab_size\": 49408,\n", 105 | " \"transformer_width\": 512,\n", 106 | " \"transformer_heads\": 8,\n", 107 | " \"transformer_layers\": 12\n", 108 | "} " 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 27, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from torchvision import transforms\n", 118 | "input_size = model_info['image_resolution']\n", 119 | "preprocess = transforms.Compose([\n", 120 | " transforms.Resize(input_size),\n", 121 | " transforms.CenterCrop(input_size),\n", 122 | " transforms.ToTensor(),\n", 123 | " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", 124 | " ])" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 28, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "torchvision.transforms.transforms.Compose" 136 | ] 137 | }, 138 | "execution_count": 28, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "type(preprocess)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# Load Data" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 29, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "loading annotations into memory...\n", 164 | "Done (t=0.04s)\n", 165 | "creating index...\n", 166 | "index created!\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "import torchvision\n", 172 | "from torch.utils.data import DataLoader\n", 173 | "\n", 174 | "def target_transform(caption_list):\n", 175 | " caption = caption_list[0] # only the first caption\n", 176 | " return clip.tokenize(caption)[0]\n", 177 | "\n", 178 | "# coco_train_dataset = torchvision.datasets.CocoCaptions(\n", 179 | "# root = '/home/ubuntu/data/coco/train2017',\n", 180 | "# annFile = '/home/ubuntu/data/coco/annotations/captions_train2017.json',\n", 181 | "# transform=preprocess,\n", 182 | "# target_transform=target_transform,\n", 183 | "# )\n", 184 | "\n", 185 | "coco_val_dataset = torchvision.datasets.CocoCaptions(\n", 186 | " root = '/home/ubuntu/data/coco/val2017',\n", 187 | " annFile = '/home/ubuntu/data/coco/annotations/captions_val2017.json',\n", 188 | " transform=preprocess,\n", 189 | " target_transform=target_transform,\n", 190 | " )" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 30, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# coco_train_dataloader = DataLoader(coco_train_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)\n", 200 | "coco_val_dataloader = DataLoader(coco_val_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "# ResNet" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 31, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "import torch\n", 217 | "import torch.nn as nn\n", 218 | "import torchvision.models as models\n", 219 | "from torch.autograd import Variable" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 32, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "\n", 229 | "deepens_imagenet = [\n", 230 | " 'ImageNet-ResNet50-052e7f78e4db--1564492444-1.pth.tar', \n", 231 | " 'ImageNet-ResNet50-1132c260ef75--1564493784-1.pth.tar',\n", 232 | " 'ImageNet-ResNet50-2f817072e8da--1564493734-1.pth.tar',\n", 233 | " 'ImageNet-ResNet50-3177c697fbf4--1564495013-1.pth.tar',\n", 234 | " 'ImageNet-ResNet50-628e11f9fd67--1564481099-1.pth.tar',\n", 235 | " 'ImageNet-ResNet50-743e10f26a38--1564493675-1.pth.tar',\n", 236 | " 'ImageNet-ResNet50-7ded66ec9900--1564481097-1.pth.tar',\n", 237 | " 'ImageNet-ResNet50-8fc5076a66c9--1564481079-1.pth.tar',\n", 238 | " 'ImageNet-ResNet50-a58ab8dd26fc--1564492521-1.pth.tar',\n", 239 | " 'ImageNet-ResNet50-a80e40d84db2--1564492573-1.pth.tar',\n", 240 | " 'ImageNet-ResNet50-be11903315ee--1564481101-1.pth.tar',\n", 241 | "]\n", 242 | "\n", 243 | "def load_model_states(model, filename):\n", 244 | " \"\"\"\n", 245 | " Load a previously saved model states.\n", 246 | " https://github.com/SamsungLabs/pytorch-ensembles\n", 247 | " \"\"\"\n", 248 | " with open(filename, 'rb') as f:\n", 249 | " # original saved file with DataParallel\n", 250 | " state_dict = torch.load(f)['state_dict']\n", 251 | " # create new OrderedDict that does not contain `module.`\n", 252 | " from collections import OrderedDict\n", 253 | " new_state_dict = OrderedDict()\n", 254 | " for k, v in state_dict.items():\n", 255 | " name = k[7:] # remove `module.`\n", 256 | " new_state_dict[name] = v\n", 257 | " # load params\n", 258 | " model.load_state_dict(new_state_dict)\n", 259 | "\n" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 33, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "from clip.model import CLIP\n", 269 | "def get_random_init_models(checkpoint_tar_name):\n", 270 | "\n", 271 | " resnet18 = models.resnet50(pretrained=False) # actually resnet 50\n", 272 | " load_model_states(resnet18, '../deepens_imagenet/' + checkpoint_tar_name)\n", 273 | "\n", 274 | " modules=list(resnet18.children())[:-1]\n", 275 | " resnet18=nn.Sequential(*modules)\n", 276 | " for p in resnet18.parameters():\n", 277 | " p.requires_grad = False\n", 278 | "\n", 279 | " resnet18.cuda().eval()\n", 280 | " target_model = resnet18\n", 281 | " return target_model\n" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "# Extractor loop\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 34, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "target_image_features_list (5000, 2048)\n", 301 | "expriment_idx 0\n", 302 | "Feature Extraction completed in 0m 15s\n", 303 | "target_image_features_list (5000, 2048)\n", 304 | "expriment_idx 1\n", 305 | "Feature Extraction completed in 0m 31s\n", 306 | "target_image_features_list (5000, 2048)\n", 307 | "expriment_idx 2\n", 308 | "Feature Extraction completed in 0m 48s\n", 309 | "target_image_features_list (5000, 2048)\n", 310 | "expriment_idx 3\n", 311 | "Feature Extraction completed in 1m 2s\n", 312 | "target_image_features_list (5000, 2048)\n", 313 | "expriment_idx 4\n", 314 | "Feature Extraction completed in 1m 17s\n", 315 | "target_image_features_list (5000, 2048)\n", 316 | "expriment_idx 5\n", 317 | "Feature Extraction completed in 1m 32s\n", 318 | "target_image_features_list (5000, 2048)\n", 319 | "expriment_idx 6\n", 320 | "Feature Extraction completed in 1m 48s\n", 321 | "target_image_features_list (5000, 2048)\n", 322 | "expriment_idx 7\n", 323 | "Feature Extraction completed in 2m 6s\n", 324 | "target_image_features_list (5000, 2048)\n", 325 | "expriment_idx 8\n", 326 | "Feature Extraction completed in 2m 21s\n", 327 | "target_image_features_list (5000, 2048)\n", 328 | "expriment_idx 9\n", 329 | "Feature Extraction completed in 2m 40s\n", 330 | "target_image_features_list (5000, 2048)\n", 331 | "expriment_idx 10\n", 332 | "Feature Extraction completed in 3m 3s\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "since = time.time()\n", 338 | "dataloaders = {\n", 339 | " # 'train': coco_train_dataloader, \n", 340 | " 'val': coco_val_dataloader,\n", 341 | "}\n", 342 | "\n", 343 | "\n", 344 | "# Each epoch has a training and validation phase\n", 345 | "for expriment_idx in range(len(deepens_imagenet)):\n", 346 | " phase = 'val'\n", 347 | " target_model = get_random_init_models(checkpoint_tar_name=deepens_imagenet[expriment_idx])\n", 348 | "\n", 349 | " ##################################\n", 350 | " # Fields to be stored for postprocessing \n", 351 | " ##################################\n", 352 | "\n", 353 | " target_image_features_list = []\n", 354 | "\n", 355 | " # Iterate over data.\n", 356 | " for inputs, captions in dataloaders[phase]:\n", 357 | " image_input = inputs.cuda(non_blocking=True)\n", 358 | " text_input = captions.cuda(non_blocking=True)\n", 359 | " \n", 360 | " with torch.set_grad_enabled(False):\n", 361 | " target_image_features = target_model(image_input).squeeze() \n", 362 | " ##################################\n", 363 | " # Evaluation book-keeping Field \n", 364 | " ##################################\n", 365 | " target_image_features_list.append( target_image_features.cpu().numpy() )\n", 366 | "\n", 367 | " ##################################\n", 368 | " # Evaluation book-keeping Field \n", 369 | " ##################################\n", 370 | " target_image_features_list = np.concatenate( target_image_features_list, axis=0)\n", 371 | " print('target_image_features_list', target_image_features_list.shape)\n", 372 | "\n", 373 | " dump_result_dict = {\n", 374 | " \"target_image_features_list\": target_image_features_list, \n", 375 | " }\n", 376 | " \n", 377 | " feature_dir = 'features200'\n", 378 | " os.makedirs(feature_dir, exist_ok = True) \n", 379 | " with open(os.path.join(feature_dir, 'feature_dump_{}.pkl'.format(expriment_idx) ), \"wb\") as pkl_file:\n", 380 | " pickle.dump(\n", 381 | " dump_result_dict, \n", 382 | " pkl_file, \n", 383 | " )\n", 384 | "\n", 385 | " time_elapsed = time.time() - since\n", 386 | " print('expriment_idx', expriment_idx)\n", 387 | " print('Feature Extraction completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [] 396 | } 397 | ], 398 | "metadata": { 399 | "accelerator": "GPU", 400 | "colab": { 401 | "collapsed_sections": [], 402 | "name": "Interacting with CLIP.ipynb", 403 | "provenance": [] 404 | }, 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "name": "python3" 408 | }, 409 | "language_info": { 410 | "codemirror_mode": { 411 | "name": "ipython", 412 | "version": 3 413 | }, 414 | "file_extension": ".py", 415 | "mimetype": "text/x-python", 416 | "name": "python", 417 | "nbconvert_exporter": "python", 418 | "pygments_lexer": "ipython3", 419 | "version": "3.9.7" 420 | }, 421 | "widgets": { 422 | "application/vnd.jupyter.widget-state+json": { 423 | "12e23e2819094ee0a079d4eb77cfc4f9": { 424 | "model_module": "@jupyter-widgets/base", 425 | "model_module_version": "1.2.0", 426 | "model_name": "LayoutModel", 427 | "state": { 428 | "_model_module": "@jupyter-widgets/base", 429 | "_model_module_version": "1.2.0", 430 | "_model_name": "LayoutModel", 431 | "_view_count": null, 432 | "_view_module": "@jupyter-widgets/base", 433 | "_view_module_version": "1.2.0", 434 | "_view_name": "LayoutView", 435 | "align_content": null, 436 | "align_items": null, 437 | "align_self": null, 438 | "border": null, 439 | "bottom": null, 440 | "display": null, 441 | "flex": null, 442 | "flex_flow": null, 443 | "grid_area": null, 444 | "grid_auto_columns": null, 445 | "grid_auto_flow": null, 446 | "grid_auto_rows": null, 447 | "grid_column": null, 448 | "grid_gap": null, 449 | "grid_row": null, 450 | "grid_template_areas": null, 451 | "grid_template_columns": null, 452 | "grid_template_rows": null, 453 | "height": null, 454 | "justify_content": null, 455 | "justify_items": null, 456 | "left": null, 457 | "margin": null, 458 | "max_height": null, 459 | "max_width": null, 460 | "min_height": null, 461 | "min_width": null, 462 | "object_fit": null, 463 | "object_position": null, 464 | "order": null, 465 | "overflow": null, 466 | "overflow_x": null, 467 | "overflow_y": null, 468 | "padding": null, 469 | "right": null, 470 | "top": null, 471 | "visibility": null, 472 | "width": null 473 | } 474 | }, 475 | "1369964d45004b5e95a058910b2a33e6": { 476 | "model_module": "@jupyter-widgets/controls", 477 | "model_module_version": "1.5.0", 478 | "model_name": "HBoxModel", 479 | "state": { 480 | "_dom_classes": [], 481 | "_model_module": "@jupyter-widgets/controls", 482 | "_model_module_version": "1.5.0", 483 | "_model_name": "HBoxModel", 484 | "_view_count": null, 485 | "_view_module": "@jupyter-widgets/controls", 486 | "_view_module_version": "1.5.0", 487 | "_view_name": "HBoxView", 488 | "box_style": "", 489 | "children": [ 490 | "IPY_MODEL_7a5f52e56ede4ac3abe37a3ece007dc9", 491 | "IPY_MODEL_ce8b0faa1a1340b5a504d7b3546b3ccb" 492 | ], 493 | "layout": "IPY_MODEL_12e23e2819094ee0a079d4eb77cfc4f9" 494 | } 495 | }, 496 | "161969cae25a49f38aacd1568d3cac6c": { 497 | "model_module": "@jupyter-widgets/base", 498 | "model_module_version": "1.2.0", 499 | "model_name": "LayoutModel", 500 | "state": { 501 | "_model_module": "@jupyter-widgets/base", 502 | "_model_module_version": "1.2.0", 503 | "_model_name": "LayoutModel", 504 | "_view_count": null, 505 | "_view_module": "@jupyter-widgets/base", 506 | "_view_module_version": "1.2.0", 507 | "_view_name": "LayoutView", 508 | "align_content": null, 509 | "align_items": null, 510 | "align_self": null, 511 | "border": null, 512 | "bottom": null, 513 | "display": null, 514 | "flex": null, 515 | "flex_flow": null, 516 | "grid_area": null, 517 | "grid_auto_columns": null, 518 | "grid_auto_flow": null, 519 | "grid_auto_rows": null, 520 | "grid_column": null, 521 | "grid_gap": null, 522 | "grid_row": null, 523 | "grid_template_areas": null, 524 | "grid_template_columns": null, 525 | "grid_template_rows": null, 526 | "height": null, 527 | "justify_content": null, 528 | "justify_items": null, 529 | "left": null, 530 | "margin": null, 531 | "max_height": null, 532 | "max_width": null, 533 | "min_height": null, 534 | "min_width": null, 535 | "object_fit": null, 536 | "object_position": null, 537 | "order": null, 538 | "overflow": null, 539 | "overflow_x": null, 540 | "overflow_y": null, 541 | "padding": null, 542 | "right": null, 543 | "top": null, 544 | "visibility": null, 545 | "width": null 546 | } 547 | }, 548 | "4a61c10fc00c4f04bb00b82e942da210": { 549 | "model_module": "@jupyter-widgets/base", 550 | "model_module_version": "1.2.0", 551 | "model_name": "LayoutModel", 552 | "state": { 553 | "_model_module": "@jupyter-widgets/base", 554 | "_model_module_version": "1.2.0", 555 | "_model_name": "LayoutModel", 556 | "_view_count": null, 557 | "_view_module": "@jupyter-widgets/base", 558 | "_view_module_version": "1.2.0", 559 | "_view_name": "LayoutView", 560 | "align_content": null, 561 | "align_items": null, 562 | "align_self": null, 563 | "border": null, 564 | "bottom": null, 565 | "display": null, 566 | "flex": null, 567 | "flex_flow": null, 568 | "grid_area": null, 569 | "grid_auto_columns": null, 570 | "grid_auto_flow": null, 571 | "grid_auto_rows": null, 572 | "grid_column": null, 573 | "grid_gap": null, 574 | "grid_row": null, 575 | "grid_template_areas": null, 576 | "grid_template_columns": null, 577 | "grid_template_rows": null, 578 | "height": null, 579 | "justify_content": null, 580 | "justify_items": null, 581 | "left": null, 582 | "margin": null, 583 | "max_height": null, 584 | "max_width": null, 585 | "min_height": null, 586 | "min_width": null, 587 | "object_fit": null, 588 | "object_position": null, 589 | "order": null, 590 | "overflow": null, 591 | "overflow_x": null, 592 | "overflow_y": null, 593 | "padding": null, 594 | "right": null, 595 | "top": null, 596 | "visibility": null, 597 | "width": null 598 | } 599 | }, 600 | "5e6adc4592124a4581b85f4c1f3bab4d": { 601 | "model_module": "@jupyter-widgets/controls", 602 | "model_module_version": "1.5.0", 603 | "model_name": "ProgressStyleModel", 604 | "state": { 605 | "_model_module": "@jupyter-widgets/controls", 606 | "_model_module_version": "1.5.0", 607 | "_model_name": "ProgressStyleModel", 608 | "_view_count": null, 609 | "_view_module": "@jupyter-widgets/base", 610 | "_view_module_version": "1.2.0", 611 | "_view_name": "StyleView", 612 | "bar_color": null, 613 | "description_width": "initial" 614 | } 615 | }, 616 | "7a5f52e56ede4ac3abe37a3ece007dc9": { 617 | "model_module": "@jupyter-widgets/controls", 618 | "model_module_version": "1.5.0", 619 | "model_name": "FloatProgressModel", 620 | "state": { 621 | "_dom_classes": [], 622 | "_model_module": "@jupyter-widgets/controls", 623 | "_model_module_version": "1.5.0", 624 | "_model_name": "FloatProgressModel", 625 | "_view_count": null, 626 | "_view_module": "@jupyter-widgets/controls", 627 | "_view_module_version": "1.5.0", 628 | "_view_name": "ProgressView", 629 | "bar_style": "success", 630 | "description": "", 631 | "description_tooltip": null, 632 | "layout": "IPY_MODEL_4a61c10fc00c4f04bb00b82e942da210", 633 | "max": 169001437, 634 | "min": 0, 635 | "orientation": "horizontal", 636 | "style": "IPY_MODEL_5e6adc4592124a4581b85f4c1f3bab4d", 637 | "value": 169001437 638 | } 639 | }, 640 | "b597cd6f6cd443aba4bf4491ac7f957e": { 641 | "model_module": "@jupyter-widgets/controls", 642 | "model_module_version": "1.5.0", 643 | "model_name": "DescriptionStyleModel", 644 | "state": { 645 | "_model_module": "@jupyter-widgets/controls", 646 | "_model_module_version": "1.5.0", 647 | "_model_name": "DescriptionStyleModel", 648 | "_view_count": null, 649 | "_view_module": "@jupyter-widgets/base", 650 | "_view_module_version": "1.2.0", 651 | "_view_name": "StyleView", 652 | "description_width": "" 653 | } 654 | }, 655 | "ce8b0faa1a1340b5a504d7b3546b3ccb": { 656 | "model_module": "@jupyter-widgets/controls", 657 | "model_module_version": "1.5.0", 658 | "model_name": "HTMLModel", 659 | "state": { 660 | "_dom_classes": [], 661 | "_model_module": "@jupyter-widgets/controls", 662 | "_model_module_version": "1.5.0", 663 | "_model_name": "HTMLModel", 664 | "_view_count": null, 665 | "_view_module": "@jupyter-widgets/controls", 666 | "_view_module_version": "1.5.0", 667 | "_view_name": "HTMLView", 668 | "description": "", 669 | "description_tooltip": null, 670 | "layout": "IPY_MODEL_161969cae25a49f38aacd1568d3cac6c", 671 | "placeholder": "", 672 | "style": "IPY_MODEL_b597cd6f6cd443aba4bf4491ac7f957e", 673 | "value": " 169001984/? [00:06<00:00, 25734958.25it/s]" 674 | } 675 | } 676 | } 677 | } 678 | }, 679 | "nbformat": 4, 680 | "nbformat_minor": 0 681 | } 682 | -------------------------------------------------------------------------------- /Figure_3_Contrastive_Learning/get_gap_stats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import argparse\n", 10 | "import os\n", 11 | "import random\n", 12 | "import shutil\n", 13 | "import time\n", 14 | "import warnings\n", 15 | "from enum import Enum\n", 16 | "import pickle\n", 17 | "import numpy as np\n", 18 | "from collections import defaultdict\n", 19 | "\n", 20 | "import torch\n", 21 | "import torch.nn as nn\n", 22 | "import torch.optim\n", 23 | "from torch.utils.data import Dataset, DataLoader\n", 24 | "import torch.backends.cudnn as cudnn\n", 25 | "\n", 26 | "def my_norm(x):\n", 27 | " return x/np.linalg.norm(x, axis=-1, keepdims=True)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "pickle_path = './features/feature_dump_val.pkl'\n", 37 | "with open(pickle_path, 'rb') as pkl_file:\n", 38 | " data_dict = pickle.load(pkl_file)\n", 39 | " assert len(data_dict['clip_image_features_list']) == len(data_dict['clip_text_features_list'])\n", 40 | " # assert len(data_dict['clip_image_features_list']) == len(data_dict['target_image_features_list'])\n", 41 | " print('Number of image-text pairs', len(data_dict['clip_image_features_list']))" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Get the gap\n", 51 | "modality_gap = my_norm(my_norm(data_dict['clip_image_features_list']).mean(axis=0) - my_norm(data_dict['clip_text_features_list']).mean(axis=0))\n", 52 | "# # save as a gap vector\n", 53 | "# with open(os.path.join('modality_gap_vector.pkl' ), \"wb\") as pkl_file:\n", 54 | "# pickle.dump(\n", 55 | "# modality_gap, \n", 56 | "# pkl_file, \n", 57 | "# )" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 20, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "modifying_results\n" 70 | ] 71 | }, 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 93 | " | distance | \n", 94 | "delta | \n", 95 | "
---|---|---|
0 | \n", 100 | "1.412827 | \n", 101 | "-1.00 | \n", 102 | "
1 | \n", 105 | "1.305014 | \n", 106 | "-0.75 | \n", 107 | "
2 | \n", 110 | "1.173114 | \n", 111 | "-0.50 | \n", 112 | "
3 | \n", 115 | "1.013107 | \n", 116 | "-0.25 | \n", 117 | "
4 | \n", 120 | "0.822103 | \n", 121 | "0.00 | \n", 122 | "
5 | \n", 125 | "0.599846 | \n", 126 | "0.25 | \n", 127 | "
6 | \n", 130 | "0.350383 | \n", 131 | "0.50 | \n", 132 | "
7 | \n", 135 | "0.083829 | \n", 136 | "0.75 | \n", 137 | "
\n", 218 | " | distance | \n", 219 | "delta | \n", 220 | "
---|---|---|
0 | \n", 225 | "1.287905 | \n", 226 | "-1.00 | \n", 227 | "
1 | \n", 230 | "1.224927 | \n", 231 | "-0.75 | \n", 232 | "
2 | \n", 235 | "1.134930 | \n", 236 | "-0.50 | \n", 237 | "
3 | \n", 240 | "1.005094 | \n", 241 | "-0.25 | \n", 242 | "
4 | \n", 245 | "0.822103 | \n", 246 | "0.00 | \n", 247 | "
5 | \n", 250 | "0.584231 | \n", 251 | "0.25 | \n", 252 | "
6 | \n", 255 | "0.317848 | \n", 256 | "0.50 | \n", 257 | "
7 | \n", 260 | "0.070293 | \n", 261 | "0.75 | \n", 262 | "
106 |
107 |
116 |
117 |
118 |