├── .gitignore ├── 1. madex ├── madex_example_dna.ipynb ├── madex_example_graph.ipynb ├── madex_example_image.ipynb ├── madex_example_text.ipynb ├── neural_interaction_detection.py ├── sampling_and_inference.py └── utils │ ├── data │ ├── cora │ │ ├── README │ │ ├── cora.cites │ │ └── cora.content │ └── sample_images │ │ ├── bus.jpg │ │ ├── dog.jpg │ │ ├── shark.jpg │ │ └── viaduct.jpg │ ├── dna_utils.py │ ├── general_utils.py │ ├── graph_utils.py │ ├── image_utils.py │ ├── lime │ ├── lime_base.py │ └── lime_text.py │ ├── linear_cross_utils.py │ ├── pretrained │ ├── dna_cnn.pt │ ├── gcn_cora.pt │ └── model_gcn.py │ └── text_utils.py ├── 2. glider ├── data │ └── initial_data_prep │ │ ├── avazu │ │ ├── config.py │ │ └── preprocess.py │ │ ├── criteo │ │ ├── config.py │ │ ├── preprocess.py │ │ └── scale.py │ │ ├── kdd2012 │ │ ├── config.py │ │ ├── preprocess.py │ │ └── scale.py │ │ └── kfold_split │ │ ├── config.py │ │ └── stratifiedKfold.py ├── detect_global_interactions.py ├── make_cross_feature_data.py ├── models │ └── autoint │ │ ├── README.md │ │ ├── model.py │ │ └── train.py ├── train_deepctr.py └── utils │ ├── cross_feature_utils.py │ └── global_interaction_utils.py ├── README.md ├── figures ├── explanation1.png ├── explanation2.png └── overview.png └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .data 2 | .ipynb_checkpoints 3 | __pycache__ 4 | *.npy 5 | *.h5 6 | *.ckpt-* 7 | *.pyc 8 | *.save 9 | *.swp 10 | *.zip 11 | backup* 12 | -------------------------------------------------------------------------------- /1. madex/madex_example_dna.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from torchtext import datasets, data\n", 10 | "import numpy as np\n", 11 | "import os, sys\n", 12 | "from time import time\n", 13 | "\n", 14 | "sys.path.append(\"../1. madex\")\n", 15 | "\n", 16 | "from neural_interaction_detection import *\n", 17 | "from sampling_and_inference import *\n", 18 | "from utils.dna_utils import *\n", 19 | "\n", 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import warnings\n", 23 | "warnings.filterwarnings(\"ignore\")\n", 24 | "\n", 25 | "%load_ext autoreload\n", 26 | "%autoreload 2\n", 27 | "\n", 28 | "device = torch.device(\"cuda:0\")" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Load Model" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "model = load_dna_model(\"utils/pretrained/dna_cnn.pt\").to(device)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Get DNA Sequence" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "GTAGGTAAGCGCACGTGTTGCACTTCCCTTAATCCA True\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "np.random.seed(42)\n", 69 | "seq_instance = generate_random_dna_sequence_with_CACGTG()\n", 70 | "print(seq_instance, \"CACGTG\" in seq_instance)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Run MADEX" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "100%|██████████| 60/60 [00:02<00:00, 29.46it/s]\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "data_inst = {\"orig\": seq_instance, \"vectorizer\": encode_dna_onehot}\n", 95 | "Xs, Ys = generate_perturbation_dataset_dna(data_inst, model, device, seed=42)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "0.0046 test loss, 16.0 seconds elapsed\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "t0 = time()\n", 113 | "interactions, mlp_loss = detect_interactions(Xs, Ys, weight_samples=False, seed=42, verbose=False, add_linear=False)\n", 114 | "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "interaction ranking \n", 127 | "\n", 128 | "1 found CACGTG >> ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16')\n", 129 | "2 ('A_21', 'C_25')\n", 130 | "3 ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'A_21')\n", 131 | "4 ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n", 132 | "5 ('A_21', 'C_25', 'C_26')\n", 133 | "6 ('A_21', 'T_23', 'C_25', 'C_26')\n", 134 | "7 ('A_21', 'T_23', 'C_25', 'C_26', 'T_28')\n", 135 | "8 ('A_2', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n", 136 | "9 ('A_2', 'A_6', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n", 137 | "10 ('A_2', 'A_6', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18', 'C_20')\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "print(\"interaction ranking\", \"\\n\")\n", 143 | "for rank, inter in enumerate(interactions[:10]):\n", 144 | " inter_indices, _ = inter\n", 145 | " inter_verbose = tuple((seq_instance[s], s) for s in inter_indices)\n", 146 | "\n", 147 | " inter_nucleotides, _ = zip(*inter_verbose)\n", 148 | " if \"\".join(inter_nucleotides) == \"CACGTG\" and all(np.diff(inter_indices) == 1):\n", 149 | " postfix = \"found CACGTG >>\"\n", 150 | " else:\n", 151 | " postfix = \"\"\n", 152 | " print(rank+1, postfix, tuple(a + \"_\" + str(b) for a,b in inter_verbose))\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "Python [conda env:torch]", 160 | "language": "python", 161 | "name": "conda-env-torch-py" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.6.2" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 4 178 | } 179 | -------------------------------------------------------------------------------- /1. madex/madex_example_graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from torchtext import datasets, data\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import numpy as np\n", 12 | "import os, sys\n", 13 | "from time import time\n", 14 | "\n", 15 | "from neural_interaction_detection import *\n", 16 | "from sampling_and_inference import *\n", 17 | "from utils.general_utils import *\n", 18 | "from utils.graph_utils import *\n", 19 | "\n", 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import warnings\n", 23 | "warnings.filterwarnings(\"ignore\")\n", 24 | "\n", 25 | "%load_ext autoreload\n", 26 | "%autoreload 2\n", 27 | "\n", 28 | "device = torch.device(\"cuda:0\")" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Load Model" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "model_folder = \"utils/pretrained\"\n", 45 | "\n", 46 | "model, n_nodes, n_hops, test_idxs = get_graph_model(model_folder)\n", 47 | "model = model.to(device)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Classify Graph" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "data_folder = \"utils/data/cora\"\n", 64 | "\n", 65 | "node_feats, adj_mat, labels = load_cora(data_folder, device)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "target node classification: 6\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "target_idx = test_idxs[0]\n", 83 | "\n", 84 | "preds = model(node_feats, convert_adj_to_da(adj_mat))\n", 85 | "classification = torch.argmax(preds, 1).cpu().numpy()[target_idx] \n", 86 | "print(\"target node classification:\", classification)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Run MADEX" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "100%|██████████| 6000/6000 [01:40<00:00, 59.72it/s]\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "data_inst = {\"nodes\": node_feats, \"edges\": adj_mat, \"test_idxs\": test_idxs}\n", 111 | "Xs, Ys = generate_perturbation_dataset_graph(data_inst, model, target_idx, n_hops+1, device, seed=42, std_scale=False)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "19.4754 test loss, 94.2 seconds elapsed\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "t0 = time()\n", 129 | "interactions, mlp_loss = detect_interactions(Xs, Ys, weight_samples=True, seed=42, verbose=False)\n", 130 | "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Show Main Effects and Interaction Interpretations" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "legend: (hops from target node, node idx). All hops should be within n_hops: 3\n", 150 | "\n", 151 | "target (0, 1808)\n", 152 | "\n", 153 | "main effects\n", 154 | "(2, 722)\n", 155 | "(2, 2465)\n", 156 | "(2, 264)\n", 157 | "(2, 1189)\n", 158 | "(2, 2146)\n", 159 | "\n", 160 | "interactions\n", 161 | "2\n", 162 | "inter 0: ((1, 638), (2, 722))\n", 163 | "4\n", 164 | "inter 1: ((2, 264), (1, 638), (2, 722), (2, 2465))\n", 165 | "5\n", 166 | "inter 2: ((2, 264), (1, 638), (2, 722), (2, 1189), (2, 2465))\n", 167 | "6\n", 168 | "inter 3: ((2, 264), (1, 638), (2, 722), (2, 1189), (2, 2146), (2, 2465))\n", 169 | "9\n", 170 | "inter 4: ((2, 264), (2, 294), (2, 296), (1, 638), (2, 722), (2, 1189), (2, 1327), (2, 2146), (2, 2465))\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "node_to_hop = get_hops_to_target(target_idx, adj_mat, n_hops)\n", 176 | "local_map = data_inst[\"local_idx_map\"]\n", 177 | "\n", 178 | "print(\"legend: (hops from target node, node idx). All hops should be within n_hops:\", n_hops)\n", 179 | "\n", 180 | "print(\"\\ntarget\", (0, target_idx))\n", 181 | "print(\"\\nmain effects\")\n", 182 | "for uni, att in get_lime_attributions(Xs, Ys)[:5]:\n", 183 | " if att > 0:\n", 184 | " print((node_to_hop[local_map[uni]],local_map[uni]))\n", 185 | "print(\"\\ninteractions\")\n", 186 | "for i, inter in enumerate(interactions[:5]):\n", 187 | " print(len(inter[0]))\n", 188 | " print(\"inter {}:\".format(i), tuple((node_to_hop[local_map[n]],local_map[n]) for n in inter[0]))\n" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python [conda env:torch]", 195 | "language": "python", 196 | "name": "conda-env-torch-py" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.6.2" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 4 213 | } 214 | -------------------------------------------------------------------------------- /1. madex/madex_example_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from torchtext import datasets, data\n", 10 | "import numpy as np\n", 11 | "import os, sys\n", 12 | "from time import time\n", 13 | "\n", 14 | "from neural_interaction_detection import *\n", 15 | "from sampling_and_inference import *\n", 16 | "from utils.general_utils import *\n", 17 | "from utils.text_utils import *\n", 18 | "\n", 19 | "import warnings\n", 20 | "warnings.filterwarnings(\"ignore\")\n", 21 | "%load_ext autoreload\n", 22 | "%autoreload 2\n", 23 | "\n", 24 | "device = torch.device(\"cuda:0\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Load Model" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stderr", 41 | "output_type": "stream", 42 | "text": [ 43 | "Widget Javascript not detected. It may not be installed or enabled properly.\n" 44 | ] 45 | }, 46 | { 47 | "data": { 48 | "application/vnd.jupyter.widget-view+json": { 49 | "model_id": "f1b3bc5eb16f46edb5fcd643307db412" 50 | } 51 | }, 52 | "metadata": {}, 53 | "output_type": "display_data" 54 | }, 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "model = get_bert_model(device)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Classify Sentence" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "positive sentiment\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "sentence = \"this was not a great movie, but a good movie nevertheless\"\n", 89 | "\n", 90 | "out = model(sentence)\n", 91 | "pred = np.argmax(out[0])\n", 92 | "print((\"positive\" if pred== 1 else \"negative\") + \" sentiment\")" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Run MADEX" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stderr", 109 | "output_type": "stream", 110 | "text": [ 111 | "100%|██████████| 12/12 [00:10<00:00, 1.15it/s]\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "data_inst = {\"orig\": sentence}\n", 117 | "Xs, Ys = generate_perturbation_dataset_text(data_inst, model, 1, device, model_id=\"bert\", batch_size=500, seed=42, std_scale=True)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "0.0142 test loss, 29.9 seconds elapsed\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "t0 = time()\n", 135 | "interactions, mlp_loss = detect_interactions(Xs, Ys, detector=\"GradientNID\", add_linear=True, device=device, weight_samples=True, seed=42, verbose=False)\n", 136 | "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Show Main Effects and Interaction Interpretations" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "this was not a great movie, but a good movie nevertheless\n", 156 | "\n", 157 | "main effects: ('but', 'a', 'good', 'movie', 'nevertheless')\n", 158 | "\n", 159 | "top-5 interactions\n", 160 | "inter 1: ('not', 'but') 2.7557428\n", 161 | "inter 2: ('but', 'good') 1.9747727\n", 162 | "inter 3: ('not', 'good') 1.8207084\n", 163 | "inter 4: ('great', 'good') 1.3452238\n", 164 | "inter 5: ('not', 'great') 1.2503706\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "print(sentence + \"\\n\")\n", 170 | "\n", 171 | "dom_map = data_inst[\"domain_mapper\"]\n", 172 | "\n", 173 | "lime_atts = get_lime_attributions(Xs, Ys)\n", 174 | "print(\"main effects:\", map_words([i for i, a in lime_atts if a*(pred*2-1) > 0], dom_map))\n", 175 | "\n", 176 | "print(\"\\ntop-5 interactions\")\n", 177 | "for i, inter_tuple in enumerate(interactions[:5]):\n", 178 | " inter, strength = inter_tuple\n", 179 | " word_inter = map_words(inter, dom_map)\n", 180 | " print(\"inter {}:\".format(i+1), word_inter, strength)" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python [conda env:torch]", 187 | "language": "python", 188 | "name": "conda-env-torch-py" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.6.2" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 4 205 | } 206 | -------------------------------------------------------------------------------- /1. madex/neural_interaction_detection.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import operator 3 | import numpy as np 4 | import torch 5 | from torch.utils import data 6 | import torch.nn as nn 7 | from utils.general_utils import * 8 | from torch import autograd 9 | 10 | 11 | def preprocess_weights(weights): 12 | w_later = np.abs(weights[-1]) 13 | w_input = np.abs(weights[0]) 14 | 15 | for i in range(len(weights) - 2, 0, -1): 16 | w_later = np.matmul(w_later, np.abs(weights[i])) 17 | 18 | return w_input, w_later 19 | 20 | 21 | def interpret_interactions_from_weights(w_input, w_later, get_main_effects=False): 22 | interaction_strengths = {} 23 | for i in range(w_later.shape[1]): 24 | sorted_hweights = sorted( 25 | enumerate(w_input[i]), key=lambda x: x[1], reverse=True 26 | ) 27 | interaction_candidate = [] 28 | candidate_weights = [] 29 | for j in range(w_input.shape[1]): 30 | bisect.insort(interaction_candidate, sorted_hweights[j][0]) 31 | candidate_weights.append(sorted_hweights[j][1]) 32 | 33 | if not get_main_effects and len(interaction_candidate) == 1: 34 | continue 35 | interaction_tup = tuple(interaction_candidate) 36 | if interaction_tup not in interaction_strengths: 37 | interaction_strengths[interaction_tup] = 0 38 | interaction_strength = (min(candidate_weights)) * (np.sum(w_later[:, i])) 39 | interaction_strengths[interaction_tup] += interaction_strength 40 | 41 | interaction_ranking = sorted( 42 | interaction_strengths.items(), key=operator.itemgetter(1), reverse=True 43 | ) 44 | 45 | return interaction_ranking 46 | 47 | 48 | 49 | def get_higher_order_grad(inter, model, x, device): 50 | x = torch.FloatTensor(x).to(device) 51 | x.requires_grad = True 52 | y = model(x) 53 | for i, v in enumerate(inter): 54 | if i == 0: 55 | grad = autograd.grad(y, x, create_graph=True)[0][v] # first feature 56 | else: 57 | grad = autograd.grad(grad, x, create_graph=True)[0][v] # second feature 58 | 59 | return grad.item()**2 60 | 61 | def get_second_order_grad(model, x, device): 62 | 63 | x = torch.FloatTensor(x).to(device) 64 | 65 | if x.nelement() < 2: 66 | return np.array([]) 67 | 68 | x.requires_grad = True 69 | 70 | y = model(x) 71 | grads = autograd.grad(y, x, create_graph=True)[0].squeeze() 72 | 73 | grad_list = [] 74 | for j, grad in enumerate(grads): 75 | grad2 = autograd.grad(grad, x, retain_graph = True)[0].squeeze() 76 | grad_list.append(grad2) 77 | 78 | grad_matrix = torch.stack(grad_list) 79 | return grad_matrix.cpu().numpy()**2 80 | 81 | 82 | def run_NID(weights): 83 | w_input, w_later = preprocess_weights(weights) 84 | interaction_ranking = interpret_interactions_from_weights(w_input, w_later) 85 | interaction_ranking_pruned = prune_redundant_interactions(interaction_ranking) 86 | 87 | return interaction_ranking_pruned 88 | 89 | def run_gradient_NID(mlp, x, grad_gpu): 90 | interaction_scores = {} 91 | 92 | if grad_gpu == -1: 93 | device = torch.device("cpu") 94 | else: 95 | device = torch.device("cuda:" + str(grad_gpu)) 96 | 97 | mlp = mlp.to(device) 98 | 99 | inter_matrix = get_second_order_grad(mlp, x, device) 100 | 101 | if len(inter_matrix) == 0: 102 | return [] 103 | 104 | inter_scores = [] 105 | 106 | for j in range(inter_matrix.shape[0]): 107 | for i in range(j): 108 | inter_scores.append(((i,j),inter_matrix[i,j])) 109 | 110 | inter_ranking = sorted(inter_scores, key=lambda x: -x[1]) 111 | 112 | return inter_ranking 113 | 114 | 115 | def prune_redundant_interactions(interaction_ranking, max_interactions=100): 116 | interaction_ranking_pruned = [] 117 | current_superset_inters = [] 118 | for inter, strength in interaction_ranking: 119 | set_inter = set(inter) 120 | if len(interaction_ranking_pruned) >= max_interactions: 121 | break 122 | subset_inter_skip = False 123 | update_superset_inters = [] 124 | for superset_inter in current_superset_inters: 125 | if set_inter < superset_inter: 126 | subset_inter_skip = True 127 | break 128 | elif not (set_inter > superset_inter): 129 | update_superset_inters.append(superset_inter) 130 | if subset_inter_skip: 131 | continue 132 | current_superset_inters = update_superset_inters 133 | current_superset_inters.append(set_inter) 134 | interaction_ranking_pruned.append((inter, strength)) 135 | 136 | return interaction_ranking_pruned 137 | 138 | 139 | def detect_interactions( 140 | Xs, 141 | Ys, 142 | detector = "NID", 143 | x_instance_representation = None, 144 | arch=[256, 128, 64], 145 | batch_size=100, 146 | device=torch.device("cpu"), 147 | weight_samples=False, 148 | add_linear=False, 149 | l1_const=None, 150 | grad_gpu=-1, 151 | seed=None, 152 | **kwargs 153 | ): 154 | def get_weights(model): 155 | weights = [] 156 | for name, param in model.named_parameters(): 157 | if "interaction_mlp" in name and "weight" in name: 158 | weights.append(param.cpu().detach().numpy()) 159 | return weights 160 | 161 | assert(detector in {"NID", "GradientNID"}) 162 | 163 | if seed is not None: 164 | set_seed(seed) 165 | 166 | if type(Xs) != dict and type(Ys) != dict: 167 | Xs = {"train": Xs} 168 | Ys = {"train": Ys} 169 | 170 | Wd = get_sample_weights(Xs, enable=weight_samples, **kwargs) 171 | 172 | data_loaders = {} 173 | for k in Xs: 174 | feats = force_float(Xs[k]) 175 | targets = force_float(Ys[k]) 176 | sws = force_float(Wd[k]).unsqueeze(1) 177 | dataset = data.TensorDataset(feats, targets, sws) 178 | data_loaders[k] = data.DataLoader(dataset, batch_size) 179 | 180 | if detector == "GradientNID": 181 | act_func = nn.Softplus() 182 | if l1_const == None: 183 | l1_const = 0 184 | else: 185 | act_func = nn.ReLU() 186 | if l1_const == None: 187 | l1_const = 1e-4 188 | 189 | mlp = MLP(feats.shape[1], arch, add_linear=add_linear, act_func=act_func).to(device) 190 | 191 | mlp, mlp_loss = train(mlp, data_loaders, device=device, l1_const=l1_const, **kwargs) 192 | 193 | if detector == "NID": 194 | inters = run_NID(get_weights(mlp)) 195 | elif detector == "GradientNID": 196 | if x_instance_representation is None: 197 | x_instance_representation = np.ones((1,Xs["train"].shape[1])) 198 | inters = run_gradient_NID(mlp, x_instance_representation, grad_gpu) 199 | 200 | return inters, mlp_loss 201 | -------------------------------------------------------------------------------- /1. madex/sampling_and_inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from tqdm import tqdm 4 | import copy 5 | from utils.general_utils import * 6 | from utils.text_utils import * 7 | from utils.graph_utils import * 8 | from utils.dna_utils import * 9 | from utils.lime.lime_text import * 10 | 11 | 12 | def generate_binary_perturbations( 13 | num_feat, num_samples=100, init_on=True, perturbed_features=None 14 | ): 15 | if perturbed_features == None: 16 | perturbed_features = {"indices": np.array(range(num_feat))} 17 | num_perturb = len(perturbed_features["indices"]) 18 | 19 | samples_binary = np.ones((num_samples, num_feat), dtype=np.int8) 20 | perturb_binary = np.ones((num_samples, num_perturb), dtype=np.int8) 21 | num_flips = np.random.randint(1, num_perturb + 1, num_samples) 22 | 23 | for r in range(num_samples): 24 | if not (init_on and r == 0): 25 | num_flip = num_flips[r] 26 | perturb_binary[r, 0:num_flip] = np.zeros(num_flip, dtype=np.int8) 27 | np.random.shuffle(perturb_binary[r]) 28 | samples_binary[r, perturbed_features["indices"]] = perturb_binary[r] 29 | 30 | return samples_binary 31 | 32 | 33 | def generate_perturbation_dataset_autoint( 34 | data_inst, 35 | model, 36 | dense_feat_indices, 37 | sparse_feat_indices, 38 | num_samples=6000, 39 | seed=None, 40 | **kwargs 41 | ): 42 | if seed is not None: 43 | set_seed(seed) 44 | 45 | def inv_sigmoid(y): 46 | return np.log(y / (1 - y)) 47 | 48 | num_feats = len(dense_feat_indices) + len(sparse_feat_indices) 49 | samples_binary = generate_binary_perturbations(num_feats, num_samples, True) 50 | 51 | means_arr = np.array([data_inst["means"][i] for i in dense_feat_indices]) 52 | 53 | perturb_Xv = [] 54 | perturb_Xi = [] 55 | for i in range(num_samples): 56 | raw_dense = data_inst["Xv"][dense_feat_indices] 57 | raw_sparse = data_inst["Xv"][sparse_feat_indices] 58 | binary_dense = samples_binary[i, dense_feat_indices] 59 | binary_sparse = samples_binary[i, sparse_feat_indices] 60 | perturb_raw_dense = raw_dense + binary_dense + means_arr * (1 - binary_dense) 61 | perturb_raw_sparse = binary_sparse * raw_sparse 62 | 63 | perturb_raw = np.zeros(num_feats) 64 | perturb_raw[dense_feat_indices] = perturb_raw_dense 65 | perturb_raw[sparse_feat_indices] = perturb_raw_sparse 66 | 67 | # perturb_raw = np.concatenate([perturb_raw_dense, perturb_raw_sparse]) 68 | perturb_Xv.append(perturb_raw) 69 | perturb_Xi.append(data_inst["Xi"]) 70 | perturb_Xv = np.stack(perturb_Xv) 71 | perturb_Xi = np.stack(perturb_Xi) 72 | 73 | samples_labels = inv_sigmoid(model.predict(perturb_Xi, perturb_Xv)) 74 | 75 | Xs, Ys = proprocess_data(samples_binary.astype(np.int64), samples_labels, **kwargs) 76 | return Xs, Ys 77 | 78 | 79 | def generate_perturbation_dataset_image( 80 | data_inst, 81 | model, 82 | class_idx, 83 | device, 84 | num_samples=6000, 85 | batch_size=100, 86 | seed=None, 87 | **kwargs 88 | ): 89 | # Based on LIME image: https://github.com/marcotcr/lime/blob/master/lime/lime_image.py 90 | 91 | if seed is not None: 92 | set_seed(seed) 93 | 94 | image = data_inst["orig"] 95 | segments = data_inst["segments"] 96 | num_feats = len(np.unique(segments)) 97 | 98 | samples_binary = generate_binary_perturbations(num_feats, num_samples, True) 99 | 100 | image_means = image.copy() 101 | for i in np.unique(segments): 102 | image_means[segments == i] = ( 103 | np.mean(image[segments == i][:, 0]), 104 | np.mean(image[segments == i][:, 1]), 105 | np.mean(image[segments == i][:, 2]), 106 | ) 107 | 108 | n_batches = int(np.ceil(num_samples / batch_size)) 109 | 110 | samples_labels = [] 111 | for i in tqdm(range(n_batches)): 112 | 113 | samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size] 114 | 115 | perturbed_imgs = [] 116 | for sample_binary in samples_binary_batch: 117 | temp = copy.deepcopy(image) 118 | zeros = np.where(sample_binary == 0)[0] 119 | mask = np.zeros(segments.shape).astype(bool) 120 | for z in zeros: 121 | mask[segments == z] = True 122 | temp[mask] = image_means[mask] 123 | 124 | perturbed_imgs.append(temp) 125 | 126 | torch_img = ( 127 | torch.FloatTensor(np.array(perturbed_imgs)).to(device).permute(0, 3, 1, 2) 128 | ) 129 | preds = model(torch_img).data.cpu().numpy() 130 | samples_labels.extend(preds) 131 | 132 | samples_labels = np.stack(samples_labels) 133 | 134 | Xs, Ys = proprocess_data(samples_binary, samples_labels[:, class_idx], **kwargs) 135 | 136 | return Xs, Ys 137 | 138 | 139 | def generate_perturbation_dataset_text( 140 | data_inst, 141 | model, 142 | class_idx, 143 | device, 144 | num_samples=6000, 145 | batch_size=100, 146 | seed=None, 147 | model_id=None, 148 | **kwargs 149 | ): 150 | # Based on LIME image: https://github.com/marcotcr/lime/blob/master/lime/lime_text.py 151 | 152 | if seed is not None: 153 | set_seed(seed) 154 | 155 | text = data_inst["orig"] 156 | 157 | indexed_string = IndexedString(text, bow=False) 158 | data_inst["domain_mapper"] = TextDomainMapper(indexed_string) 159 | 160 | num_feats = indexed_string.num_words() 161 | 162 | samples_binary = generate_binary_perturbations(num_feats, num_samples, True) 163 | 164 | n_batches = int(np.ceil(num_samples / batch_size)) 165 | 166 | samples_labels = [] 167 | for i in tqdm(range(n_batches)): 168 | 169 | samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size] 170 | 171 | perturbed_text = [] 172 | for sample_binary in samples_binary_batch: 173 | 174 | indices2invert = np.argwhere(sample_binary == 0).squeeze() 175 | inv = indexed_string.inverse_removing(indices2invert) 176 | 177 | if model_id == "bert": 178 | ex = inv 179 | else: 180 | ex = data.Example.fromlist([inv], fields=[("text", data_inst["vectorizer"])]) 181 | perturbed_text.append(ex) 182 | 183 | if model_id == "bert": 184 | preds = model(perturbed_text) 185 | else: 186 | dset = data.Dataset(perturbed_text, fields=[("text", data_inst["vectorizer"])]) 187 | test_samples = data.Batch(data=perturbed_text, dataset=dset, device=device) 188 | preds = model(test_samples).data.cpu().numpy() 189 | 190 | samples_labels.append(preds) 191 | 192 | samples_labels = np.concatenate(samples_labels) 193 | 194 | Xs, Ys = proprocess_data(samples_binary, samples_labels[:, class_idx], **kwargs) 195 | 196 | return Xs, Ys 197 | 198 | 199 | def generate_perturbation_dataset_graph( 200 | data_inst, 201 | model, 202 | target_idx, 203 | n_hops, 204 | device, 205 | num_samples=6000, 206 | batch_size=500, 207 | seed=None, 208 | **kwargs 209 | ): 210 | def get_output(x, da): 211 | return model(x, da)[test_idxs].detach().cpu() 212 | 213 | if seed is not None: 214 | set_seed(seed) 215 | 216 | node_feats = data_inst["nodes"] 217 | adj_mat = data_inst["edges"] 218 | test_idxs = data_inst["test_idxs"] 219 | 220 | da_mat = convert_adj_to_da(adj_mat) 221 | 222 | # Collect all nodes within a k-hop neighborhood of the target test index 223 | adj_cum = copy.deepcopy(adj_mat) 224 | for i in range(n_hops - 1): 225 | adj_cum = torch.matmul(adj_cum, adj_mat) 226 | 227 | sum_v = 0 228 | counter = 0 229 | locality_dict = dict() 230 | locality_dict_rev = dict() 231 | for i, v in enumerate(adj_cum[target_idx]): 232 | if v != 0: 233 | sum_v += v 234 | locality_dict[i] = counter 235 | locality_dict_rev[counter] = i 236 | counter += 1 237 | local_num_nodes = len(locality_dict) 238 | 239 | data_inst["local_idx_map"] = locality_dict_rev 240 | 241 | samples_binary = generate_binary_perturbations(local_num_nodes, num_samples, True) 242 | 243 | # Get the features associated binary samples 244 | data_new = [] 245 | for i in range(node_feats.shape[0]): 246 | if i in locality_dict: 247 | data_new.append(samples_binary[:, locality_dict[i]]) 248 | else: 249 | data_new.append(np.zeros(num_samples)) 250 | data_new = np.array(data_new).transpose() 251 | 252 | # Get the test predictions associated binary samples 253 | results = [] 254 | for d in tqdm(data_new): 255 | mask = torch.FloatTensor(d).view(-1, 1).expand(node_feats.size()) 256 | masked_features = node_feats * mask.to(device) 257 | output = get_output(masked_features, da_mat).numpy() 258 | results.append(output) 259 | results = np.array(results) 260 | 261 | y_idx = test_idxs.index(target_idx) 262 | classifications = get_output(node_feats, da_mat).max(1)[1] 263 | 264 | samples_labels = results[:, y_idx, classifications[y_idx]] 265 | 266 | # samples_labels = [] 267 | # for ci, c in enumerate(classifications): 268 | # samples_labels.append(results[:, ci, c]) 269 | 270 | Xs, Ys = proprocess_data(samples_binary, samples_labels, **kwargs) 271 | 272 | return Xs, Ys 273 | 274 | 275 | def generate_perturbation_dataset_dna( 276 | data_inst, model, device, num_samples=6000, batch_size=100, seed=None, **kwargs 277 | ): 278 | 279 | if seed is not None: 280 | set_seed(seed) 281 | 282 | seq = data_inst["orig"] 283 | vectorizer = data_inst["vectorizer"] 284 | 285 | indexed_seq = IndexedNucleotides(seq) 286 | 287 | num_feats = indexed_seq.num_nucleotides() 288 | 289 | samples_binary = generate_binary_perturbations(num_feats, num_samples, True) 290 | 291 | n_batches = int(np.ceil(num_samples / batch_size)) 292 | 293 | samples_labels = [] 294 | for i in tqdm(range(n_batches)): 295 | 296 | samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size] 297 | 298 | perturbed_seqs = [] 299 | for sample_binary in samples_binary_batch: 300 | 301 | indices2invert = np.argwhere(sample_binary == 0).squeeze() 302 | inv = indexed_seq.perturb_nucleotide(indices2invert) 303 | ex = vectorizer(inv) 304 | perturbed_seqs.append(ex) 305 | 306 | test_samples = torch.FloatTensor(perturbed_seqs).permute(0, 2, 1).to(device) 307 | preds = model(test_samples).data.cpu().numpy() 308 | 309 | samples_labels.append(preds) 310 | 311 | samples_labels = np.concatenate(samples_labels).squeeze() 312 | 313 | Xs, Ys = proprocess_data(samples_binary, samples_labels, **kwargs) 314 | 315 | return Xs, Ys 316 | -------------------------------------------------------------------------------- /1. madex/utils/data/cora/README: -------------------------------------------------------------------------------- 1 | This directory contains the a selection of the Cora dataset (www.research.whizbang.com/data). 2 | 3 | The Cora dataset consists of Machine Learning papers. These papers are classified into one of the following seven classes: 4 | Case_Based 5 | Genetic_Algorithms 6 | Neural_Networks 7 | Probabilistic_Methods 8 | Reinforcement_Learning 9 | Rule_Learning 10 | Theory 11 | 12 | The papers were selected in a way such that in the final corpus every paper cites or is cited by atleast one other paper. There are 2708 papers in the whole corpus. 13 | 14 | After stemming and removing stopwords we were left with a vocabulary of size 1433 unique words. All words with document frequency less than 10 were removed. 15 | 16 | 17 | THE DIRECTORY CONTAINS TWO FILES: 18 | 19 | The .content file contains descriptions of the papers in the following format: 20 | 21 | + 22 | 23 | The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper. 24 | 25 | The .cites file contains the citation graph of the corpus. Each line describes a link in the following format: 26 | 27 | 28 | 29 | Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1". -------------------------------------------------------------------------------- /1. madex/utils/data/sample_images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/bus.jpg -------------------------------------------------------------------------------- /1. madex/utils/data/sample_images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/dog.jpg -------------------------------------------------------------------------------- /1. madex/utils/data/sample_images/shark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/shark.jpg -------------------------------------------------------------------------------- /1. madex/utils/data/sample_images/viaduct.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/viaduct.jpg -------------------------------------------------------------------------------- /1. madex/utils/dna_utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import h5py as h5 4 | import numpy as np 5 | from utils.general_utils import * 6 | 7 | # from sampling_and_inference import * 8 | 9 | 10 | class Flatten(nn.Module): 11 | def forward(self, input): 12 | return input.view(input.size(0), -1) 13 | 14 | 15 | def createConv1D(n_inp, n_out, hidden_units, kernel_size, seq_len, activation=nn.ReLU): 16 | 17 | layers = [] 18 | layers_size = [n_inp] + hidden_units 19 | for i in range(len(layers_size) - 1): 20 | layers.append(nn.Conv1d(layers_size[i], layers_size[i + 1], kernel_size)) 21 | if activation is not None: 22 | layers.append(activation()) 23 | layers.append(Flatten()) 24 | seq_len = seq_len - (kernel_size - 1) * len(hidden_units) 25 | linear_dim = layers_size[-1] * seq_len 26 | layers.append(nn.Linear(linear_dim, n_out)) 27 | 28 | return nn.Sequential(*layers) 29 | 30 | 31 | class conv1D(nn.Module): 32 | def __init__(self, n_inp, n_out, hidden_units, kernel_size, seq_len, **kwargs): 33 | super(conv1D, self).__init__() 34 | self.conv1D = createConv1D(n_inp, n_out, hidden_units, kernel_size, seq_len) 35 | 36 | def forward(self, x): 37 | return self.conv1D(x) 38 | 39 | 40 | def load_dna_model(path): 41 | model = conv1D(4, 1, [64, 64], 5, 36) 42 | model.load_state_dict(torch.load(path)) 43 | return model 44 | 45 | 46 | def generate_random_dna_sequence_with_CACGTG(length=36, seed=None): 47 | if seed is not None: 48 | set_seed(seed) 49 | 50 | nucleotides = ["A", "C", "G", "T"] 51 | seq = "" 52 | ebox = "CACGTG" 53 | for i in np.random.randint(0, 4, (length)): 54 | seq += nucleotides[i] 55 | i = np.random.randint(0, length - len(ebox)) 56 | seq = seq[:i] + ebox + seq[i + len(ebox) :] 57 | return seq 58 | 59 | 60 | def encode_dna_onehot(seq): 61 | seq_as_list = list(seq) 62 | 63 | for i, c in enumerate(seq_as_list): 64 | if c == "A": 65 | seq_as_list[i] = [1, 0, 0, 0] 66 | elif c == "T": 67 | seq_as_list[i] = [0, 1, 0, 0] 68 | elif c == "C": 69 | seq_as_list[i] = [0, 0, 1, 0] 70 | elif c == "G": 71 | seq_as_list[i] = [0, 0, 0, 1] 72 | else: 73 | seq_as_list[i] = [0, 0, 0, 0] 74 | 75 | return np.array(seq_as_list) 76 | 77 | 78 | class IndexedNucleotides(object): 79 | """String with various indexes.""" 80 | 81 | """Based on LIME official Repo""" 82 | 83 | def __init__(self, raw_string): 84 | """Initializer. 85 | 86 | Args: 87 | raw_string: string with raw text in it 88 | """ 89 | self.raw = raw_string 90 | self.as_list = list(self.raw) 91 | self.as_np = np.array(self.as_list) 92 | self.string_start = np.arange(len(self.raw)) 93 | vocab = {} 94 | self.inverse_vocab = [] 95 | self.positions = [] 96 | non_vocab = set() 97 | for i, char in enumerate(self.as_np): 98 | if char in non_vocab: 99 | continue 100 | self.inverse_vocab.append(char) 101 | self.positions.append(i) 102 | self.positions = np.array(self.positions) 103 | 104 | def raw_string(self): 105 | """Returns the original raw string""" 106 | return self.raw 107 | 108 | def num_nucleotides(self): 109 | """Returns the number of tokens in the vocabulary for this document.""" 110 | return len(self.inverse_vocab) 111 | 112 | def choose_alt(self, existing): 113 | nucleotides = ["A", "T", "G", "C"] 114 | nucleotides.remove(existing) 115 | return nucleotides[np.random.randint(0, 3)] 116 | 117 | def perturb_nucleotide(self, chars_to_remove): 118 | mask = np.ones(self.as_np.shape[0], dtype="bool") 119 | mask[self.__get_idxs(chars_to_remove)] = False 120 | return "".join( 121 | [ 122 | self.as_list[i] if mask[i] else self.choose_alt(self.as_list[i]) 123 | for i in range(mask.shape[0]) 124 | ] 125 | ) 126 | 127 | def __get_idxs(self, chars): 128 | """Returns indexes to appropriate words.""" 129 | return self.positions[chars] 130 | -------------------------------------------------------------------------------- /1. madex/utils/general_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from utils.lime import lime_base 4 | import sklearn 5 | from sklearn.preprocessing import StandardScaler 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import copy 9 | 10 | 11 | def set_seed(seed=42): 12 | np.random.seed(seed) 13 | torch.manual_seed(seed) 14 | if torch.cuda.is_available(): 15 | torch.cuda.manual_seed(seed) 16 | 17 | 18 | def force_float(X_numpy): 19 | return torch.from_numpy(X_numpy.astype(np.float32)) 20 | 21 | 22 | def proprocess_data(X, Y, valid_size=500, test_size=500, std_scale=False): 23 | 24 | n, p = X.shape 25 | ## Make dataset splits 26 | ntrain, nval, ntest = n - valid_size - test_size, valid_size, test_size 27 | 28 | Xs = { 29 | "train": X[:ntrain], 30 | "val": X[ntrain : ntrain + nval], 31 | "test": X[ntrain + nval : ntrain + nval + ntest], 32 | } 33 | Ys = { 34 | "train": np.expand_dims(Y[:ntrain], axis=1), 35 | "val": np.expand_dims(Y[ntrain : ntrain + nval], axis=1), 36 | "test": np.expand_dims(Y[ntrain + nval : ntrain + nval + ntest], axis=1), 37 | } 38 | 39 | for k in Xs: 40 | if len(Xs[k]) == 0: 41 | assert k != "train" 42 | del Xs[k] 43 | del Ys[k] 44 | 45 | if std_scale: 46 | scaler = StandardScaler() 47 | scaler.fit(Ys["train"]) 48 | for k in Ys: 49 | Ys[k] = scaler.transform(Ys[k]) 50 | Ys["scaler"] = scaler 51 | 52 | return Xs, Ys 53 | 54 | 55 | 56 | class MLP(nn.Module): 57 | def __init__( 58 | self, 59 | num_features, 60 | hidden_units, 61 | add_linear=False, 62 | act_func=nn.ReLU(), 63 | ): 64 | super(MLP, self).__init__() 65 | 66 | self.hidden_units = hidden_units 67 | self.add_linear = add_linear 68 | self.interaction_mlp = create_mlp([num_features] + hidden_units + [1], act_func=act_func) 69 | 70 | self.add_linear = add_linear 71 | 72 | if add_linear: 73 | self.linear = nn.Linear(num_features, 1, bias=False) 74 | 75 | 76 | def forward(self, x): 77 | y = self.interaction_mlp(x) 78 | 79 | if self.add_linear: 80 | y += self.linear(x) 81 | return y 82 | 83 | 84 | def create_mlp(layer_sizes, out_bias=True, act_func=nn.ReLU()): 85 | ls = list(layer_sizes) 86 | layers = nn.ModuleList() 87 | for i in range(1, len(ls) - 1): 88 | layers.append(nn.Linear(int(ls[i - 1]), int(ls[i]))) 89 | layers.append(act_func) 90 | layers.append(nn.Linear(int(ls[-2]), int(ls[-1]), bias=out_bias)) 91 | return nn.Sequential(*layers) 92 | 93 | 94 | def train( 95 | net, 96 | data_loaders, 97 | criterion=nn.MSELoss(reduction="none"), 98 | nepochs=100, 99 | verbose=False, 100 | early_stopping=True, 101 | patience=5, 102 | l1_const=1e-4, 103 | l2_const=0, 104 | learning_rate=0.01, 105 | opt_func=optim.Adam, 106 | device=torch.device("cpu"), 107 | **kwargs 108 | ): 109 | optimizer = opt_func(net.parameters(), lr=learning_rate, weight_decay=l2_const) 110 | 111 | def include_sws(loss, sws): 112 | assert loss.shape == sws.shape 113 | return (loss * sws / sws.sum()).sum() 114 | 115 | def evaluate(net, data_loader, criterion, device): 116 | losses = [] 117 | sws = [] 118 | for inputs, targets, sws_batch in data_loader: 119 | inputs = inputs.to(device) 120 | targets = targets.to(device) 121 | loss = criterion(net(inputs), targets).cpu().data 122 | losses.append(loss) 123 | sws.append(sws_batch) 124 | return include_sws(torch.stack(losses), torch.stack(sws)).item() 125 | 126 | best_loss = float("inf") 127 | best_net = None 128 | 129 | if "val" not in data_loaders: 130 | early_stopping = False 131 | 132 | patience_counter = 0 133 | 134 | for epoch in range(nepochs): 135 | if verbose: 136 | print("epoch", epoch) 137 | running_loss = 0.0 138 | run_count = 0 139 | for i, data in enumerate(data_loaders["train"], 0): 140 | inputs, targets, sws = data 141 | inputs = inputs.to(device) 142 | targets = targets.to(device) 143 | sws = sws.to(device) 144 | optimizer.zero_grad() 145 | outputs = net(inputs) 146 | loss = include_sws(criterion(outputs, targets), sws) 147 | 148 | reg_loss = 0 149 | for name, param in net.named_parameters(): 150 | if "interaction_mlp" in name and "weight" in name: 151 | reg_loss += torch.sum(torch.abs(param)) 152 | 153 | (loss + reg_loss * l1_const).backward() 154 | optimizer.step() 155 | running_loss += loss.item() 156 | run_count += 1 157 | 158 | if epoch % 1 == 0: 159 | key = "val" if "val" in data_loaders else "train" 160 | val_loss = evaluate(net, data_loaders[key], criterion, device) 161 | 162 | if verbose: 163 | print( 164 | "[%d, %5d] train loss: %.4f, val loss: %.4f" 165 | % (epoch + 1, nepochs, running_loss / run_count, val_loss) 166 | ) 167 | if early_stopping: 168 | if val_loss < best_loss: 169 | best_loss = val_loss 170 | best_net = copy.deepcopy(net) 171 | patience_counter = 0 172 | else: 173 | patience_counter += 1 174 | if patience_counter > patience: 175 | net = best_net 176 | val_loss = best_loss 177 | if verbose: 178 | print("early stopping!") 179 | break 180 | 181 | prev_loss = running_loss 182 | running_loss = 0.0 183 | 184 | if "test" in data_loaders: 185 | key = "test" 186 | elif "val" in data_loaders: 187 | key = "val" 188 | else: 189 | key = "train" 190 | test_loss = evaluate(net, data_loaders[key], criterion, device) 191 | 192 | if verbose: 193 | print("Finished Training. Test loss: ", test_loss) 194 | 195 | return net, test_loss 196 | 197 | 198 | def merge_overlapping_sets( 199 | prediction_scores, 200 | interaction_atts, 201 | overlap_thresh=0.5, 202 | rel_gain_threshold=0, 203 | patience=1, 204 | num_features=None, 205 | ): 206 | def overlap_coef(A, B): 207 | A = set(A) 208 | B = set(B) 209 | return len(A & B) / min(len(A), len(B)) 210 | 211 | def merge_sets(inter_sets): 212 | prev_sets = None 213 | inter_sets = list(inter_sets) 214 | inter_sets_merged = inter_sets 215 | while inter_sets != prev_sets: 216 | prev_sets = list(inter_sets) 217 | for A in inter_sets: 218 | for B in inter_sets_merged: 219 | if A != B: 220 | if overlap_coef(A, B) >= overlap_thresh: 221 | inter_sets_merged.append( 222 | tuple(sorted(set(A) | set(B))) 223 | ) # merge 224 | if A in inter_sets_merged: 225 | inter_sets_merged.remove(A) 226 | if B in inter_sets_merged: 227 | inter_sets_merged.remove(B) 228 | 229 | inter_sets = list(set(inter_sets_merged)) 230 | return inter_sets 231 | 232 | def threshold_inter_sets(interaction_atts, prediction_scores): 233 | scores = prediction_scores 234 | inter_sets = [] 235 | patience_counter = 0 236 | best_score = scores[0] 237 | for i in range(1, len(scores)): 238 | cur_score = scores[i] 239 | rel_gain = (cur_score - best_score) / best_score 240 | inter_sets_temp, _ = zip(*interaction_atts[i - 1]) 241 | if num_features is not None: 242 | if any(len(inter) == num_features for inter in inter_sets_temp): 243 | break 244 | if rel_gain > rel_gain_threshold: 245 | best_score = cur_score 246 | inter_sets = inter_sets_temp 247 | patience_counter = 0 248 | else: 249 | if patience_counter < patience: 250 | patience_counter += 1 251 | else: 252 | break 253 | return inter_sets 254 | 255 | inter_sets = threshold_inter_sets(interaction_atts, prediction_scores) 256 | inter_sets_merged = merge_sets(inter_sets) 257 | 258 | return inter_sets_merged 259 | 260 | 261 | ###################################################### 262 | # The following are based on the official LIME repo 263 | ###################################################### 264 | 265 | 266 | def get_sample_distances(Xs): 267 | all_ones = np.ones((1, Xs["train"].shape[1])) 268 | Dd = {} 269 | for k in Xs: 270 | if k == "scaler": 271 | continue 272 | distances = sklearn.metrics.pairwise_distances( 273 | Xs[k], all_ones, metric="cosine" 274 | ).ravel() 275 | Dd[k] = distances 276 | 277 | return Dd 278 | 279 | 280 | def get_sample_weights(Xs, kernel_width=0.25, enable=True, **kwargs): 281 | def kernel(d): 282 | return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) 283 | 284 | if enable: 285 | Dd = get_sample_distances(Xs) 286 | 287 | Wd = {} 288 | for k in Xs: 289 | if k == "scaler": 290 | continue 291 | if enable: 292 | Wd[k] = kernel(Dd[k]) 293 | else: 294 | Wd[k] = np.ones(Xs[k].shape[0]) 295 | 296 | return Wd 297 | 298 | 299 | def get_lime_attributions( 300 | Xs, Ys, max_features=10000, kernel_width=0.25, weight_samples=True, sort=True 301 | ): 302 | def kernel(d): 303 | return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) 304 | 305 | distances = get_sample_distances(Xs)["train"] 306 | if not weight_samples: 307 | distances = np.ones_like(distances).squeeze(1) 308 | 309 | lb = lime_base.LimeBase(kernel_fn=kernel) 310 | lime_atts = lb.explain_instance_with_data( 311 | Xs["train"], Ys["train"], distances, 0, max_features 312 | )[0] 313 | if sort: 314 | lime_atts = sorted(lime_atts, key=lambda x: -x[1]) 315 | return lime_atts 316 | -------------------------------------------------------------------------------- /1. madex/utils/graph_utils.py: -------------------------------------------------------------------------------- 1 | from utils.pretrained.model_gcn import * 2 | from collections import defaultdict 3 | import numpy as np 4 | import copy 5 | 6 | 7 | def get_graph_model(model_folder): 8 | 9 | meta = torch.load(model_folder + "/gcn_cora.pt") 10 | 11 | n_hops = meta["n_hops"] 12 | n_nodes = meta["n_nodes"] 13 | test_idxs = meta["test_idxs"] 14 | n_samples = meta["n_samples"] 15 | dim_inp = meta["dim_inp"] 16 | dim_hid = meta["dim_hid"] 17 | dim_out = meta["dim_out"] 18 | 19 | model = create_model(dim_inp, dim_hid, dim_out, n_samples, n_hops) 20 | model.load_state_dict(meta["state_dict"]) 21 | 22 | return model, n_nodes, n_hops, test_idxs 23 | 24 | 25 | def convert_adj_to_da(adj_mat, make_undirected=False): 26 | # Converts adjacency to laplacian matrix 27 | if isinstance(adj_mat, np.ndarray): 28 | adj_mat = torch.from_numpy(adj_mat).float() 29 | if make_undirected: 30 | diag = torch.diag(torch.diag(adj_mat)) 31 | x = adj_mat - diag 32 | adj_mat = x + x.t() + adj_mat 33 | 34 | da_mat = torch.eye(len(adj_mat)).to(adj_mat.device) - adj_mat 35 | return da_mat 36 | 37 | 38 | def load_cora(data_folder, device): 39 | num_nodes = 2708 40 | num_feats = 1433 41 | feat_data = np.zeros((num_nodes, num_feats)) 42 | labels = np.empty((num_nodes, 1), dtype=np.int64) 43 | node_map = {} 44 | label_map = {} 45 | with open(data_folder + "/cora.content") as fp: 46 | for i, line in enumerate(fp): 47 | info = line.strip().split() 48 | feat_data[i, :] = [float(_) for _ in info[1:-1]] 49 | node_map[info[0]] = i 50 | if not info[-1] in label_map: 51 | label_map[info[-1]] = len(label_map) 52 | labels[i] = label_map[info[-1]] 53 | 54 | adj_lists = defaultdict(set) 55 | with open(data_folder + "/cora.cites") as fp: 56 | for i, line in enumerate(fp): 57 | info = line.strip().split() 58 | n1 = node_map[info[0]] 59 | n2 = node_map[info[1]] 60 | adj_lists[n1].add(n2) 61 | adj_lists[n2].add(n1) 62 | 63 | adj_mat = np.zeros((num_nodes, num_nodes)) 64 | for u in adj_lists: 65 | for v in adj_lists[u]: 66 | adj_mat[u, v] = 1 67 | 68 | feat_data = torch.FloatTensor(feat_data).to(device) 69 | adj_mat = torch.FloatTensor(adj_mat).to(device) 70 | return feat_data, adj_mat, labels 71 | 72 | 73 | def get_hops_to_target(target_idx, adj_mat, n_hops): 74 | # Create a map from node to the number of hops from the target test index 75 | node_to_hop = {target_idx: 0} 76 | seen_points = {target_idx} 77 | for j in range(1, n_hops + 2): 78 | adj_cum = copy.deepcopy(adj_mat) 79 | for i in range(j - 1): 80 | adj_cum = torch.matmul(adj_cum, adj_mat) 81 | collect = {i for i, v in enumerate(adj_cum[target_idx]) if v != 0} 82 | ex_collect = collect - seen_points 83 | seen_points |= collect 84 | for e in ex_collect: 85 | node_to_hop[e] = j 86 | 87 | return node_to_hop 88 | -------------------------------------------------------------------------------- /1. madex/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | from torchvision import transforms 2 | import requests 3 | from PIL import Image 4 | from skimage.segmentation import mark_boundaries 5 | import numpy as np 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | from matplotlib.gridspec import GridSpec 9 | matplotlib.rcParams['mathtext.fontset'] = 'cm' 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral' 11 | 12 | 13 | # image pre-processing needed for ResNet 14 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 15 | 16 | preprocess = transforms.Compose( 17 | [ 18 | transforms.Resize((224, 224)), 19 | transforms.ToTensor(), 20 | normalize, 21 | ] 22 | ) 23 | 24 | def get_image_and_labels( 25 | image_path, 26 | device, 27 | labels_url="https://s3.amazonaws.com/outcome-blog/imagenet/labels.json", 28 | ): 29 | """ 30 | Loads image instance and labels 31 | 32 | Args: 33 | image_path: path to image instance 34 | labels_url: url to json labels 35 | 36 | Returns: 37 | image, labels 38 | """ 39 | image = Image.open(image_path) 40 | if image.mode != "RGB": 41 | image = image.convert("RGB") 42 | image_tensor = preprocess(image) 43 | image = ( 44 | image_tensor.cpu().numpy().transpose(1, 2, 0) / image_tensor.abs().max().item() 45 | ) 46 | image_tensor = ( 47 | image_tensor.unsqueeze_(0).to(device) / image_tensor.abs().max().item() 48 | ) 49 | labels = { 50 | int(key): value for (key, value) in requests.get(labels_url).json().items() 51 | } 52 | return image, image_tensor, labels 53 | 54 | 55 | def show_segmented_image(image, segments): 56 | plt.imshow(mark_boundaries(image / 2 + 0.5, segments)) 57 | 58 | 59 | def plot_explanations(img_arrays, figsize = 0.4, spacing = 0.15, savepath=""): 60 | w_spacing = (2/3)*spacing 61 | left = 0 62 | ax_arays = [] 63 | fig = plt.figure() 64 | for img_array in img_arrays: 65 | num_imgs = len(img_array) 66 | right = left + figsize*(num_imgs) + (num_imgs-1)*0.4*w_spacing 67 | ax_arays.append(fig.subplots(1,num_imgs, gridspec_kw=dict(left=left, right=right, wspace=w_spacing))) 68 | left = right + spacing 69 | 70 | for i, ax_array in enumerate(ax_arays): 71 | if hasattr(ax_array, "flat"): 72 | for j, ax in enumerate(ax_array.flat): 73 | img, title = img_arrays[i][j] 74 | ax.imshow(img/2+0.5) 75 | ax.set_title(title, fontsize=55*figsize) 76 | ax.axis("off") 77 | else: 78 | img, title = img_arrays[i][0] 79 | 80 | ax_array.imshow(img/2+0.5) 81 | ax_array.set_title(title, fontsize=55*figsize) 82 | ax_array.axis("off") 83 | 84 | if savepath: 85 | plt.savefig(savepath, bbox_inches="tight") 86 | plt.show() 87 | 88 | 89 | def show_explanations(inter_sets, image, segments, figsize=0.4, spacing=0.15, lime_atts=None, savepath=""): 90 | 91 | def get_interaction_img(inter): 92 | temp = (np.ones(image.shape, image.dtype) -0.5)*1 93 | for n in inter: 94 | temp[segments == n] = image[segments == n].copy() 95 | return temp 96 | 97 | img_arrays = [] 98 | img_arrays.append( [(image, "Original image")] ) 99 | 100 | ## main effects 101 | if lime_atts is not None: 102 | temp = (np.ones(image.shape, image.dtype) -0.5)*1 103 | for n,_ in lime_atts[:5]: 104 | temp[segments == n] = image[segments == n].copy() 105 | img_arrays.append( [(temp, "Main effects")] ) 106 | 107 | inter_img_arrays = [] 108 | for i, inter_set in enumerate(inter_sets): 109 | inter_img_arrays.append( (get_interaction_img(inter_set), "Interaction $\mathcal{I}_" + str(i+1) + "$") ) 110 | img_arrays.append(inter_img_arrays) 111 | 112 | plot_explanations(img_arrays, figsize, spacing, savepath) -------------------------------------------------------------------------------- /1. madex/utils/lime/lime_base.py: -------------------------------------------------------------------------------- 1 | ################################ 2 | # Based on the LIME code repo 3 | ################################ 4 | 5 | """ 6 | Contains abstract functionality for learning locally linear sparse model. 7 | """ 8 | from __future__ import print_function 9 | import numpy as np 10 | from sklearn.linear_model import Ridge, lars_path 11 | from sklearn.utils import check_random_state 12 | import copy 13 | from sklearn.metrics import mean_squared_error 14 | 15 | 16 | class LimeBase(object): 17 | """Class for learning a locally linear sparse model from perturbed data""" 18 | 19 | def __init__(self, kernel_fn, verbose=False, random_state=None): 20 | """Init function 21 | 22 | Args: 23 | kernel_fn: function that transforms an array of distances into an 24 | array of proximity values (floats). 25 | verbose: if true, print local prediction values from linear model. 26 | random_state: an integer or numpy.RandomState that will be used to 27 | generate random numbers. If None, the random state will be 28 | initialized using the internal numpy seed. 29 | """ 30 | self.kernel_fn = kernel_fn 31 | self.verbose = verbose 32 | self.random_state = check_random_state(random_state) 33 | 34 | @staticmethod 35 | def generate_lars_path(weighted_data, weighted_labels): 36 | """Generates the lars path for weighted data. 37 | 38 | Args: 39 | weighted_data: data that has been weighted by kernel 40 | weighted_label: labels, weighted by kernel 41 | 42 | Returns: 43 | (alphas, coefs), both are arrays corresponding to the 44 | regularization parameter and coefficients, respectively 45 | """ 46 | x_vector = weighted_data 47 | alphas, _, coefs = lars_path( 48 | x_vector, weighted_labels, method="lasso", verbose=False 49 | ) 50 | return alphas, coefs 51 | 52 | def forward_selection(self, data, labels, weights, num_features): 53 | """Iteratively adds features to the model""" 54 | clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state) 55 | used_features = [] 56 | for _ in range(min(num_features, data.shape[1])): 57 | max_ = -100000000 58 | best = 0 59 | for feature in range(data.shape[1]): 60 | if feature in used_features: 61 | continue 62 | clf.fit( 63 | data[:, used_features + [feature]], labels, sample_weight=weights 64 | ) 65 | score = clf.score( 66 | data[:, used_features + [feature]], labels, sample_weight=weights 67 | ) 68 | if score > max_: 69 | best = feature 70 | max_ = score 71 | used_features.append(best) 72 | return np.array(used_features) 73 | 74 | def feature_selection(self, data, labels, weights, num_features, method): 75 | """Selects features for the model. see explain_instance_with_data to 76 | understand the parameters.""" 77 | 78 | if method == "none": 79 | return np.array(range(data.shape[1])) 80 | elif method == "forward_selection": 81 | return self.forward_selection(data, labels, weights, num_features) 82 | elif method == "highest_weights": 83 | clf = Ridge(alpha=0.01, fit_intercept=True, random_state=self.random_state) 84 | clf.fit(data, labels, sample_weight=weights) 85 | feature_weights = sorted( 86 | zip(range(data.shape[0]), clf.coef_ * data[0]), 87 | key=lambda x: np.abs(x[1]), 88 | reverse=True, 89 | ) 90 | return np.array([x[0] for x in feature_weights[:num_features]]) 91 | elif method == "lasso_path": 92 | weighted_data = ( 93 | data - np.average(data, axis=0, weights=weights) 94 | ) * np.sqrt(weights[:, np.newaxis]) 95 | weighted_labels = (labels - np.average(labels, weights=weights)) * np.sqrt( 96 | weights 97 | ) 98 | nonzero = range(weighted_data.shape[1]) 99 | _, coefs = self.generate_lars_path(weighted_data, weighted_labels) 100 | for i in range(len(coefs.T) - 1, 0, -1): 101 | nonzero = coefs.T[i].nonzero()[0] 102 | if len(nonzero) <= num_features: 103 | break 104 | used_features = nonzero 105 | return used_features 106 | elif method == "auto": 107 | if num_features <= 6: 108 | n_method = "forward_selection" 109 | else: 110 | n_method = "highest_weights" 111 | return self.feature_selection(data, labels, weights, num_features, n_method) 112 | 113 | def explain_instance_with_data( 114 | self, 115 | neighborhood_data, 116 | neighborhood_labels, 117 | distances, 118 | label, 119 | num_features, 120 | feature_selection="auto", 121 | model_regressor=None, 122 | ): 123 | """Takes perturbed data, labels and distances, returns explanation. 124 | 125 | Args: 126 | neighborhood_data: perturbed data, 2d array. first element is 127 | assumed to be the original data point. 128 | neighborhood_labels: corresponding perturbed labels. should have as 129 | many columns as the number of possible labels. 130 | distances: distances to original data point. 131 | label: label for which we want an explanation 132 | num_features: maximum number of features in explanation 133 | feature_selection: how to select num_features. options are: 134 | 'forward_selection': iteratively add features to the model. 135 | This is costly when num_features is high 136 | 'highest_weights': selects the features that have the highest 137 | product of absolute weight * original data point when 138 | learning with all the features 139 | 'lasso_path': chooses features based on the lasso 140 | regularization path 141 | 'none': uses all features, ignores num_features 142 | 'auto': uses forward_selection if num_features <= 6, and 143 | 'highest_weights' otherwise. 144 | model_regressor: sklearn regressor to use in explanation. 145 | Defaults to Ridge regression if None. Must have 146 | model_regressor.coef_ and 'sample_weight' as a parameter 147 | to model_regressor.fit() 148 | 149 | Returns: 150 | (intercept, exp, score): 151 | intercept is a float. 152 | exp is a sorted list of tuples, where each tuple (x,y) corresponds 153 | to the feature id (x) and the local weight (y). The list is sorted 154 | by decreasing absolute value of y. 155 | score is the R^2 value of the returned explanation 156 | """ 157 | # data_copy = copy.deepcopy(neighborhood_data) 158 | weights = self.kernel_fn(distances) 159 | 160 | labels_column = neighborhood_labels[:, label] 161 | used_features = self.feature_selection( 162 | neighborhood_data, labels_column, weights, num_features, feature_selection 163 | ) 164 | 165 | if model_regressor is None: 166 | model_regressor = Ridge( 167 | alpha=1, fit_intercept=True, random_state=self.random_state 168 | ) 169 | easy_model = model_regressor 170 | easy_model.fit( 171 | neighborhood_data[:, used_features], labels_column, sample_weight=weights 172 | ) 173 | r_sq = easy_model.score( 174 | neighborhood_data[:, used_features], labels_column, sample_weight=weights 175 | ) 176 | 177 | local_pred = easy_model.predict( 178 | neighborhood_data[0, used_features].reshape(1, -1) 179 | ) 180 | all_pred = easy_model.predict(neighborhood_data[:, used_features]) 181 | 182 | mse = mean_squared_error(labels_column, all_pred, sample_weight=weights) 183 | 184 | if self.verbose: 185 | print("Intercept", easy_model.intercept_) 186 | print("Prediction_local", local_pred) 187 | print("Right:", neighborhood_labels[0, label]) 188 | return ( 189 | sorted( 190 | zip(used_features, easy_model.coef_), 191 | key=lambda x: np.abs(x[1]), 192 | reverse=True, 193 | ), 194 | r_sq, 195 | mse, 196 | local_pred, 197 | neighborhood_data[:, used_features], 198 | labels_column, 199 | all_pred, 200 | weights, 201 | used_features, 202 | easy_model, 203 | ) 204 | -------------------------------------------------------------------------------- /1. madex/utils/lime/lime_text.py: -------------------------------------------------------------------------------- 1 | ################################ 2 | # Based on the LIME code repo 3 | ################################ 4 | 5 | import numpy as np 6 | import re 7 | 8 | 9 | class TextDomainMapper: 10 | """Maps feature ids to words or word-positions""" 11 | 12 | def __init__(self, indexed_string): 13 | """Initializer. 14 | 15 | Args: 16 | indexed_string: lime_text.IndexedString, original string 17 | """ 18 | self.indexed_string = indexed_string 19 | 20 | def map_exp_ids(self, exp, positions=False): 21 | """Maps ids to words or word-position strings. 22 | 23 | Args: 24 | exp: list of tuples [(id, weight), (id,weight)] 25 | positions: if True, also return word positions 26 | 27 | Returns: 28 | list of tuples (word, weight), or (word_positions, weight) if 29 | examples: ('bad', 1) or ('bad_3-6-12', 1) 30 | """ 31 | if positions: 32 | exp = [ 33 | ( 34 | "%s_%s" 35 | % ( 36 | self.indexed_string.word(x[0]), 37 | "-".join(map(str, self.indexed_string.string_position(x[0]))), 38 | ), 39 | x[1], 40 | ) 41 | for x in exp 42 | ] 43 | else: 44 | exp = [(self.indexed_string.word(x[0]), x[1]) for x in exp] 45 | return exp 46 | 47 | def visualize_instance_html( 48 | self, exp, label, div_name, exp_object_name, text=True, opacity=True 49 | ): 50 | """Adds text with highlighted words to visualization. 51 | 52 | Args: 53 | exp: list of tuples [(id, weight), (id,weight)] 54 | label: label id (integer) 55 | div_name: name of div object to be used for rendering(in js) 56 | exp_object_name: name of js explanation object 57 | text: if False, return empty 58 | opacity: if True, fade colors according to weight 59 | """ 60 | if not text: 61 | return "" 62 | text = ( 63 | self.indexed_string.raw_string() 64 | .encode("utf-8", "xmlcharrefreplace") 65 | .decode("utf-8") 66 | ) 67 | text = re.sub(r"[<>&]", "|", text) 68 | exp = [ 69 | ( 70 | self.indexed_string.word(x[0]), 71 | self.indexed_string.string_position(x[0]), 72 | x[1], 73 | ) 74 | for x in exp 75 | ] 76 | all_occurrences = list( 77 | itertools.chain.from_iterable( 78 | [itertools.product([x[0]], x[1], [x[2]]) for x in exp] 79 | ) 80 | ) 81 | all_occurrences = [(x[0], int(x[1]), x[2]) for x in all_occurrences] 82 | ret = """ 83 | %s.show_raw_text(%s, %d, %s, %s, %s); 84 | """ % ( 85 | exp_object_name, 86 | json.dumps(all_occurrences), 87 | label, 88 | json.dumps(text), 89 | div_name, 90 | json.dumps(opacity), 91 | ) 92 | return ret 93 | 94 | 95 | class IndexedString(object): 96 | """String with various indexes.""" 97 | 98 | def __init__(self, raw_string, split_expression=r"\W+", bow=True): 99 | """Initializer. 100 | 101 | Args: 102 | raw_string: string with raw text in it 103 | split_expression: string will be split by this. 104 | bow: if True, a word is the same everywhere in the text - i.e. we 105 | will index multiple occurrences of the same word. If False, 106 | order matters, so that the same word will have different ids 107 | according to position. 108 | """ 109 | self.raw = raw_string 110 | self.as_list = re.split(r"(%s)|$" % split_expression, self.raw) 111 | self.as_np = np.array(self.as_list) 112 | non_word = re.compile(r"(%s)|$" % split_expression).match 113 | self.string_start = np.hstack( 114 | ([0], np.cumsum([len(x) for x in self.as_np[:-1]])) 115 | ) 116 | vocab = {} 117 | self.inverse_vocab = [] 118 | self.positions = [] 119 | self.bow = bow 120 | non_vocab = set() 121 | for i, word in enumerate(self.as_np): 122 | if word in non_vocab: 123 | continue 124 | if non_word(word): 125 | non_vocab.add(word) 126 | continue 127 | if bow: 128 | if word not in vocab: 129 | vocab[word] = len(vocab) 130 | self.inverse_vocab.append(word) 131 | self.positions.append([]) 132 | idx_word = vocab[word] 133 | self.positions[idx_word].append(i) 134 | else: 135 | self.inverse_vocab.append(word) 136 | self.positions.append(i) 137 | if not bow: 138 | self.positions = np.array(self.positions) 139 | 140 | def raw_string(self): 141 | """Returns the original raw string""" 142 | return self.raw 143 | 144 | def num_words(self): 145 | """Returns the number of tokens in the vocabulary for this document.""" 146 | return len(self.inverse_vocab) 147 | 148 | def word(self, id_): 149 | """Returns the word that corresponds to id_ (int)""" 150 | return self.inverse_vocab[id_] 151 | 152 | def string_position(self, id_): 153 | """Returns a np array with indices to id_ (int) occurrences""" 154 | if self.bow: 155 | return self.string_start[self.positions[id_]] 156 | else: 157 | return self.string_start[[self.positions[id_]]] 158 | 159 | def inverse_removing(self, words_to_remove): 160 | """Returns a string after removing the appropriate words. 161 | 162 | If self.bow is false, replaces word with UNKWORDZ instead of removing 163 | it. 164 | 165 | Args: 166 | words_to_remove: list of ids (ints) to remove 167 | 168 | Returns: 169 | original raw string with appropriate words removed. 170 | """ 171 | mask = np.ones(self.as_np.shape[0], dtype="bool") 172 | mask[self.__get_idxs(words_to_remove)] = False 173 | if not self.bow: 174 | return "".join( 175 | [ 176 | self.as_list[i] if mask[i] else "UNKWORDZ" 177 | for i in range(mask.shape[0]) 178 | ] 179 | ) 180 | return "".join([self.as_list[v] for v in mask.nonzero()[0]]) 181 | 182 | def __get_idxs(self, words): 183 | """Returns indexes to appropriate words.""" 184 | if self.bow: 185 | return list( 186 | itertools.chain.from_iterable([self.positions[z] for z in words]) 187 | ) 188 | else: 189 | return self.positions[words] 190 | -------------------------------------------------------------------------------- /1. madex/utils/linear_cross_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import copy 4 | from sklearn.linear_model import Ridge, base 5 | from utils.general_utils import * 6 | 7 | 8 | def update_cross_features(Xs_in, interactions): 9 | Xs = copy.deepcopy(Xs_in) 10 | 11 | for k in Xs: 12 | Xk = Xs[k] 13 | new_features = [] 14 | 15 | for inter in interactions: 16 | inter_np = np.array(inter) 17 | new_feature = 1 * np.all(Xk[:, inter_np - 1], axis=1) 18 | new_features.append(new_feature) 19 | 20 | new_dset = np.concatenate([Xk, np.stack(new_features, axis=1)], axis=1) 21 | 22 | Xs[k] = new_dset 23 | return Xs 24 | 25 | 26 | def fit_linear_cross_models( 27 | Xs, 28 | Ys, 29 | interactions, 30 | hierarchy_stepsize=1, 31 | max_steps=1, 32 | hierarchy_patience=0, 33 | stopping=False, 34 | verbose=False, 35 | weight_samples=False, 36 | flat=False, 37 | **kwargs 38 | ): 39 | """ 40 | Trains an MLP and interprets interactions from its weights 41 | 42 | Args: 43 | data_loaders: dict of train, val, and test dataloaders 44 | sample_to_explain: the data instances to get attributions for 45 | interactions: a ranking of interaction sets 46 | hierarchy_stepsize: the stepsize across the ranking 47 | max_steps: the max number of steps on the ranking. a max_steps of 1 stops right after getting univariate attributions 48 | hierarchy patience: the patience of when to early stop on the interaction ranking based on validation performance 49 | user_linear: whether to use a linear model rather than a GAM for learning GAM+interactions model 50 | stopping: whether to early stop on the interaction ranking or not' 51 | mode: 'MSE' or 'BCE' for regression or binary classification 52 | experiment: name of experiment 53 | aggregate: aggregates the attributions of overlapping univariates and interaction sets 54 | verbose: set True to get training info 55 | 56 | Returns: 57 | the best GAM models, hierarchical interaction attributions, univariate attributions, prediction pefrformances at each hierarchical step, all trained GAMs 58 | """ 59 | 60 | Wd = get_sample_weights(Xs, enable=weight_samples, **kwargs) 61 | 62 | best_model = None 63 | best_score = None 64 | margin = 0 # initialize to 0 to start, give initial slack for aggregate 65 | patience_counter = 0 66 | active_interaction_list = [] 67 | hierarchical_interaction_attributions = [] 68 | active_interactions = [] 69 | prediction_scores = [] 70 | 71 | break_out = False 72 | 73 | ## Build univariate gam 74 | n_features = Xs["train"].shape[1] # next(iter(data_loaders["train"]))[0].shape[1] 75 | univariates = list(range(n_features)) 76 | 77 | clf = Ridge(alpha=0.01) 78 | clf.fit(Xs["train"], Ys["train"], sample_weight=Wd["train"]) 79 | r_sq = clf.score(Xs["val"], Ys["val"], sample_weight=Wd["val"]) 80 | r_sq_test = clf.score(Xs["test"], Ys["test"], sample_weight=Wd["test"]) 81 | 82 | Xs_base = copy.deepcopy(Xs) 83 | 84 | prediction_score = r_sq 85 | prediction_scores.append(r_sq_test) 86 | 87 | best_score = prediction_score 88 | 89 | univariate_attributions = (univariates, clf.coef_[0]) 90 | 91 | for s in range(1, max_steps): 92 | active_interactions2 = [] 93 | k = hierarchy_stepsize * s 94 | 95 | for v in range(k): 96 | try: 97 | interaction = interactions[v][0] 98 | active_interactions2.append(interaction) 99 | 100 | except: # TODO handle this better later 101 | break_out = True 102 | break 103 | 104 | append, remove_items = True, [] 105 | insertion_idx = len(active_interactions) 106 | for a, ai in enumerate(active_interactions): 107 | if set(interaction) <= set(ai): 108 | append = False 109 | if set(interaction) > set(ai): 110 | remove_items.append(ai) 111 | if insertion_idx == len(active_interactions): 112 | insertion_idx = a 113 | if remove_items: 114 | for r in remove_items: 115 | active_interactions.remove(r) 116 | if append: 117 | active_interactions.insert(insertion_idx, interaction) 118 | 119 | if break_out: 120 | break 121 | 122 | active_interactions_pruned = [np.array(ai) for ai in active_interactions] 123 | active_interactions2 = active_interactions #active_interactions_pruned 124 | 125 | if verbose: 126 | print("\tpruned", active_interactions_pruned) 127 | 128 | if flat: 129 | active_interactions2 = interactions 130 | 131 | Xs_inter = update_cross_features(Xs_base, active_interactions2) 132 | clf = Ridge(alpha=0.01) 133 | clf.fit(Xs_inter["train"], Ys["train"], sample_weight=Wd["train"]) 134 | r_sq = clf.score(Xs_inter["val"], Ys["val"], sample_weight=Wd["val"]) 135 | r_sq_test = clf.score(Xs_inter["test"], Ys["test"], sample_weight=Wd["test"]) 136 | 137 | prediction_score = r_sq 138 | 139 | performance_improvement = prediction_score > best_score 140 | if (not stopping) or ( 141 | stopping 142 | and (performance_improvement or patience_counter < hierarchy_patience) 143 | ): 144 | interaction_attributions = [] 145 | for inter_i, inter in enumerate(active_interactions2): 146 | w = clf.coef_[0, inter_i + n_features] 147 | interaction_attributions.append((inter, w)) 148 | hierarchical_interaction_attributions.append(interaction_attributions) 149 | prediction_scores.append(r_sq_test) 150 | 151 | if stopping: 152 | if performance_improvement: 153 | patience_counter = 0 154 | best_score = prediction_score 155 | else: 156 | patience_counter += 1 157 | else: 158 | break 159 | 160 | if flat: 161 | return interaction_attributions, univariate_attributions, prediction_score 162 | 163 | return ( 164 | prediction_scores, 165 | hierarchical_interaction_attributions, 166 | univariate_attributions, 167 | ) 168 | -------------------------------------------------------------------------------- /1. madex/utils/pretrained/dna_cnn.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/pretrained/dna_cnn.pt -------------------------------------------------------------------------------- /1. madex/utils/pretrained/gcn_cora.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/pretrained/gcn_cora.pt -------------------------------------------------------------------------------- /1. madex/utils/pretrained/model_gcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.functional import relu 4 | 5 | 6 | class InductiveGCN(nn.Module): 7 | def __init__(self, dim_inp, dim_hid, dim_out, n_samples, n_hops): 8 | super().__init__() 9 | 10 | self.dim_inp = dim_inp 11 | self.dim_hid = dim_hid 12 | self.dim_out = dim_out 13 | self.n_samples = n_samples 14 | 15 | dim_hiddens = [dim_inp] + [dim_hid] * n_hops 16 | self.layers = [ 17 | nn.Linear(dim_hiddens[i], dim_hiddens[i + 1]) 18 | for i in range(len(dim_hiddens) - 1) 19 | ] 20 | self.final_fc = nn.Linear(dim_hiddens[-1], dim_out) 21 | for layer in self.layers + [self.final_fc]: 22 | nn.init.xavier_normal_(layer.weight) 23 | nn.init.zeros_(layer.bias) 24 | self.layers = nn.ModuleList(self.layers) 25 | 26 | def forward(self, x, adj_mat): 27 | """ 28 | 29 | :param x: (n_nodes, dim_inp) 30 | :param adj_mat: (n_nodes, n_nodes) 31 | :return: (n_nodes, dim_out) 32 | """ 33 | for layer in self.layers: 34 | x = torch.matmul(adj_mat, x) 35 | x = relu(layer(x)) 36 | x = torch.matmul(adj_mat, x) 37 | x = self.final_fc(x) 38 | return x 39 | 40 | 41 | def create_model(dim_inp, dim_hid, dim_out, n_samples, n_hops): 42 | return InductiveGCN(dim_inp, dim_hid, dim_out, n_samples, n_hops) 43 | -------------------------------------------------------------------------------- /1. madex/utils/text_utils.py: -------------------------------------------------------------------------------- 1 | ############################################################################################# 2 | # Parts of this code adapted from the Transformers repo 3 | # https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines.py 4 | ############################################################################################# 5 | 6 | import torch 7 | import numpy as np 8 | 9 | from transformers import Pipeline 10 | from typing import Dict, List, Optional, Tuple, Union 11 | from transformers.configuration_utils import PretrainedConfig 12 | from transformers.tokenization_utils import PreTrainedTokenizer 13 | from transformers.modelcard import ModelCard 14 | from transformers.tokenization_auto import AutoTokenizer 15 | from transformers.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig 16 | 17 | from transformers.modeling_auto import ( 18 | AutoModel, 19 | AutoModelForSequenceClassification, 20 | AutoModelForQuestionAnswering, 21 | AutoModelForTokenClassification, 22 | AutoModelWithLMHead, 23 | ) 24 | 25 | def split_words(word_ids, sort=True): 26 | """ 27 | splits words from word_id representation 28 | """ 29 | word_ids2 = [] 30 | for word in word_ids: 31 | w = [x for x in word[0].split("_") if x] 32 | word_ids2.append(("_".join(w[0:-1]), int(w[-1]))) 33 | 34 | if sort: 35 | word_ids2.sort(key=lambda x: x[1]) 36 | return word_ids2 37 | 38 | 39 | def map_words(inter, domain_mapper): 40 | dom_map = domain_mapper.map_exp_ids 41 | word_inter = split_words(dom_map([(i, None) for i in inter], positions=True)) 42 | return tuple(w for w, _ in word_inter) 43 | 44 | 45 | class TextClassificationPipelineMod(Pipeline): 46 | """ 47 | Text classification pipeline using ModelForTextClassification head. 48 | """ 49 | 50 | def __call__(self, *args, **kwargs): 51 | outputs = super().__call__(*args, **kwargs) 52 | return outputs 53 | 54 | 55 | def pipeline( 56 | task: str, 57 | model: Optional = None, 58 | config: Optional[Union[str, PretrainedConfig]] = None, 59 | tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, 60 | modelcard: Optional[Union[str, ModelCard]] = None, 61 | device = torch.device("cpu"), 62 | **kwargs 63 | ) -> Pipeline: 64 | """ 65 | Utility factory method to build a pipeline. 66 | Pipeline are made of: 67 | A Tokenizer instance in charge of mapping raw textual input to token 68 | A Model instance 69 | Some (optional) post processing for enhancing model's output 70 | Examples: 71 | pipeline('sentiment-analysis') 72 | """ 73 | # Register all the supported task here 74 | SUPPORTED_TASKS = { 75 | "sentiment-analysis": { 76 | "impl": TextClassificationPipelineMod, 77 | "pt": AutoModelForSequenceClassification,# if is_torch_available() else None, 78 | "default": { 79 | "model": { 80 | "pt": "distilbert-base-uncased-finetuned-sst-2-english", 81 | }, 82 | "config": "distilbert-base-uncased-finetuned-sst-2-english", 83 | "tokenizer": "distilbert-base-uncased", 84 | }, 85 | }, 86 | } 87 | 88 | # Retrieve the task 89 | if task not in SUPPORTED_TASKS: 90 | raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) 91 | 92 | framework = "pt"#get_framework(model) 93 | 94 | targeted_task = SUPPORTED_TASKS[task] 95 | task, model_class = targeted_task["impl"], targeted_task[framework] 96 | 97 | # Use default model/config/tokenizer for the task if no model is provided 98 | if model is None: 99 | models, config, tokenizer = tuple(targeted_task["default"].values()) 100 | model = models[framework] 101 | 102 | # Try to infer tokenizer from model or config name (if provided as str) 103 | if tokenizer is None: 104 | if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: 105 | tokenizer = model 106 | elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: 107 | tokenizer = config 108 | else: 109 | # Impossible to guest what is the right tokenizer here 110 | raise Exception( 111 | "Impossible to guess which tokenizer to use. " 112 | "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer." 113 | ) 114 | 115 | # Try to infer modelcard from model or config name (if provided as str) 116 | if modelcard is None: 117 | # Try to fallback on one of the provided string for model or config (will replace the suffix) 118 | if isinstance(model, str): 119 | modelcard = model 120 | elif isinstance(config, str): 121 | modelcard = config 122 | 123 | # Instantiate tokenizer if needed 124 | if isinstance(tokenizer, str): 125 | tokenizer = AutoTokenizer.from_pretrained(tokenizer) 126 | 127 | # Instantiate config if needed 128 | if isinstance(config, str): 129 | config = AutoConfig.from_pretrained(config) 130 | 131 | # Instantiate modelcard if needed 132 | if isinstance(modelcard, str): 133 | modelcard = ModelCard.from_pretrained(modelcard) 134 | 135 | # Instantiate model if needed 136 | if isinstance(model, str): 137 | # Handle transparent TF/PT model conversion 138 | model_kwargs = {} 139 | if framework == "pt" and model.endswith(".h5"): 140 | model_kwargs["from_tf"] = True 141 | logger.warning( 142 | "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " 143 | "Trying to load the model with PyTorch." 144 | ) 145 | 146 | model = model_class.from_pretrained(model, config=config, **model_kwargs) 147 | model = model.to(device) 148 | model.device = device 149 | return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs) 150 | 151 | 152 | def get_bert_model(device): 153 | model = pipeline("sentiment-analysis", device=device) 154 | model.device = device 155 | return model 156 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/avazu/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = "./data/autoint/Avazu/" 2 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/avazu/preprocess.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Email of the author: zjduan@pku.edu.cn 3 | """ 4 | 0.id: ad identifier 5 | 1.click: 0/1 for non-click/click 6 | 2.hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC. 7 | 3.C1 -- anonymized categorical variable 8 | 4.banner_pos 9 | 5.site_id 10 | 6.site_domain 11 | 7.site_category 12 | 8.app_id 13 | 9.app_domain 14 | 10.app_category 15 | 11.device_id 16 | 12.device_ip 17 | 13.device_model 18 | 14.device_type 19 | 15.device_conn_type 20 | 16.C14 21 | 17.C15 22 | 18.C16 23 | 19.C17 24 | 20.C18 25 | 21.C19 26 | 22.C20 27 | 23.C21 28 | """ 29 | import pandas as pd 30 | import config 31 | import math 32 | 33 | train_path = config.DATA_PATH + "train.csv" 34 | f1 = open(train_path, "r") 35 | dic = {} 36 | f_train_value = open(config.DATA_PATH + "train_x.txt", "w") 37 | f_train_index = open(config.DATA_PATH + "train_i.txt", "w") 38 | f_train_label = open(config.DATA_PATH + "train_y.txt", "w") 39 | debug = False 40 | tune = False 41 | Bound = [5] * 24 42 | 43 | label_index = 1 44 | Column = 24 45 | 46 | numr_feat = [] 47 | numerical = [0] * Column 48 | numerical[label_index] = -1 49 | 50 | cate_feat = [] 51 | for i in range(Column): 52 | if numerical[i] == 0: 53 | cate_feat.extend([i]) 54 | 55 | index_cnt = 0 56 | index_others = [0] * Column 57 | Max = [0] * Column 58 | 59 | 60 | for i in numr_feat: 61 | index_others[i] = index_cnt 62 | index_cnt += 1 63 | numerical[i] = 1 64 | for i in cate_feat: 65 | index_others[i] = index_cnt 66 | index_cnt += 1 67 | 68 | for i in range(Column): 69 | dic[i] = dict() 70 | 71 | cnt_line = 0 72 | for line in f1: 73 | cnt_line += 1 74 | if cnt_line == 1: 75 | continue # header 76 | if cnt_line % 1000000 == 0: 77 | print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt)) 78 | if debug == True: 79 | if cnt_line >= 10000: 80 | break 81 | split = line.strip("\n").split(",") 82 | for i in cate_feat: 83 | if split[i] != "": 84 | if split[i] not in dic[i]: 85 | dic[i][split[i]] = [index_others[i], 0] 86 | dic[i][split[i]][1] += 1 87 | if ( 88 | dic[i][split[i]][0] == index_others[i] 89 | and dic[i][split[i]][1] == Bound[i] 90 | ): 91 | dic[i][split[i]][0] = index_cnt 92 | index_cnt += 1 93 | 94 | if tune == False: 95 | label = split[label_index] 96 | if label != "0": 97 | label = "1" 98 | index = [0] * (Column - 1) 99 | value = ["0"] * (Column - 1) 100 | for i in range(Column): 101 | cur = i 102 | if i == label_index: 103 | continue 104 | if i > label_index: 105 | cur = i - 1 106 | if numerical[i] == 1: 107 | index[cur] = index_others[i] 108 | if split[i] != "": 109 | value[cur] = split[i] 110 | # Max[i] = max(int(split[i]), Max[i]) 111 | else: 112 | if split[i] != "": 113 | index[cur] = dic[i][split[i]][0] 114 | value[cur] = "1" 115 | 116 | if split[i] == "": 117 | value[cur] = "0" 118 | 119 | f_train_index.write(" ".join(str(i) for i in index) + "\n") 120 | f_train_value.write(" ".join(value) + "\n") 121 | f_train_label.write(label + "\n") 122 | 123 | f1.close() 124 | f_train_index.close() 125 | f_train_value.close() 126 | f_train_label.close() 127 | print("Finished!") 128 | print("index_cnt = %d" % index_cnt) 129 | # print ("max number for numerical features:") 130 | # for i in numr_feat: 131 | # print ("no.:%d max: %d" % (i, Max[i])) 132 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/criteo/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = "./data/autoint/Criteo/" 2 | SOURCE_DATA = "./data/autoint/Criteo/train.txt" 3 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/criteo/preprocess.py: -------------------------------------------------------------------------------- 1 | import config 2 | 3 | train_path = config.SOURCE_DATA 4 | f1 = open(train_path, "r") 5 | dic = {} 6 | # generate three fold. 7 | # train_x: value 8 | # train_i: index 9 | # train_y: label 10 | f_train_value = open(config.DATA_PATH + "train_x.txt", "w") 11 | f_train_index = open(config.DATA_PATH + "train_i.txt", "w") 12 | f_train_label = open(config.DATA_PATH + "train_y.txt", "w") 13 | 14 | for i in range(39): 15 | dic[i] = {} 16 | 17 | cnt_train = 0 18 | 19 | # for debug 20 | # limits = 10000 21 | index = [1] * 26 22 | for line in f1: 23 | cnt_train += 1 24 | if cnt_train % 100000 == 0: 25 | print("now train cnt : %d\n" % cnt_train) 26 | # if cnt_train > limits: 27 | # break 28 | split = line.strip("\n").split("\t") 29 | # 0-label, 1-13 numerical, 14-39 category 30 | for i in range(13, 39): 31 | # dic_len = len(dic[i]) 32 | if split[i + 1] not in dic[i]: 33 | # [1, 0] 1 is the index for those whose appear times <= 10 0 indicates the appear times 34 | dic[i][split[i + 1]] = [1, 0] 35 | dic[i][split[i + 1]][1] += 1 36 | if dic[i][split[i + 1]][0] == 1 and dic[i][split[i + 1]][1] > 10: 37 | index[i - 13] += 1 38 | dic[i][split[i + 1]][0] = index[i - 13] 39 | f1.close() 40 | print("total entries :%d\n" % (cnt_train - 1)) 41 | 42 | # calculate number of category features of every dimension 43 | kinds = [13] 44 | for i in range(13, 39): 45 | kinds.append(index[i - 13]) 46 | print("number of dimensions : %d" % (len(kinds) - 1)) 47 | print(kinds) 48 | 49 | for i in range(1, len(kinds)): 50 | kinds[i] += kinds[i - 1] 51 | print(kinds) 52 | 53 | # make new data 54 | 55 | f1 = open(train_path, "r") 56 | cnt_train = 0 57 | print("remake training data...\n") 58 | for line in f1: 59 | cnt_train += 1 60 | if cnt_train % 100000 == 0: 61 | print("now train cnt : %d\n" % cnt_train) 62 | # if cnt_train > limits: 63 | # break 64 | entry = ["0"] * 39 65 | index = [None] * 39 66 | split = line.strip("\n").split("\t") 67 | label = str(split[0]) 68 | for i in range(13): 69 | if split[i + 1] != "": 70 | entry[i] = split[i + 1] 71 | index[i] = i + 1 72 | for i in range(13, 39): 73 | if split[i + 1] != "": 74 | entry[i] = "1" 75 | index[i] = dic[i][split[i + 1]][0] 76 | for j in range(26): 77 | index[13 + j] += kinds[j] 78 | index = [str(item) for item in index] 79 | f_train_value.write(" ".join(entry) + "\n") 80 | f_train_index.write(" ".join(index) + "\n") 81 | f_train_label.write(label + "\n") 82 | f1.close() 83 | 84 | 85 | f_train_value.close() 86 | f_train_index.close() 87 | f_train_label.close() 88 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/criteo/scale.py: -------------------------------------------------------------------------------- 1 | import math 2 | import config 3 | import numpy as np 4 | 5 | 6 | def scale(x): 7 | if x > 2: 8 | x = int(math.log(float(x)) ** 2) 9 | return x 10 | 11 | 12 | def scale_each_fold(): 13 | for i in range(1, 11): 14 | print("now part %d" % i) 15 | data = np.load(config.DATA_PATH + "part" + str(i) + "/train_x.npy") 16 | part = data[:, 0:13] 17 | for j in range(part.shape[0]): 18 | if j % 100000 == 0: 19 | print(j) 20 | part[j] = list(map(scale, part[j])) 21 | np.save(config.DATA_PATH + "part" + str(i) + "/train_x2.npy", data) 22 | 23 | 24 | if __name__ == "__main__": 25 | scale_each_fold() 26 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/kdd2012/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = "./KDD2012/" 2 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/kdd2012/preprocess.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Email of the author: zjduan@pku.edu.cn 3 | """ 4 | 0. Click: 5 | 1. Impression(numerical) 6 | 2. DisplayURL: (categorical) 7 | 3. AdID:(categorical) 8 | 4. AdvertiserID:(categorical) 9 | 5. Depth:(numerical) 10 | 6. Position:(numerical) 11 | 7. QueryID: (categorical) the key of the data file 'queryid_tokensid.txt'. 12 | 8. KeywordID: (categorical)the key of 'purchasedkeyword_tokensid.txt'. 13 | 9. TitleID: (categorical)the key of 'titleid_tokensid.txt'. 14 | 10. DescriptionID: (categorical)the key of 'descriptionid_tokensid.txt'. 15 | 11. UserID: (categorical)the key of 'userid_profile.txt' 16 | 12. User's Gender: (categorical) 17 | 13. User's Age: (categorical) 18 | """ 19 | import math 20 | 21 | train_path = "./KDD2012/training.txt" 22 | f1 = open(train_path, "r") 23 | f2 = open("./KDD2012/userid_profile.txt", "r") 24 | dic = {} 25 | f_train_value = open("./KDD2012/train_x.txt", "w") 26 | f_train_index = open("./KDD2012/train_i.txt", "w") 27 | f_train_label = open("./KDD2012/train_y.txt", "w") 28 | debug = False 29 | tune = False 30 | Column = 12 31 | Field = 13 32 | 33 | numr_feat = [1, 5, 6] 34 | numerical = [0] * Column 35 | cate_feat = [2, 3, 4, 7, 8, 9, 10, 11] 36 | index_cnt = 0 37 | index_others = [0] * (Field + 1) 38 | Max = [0] * 12 39 | numerical[0] = -1 40 | for i in numr_feat: 41 | index_others[i] = index_cnt 42 | index_cnt += 1 43 | numerical[i] = 1 44 | for i in cate_feat: 45 | index_others[i] = index_cnt 46 | index_cnt += 1 47 | 48 | for i in range(Field + 1): 49 | dic[i] = dict() 50 | 51 | ###init user_dic 52 | user_dic = dict() 53 | 54 | cnt_line = 0 55 | for line in f2: 56 | cnt_line += 1 57 | if cnt_line % 1000000 == 0: 58 | print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt)) 59 | # if (debug == True): 60 | # if (cnt_line >= 10000): 61 | # break 62 | split = line.strip("\n").split("\t") 63 | user_dic[split[0]] = [split[1], split[2]] 64 | if split[1] not in dic[12]: 65 | dic[12][split[1]] = [index_cnt, 0] 66 | index_cnt += 1 67 | if split[2] not in dic[13]: 68 | dic[13][split[2]] = [index_cnt, 0] 69 | index_cnt += 1 70 | 71 | cnt_line = 0 72 | for line in f1: 73 | cnt_line += 1 74 | if cnt_line % 1000000 == 0: 75 | print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt)) 76 | if debug == True: 77 | if cnt_line >= 10000: 78 | break 79 | split = line.strip("\n").split("\t") 80 | for i in cate_feat: 81 | if split[i] != "": 82 | if split[i] not in dic[i]: 83 | dic[i][split[i]] = [index_others[i], 0] 84 | dic[i][split[i]][1] += 1 85 | if dic[i][split[i]][0] == index_others[i] and dic[i][split[i]][1] == 10: 86 | dic[i][split[i]][0] = index_cnt 87 | index_cnt += 1 88 | 89 | if tune == False: 90 | label = split[0] 91 | if label != "0": 92 | label = "1" 93 | index = [0] * Field 94 | value = ["0"] * Field 95 | for i in range(1, 12): 96 | if numerical[i] == 1: 97 | index[i - 1] = index_others[i] 98 | if split[i] != "": 99 | value[i - 1] = split[i] 100 | Max[i] = max(int(split[i]), Max[i]) 101 | else: 102 | if split[i] != "": 103 | index[i - 1] = dic[i][split[i]][0] 104 | value[i - 1] = "1" 105 | 106 | if split[i] == "": 107 | value[i - 1] = "0" 108 | if i == 11 and split[i] == "0": 109 | value[i - 1] = "0" 110 | ### gender and age 111 | if split[11] == "" or (split[11] not in user_dic): 112 | index[12 - 1] = index_others[12] 113 | value[12 - 1] = "0" 114 | index[13 - 1] = index_others[13] 115 | value[13 - 1] = "0" 116 | else: 117 | index[12 - 1] = dic[12][user_dic[split[11]][0]][0] 118 | value[12 - 1] = "1" 119 | index[13 - 1] = dic[13][user_dic[split[11]][1]][0] 120 | value[13 - 1] = "1" 121 | 122 | f_train_index.write(" ".join(str(i) for i in index) + "\n") 123 | f_train_value.write(" ".join(value) + "\n") 124 | f_train_label.write(label + "\n") 125 | 126 | f1.close() 127 | f_train_index.close() 128 | f_train_value.close() 129 | f_train_label.close() 130 | print("Finished!") 131 | print("index_cnt = %d" % index_cnt) 132 | print("max number for numerical features:") 133 | for i in numr_feat: 134 | print("no.:%d max: %d" % (i, Max[i])) 135 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/kdd2012/scale.py: -------------------------------------------------------------------------------- 1 | import math 2 | import config 3 | import numpy as np 4 | 5 | 6 | def scale(x): 7 | if x > 2: 8 | x = int(math.log(float(x)) ** 2) 9 | return x 10 | 11 | 12 | def scale_each_fold(): 13 | for i in range(1, 11): 14 | print("now part %d" % i) 15 | data = np.load(config.DATA_PATH + "part" + str(i) + "/train_x.npy") 16 | part = data[:, 0:13] 17 | for j in range(part.shape[0]): 18 | if j % 100000 == 0: 19 | print(j) 20 | part[j] = list(map(scale, part[j])) 21 | np.save(config.DATA_PATH + "part" + str(i) + "/train_x2.npy", data) 22 | 23 | 24 | if __name__ == "__main__": 25 | scale_each_fold() 26 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/kfold_split/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = "./data/autoint/Criteo/" 2 | TRAIN_I = DATA_PATH + "train_i.txt" 3 | TRAIN_X = DATA_PATH + "train_x.txt" 4 | TRAIN_Y = DATA_PATH + "train_y.txt" 5 | 6 | NUM_SPLITS = 10 7 | RANDOM_SEED = 2018 8 | -------------------------------------------------------------------------------- /2. glider/data/initial_data_prep/kfold_split/stratifiedKfold.py: -------------------------------------------------------------------------------- 1 | # Email of the author: zjduan@pku.edu.cn 2 | import numpy as np 3 | import config 4 | import os 5 | import pandas as pd 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn import preprocessing 8 | 9 | scale = "" 10 | train_x_name = "train_x.npy" 11 | train_y_name = "train_y.npy" 12 | 13 | # numr_feat = [] 14 | Column = 13 15 | 16 | 17 | def _load_data(_nrows=None, debug=False): 18 | 19 | train_x = pd.read_csv( 20 | config.TRAIN_X, header=None, sep=" ", nrows=_nrows, dtype=np.float 21 | ) 22 | train_y = pd.read_csv( 23 | config.TRAIN_Y, header=None, sep=" ", nrows=_nrows, dtype=np.int32 24 | ) 25 | 26 | # for i in range(11): 27 | # print ("argmax feat %d = %d, max = %d" % (i, train_x[i].argmax(), train_x[i].max())) 28 | 29 | train_x = train_x.values 30 | train_y = train_y.values.reshape([-1]) 31 | 32 | # print ("begin to scale") 33 | # if (scale == "minmax"): 34 | # train_x = preprocessing.MinMaxScaler().fit_transform(train_x) 35 | 36 | # if (scale == "std"): 37 | # train_x[:,0:12] = preprocessing.scale(train_x[:,0:12]) 38 | # train_x[:,0:12] += 1 39 | 40 | print("data loading done!") 41 | print("training data : %d" % train_y.shape[0]) 42 | 43 | assert train_x.shape[0] == train_y.shape[0] 44 | 45 | return train_x, train_y 46 | 47 | 48 | def save_x_y(fold_index, train_x, train_y): 49 | _get = lambda x, l: [x[i] for i in l] 50 | for i in range(len(fold_index)): 51 | print("now part %d" % (i + 1)) 52 | part_index = fold_index[i] 53 | Xv_train_, y_train_ = _get(train_x, part_index), _get(train_y, part_index) 54 | save_dir_Xv = config.DATA_PATH + "part" + str(i + 1) + "/" 55 | save_dir_y = config.DATA_PATH + "part" + str(i + 1) + "/" 56 | if os.path.exists(save_dir_Xv) == False: 57 | os.makedirs(save_dir_Xv) 58 | if os.path.exists(save_dir_y) == False: 59 | os.makedirs(save_dir_y) 60 | save_path_Xv = save_dir_Xv + train_x_name 61 | save_path_y = save_dir_y + train_y_name 62 | np.save(save_path_Xv, Xv_train_) 63 | np.save(save_path_y, y_train_) 64 | 65 | 66 | # def save_test(test_x, test_y): 67 | # np.save("../data/test/test_x.npy", test_x) 68 | # np.save("../data/test/test_y.npy", test_y) 69 | 70 | 71 | def save_i(fold_index): 72 | _get = lambda x, l: [x[i] for i in l] 73 | train_i = pd.read_csv( 74 | config.TRAIN_I, header=None, sep=" ", nrows=None, dtype=np.int32 75 | ) 76 | train_i = train_i.values 77 | feature_size = train_i.max() + 1 78 | print("feature_size = %d" % feature_size) 79 | feature_size = [feature_size] 80 | feature_size = np.array(feature_size) 81 | np.save(config.DATA_PATH + "feature_size.npy", feature_size) 82 | 83 | # pivot = 40000000 84 | 85 | # test_i = train_i[pivot:] 86 | # train_i = train_i[:pivot] 87 | 88 | # print("test_i size: %d" % len(test_i)) 89 | print("train_i size: %d" % len(train_i)) 90 | 91 | # np.save("../data/test/test_i.npy", test_i) 92 | 93 | for i in range(len(fold_index)): 94 | print("now part %d" % (i + 1)) 95 | part_index = fold_index[i] 96 | Xi_train_ = _get(train_i, part_index) 97 | save_path_Xi = config.DATA_PATH + "part" + str(i + 1) + "/train_i.npy" 98 | np.save(save_path_Xi, Xi_train_) 99 | 100 | 101 | def main(): 102 | 103 | train_x, train_y = _load_data() 104 | print("loading data done!") 105 | 106 | folds = list( 107 | StratifiedKFold( 108 | n_splits=10, shuffle=True, random_state=config.RANDOM_SEED 109 | ).split(train_x, train_y) 110 | ) 111 | 112 | fold_index = [] 113 | for i, (train_id, valid_id) in enumerate(folds): 114 | fold_index.append(valid_id) 115 | 116 | print("fold num: %d" % (len(fold_index))) 117 | 118 | fold_index = np.array(fold_index) 119 | np.save(config.DATA_PATH + "fold_index.npy", fold_index) 120 | 121 | save_x_y(fold_index, train_x, train_y) 122 | print("save train_x_y done!") 123 | 124 | fold_index = np.load(config.DATA_PATH + "fold_index.npy") 125 | save_i(fold_index) 126 | print("save index done!") 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /2. glider/detect_global_interactions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("../1. madex") 4 | from sampling_and_inference import generate_perturbation_dataset_autoint 5 | from neural_interaction_detection import detect_interactions 6 | import os 7 | import logging 8 | from tqdm import tqdm 9 | import warnings 10 | import pickle 11 | import numpy as np 12 | import argparse 13 | import torch 14 | import torch.optim as optim 15 | from utils.global_interaction_utils import * 16 | import torch.multiprocessing as multiprocessing 17 | 18 | 19 | warnings.simplefilter("ignore") 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--global_size", type=int, default=1000) 23 | parser.add_argument("--num_perturbation", type=int, default=6000) 24 | parser.add_argument( 25 | "--save_path", 26 | type=str, 27 | default="/meladyfs/newyork/mtsang/AutoInt/test_code/Criteo/b3h2_dnn_dropkeep1_400x2_5trials_v2/1/", 28 | ) 29 | parser.add_argument("--data", type=str, help="data name", default="criteo") 30 | parser.add_argument("--save_id", type=str, help="save id", default="testpar2") 31 | parser.add_argument( 32 | "--data_path", type=str, help="root path for all the data", default="data/autoint" 33 | ) 34 | parser.add_argument("--epochs", type=int, help="num epochs", default=100) 35 | parser.add_argument("--es", type=int, help="enable early stopping", default=1) 36 | parser.add_argument("--l1", type=float, help="set l1 reg constant", default=1e-4) 37 | parser.add_argument("--lr", type=float, help="learning rate", default=0.01) 38 | parser.add_argument("--opt", type=str, help="optimizer", default="adam") 39 | parser.add_argument( 40 | "--par_batch_size", 41 | type=int, 42 | help="size of parallel batch (same as num parallel processes)", 43 | default=32, 44 | ) 45 | parser.add_argument("--add_linear", type=int, help="contain main effects in interaction detector via linear regression", default=0) 46 | parser.add_argument("--detector", type=str, help="detector: NID or GradientNID", default="NID") 47 | parser.add_argument("--gpu", type=int, help="gpu number", default=0) 48 | 49 | args = parser.parse_args() 50 | par_batch_size = args.par_batch_size 51 | if args.opt == "adagrad": 52 | opt = optim.Adagrad 53 | elif args.opt == "adam": 54 | opt = optim.Adam 55 | else: 56 | raise ValueError("invalid optimizer") 57 | 58 | # device = torch.device("cuda:" + str(args.gpu)) 59 | 60 | 61 | def par_experiment(idx, perturbations): 62 | feats = perturbations["feats"] 63 | labels = perturbations["targets"] 64 | 65 | # distributes processes across two gpus 66 | device = torch.device("cuda:" + str(idx%2)) 67 | 68 | try: 69 | inters, mlp_loss = detect_interactions( 70 | feats, 71 | labels, 72 | arch=[256, 128, 64], 73 | nepochs=args.epochs, 74 | early_stopping=args.es, 75 | patience=5, 76 | l1_const=args.l1, 77 | learning_rate=args.lr, 78 | opt_func=opt, 79 | add_linear=args.add_linear, 80 | detector=args.detector, 81 | seed=42, 82 | verbose=False, 83 | device=device, 84 | ) 85 | print("mlp loss", mlp_loss) 86 | result = {"inters": inters, "mlp_loss": mlp_loss} 87 | except: 88 | print("error in learning mlp for interaction detection") 89 | result = None 90 | 91 | return idx, result 92 | 93 | 94 | def run(): 95 | multiprocessing.set_start_method("spawn", force=True) 96 | 97 | # this data is shuffled. other datasets must be shuffled for global interaction detection 98 | model, data = get_autoint_and_data( 99 | data_path=args.data_path, dataset=args.data, save_path=args.save_path 100 | ) 101 | 102 | dense_feat_indices = [] 103 | sparse_feat_indices = [] 104 | for i in tqdm(range(data["Xi"].shape[1])): 105 | uniq = np.unique(data["Xi"][:, i]) 106 | if len(uniq) == 1 and not args.data == "avazu": 107 | dense_feat_indices.append(i) 108 | else: 109 | sparse_feat_indices.append(i) 110 | 111 | print("dense feature indices", dense_feat_indices) 112 | 113 | save_postfix = "_" + args.save_id if args.save_id else "" 114 | 115 | base_path = "experiments/detected_interactions/" 116 | pkl_path = ( 117 | base_path 118 | + "detected_interactions_" 119 | + args.data.lower() 120 | + save_postfix 121 | + ".pickle" 122 | ) 123 | if os.path.exists(pkl_path): 124 | with open(pkl_path, "rb") as handle: 125 | interaction_results = pickle.load(handle) 126 | print("loaded existing results. starting from index", len(interaction_results)) 127 | else: 128 | if not os.path.exists(base_path): 129 | os.makedirs(base_path) 130 | interaction_results = [] 131 | 132 | indexes = list(range(len(interaction_results), args.global_size)) 133 | num_par_batches = int(np.ceil(len(indexes) / par_batch_size)) 134 | 135 | for b in tqdm(range(num_par_batches)): 136 | index_batch = indexes[b * par_batch_size : (b + 1) * par_batch_size] 137 | perturbation_batch = [] 138 | for idx in index_batch: 139 | 140 | data_inst = { 141 | "Xi": data["Xi"][idx], 142 | "Xv": data["Xv"][idx], 143 | "means": data["means"], 144 | } 145 | feats, targets = generate_perturbation_dataset_autoint( 146 | data_inst, 147 | model, 148 | dense_feat_indices, 149 | sparse_feat_indices, 150 | num_samples=args.num_perturbation, 151 | valid_size=500, 152 | test_size=500, 153 | seed=idx, 154 | ) 155 | perturbation_batch.append({"feats": feats, "targets": targets}) 156 | 157 | with multiprocessing.Pool(processes=par_batch_size) as pool: 158 | results_batch = pool.starmap( 159 | par_experiment, zip(index_batch, perturbation_batch) 160 | ) 161 | 162 | results_batch.sort(key=lambda x: x[0]) 163 | 164 | for _, result in results_batch: 165 | interaction_results.append(result) 166 | 167 | with open(pkl_path, "wb") as handle: 168 | pickle.dump(interaction_results, handle) 169 | 170 | 171 | if __name__ == "__main__": 172 | run() 173 | -------------------------------------------------------------------------------- /2. glider/make_cross_feature_data.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from tqdm import tqdm 3 | import numpy as np 4 | from sklearn.preprocessing import LabelEncoder 5 | import warnings 6 | import os 7 | import pandas as pd 8 | import argparse 9 | 10 | warnings.simplefilter("ignore") 11 | 12 | from utils.cross_feature_utils import * 13 | 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | "--data_file", 18 | type=str, 19 | help="the path where global interaction results are saved", 20 | default="experiments/detected_interactions_criteo_repr2.pickle", 21 | ) 22 | parser.add_argument("--exp", type=str, help="an experiment id", default="cross2_K20") 23 | parser.add_argument( 24 | "--K", type=int, help="the top-K threshold for global interactions", default=20 25 | ) 26 | parser.add_argument("--data", type=str, help="data name", default="criteo") 27 | parser.add_argument( 28 | "--autoint_save_path", 29 | type=str, 30 | help="folder where cross features for autoint are saved", 31 | default="data/autoint/criteo", 32 | ) 33 | parser.add_argument( 34 | "--deepctr_save_path", 35 | type=str, 36 | help="folder where cross features for deepctr are saved", 37 | default="data/deepctr/criteo", 38 | ) 39 | parser.add_argument( 40 | "--bs", type=int, help="batch size of training data", default=1000000 41 | ) 42 | parser.add_argument("--nbins", type=int, help="num bins", default=100) 43 | parser.add_argument( 44 | "--nprocs", type=int, help="number of parallel processes", default=20 45 | ) 46 | parser.add_argument( 47 | "--thresh", 48 | type=float, 49 | help="min pct of training batch to require cross feature ids to appear", 50 | default=0.0001, 51 | ) 52 | parser.add_argument( 53 | "--top_k", 54 | type=int, 55 | help="k threshold for madex interactions", 56 | default=100, 57 | ) 58 | parser.add_argument( 59 | "--save_base_data", 60 | type=str2bool, 61 | help="y/n: save (baseline) data without cross features for deepctr", 62 | nargs="?", 63 | const=True, 64 | default=True, 65 | ) 66 | parser.add_argument( 67 | "--prune", 68 | type=str2bool, 69 | help="prune interaction subsets", 70 | nargs="?", 71 | const=True, 72 | default=True, 73 | ) 74 | 75 | args = parser.parse_args() 76 | 77 | interactions_file = args.data_file 78 | experiment = args.exp 79 | max_rank = args.K 80 | dataset = (args.data).lower() 81 | data_path_autoint = args.autoint_save_path 82 | data_path_deepctr = args.deepctr_save_path 83 | 84 | training_batch_size = args.bs 85 | num_bins = args.nbins 86 | num_processes = args.nprocs 87 | threshold_pct = args.thresh 88 | deepctr_save_baseline_data = args.save_base_data 89 | prune_interaction_subsets = args.prune 90 | top_k = args.top_k 91 | 92 | 93 | def make_cross_feature_data( 94 | interactions_file, 95 | max_rank, 96 | dataset, 97 | training_batch_size, 98 | data_path, 99 | num_bins, 100 | threshold, 101 | top_k, 102 | prune_subsets, 103 | num_processes, 104 | ): 105 | 106 | print("loading autoint data") 107 | data = load_data_autoint(dataset, data_path) 108 | Xi, Xv, y, lens = merge_data(data) 109 | Xi_batch, Xv_batch, y_batch = get_training_batch(data, size=training_batch_size) 110 | dense_feat_indices, sparse_feat_indices = get_dense_sparse_feat_indices( 111 | Xi_batch, dataset 112 | ) 113 | 114 | # print("dense feature indices", dense_feat_indices) 115 | 116 | if dataset == "avazu": 117 | num_sparse, num_dense = 23, 0 118 | elif dataset == "criteo": 119 | num_sparse, num_dense = 26, 13 120 | else: 121 | raise ValueError("Invalid dataset") 122 | 123 | assert num_dense == len(dense_feat_indices) 124 | assert num_sparse == len(sparse_feat_indices) 125 | 126 | num_feats = len(sparse_feat_indices) + len(dense_feat_indices) 127 | 128 | print("loading interactions") 129 | inters = load_global_interactions( 130 | interactions_file, num_feats, max_rank, prune_subsets, top_k 131 | ) 132 | 133 | print("discretizing dense features") 134 | sparsified_data = discretize_dense_features( 135 | Xi, 136 | Xv, 137 | Xv_batch, 138 | dense_feat_indices, 139 | sparse_feat_indices, 140 | num_feats, 141 | num_bins, 142 | num_processes=num_processes, 143 | ) 144 | 145 | train_start = sum(lens[0:2]) 146 | sparsified_batch = sparsified_data[train_start : train_start + training_batch_size] 147 | 148 | print("crossing sparse features") 149 | cross_feats = cross_sparse_features( 150 | inters, 151 | sparsified_data, 152 | sparsified_batch, 153 | Xi_batch, 154 | threshold, 155 | num_processes=num_processes, 156 | ) 157 | Xi, Xv, Xi_cross, Xv_cross = get_X_cross( 158 | inters, cross_feats, Xi, Xv, sparse_feat_indices 159 | ) 160 | 161 | return ( 162 | Xi, 163 | Xv, 164 | y, 165 | Xi_cross, 166 | Xv_cross, 167 | lens, 168 | max_rank, 169 | num_feats, 170 | dense_feat_indices, 171 | sparse_feat_indices, 172 | ) 173 | 174 | 175 | def save_cross_feats_autoint(Xi_cross, Xv_cross, lens, experiment, data_path): 176 | print("saving data for autoint") 177 | 178 | cross_name = ["i_cross.npy", "x_cross.npy"] 179 | 180 | prev_len = 0 181 | for i in tqdm(range(1, 11)): 182 | cur_len = prev_len + lens[i - 1] 183 | Xi_seg = Xi_cross[prev_len:cur_len] 184 | Xv_seg = Xv_cross[prev_len:cur_len] 185 | 186 | folder_path = data_path + "/part" + str(i) + "/" + experiment 187 | if not os.path.exists(folder_path): 188 | os.makedirs(folder_path) 189 | 190 | np.save(folder_path + "/" + cross_name[0], Xi_seg) 191 | np.save(folder_path + "/" + cross_name[1], Xv_seg) 192 | 193 | prev_len = cur_len 194 | 195 | feature_size = int(Xi_cross.max() + 1) 196 | # print("feature_size = %d" % feature_size) 197 | 198 | folder_path2 = data_path + "/" + experiment 199 | 200 | if not os.path.exists(folder_path2): 201 | os.makedirs(folder_path2) 202 | np.save(folder_path2 + "/feature_size.npy", np.array([feature_size])) 203 | 204 | 205 | def save_cross_feats_deepctr( 206 | Xi, 207 | Xv, 208 | y, 209 | Xi_cross, 210 | Xv_cross, 211 | dense_feat_indices, 212 | sparse_feat_indices, 213 | lens, 214 | experiment, 215 | data_path, 216 | deepctr_save_baseline_data, 217 | ): 218 | print("saving data for deepctr") 219 | num_dense = len(dense_feat_indices) 220 | num_sparse = len(sparse_feat_indices) 221 | 222 | sparse_features = ["C" + str(i) for i in range(1, num_sparse + 1)] 223 | dense_features = ["I" + str(i) for i in range(1, num_dense + 1)] 224 | cross_features = ["G" + str(i) for i in range(1, max_rank + 1)] 225 | target = ["label"] 226 | 227 | settings = ["cross"] 228 | if deepctr_save_baseline_data: 229 | settings.append("baseline") 230 | 231 | n_unique_dict = dict() 232 | 233 | for setting in settings: 234 | 235 | if setting == "baseline": 236 | sparse_indices = sparse_feat_indices 237 | Xi_sparse_na = np.where( 238 | Xv[:, sparse_indices] == 1, Xi[:, sparse_indices], -1 239 | ) 240 | data_np = np.concatenate( 241 | [np.expand_dims(y, axis=1), Xv[:, dense_feat_indices], Xi_sparse_na], 242 | axis=1, 243 | ) 244 | df = pd.DataFrame( 245 | data_np, columns=target + dense_features + sparse_features 246 | ) 247 | temp_feats = sparse_features 248 | save_path = data_path 249 | postfix = "" 250 | else: 251 | sparse_indices = list(range(max_rank)) 252 | Xi_sparse_na = np.where( 253 | Xv_cross[:, sparse_indices] == 1, Xi_cross[:, sparse_indices], -1 254 | ) 255 | data_np = Xi_sparse_na 256 | df = pd.DataFrame(data_np, columns=cross_features) 257 | temp_feats = cross_features 258 | save_path = data_path + "/" + experiment 259 | postfix = "_" + str(max_rank) 260 | 261 | if not os.path.exists(save_path): 262 | os.makedirs(save_path) 263 | 264 | for feat in tqdm(temp_feats): 265 | lbe = LabelEncoder() 266 | df[feat] = lbe.fit_transform( 267 | df[feat] 268 | ) # global label encoding consistent with autoint's data preprocessing 269 | 270 | train = df[sum(lens[0:2]) :] 271 | valid = df[lens[0] : sum(lens[0:2])] 272 | test = df[0 : lens[0]] 273 | 274 | train.to_hdf( 275 | save_path + "/" + setting + postfix + ".h5", 276 | key="train", 277 | format="table", 278 | model="w", 279 | ) 280 | valid.to_hdf( 281 | save_path + "/" + setting + postfix + ".h5", key="valid", format="table" 282 | ) 283 | test.to_hdf( 284 | save_path + "/" + setting + postfix + ".h5", key="test", format="table" 285 | ) 286 | 287 | for feat in tqdm(temp_feats): 288 | n_unique_dict[feat] = df[feat].nunique() 289 | 290 | n_unique_cross_dict = dict() 291 | for feat in cross_features: 292 | n_unique_cross_dict[feat] = n_unique_dict[feat] 293 | 294 | if deepctr_save_baseline_data: 295 | n_unique_baseline_dict = dict() 296 | for feat in sparse_features: 297 | n_unique_baseline_dict[feat] = n_unique_dict[feat] 298 | with open(data_path + "/n_unique_dict_baseline.pickle", "wb") as handle: 299 | pickle.dump( 300 | n_unique_baseline_dict, handle, protocol=pickle.HIGHEST_PROTOCOL 301 | ) 302 | 303 | with open( 304 | data_path + "/" + experiment + "/n_unique_dict_cross.pickle", "wb" 305 | ) as handle: 306 | pickle.dump(n_unique_cross_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) 307 | 308 | 309 | if __name__ == "__main__": 310 | print("warning: this process may take several hours and significant RAM (>150GB)") 311 | 312 | Xi, Xv, y, Xi_cross, Xv_cross, lens, max_rank, num_feats, dense_feat_indices, sparse_feat_indices = make_cross_feature_data( 313 | interactions_file, 314 | max_rank, 315 | dataset, 316 | training_batch_size, 317 | data_path_autoint, 318 | num_bins, 319 | threshold_pct, 320 | top_k, 321 | prune_interaction_subsets, 322 | num_processes, 323 | ) 324 | 325 | save_cross_feats_autoint(Xi_cross, Xv_cross, lens, experiment, data_path_autoint) 326 | save_cross_feats_deepctr( 327 | Xi, 328 | Xv, 329 | y, 330 | Xi_cross, 331 | Xv_cross, 332 | dense_feat_indices, 333 | sparse_feat_indices, 334 | lens, 335 | experiment, 336 | data_path_deepctr, 337 | deepctr_save_baseline_data, 338 | ) 339 | -------------------------------------------------------------------------------- /2. glider/models/autoint/README.md: -------------------------------------------------------------------------------- 1 | # AutoInt 2 | 3 | This is a TenforFlow implementation of ***AutoInt*** for CTR prediction task, as described in our paper: 4 | 5 | Weiping Song, Chence Shi, Zhiping Xiao, Zhijian Duan, Yewen Xu, Ming Zhang and Jian Tang. [AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks](https://arxiv.org/pdf/1810.11921.pdf). arXiv preprint arXiv:1810.11921, 2018. 6 | 7 | ## Requirements: 8 | * **Tensorflow 1.4.0-rc1** 9 | * Python 3 10 | * CUDA 8.0+ (For GPU) 11 | 12 | ## Introduction 13 | 14 | AutoInt:An effective and efficient algorithm to 15 | automatically learn high-order feature interactions for (sparse) categorical and numerical features. 16 | 17 |
18 | 19 |
20 | The illustration of AutoInt. We first project all sparse features 21 | (both categorical and numerical features) into the low-dimensional space. Next, we feed embeddings of all fields into stacked multiple interacting layers implemented by self-attentive neural network. The output of the final interacting layer is the low-dimensional representation of learnt combinatorial features, which is further used for estimating the CTR via sigmoid function. 22 | 23 | ## Usage 24 | ### Input Format 25 | AutoInt requires the input data in the following format: 26 | * train_x: matrix with shape *(num_sample, num_field)*. train_x[s][t] is the feature value of feature field t of sample s in the dataset. The default value for categorical feature is 1. 27 | * train_i: matrix with shape *(num_sample, num_field)*. train_i[s][t] is the feature index of feature field t of sample s in the dataset. The maximal value of train_i is the feature size. 28 | * train_y: label of each sample in the dataset. 29 | 30 | If you want to know how to preprocess the data, please refer to `./Dataprocess/Criteo/preprocess.py` 31 | 32 | ### Example 33 | We use four public real-world datasets(Avazu, Criteo, KDD12, MovieLens-1M) in our experiments. Since the first three datasets are super huge, they can not be fit into the memory as a whole. In our implementation, we split the whole dataset into 10 parts and we use the first file as test set and the second file as valid set. We provide the codes for preprocessing these three datasets in `./Dataprocess`. If you want to reuse these codes, you should first run `preprocess.py` to generate `train_x.txt, train_i.txt, train_y.txt` as described in `Input Format`. Then you should run `./Dataprocesss/Kfold_split/StratifiedKfold.py` to split the whole dataset into ten folds. Finally you can run `scale.py` to scale the numerical value(optional). 34 | 35 | To help test the correctness of the code and familarize yourself with the code, we upload the first `10000` samples of `Criteo` dataset in `train_examples.txt`. And we provide the scripts for preprocessing and training.(Please refer to ` sample_preprocess.sh` and `test_code.sh`, you may need to modify the path in `config.py` and `test_code.sh`). 36 | 37 | After you run the `test_code.sh`, you should get a folder named `Criteo` which contains `part*, feature_size.npy, fold_index.npy, train_*.txt`. `feature_size.npy` contains the number of total features which will be used to initialize the model. `train_*.txt` is the whole dataset. If you use other small dataset, say `MovieLens-1M`, you only need to modify the function `_run_` in `train.py`. 38 | 39 | Here's how to run the preprocessing. 40 | ``` 41 | mkdir Criteo 42 | python ./Dataprocess/Criteo/preprocess.py 43 | python ./Dataprocess/Kfold_split/stratifiedKfold.py 44 | python ./Dataprocess/Criteo/scale.py 45 | ``` 46 | 47 | Here's how to run the training. 48 | ``` 49 | python -u train.py \ 50 | --data "Criteo" --blocks 3 --heads 2 --block_shape "[64, 64, 64]" \ 51 | --is_save "True" --save_path "./test_code/Criteo/b3h2_64x64x64/" \ 52 | --field_size 39 --run_times 1 --data_path "./" \ 53 | --epoch 3 --has_residual "True" --has_wide "False" \ 54 | --batch_size 1024 \ 55 | > test_code_single.out & 56 | ``` 57 | 58 | You should see output like this: 59 | 60 | ``` 61 | ... 62 | train logs 63 | ... 64 | start testing!... 65 | restored from ./test_code/Criteo/b3h2_dnn_dropkeep1_400x2/1/ 66 | test-result = 0.8088, test-logloss = 0.4430 67 | test_auc [0.8088305055534442] 68 | test_log_loss [0.44297631300399626] 69 | avg_auc 0.8088305055534442 70 | avg_log_loss 0.44297631300399626 71 | ``` 72 | 73 | ## Citation 74 | If you find AutoInt useful for your research, please consider citing the following paper: 75 | ``` 76 | @article{weiping2018autoint, 77 | title={AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks}, 78 | author={Weiping, Song and Chence, Shi and Zhiping, Xiao and Zhijian, Duan and Yewen, Xu and Ming, Zhang and Jian, Tang}, 79 | journal={arXiv preprint arXiv:1810.11921}, 80 | year={2018} 81 | } 82 | ``` 83 | 84 | 85 | ## Contact information 86 | If you have questions related to the code, feel free to contact Weiping Song (`songweiping@pku.edu.cn`), Chence Shi (`chenceshi@pku.edu.cn`) and Zhijian Duan (`zjduan@pku.edu.cn`). 87 | 88 | ## License 89 | MIT 90 | 91 | ## Acknowledgement 92 | This implementation gets inspirations from Kyubyong Park's [transformer](https://github.com/Kyubyong/transformer) and Chenglong Chen' [DeepFM](https://github.com/ChenglongChen/tensorflow-DeepFM). 93 | -------------------------------------------------------------------------------- /2. glider/models/autoint/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tensorflow implementation of AutoInt described in: 3 | AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks. 4 | author: Chence Shi 5 | email: chenceshi@pku.edu.cn 6 | """ 7 | 8 | import os 9 | import numpy as np 10 | import tensorflow as tf 11 | from time import time 12 | from sklearn.base import BaseEstimator, TransformerMixin 13 | from sklearn.metrics import roc_auc_score, log_loss 14 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm 15 | 16 | 17 | """ 18 | The following two functions are adapted from kyubyong park's implementation of transformer 19 | We slightly modify the code to make it suitable for our work.(add relu, delete key masking and causality mask) 20 | June 2017 by kyubyong park. 21 | kbpark.linguist@gmail.com. 22 | https://www.github.com/kyubyong/transformer 23 | """ 24 | 25 | 26 | def normalize(inputs, epsilon=1e-8): 27 | """ 28 | Applies layer normalization 29 | Args: 30 | inputs: A tensor with 2 or more dimensions 31 | epsilon: A floating number to prevent Zero Division 32 | Returns: 33 | A tensor with the same shape and data dtype 34 | """ 35 | inputs_shape = inputs.get_shape() 36 | params_shape = inputs_shape[-1:] 37 | 38 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 39 | beta = tf.Variable(tf.zeros(params_shape)) 40 | gamma = tf.Variable(tf.ones(params_shape)) 41 | normalized = (inputs - mean) / ((variance + epsilon) ** (0.5)) 42 | outputs = gamma * normalized + beta 43 | 44 | return outputs 45 | 46 | 47 | def multihead_attention( 48 | queries, 49 | keys, 50 | values, 51 | num_units=None, 52 | num_heads=1, 53 | dropout_keep_prob=1, 54 | is_training=True, 55 | has_residual=True, 56 | ): 57 | 58 | if num_units is None: 59 | num_units = queries.get_shape().as_list[-1] 60 | 61 | # Linear projections 62 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) 63 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) 64 | V = tf.layers.dense(values, num_units, activation=tf.nn.relu) 65 | if has_residual: 66 | V_res = tf.layers.dense(values, num_units, activation=tf.nn.relu) 67 | 68 | # Split and concat 69 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) 70 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) 71 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) 72 | 73 | # Multiplication 74 | weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) 75 | 76 | # Scale 77 | weights = weights / (K_.get_shape().as_list()[-1] ** 0.5) 78 | 79 | # Activation 80 | weights = tf.nn.softmax(weights) 81 | 82 | # Dropouts 83 | weights = tf.layers.dropout( 84 | weights, rate=1 - dropout_keep_prob, training=tf.convert_to_tensor(is_training) 85 | ) 86 | 87 | # Weighted sum 88 | outputs = tf.matmul(weights, V_) 89 | 90 | # Restore shape 91 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) 92 | 93 | # Residual connection 94 | if has_residual: 95 | outputs += V_res 96 | 97 | outputs = tf.nn.relu(outputs) 98 | # Normalize 99 | outputs = normalize(outputs) 100 | 101 | return outputs 102 | 103 | 104 | class AutoInt: 105 | def __init__(self, args, feature_size, run_cnt): 106 | 107 | self.feature_size = ( 108 | feature_size 109 | ) # denote as n, dimension of concatenated features 110 | self.field_size = args.field_size # denote as M, number of total feature fields 111 | self.embedding_size = ( 112 | args.embedding_size 113 | ) # denote as d, size of the feature embedding 114 | self.blocks = args.blocks # number of the blocks 115 | self.heads = args.heads # number of the heads 116 | self.block_shape = args.block_shape 117 | self.output_size = args.block_shape[-1] 118 | self.has_residual = args.has_residual 119 | self.has_wide = args.has_wide # whether to add wide part 120 | self.deep_layers = ( 121 | args.deep_layers 122 | ) # whether to joint train with deep networks as described in paper 123 | 124 | self.batch_norm = args.batch_norm 125 | self.batch_norm_decay = args.batch_norm_decay 126 | self.drop_keep_prob = args.dropout_keep_prob 127 | self.l2_reg = args.l2_reg 128 | self.epoch = args.epoch 129 | self.batch_size = args.batch_size 130 | self.learning_rate = args.learning_rate 131 | self.learning_rate_wide = args.learning_rate_wide 132 | self.optimizer_type = args.optimizer_type 133 | 134 | self.save_path = args.save_path + str(run_cnt) + "/" 135 | self.is_save = args.is_save 136 | if args.is_save == True and os.path.exists(self.save_path) == False: 137 | os.makedirs(self.save_path) 138 | 139 | self.verbose = args.verbose 140 | self.random_seed = args.random_seed 141 | self.loss_type = args.loss_type 142 | self.eval_metric = roc_auc_score 143 | self.best_loss = 1.0 144 | self.greater_is_better = args.greater_is_better 145 | self.train_result, self.valid_result = [], [] 146 | self.train_loss, self.valid_loss = [], [] 147 | 148 | self._init_graph() 149 | 150 | def _init_graph(self): 151 | self.graph = tf.Graph() 152 | with self.graph.as_default(): 153 | 154 | tf.set_random_seed(self.random_seed) 155 | 156 | self.feat_index = tf.placeholder( 157 | tf.int32, shape=[None, None], name="feat_index" 158 | ) # None * M # M is num features 159 | self.feat_value = tf.placeholder( 160 | tf.float32, shape=[None, None], name="feat_value" 161 | ) # None * M 162 | self.label = tf.placeholder( 163 | tf.float32, shape=[None, 1], name="label" 164 | ) # None * 1 165 | # In our implementation, the shape of dropout_keep_prob is [3], used in 3 different parts. 166 | self.dropout_keep_prob = tf.placeholder( 167 | tf.float32, shape=[None], name="dropout_keep_prob" 168 | ) 169 | self.train_phase = tf.placeholder(tf.bool, name="train_phase") 170 | 171 | self.weights = self._initialize_weights() 172 | 173 | # model 174 | self.embeddings = tf.nn.embedding_lookup( 175 | self.weights["feature_embeddings"], self.feat_index 176 | ) # None * M * d # num * emb dim 177 | feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1]) 178 | self.embeddings = tf.multiply(self.embeddings, feat_value) # None * M * d 179 | self.embeddings = tf.nn.dropout( 180 | self.embeddings, self.dropout_keep_prob[1] 181 | ) # None * M * d 182 | if self.has_wide: 183 | self.y_first_order = tf.nn.embedding_lookup( 184 | self.weights["feature_bias"], self.feat_index 185 | ) # None * M * 1 186 | self.y_first_order = tf.reduce_sum( 187 | tf.multiply(self.y_first_order, feat_value), 1 188 | ) # None * 1 189 | 190 | # joint training with feedforward nn 191 | if self.deep_layers != None: 192 | self.y_dense = tf.reshape( 193 | self.embeddings, shape=[-1, self.field_size * self.embedding_size] 194 | ) 195 | for i in range(0, len(self.deep_layers)): 196 | self.y_dense = tf.add( 197 | tf.matmul(self.y_dense, self.weights["layer_%d" % i]), 198 | self.weights["bias_%d" % i], 199 | ) # None * layer[i] 200 | if self.batch_norm: 201 | self.y_dense = self.batch_norm_layer( 202 | self.y_dense, 203 | train_phase=self.train_phase, 204 | scope_bn="bn_%d" % i, 205 | ) 206 | self.y_dense = tf.nn.relu(self.y_dense) 207 | self.y_dense = tf.nn.dropout( 208 | self.y_dense, self.dropout_keep_prob[2] 209 | ) 210 | self.y_dense = tf.add( 211 | tf.matmul(self.y_dense, self.weights["prediction_dense"]), 212 | self.weights["prediction_bias_dense"], 213 | name="logits_dense", 214 | ) # None * 1 215 | 216 | # ---------- main part of AutoInt------------------- 217 | self.y_deep = self.embeddings # None * M * d 218 | for i in range(self.blocks): 219 | self.y_deep = multihead_attention( 220 | queries=self.y_deep, 221 | keys=self.y_deep, 222 | values=self.y_deep, 223 | num_units=self.block_shape[i], 224 | num_heads=self.heads, 225 | dropout_keep_prob=self.dropout_keep_prob[0], 226 | is_training=self.train_phase, 227 | has_residual=self.has_residual, 228 | ) 229 | 230 | self.flat = tf.reshape( 231 | self.y_deep, shape=[-1, self.output_size * self.field_size] 232 | ) 233 | 234 | self.out = tf.add( 235 | tf.matmul(self.flat, self.weights["prediction"]), 236 | self.weights["prediction_bias"], 237 | name="logits", 238 | ) # None * 1 239 | 240 | if self.has_wide: 241 | self.out += self.y_first_order 242 | 243 | if self.deep_layers != None: 244 | self.out += self.y_dense 245 | 246 | # ---------- Compute the loss ---------- 247 | # loss 248 | if self.loss_type == "logloss": 249 | self.out = tf.nn.sigmoid(self.out, name="pred") 250 | self.loss = tf.losses.log_loss(self.label, self.out) 251 | elif self.loss_type == "mse": 252 | self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) 253 | 254 | # l2 regularization on weights 255 | if self.l2_reg > 0: 256 | if self.deep_layers != None: 257 | for i in range(len(self.deep_layers)): 258 | self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( 259 | self.weights["layer_%d" % i] 260 | ) 261 | 262 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 263 | self.var1 = [ 264 | v for v in tf.trainable_variables() if v.name != "feature_bias:0" 265 | ] 266 | self.var2 = [tf.trainable_variables()[1]] # self.var2 = [feature_bias] 267 | # optimizer 268 | # here we should use two different optimizer for wide and deep model(if we add wide part). 269 | if self.optimizer_type == "adam": 270 | if self.has_wide: 271 | optimizer1 = tf.train.AdamOptimizer( 272 | learning_rate=self.learning_rate, 273 | beta1=0.9, 274 | beta2=0.999, 275 | epsilon=1e-8, 276 | ) 277 | optimizer2 = tf.train.GradientDescentOptimizer( 278 | learning_rate=self.learning_rate_wide 279 | ) 280 | # minimize(self.loss, global_step=self.global_step) 281 | var_list1 = self.var1 282 | var_list2 = self.var2 283 | grads = tf.gradients(self.loss, var_list1 + var_list2) 284 | grads1 = grads[: len(var_list1)] 285 | grads2 = grads[len(var_list1) :] 286 | train_op1 = optimizer1.apply_gradients( 287 | zip(grads1, var_list1), global_step=self.global_step 288 | ) 289 | train_op2 = optimizer2.apply_gradients(zip(grads2, var_list2)) 290 | self.optimizer = tf.group(train_op1, train_op2) 291 | else: 292 | self.optimizer = tf.train.AdamOptimizer( 293 | learning_rate=self.learning_rate, 294 | beta1=0.9, 295 | beta2=0.999, 296 | epsilon=1e-8, 297 | ).minimize(self.loss, global_step=self.global_step) 298 | elif self.optimizer_type == "adagrad": 299 | self.optimizer = tf.train.AdagradOptimizer( 300 | learning_rate=self.learning_rate, initial_accumulator_value=1e-8 301 | ).minimize(self.loss) 302 | elif self.optimizer_type == "gd": 303 | self.optimizer = tf.train.GradientDescentOptimizer( 304 | learning_rate=self.learning_rate 305 | ).minimize(self.loss) 306 | elif self.optimizer_type == "momentum": 307 | self.optimizer = tf.train.MomentumOptimizer( 308 | learning_rate=self.learning_rate, momentum=0.95 309 | ).minimize(self.loss) 310 | 311 | # init 312 | self.saver = tf.train.Saver(max_to_keep=5) 313 | init = tf.global_variables_initializer() 314 | self.sess = self._init_session() 315 | self.sess.run(init) 316 | self.count_param() 317 | 318 | def count_param(self): 319 | k = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]) 320 | 321 | # print(tf.trainable_variables()) 322 | print("total parameters :%d" % k) 323 | print("extra parameters : %d" % (k - self.feature_size * self.embedding_size)) 324 | 325 | def _init_session(self): 326 | config = tf.ConfigProto(allow_soft_placement=True) 327 | config.gpu_options.allow_growth = True 328 | return tf.Session(config=config) 329 | 330 | def _initialize_weights(self): 331 | weights = dict() 332 | 333 | # embeddings 334 | weights["feature_embeddings"] = tf.Variable( 335 | tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01), 336 | name="feature_embeddings", 337 | ) # feature_size(n) * d 338 | 339 | if self.has_wide: 340 | weights["feature_bias"] = tf.Variable( 341 | tf.random_normal([self.feature_size, 1], 0.0, 0.001), 342 | name="feature_bias", 343 | ) # feature_size(n) * 1 344 | input_size = self.output_size * self.field_size 345 | 346 | # dense layers 347 | if self.deep_layers != None: 348 | num_layer = len(self.deep_layers) 349 | layer0_size = self.field_size * self.embedding_size 350 | glorot = np.sqrt(2.0 / (layer0_size + self.deep_layers[0])) 351 | weights["layer_0"] = tf.Variable( 352 | np.random.normal( 353 | loc=0, scale=glorot, size=(layer0_size, self.deep_layers[0]) 354 | ), 355 | dtype=np.float32, 356 | ) 357 | weights["bias_0"] = tf.Variable( 358 | np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])), 359 | dtype=np.float32, 360 | ) # 1 * layers[0] 361 | for i in range(1, num_layer): 362 | glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i])) 363 | weights["layer_%d" % i] = tf.Variable( 364 | np.random.normal( 365 | loc=0, 366 | scale=glorot, 367 | size=(self.deep_layers[i - 1], self.deep_layers[i]), 368 | ), 369 | dtype=np.float32, 370 | ) # layers[i-1] * layers[i] 371 | weights["bias_%d" % i] = tf.Variable( 372 | np.random.normal( 373 | loc=0, scale=glorot, size=(1, self.deep_layers[i]) 374 | ), 375 | dtype=np.float32, 376 | ) # 1 * layer[i] 377 | glorot = np.sqrt(2.0 / (self.deep_layers[-1] + 1)) 378 | weights["prediction_dense"] = tf.Variable( 379 | np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[-1], 1)), 380 | dtype=np.float32, 381 | name="prediction_dense", 382 | ) 383 | weights["prediction_bias_dense"] = tf.Variable( 384 | np.random.normal(), dtype=np.float32, name="prediction_bias_dense" 385 | ) 386 | 387 | # ---------- prediciton weight ------------------# 388 | glorot = np.sqrt(2.0 / (input_size + 1)) 389 | weights["prediction"] = tf.Variable( 390 | np.random.normal(loc=0, scale=glorot, size=(input_size, 1)), 391 | dtype=np.float32, 392 | name="prediction", 393 | ) 394 | weights["prediction_bias"] = tf.Variable( 395 | np.random.normal(), dtype=np.float32, name="prediction_bias" 396 | ) 397 | 398 | return weights 399 | 400 | def batch_norm_layer(self, x, train_phase, scope_bn): 401 | bn_train = batch_norm( 402 | x, 403 | decay=self.batch_norm_decay, 404 | center=True, 405 | scale=True, 406 | updates_collections=None, 407 | is_training=True, 408 | reuse=None, 409 | trainable=True, 410 | scope=scope_bn, 411 | ) 412 | bn_inference = batch_norm( 413 | x, 414 | decay=self.batch_norm_decay, 415 | center=True, 416 | scale=True, 417 | updates_collections=None, 418 | is_training=False, 419 | reuse=True, 420 | trainable=True, 421 | scope=scope_bn, 422 | ) 423 | z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference) 424 | return z 425 | 426 | def get_batch(self, Xi, Xv, y, batch_size, index): 427 | start = index * batch_size 428 | end = (index + 1) * batch_size 429 | end = end if end < len(y) else len(y) 430 | return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]] 431 | 432 | # shuffle three lists simutaneously 433 | def shuffle_in_unison_scary(self, a, b, c): 434 | rng_state = np.random.get_state() 435 | np.random.shuffle(a) 436 | np.random.set_state(rng_state) 437 | np.random.shuffle(b) 438 | np.random.set_state(rng_state) 439 | np.random.shuffle(c) 440 | 441 | def fit_on_batch(self, Xi, Xv, y): 442 | feed_dict = { 443 | self.feat_index: Xi, 444 | self.feat_value: Xv, 445 | self.label: y, 446 | self.dropout_keep_prob: self.drop_keep_prob, 447 | self.train_phase: True, 448 | } 449 | step, loss, opt = self.sess.run( 450 | (self.global_step, self.loss, self.optimizer), feed_dict=feed_dict 451 | ) 452 | return step, loss 453 | 454 | # Since the train data is very large, they can not be fit into the memory at the same time. 455 | # We separate the whole train data into several files and call "fit_once" for each file. 456 | def fit_once( 457 | self, 458 | Xi_train, 459 | Xv_train, 460 | y_train, 461 | epoch, 462 | file_count, 463 | Xi_valid=None, 464 | Xv_valid=None, 465 | y_valid=None, 466 | early_stopping=False, 467 | ): 468 | 469 | has_valid = Xv_valid is not None 470 | last_step = 0 471 | t1 = time() 472 | self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train) 473 | total_batch = int(len(y_train) / self.batch_size) 474 | for i in range(total_batch): 475 | Xi_batch, Xv_batch, y_batch = self.get_batch( 476 | Xi_train, Xv_train, y_train, self.batch_size, i 477 | ) 478 | step, loss = self.fit_on_batch(Xi_batch, Xv_batch, y_batch) 479 | last_step = step 480 | 481 | # evaluate training and validation datasets 482 | train_result, train_loss = self.evaluate(Xi_train, Xv_train, y_train) 483 | self.train_result.append(train_result) 484 | self.train_loss.append(train_loss) 485 | if has_valid: 486 | valid_result, valid_loss = self.evaluate(Xi_valid, Xv_valid, y_valid) 487 | self.valid_result.append(valid_result) 488 | self.valid_loss.append(valid_loss) 489 | if valid_loss < self.best_loss and self.is_save == True: 490 | old_loss = self.best_loss 491 | self.best_loss = valid_loss 492 | self.saver.save( 493 | self.sess, self.save_path + "model.ckpt", global_step=last_step 494 | ) 495 | print( 496 | "[%d-%d] model saved!. Valid loss is improved from %.4f to %.4f" 497 | % (epoch, file_count, old_loss, self.best_loss) 498 | ) 499 | 500 | if self.verbose > 0 and ((epoch - 1) * 9 + file_count) % self.verbose == 0: 501 | if has_valid: 502 | print( 503 | "[%d-%d] train-result=%.4f, train-logloss=%.4f, valid-result=%.4f, valid-logloss=%.4f [%.1f s]" 504 | % ( 505 | epoch, 506 | file_count, 507 | train_result, 508 | train_loss, 509 | valid_result, 510 | valid_loss, 511 | time() - t1, 512 | ) 513 | ) 514 | else: 515 | print( 516 | "[%d-%d] train-result=%.4f [%.1f s]" 517 | % (epoch, file_count, train_result, time() - t1) 518 | ) 519 | if has_valid and early_stopping and self.training_termination(self.valid_loss): 520 | return False 521 | else: 522 | return True 523 | 524 | def training_termination(self, valid_result): 525 | if len(valid_result) > 5: 526 | if self.greater_is_better: 527 | if ( 528 | valid_result[-1] < valid_result[-2] 529 | and valid_result[-2] < valid_result[-3] 530 | and valid_result[-3] < valid_result[-4] 531 | and valid_result[-4] < valid_result[-5] 532 | ): 533 | return True 534 | else: 535 | if ( 536 | valid_result[-1] > valid_result[-2] 537 | and valid_result[-2] > valid_result[-3] 538 | and valid_result[-3] > valid_result[-4] 539 | and valid_result[-4] > valid_result[-5] 540 | ): 541 | return True 542 | return False 543 | 544 | def predict(self, Xi, Xv): 545 | """ 546 | :param Xi: list of list of feature indices of each sample in the dataset 547 | :param Xv: list of list of feature values of each sample in the dataset 548 | :return: predicted probability of each sample 549 | """ 550 | # dummy y 551 | dummy_y = [1] * len(Xi) 552 | batch_index = 0 553 | Xi_batch, Xv_batch, y_batch = self.get_batch( 554 | Xi, Xv, dummy_y, self.batch_size, batch_index 555 | ) 556 | y_pred = None 557 | # y_loss = None 558 | while len(Xi_batch) > 0: 559 | num_batch = len(y_batch) 560 | feed_dict = { 561 | self.feat_index: Xi_batch, 562 | self.feat_value: Xv_batch, 563 | self.label: y_batch, 564 | self.dropout_keep_prob: [1.0] * len(self.drop_keep_prob), 565 | self.train_phase: False, 566 | } 567 | batch_out = self.sess.run(self.out, feed_dict=feed_dict) 568 | 569 | if batch_index == 0: 570 | y_pred = np.reshape(batch_out, (num_batch,)) 571 | else: 572 | y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,)))) 573 | 574 | batch_index += 1 575 | Xi_batch, Xv_batch, y_batch = self.get_batch( 576 | Xi, Xv, dummy_y, self.batch_size, batch_index 577 | ) 578 | 579 | return y_pred 580 | 581 | def evaluate(self, Xi, Xv, y): 582 | """ 583 | :param Xi: list of list of feature indices of each sample in the dataset 584 | :param Xv: list of list of feature values of each sample in the dataset 585 | :param y: label of each sample in the dataset 586 | :return: metric of the evaluation 587 | """ 588 | y_pred = self.predict(Xi, Xv) 589 | y_pred = np.clip(y_pred, 1e-6, 1 - 1e-6) 590 | return self.eval_metric(y, y_pred), log_loss(y, y_pred) 591 | 592 | def restore(self, save_path=None): 593 | if save_path == None: 594 | save_path = self.save_path 595 | ckpt = tf.train.get_checkpoint_state(save_path) 596 | if ckpt and ckpt.model_checkpoint_path: 597 | self.saver.restore(self.sess, ckpt.model_checkpoint_path) 598 | if self.verbose > 0: 599 | print("restored from %s" % (save_path)) 600 | -------------------------------------------------------------------------------- /2. glider/models/autoint/train.py: -------------------------------------------------------------------------------- 1 | ## AutoInt's official training code. Modifications are only made to accomodate cross feature 2 | 3 | import math 4 | import numpy as np 5 | import pandas as pd 6 | import tensorflow as tf 7 | from sklearn.metrics import make_scorer 8 | from sklearn.model_selection import StratifiedKFold 9 | from time import time 10 | from model import AutoInt 11 | import argparse 12 | import os 13 | 14 | from os.path import join 15 | 16 | 17 | def str2list(v): 18 | v = v.split(",") 19 | v = [int(_.strip("[]")) for _ in v] 20 | 21 | return v 22 | 23 | 24 | def str2list2(v): 25 | v = v.split(",") 26 | v = [float(_.strip("[]")) for _ in v] 27 | 28 | return v 29 | 30 | 31 | def str2bool(v): 32 | if v.lower() in ["yes", "true", "t", "y", "1"]: 33 | return True 34 | elif v.lower() in ["no", "false", "f", "n", "0"]: 35 | return False 36 | else: 37 | raise argparse.ArgumentTypeError("Unsupported value encountered.") 38 | 39 | 40 | def parse_args(): 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--blocks", type=int, default=3, help="#blocks") 43 | parser.add_argument( 44 | "--block_shape", 45 | type=str2list, 46 | default=[64, 64, 64], 47 | help="output shape of each block", 48 | ) 49 | parser.add_argument("--heads", type=int, default=2, help="#heads") 50 | parser.add_argument("--embedding_size", type=int, default=16) 51 | parser.add_argument("--dropout_keep_prob", type=str2list2, default=[1, 1, 1]) 52 | parser.add_argument("--epoch", type=int, default=3) 53 | parser.add_argument("--batch_size", type=int, default=1024) 54 | parser.add_argument("--learning_rate", type=float, default=0.001) 55 | parser.add_argument("--learning_rate_wide", type=float, default=0.001) 56 | parser.add_argument("--optimizer_type", type=str, default="adam") 57 | parser.add_argument("--l2_reg", type=float, default=0.0) 58 | parser.add_argument("--random_seed", type=int, default=2018) 59 | parser.add_argument("--save_path", type=str, default="./model/") 60 | parser.add_argument("--field_size", type=int, default=0, help="dummy variable") 61 | parser.add_argument("--loss_type", type=str, default="logloss") 62 | parser.add_argument("--verbose", type=int, default=1) 63 | parser.add_argument( 64 | "--run_times", type=int, default=5, help="run multiple times to eliminate error" 65 | ) 66 | parser.add_argument("--is_save", type=str2bool, default=True) 67 | parser.add_argument( 68 | "--greater_is_better", type=str2bool, default=False, help="early stop criterion" 69 | ) 70 | parser.add_argument( 71 | "--has_residual", type=str2bool, default=True, help="add residual or not" 72 | ) 73 | parser.add_argument("--has_wide", type=str2bool, default=False) 74 | parser.add_argument( 75 | "--deep_layers", 76 | type=str2list, 77 | default=[400, 400], 78 | help="config for dnn in joint train", 79 | ) 80 | parser.add_argument("--batch_norm", type=int, default=0) 81 | parser.add_argument("--batch_norm_decay", type=float, default=0.995) 82 | parser.add_argument("--data", type=str, help="data name") 83 | parser.add_argument("--data_path", type=str, default="./", help="root path for all the data") 84 | parser.add_argument("--gpu", type=int, help="which gpu") 85 | parser.add_argument("--exp", type=str, help="experiment", default="cross") 86 | parser.add_argument("--cross_exp", type=str, help="cross exp", default="cross1") 87 | 88 | return parser.parse_args() 89 | 90 | 91 | def include_cross_features(args, Xi, Xv, j): 92 | if args.exp == "cross": 93 | path = join(args.data_path, args.data, "part" + str(j), args.cross_exp) 94 | 95 | Xi_cross = np.load(join(path, "i_cross.npy")) 96 | Xv_cross = np.load(join(path, "x_cross.npy")) 97 | Xi = np.concatenate([Xi, Xi_cross], axis=1) 98 | Xv = np.concatenate([Xv, Xv_cross], axis=1) 99 | return Xi, Xv 100 | 101 | 102 | def _run_(args, file_name, run_cnt): 103 | # path_prefix = '../Dataprocess/' + args.data 104 | path_prefix = os.path.join(args.data_path, args.data) 105 | if not os.path.exists(args.save_path): 106 | os.makedirs(args.save_path) 107 | 108 | if args.exp == "cross": 109 | feature_size = np.load(join(path_prefix, args.cross_exp, "feature_size.npy"))[0] 110 | else: 111 | feature_size = np.load(path_prefix + "/feature_size.npy")[0] 112 | 113 | # variables = tf.contrib.framework.get_variables_to_restore() 114 | # print(variables) 115 | # return 116 | 117 | Xi_valid = np.load(path_prefix + "/part2/" + file_name[0]) 118 | Xv_valid = np.load(path_prefix + "/part2/" + file_name[1]) 119 | y_valid = np.load(path_prefix + "/part2/" + file_name[2]) 120 | 121 | Xi_valid, Xv_valid = include_cross_features(args, Xi_valid, Xv_valid, 2) 122 | 123 | args.field_size = Xi_valid.shape[1] 124 | 125 | # test: file1, valid: file2, train: file3-10 126 | model = AutoInt(args=args, feature_size=feature_size, run_cnt=run_cnt) 127 | 128 | is_continue = True 129 | for k in range(model.epoch): 130 | if not is_continue: 131 | print("early stopping at epoch %d" % (k + 1)) 132 | break 133 | file_count = 0 134 | time_epoch = 0 135 | for j in range(3, 11): 136 | if not is_continue: 137 | print("early stopping at epoch %d file %d" % (k + 1, j)) 138 | break 139 | file_count += 1 140 | Xi_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[0]) 141 | Xv_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[1]) 142 | y_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[2]) 143 | 144 | Xi_train, Xv_train = include_cross_features(args, Xi_train, Xv_train, j) 145 | 146 | print("epoch %d, file %d" % (k + 1, j)) 147 | t1 = time() 148 | is_continue = model.fit_once( 149 | Xi_train, 150 | Xv_train, 151 | y_train, 152 | k + 1, 153 | file_count, 154 | Xi_valid, 155 | Xv_valid, 156 | y_valid, 157 | early_stopping=True, 158 | ) 159 | time_epoch += time() - t1 160 | 161 | print("epoch %d, time %d" % (k + 1, time_epoch)) 162 | 163 | print("start testing!...") 164 | Xi_test = np.load(path_prefix + "/part1/" + file_name[0]) 165 | Xv_test = np.load(path_prefix + "/part1/" + file_name[1]) 166 | y_test = np.load(path_prefix + "/part1/" + file_name[2]) 167 | 168 | Xi_test, Xv_test = include_cross_features(args, Xi_test, Xv_test, 1) 169 | 170 | model.restore() 171 | 172 | test_result, test_loss = model.evaluate(Xi_test, Xv_test, y_test) 173 | print("test-result = %.4lf, test-logloss = %.4lf" % (test_result, test_loss)) 174 | return test_result, test_loss 175 | 176 | 177 | if __name__ == "__main__": 178 | args = parse_args() 179 | print(args.__dict__) 180 | print("**************") 181 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 182 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) 183 | 184 | data_path = args.data.split("/") 185 | if any([data_path[-1].startswith(d) for d in ["avazu"]]): 186 | file_name = ["train_i.npy", "train_x.npy", "train_y.npy"] 187 | elif any([data_path[-1].startswith(d) for d in ["criteo"]]): 188 | file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"] 189 | else: 190 | raise ValueError("invalid data arg") 191 | test_auc = [] 192 | test_log = [] 193 | 194 | print("run time : %d" % args.run_times) 195 | for i in range(1, args.run_times + 1): 196 | test_result, test_loss = _run_(args, file_name, i) 197 | test_auc.append(test_result) 198 | test_log.append(test_loss) 199 | print("test_auc", test_auc) 200 | print("test_log_loss", test_log) 201 | print("avg_auc", sum(test_auc) / len(test_auc)) 202 | print("avg_log_loss", sum(test_log) / len(test_log)) 203 | -------------------------------------------------------------------------------- /2. glider/train_deepctr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--model", type=str, help="model", default="WDL") 6 | parser.add_argument("--runs", type=int, help="num trials", default=5) 7 | parser.add_argument("--exp", type=str, help="experiment", default="baseline") 8 | parser.add_argument("--ds", type=str, help="dataset", default="criteo") 9 | parser.add_argument("--bs", type=int, help="batchsize", default=1024) 10 | parser.add_argument("--gpu", type=int, default=0) 11 | parser.add_argument("--lr", type=float, help="learning rate", default=0.01) 12 | parser.add_argument("--opt", type=str, help="optimizer", default="adagrad") 13 | parser.add_argument("--epochs", type=int, help="epochs", default=50) 14 | parser.add_argument("--test_id", type=str, help="test_id", default="test1") 15 | parser.add_argument("--emb_dim", type=int, help="size of embedding table", default=16) 16 | parser.add_argument("--patience", type=int, help="patience", default=1) 17 | parser.add_argument("--d_base", type=str, help="base data id", default="baseline") 18 | parser.add_argument("--d_cross", type=str, help="cross data id", default="cross") 19 | parser.add_argument("--d_cross_exp", type=str, help="cross exp", default="cross1") 20 | parser.add_argument("--n_cross", type=int, help="num cross features", default=40) 21 | parser.add_argument( 22 | "--epochs_skip_es", 23 | type=int, 24 | help="num of epochs to skip for checking early stopping", 25 | default=0, 26 | ) 27 | 28 | 29 | args = parser.parse_args() 30 | 31 | model_type = args.model # ["WDL", "DeepFM", "DCN", "xDeepFM"] 32 | num_trials = args.runs 33 | experiment = args.exp # ["baseline", "cross"] 34 | dataset = args.ds # ["criteo", "avazu"] 35 | batch_size = args.bs 36 | learning_rate = args.lr 37 | opt = args.opt 38 | gpu_device = args.gpu 39 | epochs = args.epochs 40 | test_id = args.test_id 41 | emb_dim = args.emb_dim 42 | patience = args.patience 43 | base_data_id = args.d_base 44 | cross_data_id = args.d_cross 45 | cross_experiment = args.d_cross_exp 46 | n_cross = args.n_cross 47 | epochs_skip_es = args.epochs_skip_es 48 | 49 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 50 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_device) 51 | 52 | 53 | import pandas as pd 54 | from sklearn.metrics import log_loss, roc_auc_score 55 | from sklearn.model_selection import train_test_split 56 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 57 | 58 | from deepctr.models import xDeepFM, DeepFM, WDL, DCN 59 | from deepctr.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names 60 | from tensorflow.python.keras.models import save_model, load_model 61 | 62 | from deepctr.layers import custom_objects 63 | import pickle 64 | 65 | import keras 66 | 67 | import numpy as np 68 | from tqdm import tqdm 69 | import math 70 | 71 | from os.path import join 72 | import tensorflow as tf 73 | from tensorflow.python.keras.optimizers import Adam, Adagrad 74 | 75 | from tensorflow.keras.callbacks import EarlyStopping 76 | from keras.backend.tensorflow_backend import set_session 77 | 78 | 79 | config = tf.ConfigProto() 80 | config.gpu_options.allow_growth = True 81 | config.log_device_placement = ( 82 | True 83 | ) 84 | sess = tf.Session(config=config) 85 | set_session(sess) 86 | 87 | assert model_type in ["WDL", "DeepFM", "DCN", "xDeepFM"] 88 | assert experiment in ["baseline", "cross"] 89 | assert dataset in ["criteo", "avazu"] 90 | 91 | if dataset == "criteo": 92 | src_datapath = "data/deepctr/criteo/" 93 | n_sparse = 26 94 | n_dense = 13 95 | 96 | elif dataset == "avazu": 97 | src_datapath = "data/deepctr/avazu" 98 | n_sparse = 23 99 | n_dense = 0 100 | 101 | 102 | sparse_features = ["C" + str(i) for i in range(1, n_sparse + 1)] 103 | dense_features = ["I" + str(i) for i in range(1, n_dense + 1)] 104 | target = ["label"] 105 | 106 | if experiment == "cross": 107 | cross_features = ["G" + str(i) for i in range(1, n_cross + 1)] 108 | else: 109 | cross_features = [] 110 | 111 | 112 | def get_labels(input_path, batch_size, target="label"): 113 | 114 | labels = {} 115 | for mode in ["valid", "test"]: 116 | label_batches = [] 117 | for data_batch in tqdm(pd.read_hdf(input_path, key=mode, chunksize=batch_size)): 118 | label_batches.append(data_batch[target].values) 119 | labels[mode] = np.concatenate(label_batches) 120 | return labels 121 | 122 | 123 | labels = get_labels(src_datapath + "/" + base_data_id + ".h5", int(1e5)) 124 | 125 | 126 | with open( 127 | join(src_datapath, "n_unique_dict_" + base_data_id + ".pickle"), "rb" 128 | ) as handle: 129 | unique_dict_baseline = pickle.load(handle) 130 | 131 | unique_dict = dict() 132 | for key in sparse_features: 133 | unique_dict[key] = unique_dict_baseline[key] 134 | 135 | if experiment == "cross": 136 | with open( 137 | join( 138 | src_datapath, cross_experiment, "n_unique_dict_" + cross_data_id + ".pickle" 139 | ), 140 | "rb", 141 | ) as handle: 142 | unique_dict_cross = pickle.load(handle) 143 | for key in cross_features: 144 | unique_dict[key] = unique_dict_cross[key] 145 | 146 | 147 | fixlen_feature_columns = [ 148 | SparseFeat(feat, unique_dict[feat]) for feat in sparse_features + cross_features 149 | ] + [DenseFeat(feat, 1) for feat in dense_features] 150 | 151 | dnn_feature_columns = fixlen_feature_columns 152 | linear_feature_columns = fixlen_feature_columns 153 | 154 | fixlen_feature_names = get_fixlen_feature_names( 155 | linear_feature_columns + dnn_feature_columns 156 | ) 157 | 158 | 159 | def get_data_generator( 160 | base_path, 161 | cross_path, 162 | model_inputs, 163 | batch_size, 164 | mode="train", 165 | target="label", 166 | keras=False, 167 | ): 168 | while True: 169 | i = 0 170 | while True: 171 | data_batch_baseline = pd.read_hdf( 172 | base_path, key=mode, start=i * batch_size, stop=(i + 1) * batch_size 173 | ) 174 | if cross_path: 175 | data_batch_cross = pd.read_hdf( 176 | cross_path, 177 | key=mode, 178 | start=i * batch_size, 179 | stop=(i + 1) * batch_size, 180 | ) 181 | i += 1 182 | if data_batch_baseline.shape[0] == 0: 183 | break 184 | data_batch = ( 185 | pd.concat([data_batch_baseline, data_batch_cross], axis=1) 186 | if cross_path 187 | else data_batch_baseline 188 | ) 189 | X = [data_batch[name] for name in model_inputs] 190 | Y = data_batch[target].values 191 | yield (X, Y) 192 | if not keras: 193 | break 194 | 195 | 196 | base_path = join(src_datapath, base_data_id + ".h5") 197 | cross_path = ( 198 | join(src_datapath, cross_experiment, cross_data_id + "_" + str(n_cross) + ".h5") 199 | if experiment == "cross" 200 | else "" 201 | ) 202 | cross_experiment = cross_experiment if experiment == "cross" else "baseline" 203 | 204 | 205 | def shuffle_batch(X, y=None, seed=None): 206 | if seed is not None: 207 | np.random.seed(seed) 208 | 209 | indices = np.random.permutation(len(X[0])) 210 | 211 | X_shuff = [] 212 | for i in range(len(X)): 213 | X_shuff.append(X[i].iloc[indices]) 214 | 215 | if y is not None: 216 | y_shuff = y[indices] 217 | return X_shuff, y_shuff 218 | else: 219 | return X_shuff 220 | 221 | 222 | exp_folder = join("experiments", "deepctr", test_id, "checkpoints") 223 | 224 | if not os.path.exists(exp_folder): 225 | os.makedirs(exp_folder) 226 | 227 | 228 | pkl_path = join( 229 | "experiments", 230 | "deepctr", 231 | test_id, 232 | dataset + "_" + model_type + "_" + cross_experiment + ".pkl", 233 | ) 234 | 235 | if os.path.exists(pkl_path): 236 | with open(pkl_path, "rb") as handle: 237 | results_dict = pickle.load(handle) 238 | histories = results_dict["histories"] 239 | histories_val = results_dict["val_loss"] 240 | test_performances = results_dict["test_performances"] 241 | checkpoints = results_dict["checkpoints"] 242 | else: 243 | histories = [] 244 | histories_val = [] 245 | test_performances = [] 246 | checkpoints = [] 247 | 248 | 249 | for i in range(num_trials): 250 | if i < len(histories): 251 | continue 252 | 253 | print("Starting trial", i + 1) 254 | 255 | model_checkpoint_file = join( 256 | "experiments", 257 | "deepctr", 258 | test_id, 259 | "checkpoints", 260 | dataset + "_" + model_type + "_" + cross_experiment + "_trial" + str(i) + ".h5", 261 | ) 262 | 263 | test_generator = get_data_generator( 264 | base_path, cross_path, fixlen_feature_names, batch_size, mode="test", keras=True 265 | ) 266 | 267 | if model_type == "DeepFM": 268 | model = DeepFM( 269 | linear_feature_columns, 270 | dnn_feature_columns, 271 | task="binary", 272 | embedding_size=emb_dim, 273 | use_fm=True, 274 | dnn_hidden_units=[400, 400, 400], 275 | ) 276 | 277 | if model_type == "xDeepFM": 278 | model = xDeepFM( 279 | linear_feature_columns, 280 | dnn_feature_columns, 281 | task="binary", 282 | embedding_size=emb_dim, 283 | dnn_hidden_units=[400, 400], 284 | cin_layer_size=[200, 200, 200], 285 | ) 286 | 287 | if model_type == "WDL": 288 | model = WDL( 289 | linear_feature_columns, 290 | dnn_feature_columns, 291 | task="binary", 292 | embedding_size=emb_dim, 293 | dnn_hidden_units=[1024, 512, 256], 294 | ) 295 | 296 | if model_type == "DCN": 297 | model = DCN( 298 | dnn_feature_columns, 299 | task="binary", 300 | embedding_size=emb_dim, 301 | dnn_hidden_units=[1024, 1024], 302 | cross_num=6, 303 | ) 304 | 305 | if opt == "adagrad": 306 | optimizer = Adagrad 307 | elif opt == "adam": 308 | optimizer = Adam 309 | else: 310 | raise ValueError("Invalid optimizer") 311 | 312 | model.compile( 313 | optimizer(learning_rate), "binary_crossentropy", metrics=["binary_crossentropy"] 314 | ) 315 | 316 | callbacks = [] 317 | 318 | patience_counter = 0 319 | best_valid_loss = float("Inf") 320 | 321 | history_epoch = {} 322 | history_val = {} 323 | for epoch in range(epochs): 324 | breakout = False 325 | history_epoch[epoch] = {} 326 | history_val[epoch] = [] 327 | train_generator = get_data_generator( 328 | base_path, 329 | cross_path, 330 | fixlen_feature_names, 331 | len(labels["valid"]), 332 | mode="train", 333 | ) 334 | for file_count, data_batch in enumerate(train_generator): 335 | print("epoch", epoch, "filecount", file_count) 336 | train_model_input, train_model_labels = data_batch 337 | 338 | X_shuffled, Y_shuffled = shuffle_batch( 339 | train_model_input, train_model_labels 340 | ) # using AutoInt's convention 341 | 342 | history = model.fit( 343 | X_shuffled, 344 | Y_shuffled, 345 | batch_size=batch_size, 346 | epochs=1, 347 | verbose=1, 348 | callbacks=callbacks, 349 | ) 350 | 351 | history_epoch[epoch][file_count] = [history.history, history.params] 352 | 353 | if epoch < epochs_skip_es: 354 | continue 355 | 356 | valid_generator = get_data_generator( 357 | base_path, 358 | cross_path, 359 | fixlen_feature_names, 360 | batch_size, 361 | mode="valid", 362 | keras=True, 363 | ) 364 | valid_pred = model.predict_generator( 365 | valid_generator, steps=math.ceil(len(labels["valid"]) / batch_size) 366 | ) 367 | valid_loss = log_loss(labels["valid"], valid_pred, eps=1e-7) 368 | history_val[epoch].append(valid_loss) 369 | 370 | if valid_loss < best_valid_loss: 371 | save_model(model, model_checkpoint_file) 372 | 373 | print( 374 | "[%d-%d] model saved!. Valid loss improved from %.4f to %.4f" 375 | % (epoch, file_count, best_valid_loss, valid_loss) 376 | ) 377 | best_valid_loss = valid_loss 378 | patience_counter = 0 379 | else: 380 | if patience_counter >= patience: 381 | breakout = True 382 | print("Early Stopping!") 383 | break 384 | patience_counter += 1 385 | 386 | if breakout: 387 | break 388 | 389 | best_model = tf.keras.models.load_model(model_checkpoint_file, custom_objects) 390 | 391 | pred_ans = best_model.predict_generator( 392 | test_generator, steps=math.ceil(len(labels["test"]) / batch_size) 393 | ) 394 | 395 | test_logloss = round(log_loss(labels["test"], pred_ans, eps=1e-7), 7) 396 | test_auc = round(roc_auc_score(labels["test"], pred_ans), 7) 397 | print("test LogLoss", test_logloss) 398 | print("test AUC", test_auc) 399 | 400 | histories.append(history_epoch) 401 | test_performances.append({"logloss": test_logloss, "auc": test_auc}) 402 | histories_val.append( 403 | { 404 | "history": history_val, 405 | "best_valid_loss": best_valid_loss, 406 | "patience": patience, 407 | } 408 | ) 409 | checkpoints.append(model_checkpoint_file) 410 | results_dict = { 411 | "histories": histories, 412 | "test_performances": test_performances, 413 | "val_loss": histories_val, 414 | "params": best_model.count_params(), 415 | "checkpoints": checkpoints, 416 | } 417 | 418 | with open(pkl_path, "wb") as handle: 419 | pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) 420 | 421 | print("\n\n") 422 | 423 | aucs = [x["auc"] for x in test_performances] 424 | loglosses = [x["logloss"] for x in test_performances] 425 | 426 | auc_mean = np.mean(aucs) 427 | auc_std = np.std(aucs) 428 | logloss_mean = np.mean(loglosses) 429 | logloss_std = np.std(loglosses) 430 | 431 | print(auc_mean, auc_std, logloss_mean, logloss_std) 432 | -------------------------------------------------------------------------------- /2. glider/utils/cross_feature_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from tqdm import tqdm 3 | import numpy as np 4 | from sklearn.preprocessing import KBinsDiscretizer 5 | import multiprocessing as mp 6 | from itertools import repeat 7 | import warnings 8 | 9 | warnings.simplefilter("ignore") 10 | 11 | 12 | def str2bool(v): 13 | if isinstance(v, bool): 14 | return v 15 | if v.lower() in ("yes", "true", "t", "y", "1"): 16 | return True 17 | elif v.lower() in ("no", "false", "f", "n", "0"): 18 | return False 19 | else: 20 | raise argparse.ArgumentTypeError("Boolean value expected.") 21 | 22 | 23 | def load_global_interactions(interactions_file, field_size, max_rank, prune_subsets, top_k): 24 | 25 | with open(interactions_file, "rb") as handle: 26 | interaction_results = pickle.load(handle, encoding="latin1") 27 | 28 | global_interactions = {} 29 | mlp_losses = [] 30 | inters = [] 31 | for result in interaction_results: 32 | if result is None: 33 | continue 34 | for inter in result["inters"][:top_k]: 35 | if len(inter[0]) == field_size: 36 | continue 37 | if inter[0] not in global_interactions: 38 | global_interactions[inter[0]] = 1 39 | else: 40 | global_interactions[inter[0]] += 1 41 | 42 | global_interactions = sorted( 43 | global_interactions.items(), key=lambda x: x[1], reverse=True 44 | ) 45 | 46 | if prune_subsets: 47 | pruned_global_interactions = [] 48 | index = 0 49 | while len(pruned_global_interactions) < max_rank: 50 | inter = global_interactions[index] 51 | if any( 52 | set(inter[0]) < set(new_inter[0]) 53 | for new_inter in pruned_global_interactions 54 | ): 55 | pass 56 | else: 57 | pruned_global_interactions.append(inter) 58 | pruned_global_interactions = [ 59 | t 60 | for t in pruned_global_interactions 61 | if not (set(t[0]) < set(inter[0])) 62 | ] 63 | index += 1 64 | else: 65 | pruned_global_interactions = global_interactions[:max_rank] 66 | 67 | top_K_inters, _ = zip(*pruned_global_interactions) 68 | return top_K_inters 69 | 70 | 71 | def load_data_autoint(dataset, data_path): 72 | 73 | path_prefix = data_path + "/" 74 | 75 | if dataset == "criteo": 76 | file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"] 77 | elif dataset == "avazu": 78 | file_name = ["train_i.npy", "train_x.npy", "train_y.npy"] 79 | else: 80 | raise ValueError("Invalid dataset") 81 | 82 | data = [] 83 | for j in tqdm(range(1, 11)): 84 | folder_path = path_prefix + "/part" + str(j) + "/" 85 | Xi = np.load(folder_path + file_name[0]) 86 | Xv = np.load(folder_path + file_name[1]) 87 | y = np.load(folder_path + file_name[2]) 88 | data.append({"Xi": Xi, "Xv": Xv, "y": y}) 89 | 90 | return data 91 | 92 | 93 | def merge_data(data): 94 | Xi = [] 95 | Xv = [] 96 | y = [] 97 | lens = [] 98 | for d in data: 99 | Xi.append(d["Xi"]) 100 | Xv.append(d["Xv"]) 101 | y.append(d["y"]) 102 | lens.append(len(d["y"])) 103 | Xi = np.concatenate(Xi) 104 | Xv = np.concatenate(Xv) 105 | y = np.concatenate(y) 106 | return Xi, Xv, y, lens 107 | 108 | 109 | def get_training_batch(data, size=1000000): 110 | Xi_batch = data[2]["Xi"][:size] 111 | Xv_batch = data[2]["Xv"][:size] 112 | y_batch = data[2]["y"][:size] 113 | return Xi_batch, Xv_batch, y_batch 114 | 115 | 116 | def get_dense_sparse_feat_indices(Xi_batch, dataset): 117 | 118 | dense_feat_indices = [] 119 | sparse_feat_indices = [] 120 | for i in tqdm(range(Xi_batch.shape[1])): 121 | uniq = np.unique(Xi_batch[:, i]) 122 | if len(uniq) == 1 and "avazu" not in dataset: 123 | dense_feat_indices.append(i) 124 | else: 125 | sparse_feat_indices.append(i) 126 | 127 | return dense_feat_indices, sparse_feat_indices 128 | 129 | 130 | def discretize_dense(Xv_feat, Xv_feat_batch, num_bins): 131 | est = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="quantile") 132 | est.fit(Xv_feat_batch) 133 | disc = est.transform(Xv_feat) 134 | cardinality = len(est.bin_edges_[0]) - 1 135 | return disc, est, cardinality 136 | 137 | 138 | def _par_discretize(f_idx, Xv_feat, Xv_feat_batch, num_bins): 139 | # print("start", f_idx) 140 | disc, est, cardinality = discretize_dense(Xv_feat, Xv_feat_batch, num_bins) 141 | return f_idx, disc, est, cardinality 142 | 143 | 144 | def discretize_dense_features( 145 | Xi, 146 | Xv, 147 | Xv_batch, 148 | dense_feat_indices, 149 | sparse_feat_indices, 150 | num_feats, 151 | num_bins, 152 | num_processes=20, 153 | ): 154 | 155 | discretizers = {} 156 | new_Xv_dense = [] 157 | cardinalities = {} 158 | 159 | Xv_feats = [] 160 | Xv_feats_batch = [] 161 | for i in dense_feat_indices: 162 | Xv_feats.append(Xv[:, i].reshape(-1, 1)) 163 | Xv_feats_batch.append(Xv_batch[:, i].reshape(-1, 1)) 164 | 165 | pool = mp.Pool(processes=num_processes) 166 | disc_collect = pool.starmap( 167 | _par_discretize, 168 | zip(dense_feat_indices, Xv_feats, Xv_feats_batch, repeat(num_bins)), 169 | ) 170 | cardinalities = {} 171 | discretizers = {} 172 | disc_summary = [x[1:] for x in sorted(disc_collect, key=lambda x: x[0])] 173 | new_Xv_dense = [] 174 | for i, disc in enumerate(disc_summary): 175 | new_Xv_dense.append(disc[0]) 176 | discretizers[i] = disc[1] 177 | cardinalities[i] = disc[2] 178 | 179 | if dense_feat_indices: 180 | new_Xv_dense = np.concatenate(new_Xv_dense, 1) 181 | den = True 182 | else: 183 | den = False 184 | 185 | pool.close() 186 | 187 | if den: 188 | sparsified_data = np.zeros((Xi.shape[0], num_feats)) 189 | sparsified_data[:, dense_feat_indices] = new_Xv_dense 190 | sparsified_data[:, sparse_feat_indices] = Xi[:, sparse_feat_indices] 191 | else: 192 | sparsified_data = Xi 193 | 194 | return sparsified_data 195 | 196 | 197 | def zero_index_sp_feats(combo_map, feat_combo): 198 | new_i = [] 199 | new_v = [] 200 | for c in feat_combo: 201 | if tuple(c) not in combo_map: 202 | new_i.append(0) 203 | new_v.append(0) 204 | else: 205 | new_i.append(combo_map[tuple(c)]) 206 | new_v.append(1) 207 | 208 | return new_i, new_v 209 | 210 | 211 | def _par_zero_sp(combo_idx, combo_map, feat_combo): 212 | # print(combo_idx, feat_combo.shape) 213 | new_i, new_v = zero_index_sp_feats(combo_map, feat_combo) 214 | return combo_idx, new_i, new_v 215 | 216 | 217 | def cross_sparse_features( 218 | top_K_inters, 219 | sparsified_data, 220 | sparsified_batch, 221 | Xi_batch, 222 | threshold, 223 | num_processes=20, 224 | ): 225 | 226 | # collect combo frequency 227 | inter_feats = [] 228 | inter_combo_maps = {} 229 | 230 | for inter in tqdm(top_K_inters): 231 | 232 | inter_list = list(inter) 233 | inter_counts = {} 234 | for d, data_inst in enumerate(sparsified_batch): 235 | combo = tuple(data_inst[inter_list]) 236 | if combo not in inter_counts: 237 | inter_counts[combo] = 1 238 | else: 239 | inter_counts[combo] += 1 240 | combo_map = {} 241 | for combo in inter_counts: 242 | if inter_counts[combo] <= Xi_batch.shape[0] * threshold: 243 | pass 244 | else: 245 | orig_len = len(combo_map) 246 | combo_map[combo] = orig_len + 1 # shift by 1 (0 value means missing) 247 | inter_combo_maps[inter] = combo_map 248 | 249 | # f = open("b" + str(nbins),"w") 250 | # for cm in inter_combo_maps: 251 | # f.write(str(cm) + "\t" + str(len(inter_combo_maps[cm])) + "\n") 252 | # print(len(inter_combo_maps[cm])) 253 | 254 | inters = [] 255 | combo_maps = [] 256 | feat_combos = [] 257 | for inter in tqdm(inter_combo_maps): 258 | inters.append(inter) 259 | combo_maps.append(inter_combo_maps[inter]) 260 | feat_combos.append(sparsified_data[:, list(inter)]) 261 | 262 | pool = mp.Pool(processes=num_processes) 263 | cross_feats = pool.starmap( 264 | _par_zero_sp, zip(list(range(len(combo_maps))), combo_maps, feat_combos) 265 | ) 266 | del feat_combos 267 | pool.close() 268 | cross_feats = sorted(cross_feats, key=lambda x: x[0]) 269 | 270 | ## the serial way of obtaining cross_feats (in case the parallel code doesnt work..) 271 | # cross_feats = [] 272 | # i = 0 273 | # for inter in tqdm(inter_combo_maps): 274 | # cross_feats.append(_par_zero_sp(i, inter_combo_maps[inter], sparsified_data[:,list(inter)])) 275 | # i += 1 276 | 277 | return cross_feats 278 | 279 | 280 | def get_X_cross(inters, cross_feats, Xi, Xv, sparse_feat_indices): 281 | # get cross feats in autoint data format 282 | 283 | # if a feature value is 0 , e.g. missing data, then any interaction with this feature will also be deemed missing with value 0 284 | n = 0 285 | Xv_cross = [] 286 | for inter in tqdm(inters): 287 | mask = np.ones(Xv.shape[0]) 288 | for i, idx in enumerate(inter): 289 | if idx in sparse_feat_indices: 290 | mask = mask * Xv[:, idx] 291 | 292 | Xv_cross.append(cross_feats[n][2] * mask) 293 | n += 1 294 | 295 | cross_start = Xi.max() + 1 296 | 297 | # shift all the cross feature values so they can be packed later into a single embedding matrix (autoint format) 298 | for i in tqdm(range(len(cross_feats))): 299 | cur_cross_feat = np.array(cross_feats[i][1]) 300 | max_val = cur_cross_feat.max() 301 | cross_feats[i] = ( 302 | cur_cross_feat + cross_start 303 | ) # in-place modification to save memory 304 | cross_start = max_val + cross_start + 1 305 | 306 | Xi_cross = cross_feats 307 | 308 | Xi_cross = np.stack(Xi_cross, 1) 309 | Xv_cross = np.stack(Xv_cross, 1) 310 | 311 | return Xi, Xv, Xi_cross, Xv_cross 312 | -------------------------------------------------------------------------------- /2. glider/utils/global_interaction_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("models/autoint") 4 | from model import AutoInt 5 | import numpy as np 6 | import os 7 | 8 | 9 | class get_args: 10 | # the original parameter configuration of AutoInt 11 | blocks = 3 12 | block_shape = [64, 64, 64] 13 | heads = 2 14 | embedding_size = 16 15 | dropout_keep_prob = [1, 1, 1] 16 | epoch = 3 17 | batch_size = 1024 18 | learning_rate = 0.001 19 | learning_rate_wide = 0.001 20 | optimizer_type = "adam" 21 | l2_reg = 0.0 22 | random_seed = 2018 # used in the official autoint code 23 | loss_type = "logloss" 24 | verbose = 1 25 | run_times = 1 26 | is_save = False 27 | greater_is_better = False 28 | has_residual = True 29 | has_wide = False 30 | deep_layers = [400, 400] 31 | batch_norm = 0 32 | batch_norm_decay = 0.995 33 | 34 | def __init__(self, save_path, field_size, dataset, data_path): 35 | self.save_path = save_path 36 | self.field_size = field_size 37 | self.data = dataset 38 | self.data_path = data_path 39 | 40 | 41 | def parse_args(dataset, data_path, save_path): 42 | dataset = dataset.lower() 43 | if "avazu" in dataset: 44 | field_size = 23 45 | elif "criteo" in dataset: 46 | field_size = 39 47 | else: 48 | raise ValueError("Invalid dataset") 49 | 50 | return get_args(save_path, field_size, dataset, data_path) 51 | 52 | 53 | def get_data_info(args): 54 | data = args.data.split("/")[-1].lower() 55 | if any([data.startswith(d) for d in ["avazu"]]): 56 | file_name = ["train_i.npy", "train_x.npy", "train_y.npy"] 57 | elif any([data.startswith(d) for d in ["criteo"]]): 58 | file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"] 59 | else: 60 | raise ValueError("invalid data arg") 61 | 62 | path_prefix = os.path.join(args.data_path, args.data) 63 | return file_name, path_prefix 64 | 65 | 66 | def get_autoint_and_data( 67 | dataset="Criteo", 68 | data_path="/workspace/AutoInt", 69 | save_path="/test_code/Criteo/b3h2_dnn_dropkeep1_400x2/1/", 70 | ): 71 | args = parse_args(dataset, data_path, save_path) 72 | 73 | file_name = [] 74 | 75 | file_name, path_prefix = get_data_info(args) 76 | feature_size = np.load(path_prefix + "/feature_size.npy")[0] 77 | 78 | run_cnt = 0 79 | model = AutoInt(args=args, feature_size=feature_size, run_cnt=run_cnt) 80 | 81 | Xi_valid = np.load(path_prefix + "/part2/" + file_name[0]) 82 | Xv_valid = np.load(path_prefix + "/part2/" + file_name[1]) 83 | y_valid = np.load(path_prefix + "/part2/" + file_name[2]) 84 | 85 | feature_indices = list(range(Xi_valid.shape[1])) 86 | means_dict = {} 87 | for i in feature_indices: 88 | means_dict[i] = np.mean(Xv_valid[:, i]) 89 | 90 | model.restore(args.save_path) 91 | return model, {"Xi": Xi_valid, "Xv": Xv_valid, "y": y_valid, "means": means_dict} 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature Interaction Interpretability via Interaction Detection 2 | 3 | This is the official code repository for the paper, "Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection". 4 | 5 | 6 | 7 | Example Explanations 8 | 9 | * Global Interpretations 10 |

11 | 12 |

13 | 14 | * Local Interpretations (of ResNet classifications) 15 |

16 | 17 |

18 | 19 |
20 | 21 | Michael Tsang, Dehua Cheng, Hanpeng Liu, Xue Feng, Eric Zhou, Yan Liu, [Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection](https://openreview.net/forum?id=BkgnhTEtDS), ICLR 2020. 22 | 23 | Neural Interaction Detection:\ 24 | Michael Tsang, Dehua Cheng, Yan Liu, [Detecting Statistical Interactions from Neural Network Weights](https://openreview.net/forum?id=ByOfBggRZ), ICLR 2018. 25 | 26 | 27 | ## Setup 28 | 29 | 30 | In a linux environment with Python 3.6: 31 | 32 | ```bash 33 | pip install -r requirements.txt 34 | ``` 35 | 36 | We require CUDA 10 support to use GLIDER. 37 | 38 | ## Usage 39 | ### 1. MADEX 40 | 41 | **MADEX (Model-Agnostic Dependency EXplainer)** is a method for interpreting feature interactions from a black-box prediction model per data instance. It contains two versions of Neural Interaction Detection (NID): the original NID and GradientNID. NID is a fast and accurate method to detect arbitrary-order interactions in polynomial time, whereas GradientNID exactly detects interactions from an explainer MLP. The following domains are showcased: DNA, graph, image, and text modeling. 42 | 43 |
Show instructions 44 | 45 | 46 | ```bash 47 | cd 1.\ madex/ 48 | ``` 49 | 50 | The following notebooks are available to demo MADEX: 51 | * `madex_example_dna.ipynb` 52 | * `madex_example_graph.ipynb` 53 | * `madex_example_image.ipynb` 54 | * `madex_example_text.ipynb` 55 | 56 |
57 | 58 | ### 2. GLIDER 59 | 60 | **GLIDER (GLobal Interaction Detection and Encoding for Recommendation)** takes MADEX beyond model interpretation on recommendation tasks (or tablular data modeling). GLIDER detects feature interactions that reoccur across data instances from a source recommender model, then explicitly encodes the interactions in a target recommender model. This process is a form of automatic feature engineering. 61 | 62 |
Show instructions 63 | 64 | 65 | ```bash 66 | cd 2.\ glider/ 67 | ``` 68 | 69 | #### A. Data Preparation 70 | 71 | Please follow instructions in the [AutoInt repo](https://github.com/shichence/AutoInt) for how to prepare data splits. 72 | 73 | The same code is also provided in this repo and follows the same series of commands. The Criteo dataset is found [here](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/). Place it in the path `data/autoint/criteo`. 74 | 75 | ```bash 76 | mkdir data/autoint/criteo 77 | python data/initial_data_prep/criteo/preprocess.py 78 | python data/initial_data_prep/Kfold_split/stratifiedKfold.py 79 | python data/initial_data_prep/criteo/scale.py 80 | ``` 81 | 82 | #### B. Global Interaction Detection 83 | 84 | First, train a baseline AutoInt model. 85 | 86 | ```bash 87 | python models/autoint/train.py --exp baseline --data data/autoint/criteo --save_path experiments/autoint/criteo/baseline/ --run_times 1 --gpu 0 88 | ``` 89 | 90 | Then, run global interaction detection on this model. 91 | 92 | ```bash 93 | python detect_global_interactions.py --save_path experiments/autoint/criteo/baseline/1/ --data criteo --save_id SAVEID --par_batch_size par_batch_size 94 | ``` 95 | 96 | * ```par_batch_size``` is the number of data instances to process in parallel. Set this based on the number of CPU processes and GPU memory available. 97 | * ``SAVEID`` shows up again later. Use a descriptive identifier. 98 | 99 | 100 | #### C. Cross Feature Generation 101 | 102 | To generate cross features: 103 | 104 | ```bash 105 | python make_cross_feature_data.py --data_file experiments/detected_interactions_criteo_SAVEID.pickle --exp cross_K40 --K 40 --data criteo --autoint_save_path data/autoint/criteo --deepctr_save_path data/deepctr/criteo --save_base_data true 106 | ``` 107 | 108 | #### D. Train DeepCTR models: 109 | 110 | * Wide&Deep: `WDL` 111 | * DeepFM: `DeepFM` 112 | * Deep&Cross: `DCN` 113 | * xDeepFM: `xDeepFM` 114 | 115 | Baseline: 116 | ```bash 117 | python train_deepctr.py --model WDL --ds criteo --exp baseline --patience 5 --test_id baseline_experiment --gpu 0 118 | ``` 119 | 120 | Baseline + GLIDER (distillation): 121 | ```bash 122 | python train_deepctr.py --model WDL --ds criteo --exp cross --patience 5 --test_id cross_experiment --gpu 0 --d_cross_exp cross_K40 --n_cross 40 123 | ``` 124 | 125 | 126 | #### E. Train AutoInt models: 127 | 128 | 129 | Baseline + GLIDER (enhancement): 130 | ```bash 131 | python models/autoint/train.py --exp cross --data data/autoint/criteo --save_path experiments/autoint/criteo/cross/ --gpu 0 --cross_exp cross_K40 132 | ``` 133 | 134 |
135 | 136 | 137 | ## References 138 | 139 | ``` 140 | @inproceedings{tsang2020feature, 141 | title={Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection}, 142 | author={Michael Tsang and Dehua Cheng and Hanpeng Liu and Xue Feng and Eric Zhou and Yan Liu}, 143 | booktitle={International Conference on Learning Representations}, 144 | year={2020}, 145 | url={https://openreview.net/forum?id=BkgnhTEtDS} 146 | } 147 | ``` 148 | 149 | Neural Interaction Detection: 150 | ``` 151 | @article{tsang2017detecting, 152 | title={Detecting Statistical Interactions from Neural Network Weights}, 153 | author={Michael Tsang and Dehua Cheng and Yan Liu}, 154 | journal={arXiv preprint arXiv:1705.04977}, 155 | year={2017} 156 | } 157 | ``` 158 | 159 | 160 | -------------------------------------------------------------------------------- /figures/explanation1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/explanation1.png -------------------------------------------------------------------------------- /figures/explanation2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/explanation2.png -------------------------------------------------------------------------------- /figures/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/overview.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | deepctr==0.6.0 2 | future==0.17.1 3 | h5py==2.8.0 4 | jupyterlab==1.2.6 5 | matplotlib==3.1.2 6 | nltk==3.3 7 | numpy==1.17.1 8 | ipywidgets==6.0.0 9 | pandas==0.20.3 10 | protobuf==3.11.2 11 | scikit-image==0.14.0 12 | scikit-learn==0.21.3 13 | scipy==1.3.1 14 | tables==3.5.2 15 | tensorboard==1.14.0 16 | tensorflow-estimator==1.14.0 17 | tensorflow-gpu==1.14.0 18 | torch==1.3.1 19 | torchtext==0.3.1 20 | torchvision==0.4.2 21 | tqdm==4.32.2 22 | transformers==2.4.1 --------------------------------------------------------------------------------