├── .gitignore
├── 1. madex
    ├── madex_example_dna.ipynb
    ├── madex_example_graph.ipynb
    ├── madex_example_image.ipynb
    ├── madex_example_text.ipynb
    ├── neural_interaction_detection.py
    ├── sampling_and_inference.py
    └── utils
    │   ├── data
    │       ├── cora
    │       │   ├── README
    │       │   ├── cora.cites
    │       │   └── cora.content
    │       └── sample_images
    │       │   ├── bus.jpg
    │       │   ├── dog.jpg
    │       │   ├── shark.jpg
    │       │   └── viaduct.jpg
    │   ├── dna_utils.py
    │   ├── general_utils.py
    │   ├── graph_utils.py
    │   ├── image_utils.py
    │   ├── lime
    │       ├── lime_base.py
    │       └── lime_text.py
    │   ├── linear_cross_utils.py
    │   ├── pretrained
    │       ├── dna_cnn.pt
    │       ├── gcn_cora.pt
    │       └── model_gcn.py
    │   └── text_utils.py
├── 2. glider
    ├── data
    │   └── initial_data_prep
    │   │   ├── avazu
    │   │       ├── config.py
    │   │       └── preprocess.py
    │   │   ├── criteo
    │   │       ├── config.py
    │   │       ├── preprocess.py
    │   │       └── scale.py
    │   │   ├── kdd2012
    │   │       ├── config.py
    │   │       ├── preprocess.py
    │   │       └── scale.py
    │   │   └── kfold_split
    │   │       ├── config.py
    │   │       └── stratifiedKfold.py
    ├── detect_global_interactions.py
    ├── make_cross_feature_data.py
    ├── models
    │   └── autoint
    │   │   ├── README.md
    │   │   ├── model.py
    │   │   └── train.py
    ├── train_deepctr.py
    └── utils
    │   ├── cross_feature_utils.py
    │   └── global_interaction_utils.py
├── README.md
├── figures
    ├── explanation1.png
    ├── explanation2.png
    └── overview.png
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .data
 2 | .ipynb_checkpoints
 3 | __pycache__
 4 | *.npy
 5 | *.h5
 6 | *.ckpt-*
 7 | *.pyc
 8 | *.save
 9 | *.swp
10 | *.zip
11 | backup*
12 | 


--------------------------------------------------------------------------------
/1. madex/madex_example_dna.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from torchtext import datasets, data\n",
 10 |     "import numpy as np\n",
 11 |     "import os, sys\n",
 12 |     "from time import time\n",
 13 |     "\n",
 14 |     "sys.path.append(\"../1. madex\")\n",
 15 |     "\n",
 16 |     "from neural_interaction_detection import *\n",
 17 |     "from sampling_and_inference import *\n",
 18 |     "from utils.dna_utils import *\n",
 19 |     "\n",
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "import warnings\n",
 23 |     "warnings.filterwarnings(\"ignore\")\n",
 24 |     "\n",
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2\n",
 27 |     "\n",
 28 |     "device = torch.device(\"cuda:0\")"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Load Model"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "model = load_dna_model(\"utils/pretrained/dna_cnn.pt\").to(device)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Get DNA Sequence"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "GTAGGTAAGCGCACGTGTTGCACTTCCCTTAATCCA True\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "np.random.seed(42)\n",
 69 |     "seq_instance = generate_random_dna_sequence_with_CACGTG()\n",
 70 |     "print(seq_instance, \"CACGTG\" in seq_instance)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Run MADEX"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "100%|██████████| 60/60 [00:02<00:00, 29.46it/s]\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "data_inst = {\"orig\": seq_instance, \"vectorizer\": encode_dna_onehot}\n",
 95 |     "Xs, Ys = generate_perturbation_dataset_dna(data_inst, model, device, seed=42)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "0.0046 test loss, 16.0 seconds elapsed\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "t0 = time()\n",
113 |     "interactions, mlp_loss = detect_interactions(Xs, Ys, weight_samples=False, seed=42, verbose=False, add_linear=False)\n",
114 |     "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "interaction ranking \n",
127 |       "\n",
128 |       "1 found CACGTG >> ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16')\n",
129 |       "2  ('A_21', 'C_25')\n",
130 |       "3  ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'A_21')\n",
131 |       "4  ('C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n",
132 |       "5  ('A_21', 'C_25', 'C_26')\n",
133 |       "6  ('A_21', 'T_23', 'C_25', 'C_26')\n",
134 |       "7  ('A_21', 'T_23', 'C_25', 'C_26', 'T_28')\n",
135 |       "8  ('A_2', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n",
136 |       "9  ('A_2', 'A_6', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18')\n",
137 |       "10  ('A_2', 'A_6', 'C_11', 'A_12', 'C_13', 'G_14', 'T_15', 'G_16', 'T_18', 'C_20')\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "print(\"interaction ranking\", \"\\n\")\n",
143 |     "for rank, inter in enumerate(interactions[:10]):\n",
144 |     "    inter_indices, _ = inter\n",
145 |     "    inter_verbose = tuple((seq_instance[s], s) for s in inter_indices)\n",
146 |     "\n",
147 |     "    inter_nucleotides, _ = zip(*inter_verbose)\n",
148 |     "    if \"\".join(inter_nucleotides) == \"CACGTG\" and all(np.diff(inter_indices) == 1):\n",
149 |     "        postfix = \"found CACGTG >>\"\n",
150 |     "    else:\n",
151 |     "        postfix = \"\"\n",
152 |     "    print(rank+1, postfix, tuple(a + \"_\" + str(b) for a,b in inter_verbose))\n",
153 |     "\n"
154 |    ]
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "Python [conda env:torch]",
160 |    "language": "python",
161 |    "name": "conda-env-torch-py"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.6.2"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 4
178 | }
179 | 


--------------------------------------------------------------------------------
/1. madex/madex_example_graph.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from torchtext import datasets, data\n",
 10 |     "import matplotlib.pyplot as plt\n",
 11 |     "import numpy as np\n",
 12 |     "import os, sys\n",
 13 |     "from time import time\n",
 14 |     "\n",
 15 |     "from neural_interaction_detection import *\n",
 16 |     "from sampling_and_inference import *\n",
 17 |     "from utils.general_utils import *\n",
 18 |     "from utils.graph_utils import *\n",
 19 |     "\n",
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "import warnings\n",
 23 |     "warnings.filterwarnings(\"ignore\")\n",
 24 |     "\n",
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2\n",
 27 |     "\n",
 28 |     "device = torch.device(\"cuda:0\")"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Load Model"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "model_folder = \"utils/pretrained\"\n",
 45 |     "\n",
 46 |     "model, n_nodes, n_hops, test_idxs = get_graph_model(model_folder)\n",
 47 |     "model = model.to(device)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Classify Graph"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "data_folder = \"utils/data/cora\"\n",
 64 |     "\n",
 65 |     "node_feats, adj_mat, labels = load_cora(data_folder, device)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "target node classification: 6\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "target_idx = test_idxs[0]\n",
 83 |     "\n",
 84 |     "preds = model(node_feats, convert_adj_to_da(adj_mat))\n",
 85 |     "classification = torch.argmax(preds, 1).cpu().numpy()[target_idx] \n",
 86 |     "print(\"target node classification:\", classification)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "## Run MADEX"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stderr",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "100%|██████████| 6000/6000 [01:40<00:00, 59.72it/s]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "data_inst = {\"nodes\": node_feats, \"edges\": adj_mat, \"test_idxs\": test_idxs}\n",
111 |     "Xs, Ys = generate_perturbation_dataset_graph(data_inst, model, target_idx, n_hops+1, device, seed=42, std_scale=False)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "19.4754 test loss, 94.2 seconds elapsed\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "t0 = time()\n",
129 |     "interactions, mlp_loss = detect_interactions(Xs, Ys, weight_samples=True, seed=42, verbose=False)\n",
130 |     "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Show Main Effects and Interaction Interpretations"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "legend: (hops from target node, node idx). All hops should be within n_hops: 3\n",
150 |       "\n",
151 |       "target (0, 1808)\n",
152 |       "\n",
153 |       "main effects\n",
154 |       "(2, 722)\n",
155 |       "(2, 2465)\n",
156 |       "(2, 264)\n",
157 |       "(2, 1189)\n",
158 |       "(2, 2146)\n",
159 |       "\n",
160 |       "interactions\n",
161 |       "2\n",
162 |       "inter 0: ((1, 638), (2, 722))\n",
163 |       "4\n",
164 |       "inter 1: ((2, 264), (1, 638), (2, 722), (2, 2465))\n",
165 |       "5\n",
166 |       "inter 2: ((2, 264), (1, 638), (2, 722), (2, 1189), (2, 2465))\n",
167 |       "6\n",
168 |       "inter 3: ((2, 264), (1, 638), (2, 722), (2, 1189), (2, 2146), (2, 2465))\n",
169 |       "9\n",
170 |       "inter 4: ((2, 264), (2, 294), (2, 296), (1, 638), (2, 722), (2, 1189), (2, 1327), (2, 2146), (2, 2465))\n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "node_to_hop = get_hops_to_target(target_idx, adj_mat, n_hops)\n",
176 |     "local_map = data_inst[\"local_idx_map\"]\n",
177 |     "\n",
178 |     "print(\"legend: (hops from target node, node idx). All hops should be within n_hops:\", n_hops)\n",
179 |     "\n",
180 |     "print(\"\\ntarget\", (0, target_idx))\n",
181 |     "print(\"\\nmain effects\")\n",
182 |     "for uni, att in get_lime_attributions(Xs, Ys)[:5]:\n",
183 |     "    if att > 0:\n",
184 |     "        print((node_to_hop[local_map[uni]],local_map[uni]))\n",
185 |     "print(\"\\ninteractions\")\n",
186 |     "for i, inter in enumerate(interactions[:5]):\n",
187 |     "    print(len(inter[0]))\n",
188 |     "    print(\"inter {}:\".format(i), tuple((node_to_hop[local_map[n]],local_map[n]) for n in inter[0]))\n"
189 |    ]
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "kernelspec": {
194 |    "display_name": "Python [conda env:torch]",
195 |    "language": "python",
196 |    "name": "conda-env-torch-py"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 3
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython3",
208 |    "version": "3.6.2"
209 |   }
210 |  },
211 |  "nbformat": 4,
212 |  "nbformat_minor": 4
213 | }
214 | 


--------------------------------------------------------------------------------
/1. madex/madex_example_text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from torchtext import datasets, data\n",
 10 |     "import numpy as np\n",
 11 |     "import os, sys\n",
 12 |     "from time import time\n",
 13 |     "\n",
 14 |     "from neural_interaction_detection import *\n",
 15 |     "from sampling_and_inference import *\n",
 16 |     "from utils.general_utils import *\n",
 17 |     "from utils.text_utils import *\n",
 18 |     "\n",
 19 |     "import warnings\n",
 20 |     "warnings.filterwarnings(\"ignore\")\n",
 21 |     "%load_ext autoreload\n",
 22 |     "%autoreload 2\n",
 23 |     "\n",
 24 |     "device = torch.device(\"cuda:0\")"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Load Model"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stderr",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Widget Javascript not detected.  It may not be installed or enabled properly.\n"
 44 |      ]
 45 |     },
 46 |     {
 47 |      "data": {
 48 |       "application/vnd.jupyter.widget-view+json": {
 49 |        "model_id": "f1b3bc5eb16f46edb5fcd643307db412"
 50 |       }
 51 |      },
 52 |      "metadata": {},
 53 |      "output_type": "display_data"
 54 |     },
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "model = get_bert_model(device)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Classify Sentence"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "positive sentiment\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "sentence = \"this was not a great movie, but a good movie nevertheless\"\n",
 89 |     "\n",
 90 |     "out = model(sentence)\n",
 91 |     "pred = np.argmax(out[0])\n",
 92 |     "print((\"positive\" if pred== 1 else \"negative\") + \" sentiment\")"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Run MADEX"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 4,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stderr",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "100%|██████████| 12/12 [00:10<00:00,  1.15it/s]\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "data_inst = {\"orig\": sentence}\n",
117 |     "Xs, Ys = generate_perturbation_dataset_text(data_inst, model, 1, device, model_id=\"bert\", batch_size=500, seed=42, std_scale=True)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "0.0142 test loss, 29.9 seconds elapsed\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "t0 = time()\n",
135 |     "interactions, mlp_loss = detect_interactions(Xs, Ys, detector=\"GradientNID\", add_linear=True, device=device, weight_samples=True, seed=42, verbose=False)\n",
136 |     "print(\"{} test loss, {} seconds elapsed\".format(round(mlp_loss, 4), round(time() - t0, 1)))"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Show Main Effects and Interaction Interpretations"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 6,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "this was not a great movie, but a good movie nevertheless\n",
156 |       "\n",
157 |       "main effects: ('but', 'a', 'good', 'movie', 'nevertheless')\n",
158 |       "\n",
159 |       "top-5 interactions\n",
160 |       "inter 1: ('not', 'but') 2.7557428\n",
161 |       "inter 2: ('but', 'good') 1.9747727\n",
162 |       "inter 3: ('not', 'good') 1.8207084\n",
163 |       "inter 4: ('great', 'good') 1.3452238\n",
164 |       "inter 5: ('not', 'great') 1.2503706\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "print(sentence + \"\\n\")\n",
170 |     "\n",
171 |     "dom_map = data_inst[\"domain_mapper\"]\n",
172 |     "\n",
173 |     "lime_atts = get_lime_attributions(Xs, Ys)\n",
174 |     "print(\"main effects:\", map_words([i for i, a in lime_atts if a*(pred*2-1) > 0], dom_map))\n",
175 |     "\n",
176 |     "print(\"\\ntop-5 interactions\")\n",
177 |     "for i, inter_tuple in enumerate(interactions[:5]):\n",
178 |     "    inter, strength = inter_tuple\n",
179 |     "    word_inter = map_words(inter, dom_map)\n",
180 |     "    print(\"inter {}:\".format(i+1), word_inter, strength)"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "kernelspec": {
186 |    "display_name": "Python [conda env:torch]",
187 |    "language": "python",
188 |    "name": "conda-env-torch-py"
189 |   },
190 |   "language_info": {
191 |    "codemirror_mode": {
192 |     "name": "ipython",
193 |     "version": 3
194 |    },
195 |    "file_extension": ".py",
196 |    "mimetype": "text/x-python",
197 |    "name": "python",
198 |    "nbconvert_exporter": "python",
199 |    "pygments_lexer": "ipython3",
200 |    "version": "3.6.2"
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 4
205 | }
206 | 


--------------------------------------------------------------------------------
/1. madex/neural_interaction_detection.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | import operator
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils import data
  6 | import torch.nn as nn
  7 | from utils.general_utils import *
  8 | from torch import autograd
  9 | 
 10 | 
 11 | def preprocess_weights(weights):
 12 |     w_later = np.abs(weights[-1])
 13 |     w_input = np.abs(weights[0])
 14 | 
 15 |     for i in range(len(weights) - 2, 0, -1):
 16 |         w_later = np.matmul(w_later, np.abs(weights[i]))
 17 | 
 18 |     return w_input, w_later
 19 | 
 20 | 
 21 | def interpret_interactions_from_weights(w_input, w_later, get_main_effects=False):
 22 |     interaction_strengths = {}
 23 |     for i in range(w_later.shape[1]):
 24 |         sorted_hweights = sorted(
 25 |             enumerate(w_input[i]), key=lambda x: x[1], reverse=True
 26 |         )
 27 |         interaction_candidate = []
 28 |         candidate_weights = []
 29 |         for j in range(w_input.shape[1]):
 30 |             bisect.insort(interaction_candidate, sorted_hweights[j][0])
 31 |             candidate_weights.append(sorted_hweights[j][1])
 32 | 
 33 |             if not get_main_effects and len(interaction_candidate) == 1:
 34 |                 continue
 35 |             interaction_tup = tuple(interaction_candidate)
 36 |             if interaction_tup not in interaction_strengths:
 37 |                 interaction_strengths[interaction_tup] = 0
 38 |             interaction_strength = (min(candidate_weights)) * (np.sum(w_later[:, i]))
 39 |             interaction_strengths[interaction_tup] += interaction_strength
 40 | 
 41 |     interaction_ranking = sorted(
 42 |         interaction_strengths.items(), key=operator.itemgetter(1), reverse=True
 43 |     )
 44 | 
 45 |     return interaction_ranking
 46 | 
 47 | 
 48 | 
 49 | def get_higher_order_grad(inter, model, x, device):
 50 |     x = torch.FloatTensor(x).to(device)
 51 |     x.requires_grad = True
 52 |     y = model(x)
 53 |     for i, v in enumerate(inter):
 54 |         if i == 0:
 55 |             grad = autograd.grad(y, x, create_graph=True)[0][v] # first feature
 56 |         else:
 57 |             grad = autograd.grad(grad, x, create_graph=True)[0][v] # second feature
 58 | 
 59 |     return grad.item()**2
 60 | 
 61 | def get_second_order_grad(model, x, device):
 62 | 
 63 |     x = torch.FloatTensor(x).to(device)
 64 | 
 65 |     if x.nelement() < 2:
 66 |         return np.array([])
 67 | 
 68 |     x.requires_grad = True
 69 | 
 70 |     y = model(x)
 71 |     grads = autograd.grad(y, x, create_graph=True)[0].squeeze()
 72 | 
 73 |     grad_list = [] 
 74 |     for j, grad in enumerate(grads):
 75 |         grad2 = autograd.grad(grad, x, retain_graph = True)[0].squeeze()
 76 |         grad_list.append(grad2)
 77 | 
 78 |     grad_matrix = torch.stack(grad_list)
 79 |     return grad_matrix.cpu().numpy()**2
 80 | 
 81 | 
 82 | def run_NID(weights):
 83 |     w_input, w_later = preprocess_weights(weights)
 84 |     interaction_ranking = interpret_interactions_from_weights(w_input, w_later)
 85 |     interaction_ranking_pruned = prune_redundant_interactions(interaction_ranking)
 86 | 
 87 |     return interaction_ranking_pruned
 88 | 
 89 | def run_gradient_NID(mlp, x, grad_gpu):
 90 |     interaction_scores = {}
 91 | 
 92 |     if grad_gpu == -1:
 93 |         device = torch.device("cpu")
 94 |     else:
 95 |         device = torch.device("cuda:" + str(grad_gpu))
 96 | 
 97 |     mlp = mlp.to(device)
 98 | 
 99 |     inter_matrix = get_second_order_grad(mlp, x, device)
100 |     
101 |     if len(inter_matrix) == 0:
102 |         return []
103 | 
104 |     inter_scores = []
105 | 
106 |     for j in range(inter_matrix.shape[0]):
107 |         for i in range(j):
108 |             inter_scores.append(((i,j),inter_matrix[i,j]))
109 |     
110 |     inter_ranking = sorted(inter_scores, key=lambda x: -x[1])
111 | 
112 |     return inter_ranking
113 |     
114 | 
115 | def prune_redundant_interactions(interaction_ranking, max_interactions=100):
116 |     interaction_ranking_pruned = []
117 |     current_superset_inters = []
118 |     for inter, strength in interaction_ranking:
119 |         set_inter = set(inter)
120 |         if len(interaction_ranking_pruned) >= max_interactions:
121 |             break
122 |         subset_inter_skip = False
123 |         update_superset_inters = []
124 |         for superset_inter in current_superset_inters:
125 |             if set_inter < superset_inter:
126 |                 subset_inter_skip = True
127 |                 break
128 |             elif not (set_inter > superset_inter):
129 |                 update_superset_inters.append(superset_inter)
130 |         if subset_inter_skip:
131 |             continue
132 |         current_superset_inters = update_superset_inters
133 |         current_superset_inters.append(set_inter)
134 |         interaction_ranking_pruned.append((inter, strength))
135 | 
136 |     return interaction_ranking_pruned
137 | 
138 | 
139 | def detect_interactions(
140 |     Xs,
141 |     Ys,
142 |     detector = "NID",
143 |     x_instance_representation = None, 
144 |     arch=[256, 128, 64],
145 |     batch_size=100,
146 |     device=torch.device("cpu"),
147 |     weight_samples=False,
148 |     add_linear=False,
149 |     l1_const=None,
150 |     grad_gpu=-1,
151 |     seed=None,
152 |     **kwargs
153 | ):
154 |     def get_weights(model):
155 |         weights = []
156 |         for name, param in model.named_parameters():
157 |             if "interaction_mlp" in name and "weight" in name:
158 |                 weights.append(param.cpu().detach().numpy())
159 |         return weights
160 |     
161 |     assert(detector in {"NID", "GradientNID"})
162 | 
163 |     if seed is not None:
164 |         set_seed(seed)
165 | 
166 |     if type(Xs) != dict and type(Ys) != dict:
167 |         Xs = {"train": Xs}
168 |         Ys = {"train": Ys}
169 | 
170 |     Wd = get_sample_weights(Xs, enable=weight_samples, **kwargs)
171 | 
172 |     data_loaders = {}
173 |     for k in Xs:
174 |         feats = force_float(Xs[k])
175 |         targets = force_float(Ys[k])
176 |         sws = force_float(Wd[k]).unsqueeze(1)
177 |         dataset = data.TensorDataset(feats, targets, sws)
178 |         data_loaders[k] = data.DataLoader(dataset, batch_size)
179 |     
180 |     if detector == "GradientNID":
181 |         act_func = nn.Softplus()
182 |         if l1_const == None:
183 |             l1_const = 0
184 |     else:
185 |         act_func = nn.ReLU()
186 |         if l1_const == None:
187 |             l1_const = 1e-4
188 | 
189 |     mlp = MLP(feats.shape[1], arch, add_linear=add_linear, act_func=act_func).to(device)
190 | 
191 |     mlp, mlp_loss = train(mlp, data_loaders, device=device, l1_const=l1_const, **kwargs)
192 |     
193 |     if detector == "NID":
194 |         inters = run_NID(get_weights(mlp))
195 |     elif detector == "GradientNID":
196 |         if x_instance_representation is None:
197 |             x_instance_representation = np.ones((1,Xs["train"].shape[1]))
198 |         inters = run_gradient_NID(mlp, x_instance_representation, grad_gpu)
199 | 
200 |     return inters, mlp_loss
201 | 


--------------------------------------------------------------------------------
/1. madex/sampling_and_inference.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | import copy
  5 | from utils.general_utils import *
  6 | from utils.text_utils import *
  7 | from utils.graph_utils import *
  8 | from utils.dna_utils import *
  9 | from utils.lime.lime_text import *
 10 | 
 11 | 
 12 | def generate_binary_perturbations(
 13 |     num_feat, num_samples=100, init_on=True, perturbed_features=None
 14 | ):
 15 |     if perturbed_features == None:
 16 |         perturbed_features = {"indices": np.array(range(num_feat))}
 17 |     num_perturb = len(perturbed_features["indices"])
 18 | 
 19 |     samples_binary = np.ones((num_samples, num_feat), dtype=np.int8)
 20 |     perturb_binary = np.ones((num_samples, num_perturb), dtype=np.int8)
 21 |     num_flips = np.random.randint(1, num_perturb + 1, num_samples)
 22 | 
 23 |     for r in range(num_samples):
 24 |         if not (init_on and r == 0):
 25 |             num_flip = num_flips[r]
 26 |             perturb_binary[r, 0:num_flip] = np.zeros(num_flip, dtype=np.int8)
 27 |             np.random.shuffle(perturb_binary[r])
 28 |         samples_binary[r, perturbed_features["indices"]] = perturb_binary[r]
 29 | 
 30 |     return samples_binary
 31 | 
 32 | 
 33 | def generate_perturbation_dataset_autoint(
 34 |     data_inst,
 35 |     model,
 36 |     dense_feat_indices,
 37 |     sparse_feat_indices,
 38 |     num_samples=6000,
 39 |     seed=None,
 40 |     **kwargs
 41 | ):
 42 |     if seed is not None:
 43 |         set_seed(seed)
 44 | 
 45 |     def inv_sigmoid(y):
 46 |         return np.log(y / (1 - y))
 47 | 
 48 |     num_feats = len(dense_feat_indices) + len(sparse_feat_indices)
 49 |     samples_binary = generate_binary_perturbations(num_feats, num_samples, True)
 50 | 
 51 |     means_arr = np.array([data_inst["means"][i] for i in dense_feat_indices])
 52 | 
 53 |     perturb_Xv = []
 54 |     perturb_Xi = []
 55 |     for i in range(num_samples):
 56 |         raw_dense = data_inst["Xv"][dense_feat_indices]
 57 |         raw_sparse = data_inst["Xv"][sparse_feat_indices]
 58 |         binary_dense = samples_binary[i, dense_feat_indices]
 59 |         binary_sparse = samples_binary[i, sparse_feat_indices]
 60 |         perturb_raw_dense = raw_dense + binary_dense + means_arr * (1 - binary_dense)
 61 |         perturb_raw_sparse = binary_sparse * raw_sparse
 62 | 
 63 |         perturb_raw = np.zeros(num_feats)
 64 |         perturb_raw[dense_feat_indices] = perturb_raw_dense
 65 |         perturb_raw[sparse_feat_indices] = perturb_raw_sparse
 66 | 
 67 |         # perturb_raw = np.concatenate([perturb_raw_dense, perturb_raw_sparse])
 68 |         perturb_Xv.append(perturb_raw)
 69 |         perturb_Xi.append(data_inst["Xi"])
 70 |     perturb_Xv = np.stack(perturb_Xv)
 71 |     perturb_Xi = np.stack(perturb_Xi)
 72 | 
 73 |     samples_labels = inv_sigmoid(model.predict(perturb_Xi, perturb_Xv))
 74 | 
 75 |     Xs, Ys = proprocess_data(samples_binary.astype(np.int64), samples_labels, **kwargs)
 76 |     return Xs, Ys
 77 | 
 78 | 
 79 | def generate_perturbation_dataset_image(
 80 |     data_inst,
 81 |     model,
 82 |     class_idx,
 83 |     device,
 84 |     num_samples=6000,
 85 |     batch_size=100,
 86 |     seed=None,
 87 |     **kwargs
 88 | ):
 89 |     # Based on LIME image: https://github.com/marcotcr/lime/blob/master/lime/lime_image.py
 90 | 
 91 |     if seed is not None:
 92 |         set_seed(seed)
 93 | 
 94 |     image = data_inst["orig"]
 95 |     segments = data_inst["segments"]
 96 |     num_feats = len(np.unique(segments))
 97 | 
 98 |     samples_binary = generate_binary_perturbations(num_feats, num_samples, True)
 99 | 
100 |     image_means = image.copy()
101 |     for i in np.unique(segments):
102 |         image_means[segments == i] = (
103 |             np.mean(image[segments == i][:, 0]),
104 |             np.mean(image[segments == i][:, 1]),
105 |             np.mean(image[segments == i][:, 2]),
106 |         )
107 | 
108 |     n_batches = int(np.ceil(num_samples / batch_size))
109 | 
110 |     samples_labels = []
111 |     for i in tqdm(range(n_batches)):
112 | 
113 |         samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size]
114 | 
115 |         perturbed_imgs = []
116 |         for sample_binary in samples_binary_batch:
117 |             temp = copy.deepcopy(image)
118 |             zeros = np.where(sample_binary == 0)[0]
119 |             mask = np.zeros(segments.shape).astype(bool)
120 |             for z in zeros:
121 |                 mask[segments == z] = True
122 |             temp[mask] = image_means[mask]
123 | 
124 |             perturbed_imgs.append(temp)
125 | 
126 |         torch_img = (
127 |             torch.FloatTensor(np.array(perturbed_imgs)).to(device).permute(0, 3, 1, 2)
128 |         )
129 |         preds = model(torch_img).data.cpu().numpy()
130 |         samples_labels.extend(preds)
131 | 
132 |     samples_labels = np.stack(samples_labels)
133 | 
134 |     Xs, Ys = proprocess_data(samples_binary, samples_labels[:, class_idx], **kwargs)
135 | 
136 |     return Xs, Ys
137 | 
138 | 
139 | def generate_perturbation_dataset_text(
140 |     data_inst,
141 |     model,
142 |     class_idx,
143 |     device,
144 |     num_samples=6000,
145 |     batch_size=100,
146 |     seed=None,
147 |     model_id=None,
148 |     **kwargs
149 | ):
150 |     # Based on LIME image: https://github.com/marcotcr/lime/blob/master/lime/lime_text.py
151 | 
152 |     if seed is not None:
153 |         set_seed(seed)
154 | 
155 |     text = data_inst["orig"]
156 | 
157 |     indexed_string = IndexedString(text, bow=False)
158 |     data_inst["domain_mapper"] = TextDomainMapper(indexed_string)
159 | 
160 |     num_feats = indexed_string.num_words()
161 | 
162 |     samples_binary = generate_binary_perturbations(num_feats, num_samples, True)
163 | 
164 |     n_batches = int(np.ceil(num_samples / batch_size))
165 | 
166 |     samples_labels = []
167 |     for i in tqdm(range(n_batches)):
168 | 
169 |         samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size]
170 | 
171 |         perturbed_text = []
172 |         for sample_binary in samples_binary_batch:
173 | 
174 |             indices2invert = np.argwhere(sample_binary == 0).squeeze()
175 |             inv = indexed_string.inverse_removing(indices2invert)
176 |             
177 |             if model_id == "bert":
178 |                 ex = inv
179 |             else:
180 |                 ex = data.Example.fromlist([inv], fields=[("text", data_inst["vectorizer"])])
181 |             perturbed_text.append(ex)
182 |             
183 |         if model_id == "bert":
184 |             preds = model(perturbed_text)
185 |         else:
186 |             dset = data.Dataset(perturbed_text, fields=[("text", data_inst["vectorizer"])])
187 |             test_samples = data.Batch(data=perturbed_text, dataset=dset, device=device)
188 |             preds = model(test_samples).data.cpu().numpy()
189 | 
190 |         samples_labels.append(preds)
191 | 
192 |     samples_labels = np.concatenate(samples_labels)
193 | 
194 |     Xs, Ys = proprocess_data(samples_binary, samples_labels[:, class_idx], **kwargs)
195 | 
196 |     return Xs, Ys
197 | 
198 | 
199 | def generate_perturbation_dataset_graph(
200 |     data_inst,
201 |     model,
202 |     target_idx,
203 |     n_hops,
204 |     device,
205 |     num_samples=6000,
206 |     batch_size=500,
207 |     seed=None,
208 |     **kwargs
209 | ):
210 |     def get_output(x, da):
211 |         return model(x, da)[test_idxs].detach().cpu()
212 | 
213 |     if seed is not None:
214 |         set_seed(seed)
215 | 
216 |     node_feats = data_inst["nodes"]
217 |     adj_mat = data_inst["edges"]
218 |     test_idxs = data_inst["test_idxs"]
219 | 
220 |     da_mat = convert_adj_to_da(adj_mat)
221 | 
222 |     # Collect all nodes within a k-hop neighborhood of the target test index
223 |     adj_cum = copy.deepcopy(adj_mat)
224 |     for i in range(n_hops - 1):
225 |         adj_cum = torch.matmul(adj_cum, adj_mat)
226 | 
227 |     sum_v = 0
228 |     counter = 0
229 |     locality_dict = dict()
230 |     locality_dict_rev = dict()
231 |     for i, v in enumerate(adj_cum[target_idx]):
232 |         if v != 0:
233 |             sum_v += v
234 |             locality_dict[i] = counter
235 |             locality_dict_rev[counter] = i
236 |             counter += 1
237 |     local_num_nodes = len(locality_dict)
238 | 
239 |     data_inst["local_idx_map"] = locality_dict_rev
240 | 
241 |     samples_binary = generate_binary_perturbations(local_num_nodes, num_samples, True)
242 | 
243 |     # Get the features associated binary samples
244 |     data_new = []
245 |     for i in range(node_feats.shape[0]):
246 |         if i in locality_dict:
247 |             data_new.append(samples_binary[:, locality_dict[i]])
248 |         else:
249 |             data_new.append(np.zeros(num_samples))
250 |     data_new = np.array(data_new).transpose()
251 | 
252 |     # Get the test predictions associated binary samples
253 |     results = []
254 |     for d in tqdm(data_new):
255 |         mask = torch.FloatTensor(d).view(-1, 1).expand(node_feats.size())
256 |         masked_features = node_feats * mask.to(device)
257 |         output = get_output(masked_features, da_mat).numpy()
258 |         results.append(output)
259 |     results = np.array(results)
260 | 
261 |     y_idx = test_idxs.index(target_idx)
262 |     classifications = get_output(node_feats, da_mat).max(1)[1]
263 | 
264 |     samples_labels = results[:, y_idx, classifications[y_idx]]
265 | 
266 |     #     samples_labels = []
267 |     #     for ci, c in enumerate(classifications):
268 |     #         samples_labels.append(results[:, ci, c])
269 | 
270 |     Xs, Ys = proprocess_data(samples_binary, samples_labels, **kwargs)
271 | 
272 |     return Xs, Ys
273 | 
274 | 
275 | def generate_perturbation_dataset_dna(
276 |     data_inst, model, device, num_samples=6000, batch_size=100, seed=None, **kwargs
277 | ):
278 | 
279 |     if seed is not None:
280 |         set_seed(seed)
281 | 
282 |     seq = data_inst["orig"]
283 |     vectorizer = data_inst["vectorizer"]
284 | 
285 |     indexed_seq = IndexedNucleotides(seq)
286 | 
287 |     num_feats = indexed_seq.num_nucleotides()
288 | 
289 |     samples_binary = generate_binary_perturbations(num_feats, num_samples, True)
290 | 
291 |     n_batches = int(np.ceil(num_samples / batch_size))
292 | 
293 |     samples_labels = []
294 |     for i in tqdm(range(n_batches)):
295 | 
296 |         samples_binary_batch = samples_binary[i * batch_size : (i + 1) * batch_size]
297 | 
298 |         perturbed_seqs = []
299 |         for sample_binary in samples_binary_batch:
300 | 
301 |             indices2invert = np.argwhere(sample_binary == 0).squeeze()
302 |             inv = indexed_seq.perturb_nucleotide(indices2invert)
303 |             ex = vectorizer(inv)
304 |             perturbed_seqs.append(ex)
305 | 
306 |         test_samples = torch.FloatTensor(perturbed_seqs).permute(0, 2, 1).to(device)
307 |         preds = model(test_samples).data.cpu().numpy()
308 | 
309 |         samples_labels.append(preds)
310 | 
311 |     samples_labels = np.concatenate(samples_labels).squeeze()
312 | 
313 |     Xs, Ys = proprocess_data(samples_binary, samples_labels, **kwargs)
314 | 
315 |     return Xs, Ys
316 | 


--------------------------------------------------------------------------------
/1. madex/utils/data/cora/README:
--------------------------------------------------------------------------------
 1 | This directory contains the a selection of the Cora dataset (www.research.whizbang.com/data).
 2 | 
 3 | The Cora dataset consists of Machine Learning papers. These papers are classified into one of the following seven classes:
 4 | 		Case_Based
 5 | 		Genetic_Algorithms
 6 | 		Neural_Networks
 7 | 		Probabilistic_Methods
 8 | 		Reinforcement_Learning
 9 | 		Rule_Learning
10 | 		Theory
11 | 
12 | The papers were selected in a way such that in the final corpus every paper cites or is cited by atleast one other paper. There are 2708 papers in the whole corpus. 
13 | 
14 | After stemming and removing stopwords we were left with a vocabulary of size 1433 unique words. All words with document frequency less than 10 were removed.
15 | 
16 | 
17 | THE DIRECTORY CONTAINS TWO FILES:
18 | 
19 | The .content file contains descriptions of the papers in the following format:
20 | 
21 | 		<paper_id> <word_attributes>+ <class_label>
22 | 
23 | The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper.
24 | 
25 | The .cites file contains the citation graph of the corpus. Each line describes a link in the following format:
26 | 
27 | 		<ID of cited paper> <ID of citing paper>
28 | 
29 | Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1". 


--------------------------------------------------------------------------------
/1. madex/utils/data/sample_images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/bus.jpg


--------------------------------------------------------------------------------
/1. madex/utils/data/sample_images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/dog.jpg


--------------------------------------------------------------------------------
/1. madex/utils/data/sample_images/shark.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/shark.jpg


--------------------------------------------------------------------------------
/1. madex/utils/data/sample_images/viaduct.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/data/sample_images/viaduct.jpg


--------------------------------------------------------------------------------
/1. madex/utils/dna_utils.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import h5py as h5
  4 | import numpy as np
  5 | from utils.general_utils import *
  6 | 
  7 | # from sampling_and_inference import *
  8 | 
  9 | 
 10 | class Flatten(nn.Module):
 11 |     def forward(self, input):
 12 |         return input.view(input.size(0), -1)
 13 | 
 14 | 
 15 | def createConv1D(n_inp, n_out, hidden_units, kernel_size, seq_len, activation=nn.ReLU):
 16 | 
 17 |     layers = []
 18 |     layers_size = [n_inp] + hidden_units
 19 |     for i in range(len(layers_size) - 1):
 20 |         layers.append(nn.Conv1d(layers_size[i], layers_size[i + 1], kernel_size))
 21 |         if activation is not None:
 22 |             layers.append(activation())
 23 |     layers.append(Flatten())
 24 |     seq_len = seq_len - (kernel_size - 1) * len(hidden_units)
 25 |     linear_dim = layers_size[-1] * seq_len
 26 |     layers.append(nn.Linear(linear_dim, n_out))
 27 | 
 28 |     return nn.Sequential(*layers)
 29 | 
 30 | 
 31 | class conv1D(nn.Module):
 32 |     def __init__(self, n_inp, n_out, hidden_units, kernel_size, seq_len, **kwargs):
 33 |         super(conv1D, self).__init__()
 34 |         self.conv1D = createConv1D(n_inp, n_out, hidden_units, kernel_size, seq_len)
 35 | 
 36 |     def forward(self, x):
 37 |         return self.conv1D(x)
 38 | 
 39 | 
 40 | def load_dna_model(path):
 41 |     model = conv1D(4, 1, [64, 64], 5, 36)
 42 |     model.load_state_dict(torch.load(path))
 43 |     return model
 44 | 
 45 | 
 46 | def generate_random_dna_sequence_with_CACGTG(length=36, seed=None):
 47 |     if seed is not None:
 48 |         set_seed(seed)
 49 | 
 50 |     nucleotides = ["A", "C", "G", "T"]
 51 |     seq = ""
 52 |     ebox = "CACGTG"
 53 |     for i in np.random.randint(0, 4, (length)):
 54 |         seq += nucleotides[i]
 55 |     i = np.random.randint(0, length - len(ebox))
 56 |     seq = seq[:i] + ebox + seq[i + len(ebox) :]
 57 |     return seq
 58 | 
 59 | 
 60 | def encode_dna_onehot(seq):
 61 |     seq_as_list = list(seq)
 62 | 
 63 |     for i, c in enumerate(seq_as_list):
 64 |         if c == "A":
 65 |             seq_as_list[i] = [1, 0, 0, 0]
 66 |         elif c == "T":
 67 |             seq_as_list[i] = [0, 1, 0, 0]
 68 |         elif c == "C":
 69 |             seq_as_list[i] = [0, 0, 1, 0]
 70 |         elif c == "G":
 71 |             seq_as_list[i] = [0, 0, 0, 1]
 72 |         else:
 73 |             seq_as_list[i] = [0, 0, 0, 0]
 74 | 
 75 |     return np.array(seq_as_list)
 76 | 
 77 | 
 78 | class IndexedNucleotides(object):
 79 |     """String with various indexes."""
 80 | 
 81 |     """Based on LIME official Repo"""
 82 | 
 83 |     def __init__(self, raw_string):
 84 |         """Initializer.
 85 | 
 86 |         Args:
 87 |             raw_string: string with raw text in it
 88 |         """
 89 |         self.raw = raw_string
 90 |         self.as_list = list(self.raw)
 91 |         self.as_np = np.array(self.as_list)
 92 |         self.string_start = np.arange(len(self.raw))
 93 |         vocab = {}
 94 |         self.inverse_vocab = []
 95 |         self.positions = []
 96 |         non_vocab = set()
 97 |         for i, char in enumerate(self.as_np):
 98 |             if char in non_vocab:
 99 |                 continue
100 |             self.inverse_vocab.append(char)
101 |             self.positions.append(i)
102 |         self.positions = np.array(self.positions)
103 | 
104 |     def raw_string(self):
105 |         """Returns the original raw string"""
106 |         return self.raw
107 | 
108 |     def num_nucleotides(self):
109 |         """Returns the number of tokens in the vocabulary for this document."""
110 |         return len(self.inverse_vocab)
111 | 
112 |     def choose_alt(self, existing):
113 |         nucleotides = ["A", "T", "G", "C"]
114 |         nucleotides.remove(existing)
115 |         return nucleotides[np.random.randint(0, 3)]
116 | 
117 |     def perturb_nucleotide(self, chars_to_remove):
118 |         mask = np.ones(self.as_np.shape[0], dtype="bool")
119 |         mask[self.__get_idxs(chars_to_remove)] = False
120 |         return "".join(
121 |             [
122 |                 self.as_list[i] if mask[i] else self.choose_alt(self.as_list[i])
123 |                 for i in range(mask.shape[0])
124 |             ]
125 |         )
126 | 
127 |     def __get_idxs(self, chars):
128 |         """Returns indexes to appropriate words."""
129 |         return self.positions[chars]
130 | 


--------------------------------------------------------------------------------
/1. madex/utils/general_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from utils.lime import lime_base
  4 | import sklearn
  5 | from sklearn.preprocessing import StandardScaler
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import copy
  9 | 
 10 | 
 11 | def set_seed(seed=42):
 12 |     np.random.seed(seed)
 13 |     torch.manual_seed(seed)
 14 |     if torch.cuda.is_available():
 15 |         torch.cuda.manual_seed(seed)
 16 | 
 17 | 
 18 | def force_float(X_numpy):
 19 |     return torch.from_numpy(X_numpy.astype(np.float32))
 20 | 
 21 | 
 22 | def proprocess_data(X, Y, valid_size=500, test_size=500, std_scale=False):
 23 | 
 24 |     n, p = X.shape
 25 |     ## Make dataset splits
 26 |     ntrain, nval, ntest = n - valid_size - test_size, valid_size, test_size
 27 | 
 28 |     Xs = {
 29 |         "train": X[:ntrain],
 30 |         "val": X[ntrain : ntrain + nval],
 31 |         "test": X[ntrain + nval : ntrain + nval + ntest],
 32 |     }
 33 |     Ys = {
 34 |         "train": np.expand_dims(Y[:ntrain], axis=1),
 35 |         "val": np.expand_dims(Y[ntrain : ntrain + nval], axis=1),
 36 |         "test": np.expand_dims(Y[ntrain + nval : ntrain + nval + ntest], axis=1),
 37 |     }
 38 | 
 39 |     for k in Xs:
 40 |         if len(Xs[k]) == 0:
 41 |             assert k != "train"
 42 |             del Xs[k]
 43 |             del Ys[k]
 44 | 
 45 |     if std_scale:
 46 |         scaler = StandardScaler()
 47 |         scaler.fit(Ys["train"])
 48 |         for k in Ys:
 49 |             Ys[k] = scaler.transform(Ys[k])
 50 |         Ys["scaler"] = scaler
 51 | 
 52 |     return Xs, Ys
 53 | 
 54 | 
 55 | 
 56 | class MLP(nn.Module):
 57 |     def __init__(
 58 |         self,
 59 |         num_features,
 60 |         hidden_units,
 61 |         add_linear=False,
 62 |         act_func=nn.ReLU(),
 63 |     ):
 64 |         super(MLP, self).__init__()
 65 | 
 66 |         self.hidden_units = hidden_units
 67 |         self.add_linear = add_linear
 68 |         self.interaction_mlp = create_mlp([num_features] + hidden_units + [1], act_func=act_func)
 69 |     
 70 |         self.add_linear = add_linear
 71 | 
 72 |         if add_linear:
 73 |             self.linear = nn.Linear(num_features, 1, bias=False)
 74 | 
 75 | 
 76 |     def forward(self, x):
 77 |         y = self.interaction_mlp(x)
 78 | 
 79 |         if self.add_linear:
 80 |             y += self.linear(x)
 81 |         return y
 82 | 
 83 | 
 84 | def create_mlp(layer_sizes, out_bias=True, act_func=nn.ReLU()):
 85 |     ls = list(layer_sizes)
 86 |     layers = nn.ModuleList()
 87 |     for i in range(1, len(ls) - 1):
 88 |         layers.append(nn.Linear(int(ls[i - 1]), int(ls[i])))
 89 |         layers.append(act_func)
 90 |     layers.append(nn.Linear(int(ls[-2]), int(ls[-1]), bias=out_bias))
 91 |     return nn.Sequential(*layers)
 92 | 
 93 | 
 94 | def train(
 95 |     net,
 96 |     data_loaders,
 97 |     criterion=nn.MSELoss(reduction="none"),
 98 |     nepochs=100,
 99 |     verbose=False,
100 |     early_stopping=True,
101 |     patience=5,
102 |     l1_const=1e-4,
103 |     l2_const=0,
104 |     learning_rate=0.01,
105 |     opt_func=optim.Adam,
106 |     device=torch.device("cpu"),
107 |     **kwargs
108 | ):
109 |     optimizer = opt_func(net.parameters(), lr=learning_rate, weight_decay=l2_const)
110 | 
111 |     def include_sws(loss, sws):
112 |         assert loss.shape == sws.shape
113 |         return (loss * sws / sws.sum()).sum()
114 | 
115 |     def evaluate(net, data_loader, criterion, device):
116 |         losses = []
117 |         sws = []
118 |         for inputs, targets, sws_batch in data_loader:
119 |             inputs = inputs.to(device)
120 |             targets = targets.to(device)
121 |             loss = criterion(net(inputs), targets).cpu().data
122 |             losses.append(loss)
123 |             sws.append(sws_batch)
124 |         return include_sws(torch.stack(losses), torch.stack(sws)).item()
125 | 
126 |     best_loss = float("inf")
127 |     best_net = None
128 | 
129 |     if "val" not in data_loaders:
130 |         early_stopping = False
131 | 
132 |     patience_counter = 0
133 | 
134 |     for epoch in range(nepochs):
135 |         if verbose:
136 |             print("epoch", epoch)
137 |         running_loss = 0.0
138 |         run_count = 0
139 |         for i, data in enumerate(data_loaders["train"], 0):
140 |             inputs, targets, sws = data
141 |             inputs = inputs.to(device)
142 |             targets = targets.to(device)
143 |             sws = sws.to(device)
144 |             optimizer.zero_grad()
145 |             outputs = net(inputs)
146 |             loss = include_sws(criterion(outputs, targets), sws)
147 | 
148 |             reg_loss = 0
149 |             for name, param in net.named_parameters():
150 |                 if "interaction_mlp" in name and "weight" in name:
151 |                     reg_loss += torch.sum(torch.abs(param))
152 |                     
153 |             (loss + reg_loss * l1_const).backward()
154 |             optimizer.step()
155 |             running_loss += loss.item()
156 |             run_count += 1
157 | 
158 |         if epoch % 1 == 0:
159 |             key = "val" if "val" in data_loaders else "train"
160 |             val_loss = evaluate(net, data_loaders[key], criterion, device)
161 | 
162 |             if verbose:
163 |                 print(
164 |                     "[%d, %5d] train loss: %.4f, val loss: %.4f"
165 |                     % (epoch + 1, nepochs, running_loss / run_count, val_loss)
166 |                 )
167 |             if early_stopping:
168 |                 if val_loss < best_loss:
169 |                     best_loss = val_loss
170 |                     best_net = copy.deepcopy(net)
171 |                     patience_counter = 0
172 |                 else:
173 |                     patience_counter += 1
174 |                     if patience_counter > patience:
175 |                         net = best_net
176 |                         val_loss = best_loss
177 |                         if verbose:
178 |                             print("early stopping!")
179 |                         break
180 | 
181 |             prev_loss = running_loss
182 |             running_loss = 0.0
183 | 
184 |     if "test" in data_loaders:
185 |         key = "test"
186 |     elif "val" in data_loaders:
187 |         key = "val"
188 |     else:
189 |         key = "train"
190 |     test_loss = evaluate(net, data_loaders[key], criterion, device)
191 | 
192 |     if verbose:
193 |         print("Finished Training. Test loss: ", test_loss)
194 | 
195 |     return net, test_loss
196 | 
197 | 
198 | def merge_overlapping_sets(
199 |     prediction_scores,
200 |     interaction_atts,
201 |     overlap_thresh=0.5,
202 |     rel_gain_threshold=0,
203 |     patience=1,
204 |     num_features=None,
205 | ):
206 |     def overlap_coef(A, B):
207 |         A = set(A)
208 |         B = set(B)
209 |         return len(A & B) / min(len(A), len(B))
210 | 
211 |     def merge_sets(inter_sets):
212 |         prev_sets = None
213 |         inter_sets = list(inter_sets)
214 |         inter_sets_merged = inter_sets
215 |         while inter_sets != prev_sets:
216 |             prev_sets = list(inter_sets)
217 |             for A in inter_sets:
218 |                 for B in inter_sets_merged:
219 |                     if A != B:
220 |                         if overlap_coef(A, B) >= overlap_thresh:
221 |                             inter_sets_merged.append(
222 |                                 tuple(sorted(set(A) | set(B)))
223 |                             )  # merge
224 |                             if A in inter_sets_merged:
225 |                                 inter_sets_merged.remove(A)
226 |                             if B in inter_sets_merged:
227 |                                 inter_sets_merged.remove(B)
228 | 
229 |             inter_sets = list(set(inter_sets_merged))
230 |         return inter_sets
231 | 
232 |     def threshold_inter_sets(interaction_atts, prediction_scores):
233 |         scores = prediction_scores
234 |         inter_sets = []
235 |         patience_counter = 0
236 |         best_score = scores[0]
237 |         for i in range(1, len(scores)):
238 |             cur_score = scores[i]
239 |             rel_gain = (cur_score - best_score) / best_score
240 |             inter_sets_temp, _ = zip(*interaction_atts[i - 1])
241 |             if num_features is not None:
242 |                 if any(len(inter) == num_features for inter in inter_sets_temp):
243 |                     break
244 |             if rel_gain > rel_gain_threshold:
245 |                 best_score = cur_score
246 |                 inter_sets = inter_sets_temp
247 |                 patience_counter = 0
248 |             else:
249 |                 if patience_counter < patience:
250 |                     patience_counter += 1
251 |                 else:
252 |                     break
253 |         return inter_sets
254 | 
255 |     inter_sets = threshold_inter_sets(interaction_atts, prediction_scores)
256 |     inter_sets_merged = merge_sets(inter_sets)
257 | 
258 |     return inter_sets_merged
259 | 
260 | 
261 | ######################################################
262 | # The following are based on the official LIME repo
263 | ######################################################
264 | 
265 | 
266 | def get_sample_distances(Xs):
267 |     all_ones = np.ones((1, Xs["train"].shape[1]))
268 |     Dd = {}
269 |     for k in Xs:
270 |         if k == "scaler":
271 |             continue
272 |         distances = sklearn.metrics.pairwise_distances(
273 |             Xs[k], all_ones, metric="cosine"
274 |         ).ravel()
275 |         Dd[k] = distances
276 | 
277 |     return Dd
278 | 
279 | 
280 | def get_sample_weights(Xs, kernel_width=0.25, enable=True, **kwargs):
281 |     def kernel(d):
282 |         return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))
283 | 
284 |     if enable:
285 |         Dd = get_sample_distances(Xs)
286 | 
287 |     Wd = {}
288 |     for k in Xs:
289 |         if k == "scaler":
290 |             continue
291 |         if enable:
292 |             Wd[k] = kernel(Dd[k])
293 |         else:
294 |             Wd[k] = np.ones(Xs[k].shape[0])
295 | 
296 |     return Wd
297 | 
298 | 
299 | def get_lime_attributions(
300 |     Xs, Ys, max_features=10000, kernel_width=0.25, weight_samples=True, sort=True
301 | ):
302 |     def kernel(d):
303 |         return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))
304 | 
305 |     distances = get_sample_distances(Xs)["train"]
306 |     if not weight_samples:
307 |         distances = np.ones_like(distances).squeeze(1)
308 | 
309 |     lb = lime_base.LimeBase(kernel_fn=kernel)
310 |     lime_atts = lb.explain_instance_with_data(
311 |         Xs["train"], Ys["train"], distances, 0, max_features
312 |     )[0]
313 |     if sort:
314 |         lime_atts = sorted(lime_atts, key=lambda x: -x[1])
315 |     return lime_atts
316 | 


--------------------------------------------------------------------------------
/1. madex/utils/graph_utils.py:
--------------------------------------------------------------------------------
 1 | from utils.pretrained.model_gcn import *
 2 | from collections import defaultdict
 3 | import numpy as np
 4 | import copy
 5 | 
 6 | 
 7 | def get_graph_model(model_folder):
 8 | 
 9 |     meta = torch.load(model_folder + "/gcn_cora.pt")
10 | 
11 |     n_hops = meta["n_hops"]
12 |     n_nodes = meta["n_nodes"]
13 |     test_idxs = meta["test_idxs"]
14 |     n_samples = meta["n_samples"]
15 |     dim_inp = meta["dim_inp"]
16 |     dim_hid = meta["dim_hid"]
17 |     dim_out = meta["dim_out"]
18 | 
19 |     model = create_model(dim_inp, dim_hid, dim_out, n_samples, n_hops)
20 |     model.load_state_dict(meta["state_dict"])
21 | 
22 |     return model, n_nodes, n_hops, test_idxs
23 | 
24 | 
25 | def convert_adj_to_da(adj_mat, make_undirected=False):
26 |     # Converts adjacency to laplacian matrix
27 |     if isinstance(adj_mat, np.ndarray):
28 |         adj_mat = torch.from_numpy(adj_mat).float()
29 |     if make_undirected:
30 |         diag = torch.diag(torch.diag(adj_mat))
31 |         x = adj_mat - diag
32 |         adj_mat = x + x.t() + adj_mat
33 | 
34 |     da_mat = torch.eye(len(adj_mat)).to(adj_mat.device) - adj_mat
35 |     return da_mat
36 | 
37 | 
38 | def load_cora(data_folder, device):
39 |     num_nodes = 2708
40 |     num_feats = 1433
41 |     feat_data = np.zeros((num_nodes, num_feats))
42 |     labels = np.empty((num_nodes, 1), dtype=np.int64)
43 |     node_map = {}
44 |     label_map = {}
45 |     with open(data_folder + "/cora.content") as fp:
46 |         for i, line in enumerate(fp):
47 |             info = line.strip().split()
48 |             feat_data[i, :] = [float(_) for _ in info[1:-1]]
49 |             node_map[info[0]] = i
50 |             if not info[-1] in label_map:
51 |                 label_map[info[-1]] = len(label_map)
52 |             labels[i] = label_map[info[-1]]
53 | 
54 |     adj_lists = defaultdict(set)
55 |     with open(data_folder + "/cora.cites") as fp:
56 |         for i, line in enumerate(fp):
57 |             info = line.strip().split()
58 |             n1 = node_map[info[0]]
59 |             n2 = node_map[info[1]]
60 |             adj_lists[n1].add(n2)
61 |             adj_lists[n2].add(n1)
62 | 
63 |     adj_mat = np.zeros((num_nodes, num_nodes))
64 |     for u in adj_lists:
65 |         for v in adj_lists[u]:
66 |             adj_mat[u, v] = 1
67 | 
68 |     feat_data = torch.FloatTensor(feat_data).to(device)
69 |     adj_mat = torch.FloatTensor(adj_mat).to(device)
70 |     return feat_data, adj_mat, labels
71 | 
72 | 
73 | def get_hops_to_target(target_idx, adj_mat, n_hops):
74 |     # Create a map from node to the number of hops from the target test index
75 |     node_to_hop = {target_idx: 0}
76 |     seen_points = {target_idx}
77 |     for j in range(1, n_hops + 2):
78 |         adj_cum = copy.deepcopy(adj_mat)
79 |         for i in range(j - 1):
80 |             adj_cum = torch.matmul(adj_cum, adj_mat)
81 |         collect = {i for i, v in enumerate(adj_cum[target_idx]) if v != 0}
82 |         ex_collect = collect - seen_points
83 |         seen_points |= collect
84 |         for e in ex_collect:
85 |             node_to_hop[e] = j
86 | 
87 |     return node_to_hop
88 | 


--------------------------------------------------------------------------------
/1. madex/utils/image_utils.py:
--------------------------------------------------------------------------------
  1 | from torchvision import transforms
  2 | import requests
  3 | from PIL import Image
  4 | from skimage.segmentation import mark_boundaries
  5 | import numpy as np
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.gridspec import GridSpec
  9 | matplotlib.rcParams['mathtext.fontset'] = 'cm'
 10 | matplotlib.rcParams['font.family'] = 'STIXGeneral'
 11 | 
 12 | 
 13 | # image pre-processing needed for ResNet
 14 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 15 | 
 16 | preprocess = transforms.Compose(
 17 |     [
 18 |         transforms.Resize((224, 224)),
 19 |         transforms.ToTensor(),
 20 |         normalize,
 21 |     ]
 22 | )
 23 | 
 24 | def get_image_and_labels(
 25 |     image_path,
 26 |     device,
 27 |     labels_url="https://s3.amazonaws.com/outcome-blog/imagenet/labels.json",
 28 | ):
 29 |     """
 30 |         Loads image instance and labels
 31 | 
 32 |         Args:
 33 |             image_path: path to image instance
 34 |             labels_url: url to json labels
 35 |             
 36 |         Returns:
 37 |             image, labels
 38 |     """
 39 |     image = Image.open(image_path)
 40 |     if image.mode != "RGB":
 41 |         image = image.convert("RGB")
 42 |     image_tensor = preprocess(image)
 43 |     image = (
 44 |         image_tensor.cpu().numpy().transpose(1, 2, 0) / image_tensor.abs().max().item()
 45 |     )
 46 |     image_tensor = (
 47 |         image_tensor.unsqueeze_(0).to(device) / image_tensor.abs().max().item()
 48 |     )
 49 |     labels = {
 50 |         int(key): value for (key, value) in requests.get(labels_url).json().items()
 51 |     }
 52 |     return image, image_tensor, labels
 53 | 
 54 | 
 55 | def show_segmented_image(image, segments):
 56 |     plt.imshow(mark_boundaries(image / 2 + 0.5, segments))
 57 | 
 58 | 
 59 | def plot_explanations(img_arrays, figsize = 0.4, spacing = 0.15, savepath=""):
 60 |     w_spacing = (2/3)*spacing
 61 |     left = 0
 62 |     ax_arays = []
 63 |     fig = plt.figure()
 64 |     for img_array in img_arrays:
 65 |         num_imgs = len(img_array)
 66 |         right = left + figsize*(num_imgs) + (num_imgs-1)*0.4*w_spacing
 67 |         ax_arays.append(fig.subplots(1,num_imgs, gridspec_kw=dict(left=left, right=right, wspace=w_spacing)))
 68 |         left = right + spacing
 69 | 
 70 |     for i, ax_array in enumerate(ax_arays):
 71 |         if hasattr(ax_array, "flat"):
 72 |             for j, ax in enumerate(ax_array.flat):
 73 |                 img, title = img_arrays[i][j]
 74 |                 ax.imshow(img/2+0.5)
 75 |                 ax.set_title(title, fontsize=55*figsize) 
 76 |                 ax.axis("off")
 77 |         else:
 78 |             img, title = img_arrays[i][0]
 79 | 
 80 |             ax_array.imshow(img/2+0.5)
 81 |             ax_array.set_title(title, fontsize=55*figsize)
 82 |             ax_array.axis("off")
 83 | 
 84 |     if savepath:
 85 |         plt.savefig(savepath, bbox_inches="tight")
 86 |     plt.show()
 87 | 
 88 | 
 89 | def show_explanations(inter_sets, image, segments, figsize=0.4, spacing=0.15, lime_atts=None, savepath=""):
 90 |     
 91 |     def get_interaction_img(inter):
 92 |         temp = (np.ones(image.shape, image.dtype) -0.5)*1
 93 |         for n in inter:
 94 |             temp[segments == n] = image[segments == n].copy()
 95 |         return temp
 96 | 
 97 |     img_arrays = []
 98 |     img_arrays.append( [(image, "Original image")] ) 
 99 |         
100 |     ## main effects
101 |     if lime_atts is not None:
102 |         temp = (np.ones(image.shape, image.dtype) -0.5)*1
103 |         for n,_ in lime_atts[:5]:
104 |             temp[segments == n] = image[segments == n].copy()
105 |         img_arrays.append( [(temp, "Main effects")] ) 
106 | 
107 |     inter_img_arrays = []
108 |     for i, inter_set in enumerate(inter_sets):
109 |         inter_img_arrays.append( (get_interaction_img(inter_set), "Interaction $\mathcal{I}_" + str(i+1) + "$") ) 
110 |     img_arrays.append(inter_img_arrays)
111 |         
112 |     plot_explanations(img_arrays, figsize, spacing, savepath)


--------------------------------------------------------------------------------
/1. madex/utils/lime/lime_base.py:
--------------------------------------------------------------------------------
  1 | ################################
  2 | # Based on the LIME code repo
  3 | ################################
  4 | 
  5 | """
  6 | Contains abstract functionality for learning locally linear sparse model.
  7 | """
  8 | from __future__ import print_function
  9 | import numpy as np
 10 | from sklearn.linear_model import Ridge, lars_path
 11 | from sklearn.utils import check_random_state
 12 | import copy
 13 | from sklearn.metrics import mean_squared_error
 14 | 
 15 | 
 16 | class LimeBase(object):
 17 |     """Class for learning a locally linear sparse model from perturbed data"""
 18 | 
 19 |     def __init__(self, kernel_fn, verbose=False, random_state=None):
 20 |         """Init function
 21 | 
 22 |         Args:
 23 |             kernel_fn: function that transforms an array of distances into an
 24 |                         array of proximity values (floats).
 25 |             verbose: if true, print local prediction values from linear model.
 26 |             random_state: an integer or numpy.RandomState that will be used to
 27 |                 generate random numbers. If None, the random state will be
 28 |                 initialized using the internal numpy seed.
 29 |         """
 30 |         self.kernel_fn = kernel_fn
 31 |         self.verbose = verbose
 32 |         self.random_state = check_random_state(random_state)
 33 | 
 34 |     @staticmethod
 35 |     def generate_lars_path(weighted_data, weighted_labels):
 36 |         """Generates the lars path for weighted data.
 37 | 
 38 |         Args:
 39 |             weighted_data: data that has been weighted by kernel
 40 |             weighted_label: labels, weighted by kernel
 41 | 
 42 |         Returns:
 43 |             (alphas, coefs), both are arrays corresponding to the
 44 |             regularization parameter and coefficients, respectively
 45 |         """
 46 |         x_vector = weighted_data
 47 |         alphas, _, coefs = lars_path(
 48 |             x_vector, weighted_labels, method="lasso", verbose=False
 49 |         )
 50 |         return alphas, coefs
 51 | 
 52 |     def forward_selection(self, data, labels, weights, num_features):
 53 |         """Iteratively adds features to the model"""
 54 |         clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state)
 55 |         used_features = []
 56 |         for _ in range(min(num_features, data.shape[1])):
 57 |             max_ = -100000000
 58 |             best = 0
 59 |             for feature in range(data.shape[1]):
 60 |                 if feature in used_features:
 61 |                     continue
 62 |                 clf.fit(
 63 |                     data[:, used_features + [feature]], labels, sample_weight=weights
 64 |                 )
 65 |                 score = clf.score(
 66 |                     data[:, used_features + [feature]], labels, sample_weight=weights
 67 |                 )
 68 |                 if score > max_:
 69 |                     best = feature
 70 |                     max_ = score
 71 |             used_features.append(best)
 72 |         return np.array(used_features)
 73 | 
 74 |     def feature_selection(self, data, labels, weights, num_features, method):
 75 |         """Selects features for the model. see explain_instance_with_data to
 76 |            understand the parameters."""
 77 | 
 78 |         if method == "none":
 79 |             return np.array(range(data.shape[1]))
 80 |         elif method == "forward_selection":
 81 |             return self.forward_selection(data, labels, weights, num_features)
 82 |         elif method == "highest_weights":
 83 |             clf = Ridge(alpha=0.01, fit_intercept=True, random_state=self.random_state)
 84 |             clf.fit(data, labels, sample_weight=weights)
 85 |             feature_weights = sorted(
 86 |                 zip(range(data.shape[0]), clf.coef_ * data[0]),
 87 |                 key=lambda x: np.abs(x[1]),
 88 |                 reverse=True,
 89 |             )
 90 |             return np.array([x[0] for x in feature_weights[:num_features]])
 91 |         elif method == "lasso_path":
 92 |             weighted_data = (
 93 |                 data - np.average(data, axis=0, weights=weights)
 94 |             ) * np.sqrt(weights[:, np.newaxis])
 95 |             weighted_labels = (labels - np.average(labels, weights=weights)) * np.sqrt(
 96 |                 weights
 97 |             )
 98 |             nonzero = range(weighted_data.shape[1])
 99 |             _, coefs = self.generate_lars_path(weighted_data, weighted_labels)
100 |             for i in range(len(coefs.T) - 1, 0, -1):
101 |                 nonzero = coefs.T[i].nonzero()[0]
102 |                 if len(nonzero) <= num_features:
103 |                     break
104 |             used_features = nonzero
105 |             return used_features
106 |         elif method == "auto":
107 |             if num_features <= 6:
108 |                 n_method = "forward_selection"
109 |             else:
110 |                 n_method = "highest_weights"
111 |             return self.feature_selection(data, labels, weights, num_features, n_method)
112 | 
113 |     def explain_instance_with_data(
114 |         self,
115 |         neighborhood_data,
116 |         neighborhood_labels,
117 |         distances,
118 |         label,
119 |         num_features,
120 |         feature_selection="auto",
121 |         model_regressor=None,
122 |     ):
123 |         """Takes perturbed data, labels and distances, returns explanation.
124 | 
125 |         Args:
126 |             neighborhood_data: perturbed data, 2d array. first element is
127 |                                assumed to be the original data point.
128 |             neighborhood_labels: corresponding perturbed labels. should have as
129 |                                  many columns as the number of possible labels.
130 |             distances: distances to original data point.
131 |             label: label for which we want an explanation
132 |             num_features: maximum number of features in explanation
133 |             feature_selection: how to select num_features. options are:
134 |                 'forward_selection': iteratively add features to the model.
135 |                     This is costly when num_features is high
136 |                 'highest_weights': selects the features that have the highest
137 |                     product of absolute weight * original data point when
138 |                     learning with all the features
139 |                 'lasso_path': chooses features based on the lasso
140 |                     regularization path
141 |                 'none': uses all features, ignores num_features
142 |                 'auto': uses forward_selection if num_features <= 6, and
143 |                     'highest_weights' otherwise.
144 |             model_regressor: sklearn regressor to use in explanation.
145 |                 Defaults to Ridge regression if None. Must have
146 |                 model_regressor.coef_ and 'sample_weight' as a parameter
147 |                 to model_regressor.fit()
148 | 
149 |         Returns:
150 |             (intercept, exp, score):
151 |             intercept is a float.
152 |             exp is a sorted list of tuples, where each tuple (x,y) corresponds
153 |             to the feature id (x) and the local weight (y). The list is sorted
154 |             by decreasing absolute value of y.
155 |             score is the R^2 value of the returned explanation
156 |         """
157 |         # data_copy = copy.deepcopy(neighborhood_data)
158 |         weights = self.kernel_fn(distances)
159 | 
160 |         labels_column = neighborhood_labels[:, label]
161 |         used_features = self.feature_selection(
162 |             neighborhood_data, labels_column, weights, num_features, feature_selection
163 |         )
164 | 
165 |         if model_regressor is None:
166 |             model_regressor = Ridge(
167 |                 alpha=1, fit_intercept=True, random_state=self.random_state
168 |             )
169 |         easy_model = model_regressor
170 |         easy_model.fit(
171 |             neighborhood_data[:, used_features], labels_column, sample_weight=weights
172 |         )
173 |         r_sq = easy_model.score(
174 |             neighborhood_data[:, used_features], labels_column, sample_weight=weights
175 |         )
176 | 
177 |         local_pred = easy_model.predict(
178 |             neighborhood_data[0, used_features].reshape(1, -1)
179 |         )
180 |         all_pred = easy_model.predict(neighborhood_data[:, used_features])
181 | 
182 |         mse = mean_squared_error(labels_column, all_pred, sample_weight=weights)
183 | 
184 |         if self.verbose:
185 |             print("Intercept", easy_model.intercept_)
186 |             print("Prediction_local", local_pred)
187 |             print("Right:", neighborhood_labels[0, label])
188 |         return (
189 |             sorted(
190 |                 zip(used_features, easy_model.coef_),
191 |                 key=lambda x: np.abs(x[1]),
192 |                 reverse=True,
193 |             ),
194 |             r_sq,
195 |             mse,
196 |             local_pred,
197 |             neighborhood_data[:, used_features],
198 |             labels_column,
199 |             all_pred,
200 |             weights,
201 |             used_features,
202 |             easy_model,
203 |         )
204 | 


--------------------------------------------------------------------------------
/1. madex/utils/lime/lime_text.py:
--------------------------------------------------------------------------------
  1 | ################################
  2 | # Based on the LIME code repo
  3 | ################################
  4 | 
  5 | import numpy as np
  6 | import re
  7 | 
  8 | 
  9 | class TextDomainMapper:
 10 |     """Maps feature ids to words or word-positions"""
 11 | 
 12 |     def __init__(self, indexed_string):
 13 |         """Initializer.
 14 | 
 15 |         Args:
 16 |             indexed_string: lime_text.IndexedString, original string
 17 |         """
 18 |         self.indexed_string = indexed_string
 19 | 
 20 |     def map_exp_ids(self, exp, positions=False):
 21 |         """Maps ids to words or word-position strings.
 22 | 
 23 |         Args:
 24 |             exp: list of tuples [(id, weight), (id,weight)]
 25 |             positions: if True, also return word positions
 26 | 
 27 |         Returns:
 28 |             list of tuples (word, weight), or (word_positions, weight) if
 29 |             examples: ('bad', 1) or ('bad_3-6-12', 1)
 30 |         """
 31 |         if positions:
 32 |             exp = [
 33 |                 (
 34 |                     "%s_%s"
 35 |                     % (
 36 |                         self.indexed_string.word(x[0]),
 37 |                         "-".join(map(str, self.indexed_string.string_position(x[0]))),
 38 |                     ),
 39 |                     x[1],
 40 |                 )
 41 |                 for x in exp
 42 |             ]
 43 |         else:
 44 |             exp = [(self.indexed_string.word(x[0]), x[1]) for x in exp]
 45 |         return exp
 46 | 
 47 |     def visualize_instance_html(
 48 |         self, exp, label, div_name, exp_object_name, text=True, opacity=True
 49 |     ):
 50 |         """Adds text with highlighted words to visualization.
 51 | 
 52 |         Args:
 53 |              exp: list of tuples [(id, weight), (id,weight)]
 54 |              label: label id (integer)
 55 |              div_name: name of div object to be used for rendering(in js)
 56 |              exp_object_name: name of js explanation object
 57 |              text: if False, return empty
 58 |              opacity: if True, fade colors according to weight
 59 |         """
 60 |         if not text:
 61 |             return ""
 62 |         text = (
 63 |             self.indexed_string.raw_string()
 64 |             .encode("utf-8", "xmlcharrefreplace")
 65 |             .decode("utf-8")
 66 |         )
 67 |         text = re.sub(r"[<>&]", "|", text)
 68 |         exp = [
 69 |             (
 70 |                 self.indexed_string.word(x[0]),
 71 |                 self.indexed_string.string_position(x[0]),
 72 |                 x[1],
 73 |             )
 74 |             for x in exp
 75 |         ]
 76 |         all_occurrences = list(
 77 |             itertools.chain.from_iterable(
 78 |                 [itertools.product([x[0]], x[1], [x[2]]) for x in exp]
 79 |             )
 80 |         )
 81 |         all_occurrences = [(x[0], int(x[1]), x[2]) for x in all_occurrences]
 82 |         ret = """
 83 |             %s.show_raw_text(%s, %d, %s, %s, %s);
 84 |             """ % (
 85 |             exp_object_name,
 86 |             json.dumps(all_occurrences),
 87 |             label,
 88 |             json.dumps(text),
 89 |             div_name,
 90 |             json.dumps(opacity),
 91 |         )
 92 |         return ret
 93 | 
 94 | 
 95 | class IndexedString(object):
 96 |     """String with various indexes."""
 97 | 
 98 |     def __init__(self, raw_string, split_expression=r"\W+", bow=True):
 99 |         """Initializer.
100 | 
101 |         Args:
102 |             raw_string: string with raw text in it
103 |             split_expression: string will be split by this.
104 |             bow: if True, a word is the same everywhere in the text - i.e. we
105 |                  will index multiple occurrences of the same word. If False,
106 |                  order matters, so that the same word will have different ids
107 |                  according to position.
108 |         """
109 |         self.raw = raw_string
110 |         self.as_list = re.split(r"(%s)|$" % split_expression, self.raw)
111 |         self.as_np = np.array(self.as_list)
112 |         non_word = re.compile(r"(%s)|$" % split_expression).match
113 |         self.string_start = np.hstack(
114 |             ([0], np.cumsum([len(x) for x in self.as_np[:-1]]))
115 |         )
116 |         vocab = {}
117 |         self.inverse_vocab = []
118 |         self.positions = []
119 |         self.bow = bow
120 |         non_vocab = set()
121 |         for i, word in enumerate(self.as_np):
122 |             if word in non_vocab:
123 |                 continue
124 |             if non_word(word):
125 |                 non_vocab.add(word)
126 |                 continue
127 |             if bow:
128 |                 if word not in vocab:
129 |                     vocab[word] = len(vocab)
130 |                     self.inverse_vocab.append(word)
131 |                     self.positions.append([])
132 |                 idx_word = vocab[word]
133 |                 self.positions[idx_word].append(i)
134 |             else:
135 |                 self.inverse_vocab.append(word)
136 |                 self.positions.append(i)
137 |         if not bow:
138 |             self.positions = np.array(self.positions)
139 | 
140 |     def raw_string(self):
141 |         """Returns the original raw string"""
142 |         return self.raw
143 | 
144 |     def num_words(self):
145 |         """Returns the number of tokens in the vocabulary for this document."""
146 |         return len(self.inverse_vocab)
147 | 
148 |     def word(self, id_):
149 |         """Returns the word that corresponds to id_ (int)"""
150 |         return self.inverse_vocab[id_]
151 | 
152 |     def string_position(self, id_):
153 |         """Returns a np array with indices to id_ (int) occurrences"""
154 |         if self.bow:
155 |             return self.string_start[self.positions[id_]]
156 |         else:
157 |             return self.string_start[[self.positions[id_]]]
158 | 
159 |     def inverse_removing(self, words_to_remove):
160 |         """Returns a string after removing the appropriate words.
161 | 
162 |         If self.bow is false, replaces word with UNKWORDZ instead of removing
163 |         it.
164 | 
165 |         Args:
166 |             words_to_remove: list of ids (ints) to remove
167 | 
168 |         Returns:
169 |             original raw string with appropriate words removed.
170 |         """
171 |         mask = np.ones(self.as_np.shape[0], dtype="bool")
172 |         mask[self.__get_idxs(words_to_remove)] = False
173 |         if not self.bow:
174 |             return "".join(
175 |                 [
176 |                     self.as_list[i] if mask[i] else "UNKWORDZ"
177 |                     for i in range(mask.shape[0])
178 |                 ]
179 |             )
180 |         return "".join([self.as_list[v] for v in mask.nonzero()[0]])
181 | 
182 |     def __get_idxs(self, words):
183 |         """Returns indexes to appropriate words."""
184 |         if self.bow:
185 |             return list(
186 |                 itertools.chain.from_iterable([self.positions[z] for z in words])
187 |             )
188 |         else:
189 |             return self.positions[words]
190 | 


--------------------------------------------------------------------------------
/1. madex/utils/linear_cross_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import copy
  4 | from sklearn.linear_model import Ridge, base
  5 | from utils.general_utils import *
  6 | 
  7 | 
  8 | def update_cross_features(Xs_in, interactions):
  9 |     Xs = copy.deepcopy(Xs_in)
 10 | 
 11 |     for k in Xs:
 12 |         Xk = Xs[k]
 13 |         new_features = []
 14 | 
 15 |         for inter in interactions:
 16 |             inter_np = np.array(inter)
 17 |             new_feature = 1 * np.all(Xk[:, inter_np - 1], axis=1)
 18 |             new_features.append(new_feature)
 19 | 
 20 |         new_dset = np.concatenate([Xk, np.stack(new_features, axis=1)], axis=1)
 21 | 
 22 |         Xs[k] = new_dset
 23 |     return Xs
 24 | 
 25 | 
 26 | def fit_linear_cross_models(
 27 |     Xs,
 28 |     Ys,
 29 |     interactions,
 30 |     hierarchy_stepsize=1,
 31 |     max_steps=1,
 32 |     hierarchy_patience=0,
 33 |     stopping=False,
 34 |     verbose=False,
 35 |     weight_samples=False,
 36 |     flat=False,
 37 |     **kwargs
 38 | ):
 39 |     """
 40 |         Trains an MLP and interprets interactions from its weights
 41 | 
 42 |         Args:
 43 |             data_loaders: dict of train, val, and test dataloaders
 44 |             sample_to_explain: the data instances to get attributions for
 45 |             interactions: a ranking of interaction sets
 46 |             hierarchy_stepsize: the stepsize across the ranking
 47 |             max_steps: the max number of steps on the ranking. a max_steps of 1 stops right after getting univariate attributions
 48 |             hierarchy patience: the patience of when to early stop on the interaction ranking based on validation performance
 49 |             user_linear: whether to use a linear model rather than a GAM for learning GAM+interactions model
 50 |             stopping: whether to early stop on the interaction ranking or not'
 51 |             mode: 'MSE' or 'BCE' for regression or binary classification
 52 |             experiment: name of experiment
 53 |             aggregate: aggregates the attributions of overlapping univariates and interaction sets
 54 |             verbose: set True to get training info
 55 | 
 56 |         Returns:
 57 |             the best GAM models, hierarchical interaction attributions, univariate attributions, prediction pefrformances at each hierarchical step, all trained GAMs
 58 |     """
 59 | 
 60 |     Wd = get_sample_weights(Xs, enable=weight_samples, **kwargs)
 61 | 
 62 |     best_model = None
 63 |     best_score = None
 64 |     margin = 0  # initialize to 0 to start, give initial slack for aggregate
 65 |     patience_counter = 0
 66 |     active_interaction_list = []
 67 |     hierarchical_interaction_attributions = []
 68 |     active_interactions = []
 69 |     prediction_scores = []
 70 | 
 71 |     break_out = False
 72 | 
 73 |     ## Build univariate gam
 74 |     n_features = Xs["train"].shape[1]  # next(iter(data_loaders["train"]))[0].shape[1]
 75 |     univariates = list(range(n_features))
 76 | 
 77 |     clf = Ridge(alpha=0.01)
 78 |     clf.fit(Xs["train"], Ys["train"], sample_weight=Wd["train"])
 79 |     r_sq = clf.score(Xs["val"], Ys["val"], sample_weight=Wd["val"])
 80 |     r_sq_test = clf.score(Xs["test"], Ys["test"], sample_weight=Wd["test"])
 81 | 
 82 |     Xs_base = copy.deepcopy(Xs)
 83 | 
 84 |     prediction_score = r_sq
 85 |     prediction_scores.append(r_sq_test)
 86 | 
 87 |     best_score = prediction_score
 88 | 
 89 |     univariate_attributions = (univariates, clf.coef_[0])
 90 | 
 91 |     for s in range(1, max_steps):
 92 |         active_interactions2 = []                
 93 |         k = hierarchy_stepsize * s
 94 | 
 95 |         for v in range(k):
 96 |             try:
 97 |                 interaction = interactions[v][0]
 98 |                 active_interactions2.append(interaction)
 99 | 
100 |             except:  # TODO handle this better later
101 |                 break_out = True
102 |                 break
103 | 
104 |             append, remove_items = True, []
105 |             insertion_idx = len(active_interactions)
106 |             for a, ai in enumerate(active_interactions):
107 |                 if set(interaction) <= set(ai):
108 |                     append = False
109 |                 if set(interaction) > set(ai):
110 |                     remove_items.append(ai)
111 |                     if insertion_idx == len(active_interactions):
112 |                         insertion_idx = a
113 |             if remove_items:
114 |                 for r in remove_items:
115 |                     active_interactions.remove(r)
116 |             if append:
117 |                 active_interactions.insert(insertion_idx, interaction)
118 | 
119 |         if break_out:
120 |             break
121 | 
122 |         active_interactions_pruned = [np.array(ai) for ai in active_interactions]
123 |         active_interactions2 = active_interactions #active_interactions_pruned
124 | 
125 |         if verbose:
126 |             print("\tpruned", active_interactions_pruned)
127 | 
128 |         if flat:
129 |             active_interactions2 = interactions
130 | 
131 |         Xs_inter = update_cross_features(Xs_base, active_interactions2)
132 |         clf = Ridge(alpha=0.01)
133 |         clf.fit(Xs_inter["train"], Ys["train"], sample_weight=Wd["train"])
134 |         r_sq = clf.score(Xs_inter["val"], Ys["val"], sample_weight=Wd["val"])
135 |         r_sq_test = clf.score(Xs_inter["test"], Ys["test"], sample_weight=Wd["test"])
136 | 
137 |         prediction_score = r_sq
138 | 
139 |         performance_improvement = prediction_score > best_score
140 |         if (not stopping) or (
141 |             stopping
142 |             and (performance_improvement or patience_counter < hierarchy_patience)
143 |         ):
144 |             interaction_attributions = []
145 |             for inter_i, inter in enumerate(active_interactions2):
146 |                 w = clf.coef_[0, inter_i + n_features]
147 |                 interaction_attributions.append((inter, w))
148 |             hierarchical_interaction_attributions.append(interaction_attributions)
149 |             prediction_scores.append(r_sq_test)
150 | 
151 |             if stopping:
152 |                 if performance_improvement:
153 |                     patience_counter = 0
154 |                     best_score = prediction_score
155 |                 else:
156 |                     patience_counter += 1
157 |         else:
158 |             break
159 | 
160 |         if flat:
161 |             return interaction_attributions, univariate_attributions, prediction_score
162 | 
163 |     return (
164 |         prediction_scores,
165 |         hierarchical_interaction_attributions,
166 |         univariate_attributions,
167 |     )
168 | 


--------------------------------------------------------------------------------
/1. madex/utils/pretrained/dna_cnn.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/pretrained/dna_cnn.pt


--------------------------------------------------------------------------------
/1. madex/utils/pretrained/gcn_cora.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/1. madex/utils/pretrained/gcn_cora.pt


--------------------------------------------------------------------------------
/1. madex/utils/pretrained/model_gcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.functional import relu
 4 | 
 5 | 
 6 | class InductiveGCN(nn.Module):
 7 |     def __init__(self, dim_inp, dim_hid, dim_out, n_samples, n_hops):
 8 |         super().__init__()
 9 | 
10 |         self.dim_inp = dim_inp
11 |         self.dim_hid = dim_hid
12 |         self.dim_out = dim_out
13 |         self.n_samples = n_samples
14 | 
15 |         dim_hiddens = [dim_inp] + [dim_hid] * n_hops
16 |         self.layers = [
17 |             nn.Linear(dim_hiddens[i], dim_hiddens[i + 1])
18 |             for i in range(len(dim_hiddens) - 1)
19 |         ]
20 |         self.final_fc = nn.Linear(dim_hiddens[-1], dim_out)
21 |         for layer in self.layers + [self.final_fc]:
22 |             nn.init.xavier_normal_(layer.weight)
23 |             nn.init.zeros_(layer.bias)
24 |         self.layers = nn.ModuleList(self.layers)
25 | 
26 |     def forward(self, x, adj_mat):
27 |         """
28 | 
29 |         :param x: (n_nodes, dim_inp)
30 |         :param adj_mat: (n_nodes, n_nodes)
31 |         :return: (n_nodes, dim_out)
32 |         """
33 |         for layer in self.layers:
34 |             x = torch.matmul(adj_mat, x)
35 |             x = relu(layer(x))
36 |         x = torch.matmul(adj_mat, x)
37 |         x = self.final_fc(x)
38 |         return x
39 | 
40 | 
41 | def create_model(dim_inp, dim_hid, dim_out, n_samples, n_hops):
42 |     return InductiveGCN(dim_inp, dim_hid, dim_out, n_samples, n_hops)
43 | 


--------------------------------------------------------------------------------
/1. madex/utils/text_utils.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################
  2 | # Parts of this code adapted from the Transformers repo
  3 | #     https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines.py
  4 | #############################################################################################
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | 
  9 | from transformers import Pipeline
 10 | from typing import Dict, List, Optional, Tuple, Union
 11 | from transformers.configuration_utils import PretrainedConfig
 12 | from transformers.tokenization_utils import PreTrainedTokenizer
 13 | from transformers.modelcard import ModelCard
 14 | from transformers.tokenization_auto import AutoTokenizer
 15 | from transformers.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
 16 | 
 17 | from transformers.modeling_auto import (
 18 |         AutoModel,
 19 |         AutoModelForSequenceClassification,
 20 |         AutoModelForQuestionAnswering,
 21 |         AutoModelForTokenClassification,
 22 |         AutoModelWithLMHead,
 23 |     )
 24 | 
 25 | def split_words(word_ids, sort=True):
 26 |     """
 27 |     splits words from word_id representation
 28 |     """
 29 |     word_ids2 = []
 30 |     for word in word_ids:
 31 |         w = [x for x in word[0].split("_") if x]
 32 |         word_ids2.append(("_".join(w[0:-1]), int(w[-1])))
 33 | 
 34 |     if sort:
 35 |         word_ids2.sort(key=lambda x: x[1])
 36 |     return word_ids2
 37 | 
 38 | 
 39 | def map_words(inter, domain_mapper):
 40 |     dom_map = domain_mapper.map_exp_ids
 41 |     word_inter = split_words(dom_map([(i, None) for i in inter], positions=True))
 42 |     return tuple(w for w, _ in word_inter)
 43 | 
 44 | 
 45 | class TextClassificationPipelineMod(Pipeline):
 46 |     """
 47 |     Text classification pipeline using ModelForTextClassification head.
 48 |     """
 49 | 
 50 |     def __call__(self, *args, **kwargs):
 51 |         outputs = super().__call__(*args, **kwargs)
 52 |         return outputs
 53 | 
 54 |     
 55 | def pipeline(
 56 |     task: str,
 57 |     model: Optional = None,
 58 |     config: Optional[Union[str, PretrainedConfig]] = None,
 59 |     tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
 60 |     modelcard: Optional[Union[str, ModelCard]] = None,
 61 |     device = torch.device("cpu"),
 62 |     **kwargs
 63 | ) -> Pipeline:
 64 |     """
 65 |     Utility factory method to build a pipeline.
 66 |     Pipeline are made of:
 67 |         A Tokenizer instance in charge of mapping raw textual input to token
 68 |         A Model instance
 69 |         Some (optional) post processing for enhancing model's output
 70 |     Examples:
 71 |         pipeline('sentiment-analysis')
 72 |     """
 73 |     # Register all the supported task here
 74 |     SUPPORTED_TASKS = {
 75 |         "sentiment-analysis": {
 76 |             "impl": TextClassificationPipelineMod,
 77 |             "pt": AutoModelForSequenceClassification,# if is_torch_available() else None,
 78 |             "default": {
 79 |                 "model": {
 80 |                     "pt": "distilbert-base-uncased-finetuned-sst-2-english",
 81 |                 },
 82 |                 "config": "distilbert-base-uncased-finetuned-sst-2-english",
 83 |                 "tokenizer": "distilbert-base-uncased",
 84 |             },
 85 |         },
 86 |     }
 87 | 
 88 |     # Retrieve the task
 89 |     if task not in SUPPORTED_TASKS:
 90 |         raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
 91 | 
 92 |     framework = "pt"#get_framework(model)
 93 | 
 94 |     targeted_task = SUPPORTED_TASKS[task]
 95 |     task, model_class = targeted_task["impl"], targeted_task[framework]
 96 | 
 97 |     # Use default model/config/tokenizer for the task if no model is provided
 98 |     if model is None:
 99 |         models, config, tokenizer = tuple(targeted_task["default"].values())
100 |         model = models[framework]
101 | 
102 |     # Try to infer tokenizer from model or config name (if provided as str)
103 |     if tokenizer is None:
104 |         if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
105 |             tokenizer = model
106 |         elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
107 |             tokenizer = config
108 |         else:
109 |             # Impossible to guest what is the right tokenizer here
110 |             raise Exception(
111 |                 "Impossible to guess which tokenizer to use. "
112 |                 "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
113 |             )
114 | 
115 |     # Try to infer modelcard from model or config name (if provided as str)
116 |     if modelcard is None:
117 |         # Try to fallback on one of the provided string for model or config (will replace the suffix)
118 |         if isinstance(model, str):
119 |             modelcard = model
120 |         elif isinstance(config, str):
121 |             modelcard = config
122 | 
123 |     # Instantiate tokenizer if needed
124 |     if isinstance(tokenizer, str):
125 |         tokenizer = AutoTokenizer.from_pretrained(tokenizer)
126 | 
127 |     # Instantiate config if needed
128 |     if isinstance(config, str):
129 |         config = AutoConfig.from_pretrained(config)
130 | 
131 |     # Instantiate modelcard if needed
132 |     if isinstance(modelcard, str):
133 |         modelcard = ModelCard.from_pretrained(modelcard)
134 | 
135 |     # Instantiate model if needed
136 |     if isinstance(model, str):
137 |         # Handle transparent TF/PT model conversion
138 |         model_kwargs = {}
139 |         if framework == "pt" and model.endswith(".h5"):
140 |             model_kwargs["from_tf"] = True
141 |             logger.warning(
142 |                 "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
143 |                 "Trying to load the model with PyTorch."
144 |             )
145 |         
146 |         model = model_class.from_pretrained(model, config=config, **model_kwargs)
147 |         model = model.to(device)
148 |     model.device = device
149 |     return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
150 | 
151 | 
152 | def get_bert_model(device):
153 |     model = pipeline("sentiment-analysis", device=device)
154 |     model.device = device
155 |     return model
156 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/avazu/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = "./data/autoint/Avazu/"
2 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/avazu/preprocess.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Email of the author: zjduan@pku.edu.cn
  3 | """
  4 | 0.id: ad identifier
  5 | 1.click: 0/1 for non-click/click
  6 | 2.hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
  7 | 3.C1 -- anonymized categorical variable
  8 | 4.banner_pos
  9 | 5.site_id
 10 | 6.site_domain
 11 | 7.site_category
 12 | 8.app_id
 13 | 9.app_domain
 14 | 10.app_category
 15 | 11.device_id
 16 | 12.device_ip
 17 | 13.device_model
 18 | 14.device_type
 19 | 15.device_conn_type
 20 | 16.C14
 21 | 17.C15
 22 | 18.C16
 23 | 19.C17
 24 | 20.C18
 25 | 21.C19
 26 | 22.C20
 27 | 23.C21
 28 | """
 29 | import pandas as pd
 30 | import config
 31 | import math
 32 | 
 33 | train_path = config.DATA_PATH + "train.csv"
 34 | f1 = open(train_path, "r")
 35 | dic = {}
 36 | f_train_value = open(config.DATA_PATH + "train_x.txt", "w")
 37 | f_train_index = open(config.DATA_PATH + "train_i.txt", "w")
 38 | f_train_label = open(config.DATA_PATH + "train_y.txt", "w")
 39 | debug = False
 40 | tune = False
 41 | Bound = [5] * 24
 42 | 
 43 | label_index = 1
 44 | Column = 24
 45 | 
 46 | numr_feat = []
 47 | numerical = [0] * Column
 48 | numerical[label_index] = -1
 49 | 
 50 | cate_feat = []
 51 | for i in range(Column):
 52 |     if numerical[i] == 0:
 53 |         cate_feat.extend([i])
 54 | 
 55 | index_cnt = 0
 56 | index_others = [0] * Column
 57 | Max = [0] * Column
 58 | 
 59 | 
 60 | for i in numr_feat:
 61 |     index_others[i] = index_cnt
 62 |     index_cnt += 1
 63 |     numerical[i] = 1
 64 | for i in cate_feat:
 65 |     index_others[i] = index_cnt
 66 |     index_cnt += 1
 67 | 
 68 | for i in range(Column):
 69 |     dic[i] = dict()
 70 | 
 71 | cnt_line = 0
 72 | for line in f1:
 73 |     cnt_line += 1
 74 |     if cnt_line == 1:
 75 |         continue  # header
 76 |     if cnt_line % 1000000 == 0:
 77 |         print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt))
 78 |     if debug == True:
 79 |         if cnt_line >= 10000:
 80 |             break
 81 |     split = line.strip("\n").split(",")
 82 |     for i in cate_feat:
 83 |         if split[i] != "":
 84 |             if split[i] not in dic[i]:
 85 |                 dic[i][split[i]] = [index_others[i], 0]
 86 |             dic[i][split[i]][1] += 1
 87 |             if (
 88 |                 dic[i][split[i]][0] == index_others[i]
 89 |                 and dic[i][split[i]][1] == Bound[i]
 90 |             ):
 91 |                 dic[i][split[i]][0] = index_cnt
 92 |                 index_cnt += 1
 93 | 
 94 |     if tune == False:
 95 |         label = split[label_index]
 96 |         if label != "0":
 97 |             label = "1"
 98 |         index = [0] * (Column - 1)
 99 |         value = ["0"] * (Column - 1)
100 |         for i in range(Column):
101 |             cur = i
102 |             if i == label_index:
103 |                 continue
104 |             if i > label_index:
105 |                 cur = i - 1
106 |             if numerical[i] == 1:
107 |                 index[cur] = index_others[i]
108 |                 if split[i] != "":
109 |                     value[cur] = split[i]
110 |                     # Max[i] = max(int(split[i]), Max[i])
111 |             else:
112 |                 if split[i] != "":
113 |                     index[cur] = dic[i][split[i]][0]
114 |                     value[cur] = "1"
115 | 
116 |             if split[i] == "":
117 |                 value[cur] = "0"
118 | 
119 |         f_train_index.write(" ".join(str(i) for i in index) + "\n")
120 |         f_train_value.write(" ".join(value) + "\n")
121 |         f_train_label.write(label + "\n")
122 | 
123 | f1.close()
124 | f_train_index.close()
125 | f_train_value.close()
126 | f_train_label.close()
127 | print("Finished!")
128 | print("index_cnt = %d" % index_cnt)
129 | # print ("max number for numerical features:")
130 | # for i in numr_feat:
131 | #     print ("no.:%d max: %d" % (i, Max[i]))
132 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/criteo/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = "./data/autoint/Criteo/"
2 | SOURCE_DATA = "./data/autoint/Criteo/train.txt"
3 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/criteo/preprocess.py:
--------------------------------------------------------------------------------
 1 | import config
 2 | 
 3 | train_path = config.SOURCE_DATA
 4 | f1 = open(train_path, "r")
 5 | dic = {}
 6 | # generate three fold.
 7 | # train_x: value
 8 | # train_i: index
 9 | # train_y: label
10 | f_train_value = open(config.DATA_PATH + "train_x.txt", "w")
11 | f_train_index = open(config.DATA_PATH + "train_i.txt", "w")
12 | f_train_label = open(config.DATA_PATH + "train_y.txt", "w")
13 | 
14 | for i in range(39):
15 |     dic[i] = {}
16 | 
17 | cnt_train = 0
18 | 
19 | # for debug
20 | # limits = 10000
21 | index = [1] * 26
22 | for line in f1:
23 |     cnt_train += 1
24 |     if cnt_train % 100000 == 0:
25 |         print("now train cnt : %d\n" % cnt_train)
26 |     # if cnt_train > limits:
27 |     # 	break
28 |     split = line.strip("\n").split("\t")
29 |     # 0-label, 1-13 numerical, 14-39 category
30 |     for i in range(13, 39):
31 |         # dic_len = len(dic[i])
32 |         if split[i + 1] not in dic[i]:
33 |             # [1, 0] 1 is the index for those whose appear times <= 10   0 indicates the appear times
34 |             dic[i][split[i + 1]] = [1, 0]
35 |         dic[i][split[i + 1]][1] += 1
36 |         if dic[i][split[i + 1]][0] == 1 and dic[i][split[i + 1]][1] > 10:
37 |             index[i - 13] += 1
38 |             dic[i][split[i + 1]][0] = index[i - 13]
39 | f1.close()
40 | print("total entries :%d\n" % (cnt_train - 1))
41 | 
42 | # calculate number of category features of every dimension
43 | kinds = [13]
44 | for i in range(13, 39):
45 |     kinds.append(index[i - 13])
46 | print("number of dimensions : %d" % (len(kinds) - 1))
47 | print(kinds)
48 | 
49 | for i in range(1, len(kinds)):
50 |     kinds[i] += kinds[i - 1]
51 | print(kinds)
52 | 
53 | # make new data
54 | 
55 | f1 = open(train_path, "r")
56 | cnt_train = 0
57 | print("remake training data...\n")
58 | for line in f1:
59 |     cnt_train += 1
60 |     if cnt_train % 100000 == 0:
61 |         print("now train cnt : %d\n" % cnt_train)
62 |     # if cnt_train > limits:
63 |     # 	break
64 |     entry = ["0"] * 39
65 |     index = [None] * 39
66 |     split = line.strip("\n").split("\t")
67 |     label = str(split[0])
68 |     for i in range(13):
69 |         if split[i + 1] != "":
70 |             entry[i] = split[i + 1]
71 |         index[i] = i + 1
72 |     for i in range(13, 39):
73 |         if split[i + 1] != "":
74 |             entry[i] = "1"
75 |         index[i] = dic[i][split[i + 1]][0]
76 |     for j in range(26):
77 |         index[13 + j] += kinds[j]
78 |     index = [str(item) for item in index]
79 |     f_train_value.write(" ".join(entry) + "\n")
80 |     f_train_index.write(" ".join(index) + "\n")
81 |     f_train_label.write(label + "\n")
82 | f1.close()
83 | 
84 | 
85 | f_train_value.close()
86 | f_train_index.close()
87 | f_train_label.close()
88 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/criteo/scale.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import config
 3 | import numpy as np
 4 | 
 5 | 
 6 | def scale(x):
 7 |     if x > 2:
 8 |         x = int(math.log(float(x)) ** 2)
 9 |     return x
10 | 
11 | 
12 | def scale_each_fold():
13 |     for i in range(1, 11):
14 |         print("now part %d" % i)
15 |         data = np.load(config.DATA_PATH + "part" + str(i) + "/train_x.npy")
16 |         part = data[:, 0:13]
17 |         for j in range(part.shape[0]):
18 |             if j % 100000 == 0:
19 |                 print(j)
20 |             part[j] = list(map(scale, part[j]))
21 |         np.save(config.DATA_PATH + "part" + str(i) + "/train_x2.npy", data)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     scale_each_fold()
26 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/kdd2012/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = "./KDD2012/"
2 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/kdd2012/preprocess.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Email of the author: zjduan@pku.edu.cn
  3 | """
  4 | 0. Click：
  5 | 1. Impression（numerical）
  6 | 2. DisplayURL: (categorical)
  7 | 3. AdID:(categorical) 
  8 | 4. AdvertiserID:(categorical)
  9 | 5. Depth:(numerical)
 10 | 6. Position:(numerical)
 11 | 7. QueryID:  (categorical) the key of the data file 'queryid_tokensid.txt'. 
 12 | 8. KeywordID: (categorical)the key of  'purchasedkeyword_tokensid.txt'.
 13 | 9. TitleID:  (categorical)the key of 'titleid_tokensid.txt'.
 14 | 10. DescriptionID:  (categorical)the key of 'descriptionid_tokensid.txt'.
 15 | 11. UserID: (categorical)the key of 'userid_profile.txt'
 16 | 12. User's Gender: (categorical)
 17 | 13. User's Age: (categorical)
 18 | """
 19 | import math
 20 | 
 21 | train_path = "./KDD2012/training.txt"
 22 | f1 = open(train_path, "r")
 23 | f2 = open("./KDD2012/userid_profile.txt", "r")
 24 | dic = {}
 25 | f_train_value = open("./KDD2012/train_x.txt", "w")
 26 | f_train_index = open("./KDD2012/train_i.txt", "w")
 27 | f_train_label = open("./KDD2012/train_y.txt", "w")
 28 | debug = False
 29 | tune = False
 30 | Column = 12
 31 | Field = 13
 32 | 
 33 | numr_feat = [1, 5, 6]
 34 | numerical = [0] * Column
 35 | cate_feat = [2, 3, 4, 7, 8, 9, 10, 11]
 36 | index_cnt = 0
 37 | index_others = [0] * (Field + 1)
 38 | Max = [0] * 12
 39 | numerical[0] = -1
 40 | for i in numr_feat:
 41 |     index_others[i] = index_cnt
 42 |     index_cnt += 1
 43 |     numerical[i] = 1
 44 | for i in cate_feat:
 45 |     index_others[i] = index_cnt
 46 |     index_cnt += 1
 47 | 
 48 | for i in range(Field + 1):
 49 |     dic[i] = dict()
 50 | 
 51 | ###init user_dic
 52 | user_dic = dict()
 53 | 
 54 | cnt_line = 0
 55 | for line in f2:
 56 |     cnt_line += 1
 57 |     if cnt_line % 1000000 == 0:
 58 |         print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt))
 59 |     # if (debug == True):
 60 |     #     if (cnt_line >= 10000):
 61 |     #         break
 62 |     split = line.strip("\n").split("\t")
 63 |     user_dic[split[0]] = [split[1], split[2]]
 64 |     if split[1] not in dic[12]:
 65 |         dic[12][split[1]] = [index_cnt, 0]
 66 |         index_cnt += 1
 67 |     if split[2] not in dic[13]:
 68 |         dic[13][split[2]] = [index_cnt, 0]
 69 |         index_cnt += 1
 70 | 
 71 | cnt_line = 0
 72 | for line in f1:
 73 |     cnt_line += 1
 74 |     if cnt_line % 1000000 == 0:
 75 |         print("cnt_line = %d, index_cnt = %d" % (cnt_line, index_cnt))
 76 |     if debug == True:
 77 |         if cnt_line >= 10000:
 78 |             break
 79 |     split = line.strip("\n").split("\t")
 80 |     for i in cate_feat:
 81 |         if split[i] != "":
 82 |             if split[i] not in dic[i]:
 83 |                 dic[i][split[i]] = [index_others[i], 0]
 84 |             dic[i][split[i]][1] += 1
 85 |             if dic[i][split[i]][0] == index_others[i] and dic[i][split[i]][1] == 10:
 86 |                 dic[i][split[i]][0] = index_cnt
 87 |                 index_cnt += 1
 88 | 
 89 |     if tune == False:
 90 |         label = split[0]
 91 |         if label != "0":
 92 |             label = "1"
 93 |         index = [0] * Field
 94 |         value = ["0"] * Field
 95 |         for i in range(1, 12):
 96 |             if numerical[i] == 1:
 97 |                 index[i - 1] = index_others[i]
 98 |                 if split[i] != "":
 99 |                     value[i - 1] = split[i]
100 |                     Max[i] = max(int(split[i]), Max[i])
101 |             else:
102 |                 if split[i] != "":
103 |                     index[i - 1] = dic[i][split[i]][0]
104 |                     value[i - 1] = "1"
105 | 
106 |             if split[i] == "":
107 |                 value[i - 1] = "0"
108 |             if i == 11 and split[i] == "0":
109 |                 value[i - 1] = "0"
110 |         ### gender and age
111 |         if split[11] == "" or (split[11] not in user_dic):
112 |             index[12 - 1] = index_others[12]
113 |             value[12 - 1] = "0"
114 |             index[13 - 1] = index_others[13]
115 |             value[13 - 1] = "0"
116 |         else:
117 |             index[12 - 1] = dic[12][user_dic[split[11]][0]][0]
118 |             value[12 - 1] = "1"
119 |             index[13 - 1] = dic[13][user_dic[split[11]][1]][0]
120 |             value[13 - 1] = "1"
121 | 
122 |         f_train_index.write(" ".join(str(i) for i in index) + "\n")
123 |         f_train_value.write(" ".join(value) + "\n")
124 |         f_train_label.write(label + "\n")
125 | 
126 | f1.close()
127 | f_train_index.close()
128 | f_train_value.close()
129 | f_train_label.close()
130 | print("Finished!")
131 | print("index_cnt = %d" % index_cnt)
132 | print("max number for numerical features:")
133 | for i in numr_feat:
134 |     print("no.:%d max: %d" % (i, Max[i]))
135 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/kdd2012/scale.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import config
 3 | import numpy as np
 4 | 
 5 | 
 6 | def scale(x):
 7 |     if x > 2:
 8 |         x = int(math.log(float(x)) ** 2)
 9 |     return x
10 | 
11 | 
12 | def scale_each_fold():
13 |     for i in range(1, 11):
14 |         print("now part %d" % i)
15 |         data = np.load(config.DATA_PATH + "part" + str(i) + "/train_x.npy")
16 |         part = data[:, 0:13]
17 |         for j in range(part.shape[0]):
18 |             if j % 100000 == 0:
19 |                 print(j)
20 |             part[j] = list(map(scale, part[j]))
21 |         np.save(config.DATA_PATH + "part" + str(i) + "/train_x2.npy", data)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     scale_each_fold()
26 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/kfold_split/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = "./data/autoint/Criteo/"
2 | TRAIN_I = DATA_PATH + "train_i.txt"
3 | TRAIN_X = DATA_PATH + "train_x.txt"
4 | TRAIN_Y = DATA_PATH + "train_y.txt"
5 | 
6 | NUM_SPLITS = 10
7 | RANDOM_SEED = 2018
8 | 


--------------------------------------------------------------------------------
/2. glider/data/initial_data_prep/kfold_split/stratifiedKfold.py:
--------------------------------------------------------------------------------
  1 | # Email of the author: zjduan@pku.edu.cn
  2 | import numpy as np
  3 | import config
  4 | import os
  5 | import pandas as pd
  6 | from sklearn.model_selection import StratifiedKFold
  7 | from sklearn import preprocessing
  8 | 
  9 | scale = ""
 10 | train_x_name = "train_x.npy"
 11 | train_y_name = "train_y.npy"
 12 | 
 13 | # numr_feat = []
 14 | Column = 13
 15 | 
 16 | 
 17 | def _load_data(_nrows=None, debug=False):
 18 | 
 19 |     train_x = pd.read_csv(
 20 |         config.TRAIN_X, header=None, sep=" ", nrows=_nrows, dtype=np.float
 21 |     )
 22 |     train_y = pd.read_csv(
 23 |         config.TRAIN_Y, header=None, sep=" ", nrows=_nrows, dtype=np.int32
 24 |     )
 25 | 
 26 |     # for i in range(11):
 27 |     #     print ("argmax feat %d = %d, max = %d" % (i, train_x[i].argmax(), train_x[i].max()))
 28 | 
 29 |     train_x = train_x.values
 30 |     train_y = train_y.values.reshape([-1])
 31 | 
 32 |     # print ("begin to scale")
 33 |     # if (scale == "minmax"):
 34 |     #     train_x = preprocessing.MinMaxScaler().fit_transform(train_x)
 35 | 
 36 |     # if (scale == "std"):
 37 |     #     train_x[:,0:12] = preprocessing.scale(train_x[:,0:12])
 38 |     #     train_x[:,0:12] += 1
 39 | 
 40 |     print("data loading done!")
 41 |     print("training data : %d" % train_y.shape[0])
 42 | 
 43 |     assert train_x.shape[0] == train_y.shape[0]
 44 | 
 45 |     return train_x, train_y
 46 | 
 47 | 
 48 | def save_x_y(fold_index, train_x, train_y):
 49 |     _get = lambda x, l: [x[i] for i in l]
 50 |     for i in range(len(fold_index)):
 51 |         print("now part %d" % (i + 1))
 52 |         part_index = fold_index[i]
 53 |         Xv_train_, y_train_ = _get(train_x, part_index), _get(train_y, part_index)
 54 |         save_dir_Xv = config.DATA_PATH + "part" + str(i + 1) + "/"
 55 |         save_dir_y = config.DATA_PATH + "part" + str(i + 1) + "/"
 56 |         if os.path.exists(save_dir_Xv) == False:
 57 |             os.makedirs(save_dir_Xv)
 58 |         if os.path.exists(save_dir_y) == False:
 59 |             os.makedirs(save_dir_y)
 60 |         save_path_Xv = save_dir_Xv + train_x_name
 61 |         save_path_y = save_dir_y + train_y_name
 62 |         np.save(save_path_Xv, Xv_train_)
 63 |         np.save(save_path_y, y_train_)
 64 | 
 65 | 
 66 | # def save_test(test_x, test_y):
 67 | #     np.save("../data/test/test_x.npy", test_x)
 68 | #     np.save("../data/test/test_y.npy", test_y)
 69 | 
 70 | 
 71 | def save_i(fold_index):
 72 |     _get = lambda x, l: [x[i] for i in l]
 73 |     train_i = pd.read_csv(
 74 |         config.TRAIN_I, header=None, sep=" ", nrows=None, dtype=np.int32
 75 |     )
 76 |     train_i = train_i.values
 77 |     feature_size = train_i.max() + 1
 78 |     print("feature_size = %d" % feature_size)
 79 |     feature_size = [feature_size]
 80 |     feature_size = np.array(feature_size)
 81 |     np.save(config.DATA_PATH + "feature_size.npy", feature_size)
 82 | 
 83 |     # pivot = 40000000
 84 | 
 85 |     # test_i = train_i[pivot:]
 86 |     # train_i = train_i[:pivot]
 87 | 
 88 |     # print("test_i size: %d" % len(test_i))
 89 |     print("train_i size: %d" % len(train_i))
 90 | 
 91 |     # np.save("../data/test/test_i.npy", test_i)
 92 | 
 93 |     for i in range(len(fold_index)):
 94 |         print("now part %d" % (i + 1))
 95 |         part_index = fold_index[i]
 96 |         Xi_train_ = _get(train_i, part_index)
 97 |         save_path_Xi = config.DATA_PATH + "part" + str(i + 1) + "/train_i.npy"
 98 |         np.save(save_path_Xi, Xi_train_)
 99 | 
100 | 
101 | def main():
102 | 
103 |     train_x, train_y = _load_data()
104 |     print("loading data done!")
105 | 
106 |     folds = list(
107 |         StratifiedKFold(
108 |             n_splits=10, shuffle=True, random_state=config.RANDOM_SEED
109 |         ).split(train_x, train_y)
110 |     )
111 | 
112 |     fold_index = []
113 |     for i, (train_id, valid_id) in enumerate(folds):
114 |         fold_index.append(valid_id)
115 | 
116 |     print("fold num: %d" % (len(fold_index)))
117 | 
118 |     fold_index = np.array(fold_index)
119 |     np.save(config.DATA_PATH + "fold_index.npy", fold_index)
120 | 
121 |     save_x_y(fold_index, train_x, train_y)
122 |     print("save train_x_y done!")
123 | 
124 |     fold_index = np.load(config.DATA_PATH + "fold_index.npy")
125 |     save_i(fold_index)
126 |     print("save index done!")
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/2. glider/detect_global_interactions.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | sys.path.append("../1. madex")
  4 | from sampling_and_inference import generate_perturbation_dataset_autoint
  5 | from neural_interaction_detection import detect_interactions
  6 | import os
  7 | import logging
  8 | from tqdm import tqdm
  9 | import warnings
 10 | import pickle
 11 | import numpy as np
 12 | import argparse
 13 | import torch
 14 | import torch.optim as optim
 15 | from utils.global_interaction_utils import *
 16 | import torch.multiprocessing as multiprocessing
 17 | 
 18 | 
 19 | warnings.simplefilter("ignore")
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument("--global_size", type=int, default=1000)
 23 | parser.add_argument("--num_perturbation", type=int, default=6000)
 24 | parser.add_argument(
 25 |     "--save_path",
 26 |     type=str,
 27 |     default="/meladyfs/newyork/mtsang/AutoInt/test_code/Criteo/b3h2_dnn_dropkeep1_400x2_5trials_v2/1/",
 28 | )
 29 | parser.add_argument("--data", type=str, help="data name", default="criteo")
 30 | parser.add_argument("--save_id", type=str, help="save id", default="testpar2")
 31 | parser.add_argument(
 32 |     "--data_path", type=str, help="root path for all the data", default="data/autoint"
 33 | )
 34 | parser.add_argument("--epochs", type=int, help="num epochs", default=100)
 35 | parser.add_argument("--es", type=int, help="enable early stopping", default=1)
 36 | parser.add_argument("--l1", type=float, help="set l1 reg constant", default=1e-4)
 37 | parser.add_argument("--lr", type=float, help="learning rate", default=0.01)
 38 | parser.add_argument("--opt", type=str, help="optimizer", default="adam")
 39 | parser.add_argument(
 40 |     "--par_batch_size",
 41 |     type=int,
 42 |     help="size of parallel batch (same as num parallel processes)",
 43 |     default=32,
 44 | )
 45 | parser.add_argument("--add_linear", type=int, help="contain main effects in interaction detector via linear regression", default=0)
 46 | parser.add_argument("--detector", type=str, help="detector: NID or GradientNID", default="NID")
 47 | parser.add_argument("--gpu", type=int, help="gpu number", default=0)
 48 | 
 49 | args = parser.parse_args()
 50 | par_batch_size = args.par_batch_size
 51 | if args.opt == "adagrad":
 52 |     opt = optim.Adagrad
 53 | elif args.opt == "adam":
 54 |     opt = optim.Adam
 55 | else:
 56 |     raise ValueError("invalid optimizer")
 57 | 
 58 | # device = torch.device("cuda:" + str(args.gpu))
 59 | 
 60 | 
 61 | def par_experiment(idx, perturbations):
 62 |     feats = perturbations["feats"]
 63 |     labels = perturbations["targets"]
 64 | 
 65 | #     distributes processes across two gpus
 66 |     device = torch.device("cuda:" + str(idx%2))
 67 | 
 68 |     try:
 69 |         inters, mlp_loss = detect_interactions(
 70 |             feats,
 71 |             labels,
 72 |             arch=[256, 128, 64],
 73 |             nepochs=args.epochs,
 74 |             early_stopping=args.es,
 75 |             patience=5,
 76 |             l1_const=args.l1,
 77 |             learning_rate=args.lr,
 78 |             opt_func=opt,
 79 |             add_linear=args.add_linear,
 80 |             detector=args.detector,
 81 |             seed=42,
 82 |             verbose=False,
 83 |             device=device,
 84 |         )
 85 |         print("mlp loss", mlp_loss)
 86 |         result = {"inters": inters, "mlp_loss": mlp_loss}
 87 |     except:
 88 |         print("error in learning mlp for interaction detection")
 89 |         result = None
 90 | 
 91 |     return idx, result
 92 | 
 93 | 
 94 | def run():
 95 |     multiprocessing.set_start_method("spawn", force=True)
 96 | 
 97 |     # this data is shuffled. other datasets must be shuffled for global interaction detection
 98 |     model, data = get_autoint_and_data(
 99 |         data_path=args.data_path, dataset=args.data, save_path=args.save_path
100 |     )
101 | 
102 |     dense_feat_indices = []
103 |     sparse_feat_indices = []
104 |     for i in tqdm(range(data["Xi"].shape[1])):
105 |         uniq = np.unique(data["Xi"][:, i])
106 |         if len(uniq) == 1 and not args.data == "avazu":
107 |             dense_feat_indices.append(i)
108 |         else:
109 |             sparse_feat_indices.append(i)
110 | 
111 |     print("dense feature indices", dense_feat_indices)
112 | 
113 |     save_postfix = "_" + args.save_id if args.save_id else ""
114 | 
115 |     base_path = "experiments/detected_interactions/"
116 |     pkl_path = (
117 |         base_path
118 |         + "detected_interactions_"
119 |         + args.data.lower()
120 |         + save_postfix
121 |         + ".pickle"
122 |     )
123 |     if os.path.exists(pkl_path):
124 |         with open(pkl_path, "rb") as handle:
125 |             interaction_results = pickle.load(handle)
126 |         print("loaded existing results. starting from index", len(interaction_results))
127 |     else:
128 |         if not os.path.exists(base_path):
129 |             os.makedirs(base_path)
130 |         interaction_results = []
131 | 
132 |     indexes = list(range(len(interaction_results), args.global_size))
133 |     num_par_batches = int(np.ceil(len(indexes) / par_batch_size))
134 | 
135 |     for b in tqdm(range(num_par_batches)):
136 |         index_batch = indexes[b * par_batch_size : (b + 1) * par_batch_size]
137 |         perturbation_batch = []
138 |         for idx in index_batch:
139 | 
140 |             data_inst = {
141 |                 "Xi": data["Xi"][idx],
142 |                 "Xv": data["Xv"][idx],
143 |                 "means": data["means"],
144 |             }
145 |             feats, targets = generate_perturbation_dataset_autoint(
146 |                 data_inst,
147 |                 model,
148 |                 dense_feat_indices,
149 |                 sparse_feat_indices,
150 |                 num_samples=args.num_perturbation,
151 |                 valid_size=500,
152 |                 test_size=500,
153 |                 seed=idx,
154 |             )
155 |             perturbation_batch.append({"feats": feats, "targets": targets})
156 | 
157 |         with multiprocessing.Pool(processes=par_batch_size) as pool:
158 |             results_batch = pool.starmap(
159 |                 par_experiment, zip(index_batch, perturbation_batch)
160 |             )
161 | 
162 |         results_batch.sort(key=lambda x: x[0])
163 | 
164 |         for _, result in results_batch:
165 |             interaction_results.append(result)
166 | 
167 |         with open(pkl_path, "wb") as handle:
168 |             pickle.dump(interaction_results, handle)
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     run()
173 | 


--------------------------------------------------------------------------------
/2. glider/make_cross_feature_data.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from tqdm import tqdm
  3 | import numpy as np
  4 | from sklearn.preprocessing import LabelEncoder
  5 | import warnings
  6 | import os
  7 | import pandas as pd
  8 | import argparse
  9 | 
 10 | warnings.simplefilter("ignore")
 11 | 
 12 | from utils.cross_feature_utils import *
 13 | 
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument(
 17 |     "--data_file",
 18 |     type=str,
 19 |     help="the path where global interaction results are saved",
 20 |     default="experiments/detected_interactions_criteo_repr2.pickle",
 21 | )
 22 | parser.add_argument("--exp", type=str, help="an experiment id", default="cross2_K20")
 23 | parser.add_argument(
 24 |     "--K", type=int, help="the top-K threshold for global interactions", default=20
 25 | )
 26 | parser.add_argument("--data", type=str, help="data name", default="criteo")
 27 | parser.add_argument(
 28 |     "--autoint_save_path",
 29 |     type=str,
 30 |     help="folder where cross features for autoint are saved",
 31 |     default="data/autoint/criteo",
 32 | )
 33 | parser.add_argument(
 34 |     "--deepctr_save_path",
 35 |     type=str,
 36 |     help="folder where cross features for deepctr are saved",
 37 |     default="data/deepctr/criteo",
 38 | )
 39 | parser.add_argument(
 40 |     "--bs", type=int, help="batch size of training data", default=1000000
 41 | )
 42 | parser.add_argument("--nbins", type=int, help="num bins", default=100)
 43 | parser.add_argument(
 44 |     "--nprocs", type=int, help="number of parallel processes", default=20
 45 | )
 46 | parser.add_argument(
 47 |     "--thresh",
 48 |     type=float,
 49 |     help="min pct of training batch to require cross feature ids to appear",
 50 |     default=0.0001,
 51 | )
 52 | parser.add_argument(
 53 |     "--top_k",
 54 |     type=int,
 55 |     help="k threshold for madex interactions",
 56 |     default=100,
 57 | )
 58 | parser.add_argument(
 59 |     "--save_base_data",
 60 |     type=str2bool,
 61 |     help="y/n: save (baseline) data without cross features for deepctr",
 62 |     nargs="?",
 63 |     const=True,
 64 |     default=True,
 65 | )
 66 | parser.add_argument(
 67 |     "--prune",
 68 |     type=str2bool,
 69 |     help="prune interaction subsets",
 70 |     nargs="?",
 71 |     const=True,
 72 |     default=True,
 73 | )
 74 | 
 75 | args = parser.parse_args()
 76 | 
 77 | interactions_file = args.data_file
 78 | experiment = args.exp
 79 | max_rank = args.K
 80 | dataset = (args.data).lower()
 81 | data_path_autoint = args.autoint_save_path
 82 | data_path_deepctr = args.deepctr_save_path
 83 | 
 84 | training_batch_size = args.bs
 85 | num_bins = args.nbins
 86 | num_processes = args.nprocs
 87 | threshold_pct = args.thresh
 88 | deepctr_save_baseline_data = args.save_base_data
 89 | prune_interaction_subsets = args.prune
 90 | top_k = args.top_k
 91 | 
 92 | 
 93 | def make_cross_feature_data(
 94 |     interactions_file,
 95 |     max_rank,
 96 |     dataset,
 97 |     training_batch_size,
 98 |     data_path,
 99 |     num_bins,
100 |     threshold,
101 |     top_k,
102 |     prune_subsets,
103 |     num_processes,
104 | ):
105 | 
106 |     print("loading autoint data")
107 |     data = load_data_autoint(dataset, data_path)
108 |     Xi, Xv, y, lens = merge_data(data)
109 |     Xi_batch, Xv_batch, y_batch = get_training_batch(data, size=training_batch_size)
110 |     dense_feat_indices, sparse_feat_indices = get_dense_sparse_feat_indices(
111 |         Xi_batch, dataset
112 |     )
113 | 
114 |     #     print("dense feature indices", dense_feat_indices)
115 | 
116 |     if dataset == "avazu":
117 |         num_sparse, num_dense = 23, 0
118 |     elif dataset == "criteo":
119 |         num_sparse, num_dense = 26, 13
120 |     else:
121 |         raise ValueError("Invalid dataset")
122 | 
123 |     assert num_dense == len(dense_feat_indices)
124 |     assert num_sparse == len(sparse_feat_indices)
125 | 
126 |     num_feats = len(sparse_feat_indices) + len(dense_feat_indices)
127 | 
128 |     print("loading interactions")
129 |     inters = load_global_interactions(
130 |         interactions_file, num_feats, max_rank, prune_subsets, top_k
131 |     )
132 | 
133 |     print("discretizing dense features")
134 |     sparsified_data = discretize_dense_features(
135 |         Xi,
136 |         Xv,
137 |         Xv_batch,
138 |         dense_feat_indices,
139 |         sparse_feat_indices,
140 |         num_feats,
141 |         num_bins,
142 |         num_processes=num_processes,
143 |     )
144 | 
145 |     train_start = sum(lens[0:2])
146 |     sparsified_batch = sparsified_data[train_start : train_start + training_batch_size]
147 | 
148 |     print("crossing sparse features")
149 |     cross_feats = cross_sparse_features(
150 |         inters,
151 |         sparsified_data,
152 |         sparsified_batch,
153 |         Xi_batch,
154 |         threshold,
155 |         num_processes=num_processes,
156 |     )
157 |     Xi, Xv, Xi_cross, Xv_cross = get_X_cross(
158 |         inters, cross_feats, Xi, Xv, sparse_feat_indices
159 |     )
160 | 
161 |     return (
162 |         Xi,
163 |         Xv,
164 |         y,
165 |         Xi_cross,
166 |         Xv_cross,
167 |         lens,
168 |         max_rank,
169 |         num_feats,
170 |         dense_feat_indices,
171 |         sparse_feat_indices,
172 |     )
173 | 
174 | 
175 | def save_cross_feats_autoint(Xi_cross, Xv_cross, lens, experiment, data_path):
176 |     print("saving data for autoint")
177 | 
178 |     cross_name = ["i_cross.npy", "x_cross.npy"]
179 | 
180 |     prev_len = 0
181 |     for i in tqdm(range(1, 11)):
182 |         cur_len = prev_len + lens[i - 1]
183 |         Xi_seg = Xi_cross[prev_len:cur_len]
184 |         Xv_seg = Xv_cross[prev_len:cur_len]
185 | 
186 |         folder_path = data_path + "/part" + str(i) + "/" + experiment
187 |         if not os.path.exists(folder_path):
188 |             os.makedirs(folder_path)
189 | 
190 |         np.save(folder_path + "/" + cross_name[0], Xi_seg)
191 |         np.save(folder_path + "/" + cross_name[1], Xv_seg)
192 | 
193 |         prev_len = cur_len
194 | 
195 |     feature_size = int(Xi_cross.max() + 1)
196 |     #     print("feature_size = %d" % feature_size)
197 | 
198 |     folder_path2 = data_path + "/" + experiment
199 | 
200 |     if not os.path.exists(folder_path2):
201 |         os.makedirs(folder_path2)
202 |     np.save(folder_path2 + "/feature_size.npy", np.array([feature_size]))
203 | 
204 | 
205 | def save_cross_feats_deepctr(
206 |     Xi,
207 |     Xv,
208 |     y,
209 |     Xi_cross,
210 |     Xv_cross,
211 |     dense_feat_indices,
212 |     sparse_feat_indices,
213 |     lens,
214 |     experiment,
215 |     data_path,
216 |     deepctr_save_baseline_data,
217 | ):
218 |     print("saving data for deepctr")
219 |     num_dense = len(dense_feat_indices)
220 |     num_sparse = len(sparse_feat_indices)
221 | 
222 |     sparse_features = ["C" + str(i) for i in range(1, num_sparse + 1)]
223 |     dense_features = ["I" + str(i) for i in range(1, num_dense + 1)]
224 |     cross_features = ["G" + str(i) for i in range(1, max_rank + 1)]
225 |     target = ["label"]
226 | 
227 |     settings = ["cross"]
228 |     if deepctr_save_baseline_data:
229 |         settings.append("baseline")
230 | 
231 |     n_unique_dict = dict()
232 | 
233 |     for setting in settings:
234 | 
235 |         if setting == "baseline":
236 |             sparse_indices = sparse_feat_indices
237 |             Xi_sparse_na = np.where(
238 |                 Xv[:, sparse_indices] == 1, Xi[:, sparse_indices], -1
239 |             )
240 |             data_np = np.concatenate(
241 |                 [np.expand_dims(y, axis=1), Xv[:, dense_feat_indices], Xi_sparse_na],
242 |                 axis=1,
243 |             )
244 |             df = pd.DataFrame(
245 |                 data_np, columns=target + dense_features + sparse_features
246 |             )
247 |             temp_feats = sparse_features
248 |             save_path = data_path
249 |             postfix = ""
250 |         else:
251 |             sparse_indices = list(range(max_rank))
252 |             Xi_sparse_na = np.where(
253 |                 Xv_cross[:, sparse_indices] == 1, Xi_cross[:, sparse_indices], -1
254 |             )
255 |             data_np = Xi_sparse_na
256 |             df = pd.DataFrame(data_np, columns=cross_features)
257 |             temp_feats = cross_features
258 |             save_path = data_path + "/" + experiment
259 |             postfix = "_" + str(max_rank)
260 | 
261 |         if not os.path.exists(save_path):
262 |             os.makedirs(save_path)
263 | 
264 |         for feat in tqdm(temp_feats):
265 |             lbe = LabelEncoder()
266 |             df[feat] = lbe.fit_transform(
267 |                 df[feat]
268 |             )  # global label encoding consistent with autoint's data preprocessing
269 | 
270 |         train = df[sum(lens[0:2]) :]
271 |         valid = df[lens[0] : sum(lens[0:2])]
272 |         test = df[0 : lens[0]]
273 | 
274 |         train.to_hdf(
275 |             save_path + "/" + setting + postfix + ".h5",
276 |             key="train",
277 |             format="table",
278 |             model="w",
279 |         )
280 |         valid.to_hdf(
281 |             save_path + "/" + setting + postfix + ".h5", key="valid", format="table"
282 |         )
283 |         test.to_hdf(
284 |             save_path + "/" + setting + postfix + ".h5", key="test", format="table"
285 |         )
286 | 
287 |         for feat in tqdm(temp_feats):
288 |             n_unique_dict[feat] = df[feat].nunique()
289 | 
290 |     n_unique_cross_dict = dict()
291 |     for feat in cross_features:
292 |         n_unique_cross_dict[feat] = n_unique_dict[feat]
293 | 
294 |     if deepctr_save_baseline_data:
295 |         n_unique_baseline_dict = dict()
296 |         for feat in sparse_features:
297 |             n_unique_baseline_dict[feat] = n_unique_dict[feat]
298 |         with open(data_path + "/n_unique_dict_baseline.pickle", "wb") as handle:
299 |             pickle.dump(
300 |                 n_unique_baseline_dict, handle, protocol=pickle.HIGHEST_PROTOCOL
301 |             )
302 | 
303 |     with open(
304 |         data_path + "/" + experiment + "/n_unique_dict_cross.pickle", "wb"
305 |     ) as handle:
306 |         pickle.dump(n_unique_cross_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
307 | 
308 | 
309 | if __name__ == "__main__":
310 |     print("warning: this process may take several hours and significant RAM (>150GB)")
311 | 
312 |     Xi, Xv, y, Xi_cross, Xv_cross, lens, max_rank, num_feats, dense_feat_indices, sparse_feat_indices = make_cross_feature_data(
313 |         interactions_file,
314 |         max_rank,
315 |         dataset,
316 |         training_batch_size,
317 |         data_path_autoint,
318 |         num_bins,
319 |         threshold_pct,
320 |         top_k,
321 |         prune_interaction_subsets,
322 |         num_processes,
323 |     )
324 | 
325 |     save_cross_feats_autoint(Xi_cross, Xv_cross, lens, experiment, data_path_autoint)
326 |     save_cross_feats_deepctr(
327 |         Xi,
328 |         Xv,
329 |         y,
330 |         Xi_cross,
331 |         Xv_cross,
332 |         dense_feat_indices,
333 |         sparse_feat_indices,
334 |         lens,
335 |         experiment,
336 |         data_path_deepctr,
337 |         deepctr_save_baseline_data,
338 |     )
339 | 


--------------------------------------------------------------------------------
/2. glider/models/autoint/README.md:
--------------------------------------------------------------------------------
 1 | # AutoInt
 2 | 
 3 | This is a TenforFlow implementation of ***AutoInt*** for CTR prediction task, as described in our paper:
 4 | 
 5 | Weiping Song, Chence Shi, Zhiping Xiao, Zhijian Duan, Yewen Xu, Ming Zhang and Jian Tang. [AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks](https://arxiv.org/pdf/1810.11921.pdf). arXiv preprint arXiv:1810.11921, 2018.
 6 | 
 7 | ## Requirements: 
 8 | * **Tensorflow 1.4.0-rc1**
 9 | * Python 3
10 | * CUDA 8.0+ (For GPU)
11 | 
12 | ## Introduction
13 | 
14 | AutoInt：An effective and efficient algorithm to
15 | automatically learn high-order feature interactions for (sparse) categorical and numerical features.
16 | 
17 | <div align=center>
18 |   <img src="https://github.com/shichence/AutoInt/blob/master/figures/model.png" width = 50% height = 50% />
19 | </div>
20 | The illustration of AutoInt. We first project all sparse features
21 | (both categorical and numerical features) into the low-dimensional space. Next, we feed embeddings of all fields into stacked multiple interacting layers implemented by self-attentive neural network. The output of the final interacting layer is the low-dimensional representation of learnt combinatorial features, which is further used for estimating the CTR via sigmoid function.
22 | 
23 | ## Usage
24 | ### Input Format
25 | AutoInt requires the input data in the following format:
26 | * train_x: matrix with shape *(num_sample, num_field)*. train_x[s][t] is the feature value of feature field t of sample s in the dataset. The default value for categorical feature is 1.
27 | * train_i: matrix with shape *(num_sample, num_field)*. train_i[s][t] is the feature index of feature field t of sample s in the dataset. The maximal value of train_i is the feature size.
28 | * train_y: label of each sample in the dataset.
29 | 
30 | If you want to know how to preprocess the data, please refer to `./Dataprocess/Criteo/preprocess.py`
31 | 
32 | ### Example
33 | We use four public real-world datasets(Avazu, Criteo, KDD12, MovieLens-1M) in our experiments. Since the first three datasets are super huge, they can not be fit into the memory as a whole. In our implementation, we split the whole dataset into 10 parts and we use the first file as test set and the second file as valid set. We provide the codes for preprocessing these three datasets in `./Dataprocess`. If you want to reuse these codes, you should first run `preprocess.py` to generate `train_x.txt, train_i.txt, train_y.txt` as described in `Input Format`. Then you should run `./Dataprocesss/Kfold_split/StratifiedKfold.py` to split the whole dataset into ten folds. Finally you can run `scale.py` to scale the numerical value(optional).
34 | 
35 | To help test the correctness of the code and familarize yourself with the code, we upload the first `10000` samples of `Criteo` dataset in `train_examples.txt`. And we provide the scripts for preprocessing and training.(Please refer to `	sample_preprocess.sh` and `test_code.sh`, you may need to modify the path in `config.py` and `test_code.sh`). 
36 | 
37 | After you run the `test_code.sh`, you should get a folder named `Criteo` which contains `part*, feature_size.npy, fold_index.npy, train_*.txt`. `feature_size.npy` contains the number of total features which will be used to initialize the model. `train_*.txt` is the whole dataset. If you use other small dataset, say `MovieLens-1M`, you only need to modify the function `_run_` in `train.py`.
38 | 
39 | Here's how to run the preprocessing.
40 | ```
41 | mkdir Criteo
42 | python ./Dataprocess/Criteo/preprocess.py
43 | python ./Dataprocess/Kfold_split/stratifiedKfold.py
44 | python ./Dataprocess/Criteo/scale.py
45 | ```
46 | 
47 | Here's how to run the training.
48 | ```
49 | python -u train.py \
50 |                        --data "Criteo"  --blocks 3 --heads 2  --block_shape "[64, 64, 64]" \
51 |                        --is_save "True" --save_path "./test_code/Criteo/b3h2_64x64x64/"  \
52 |                        --field_size 39  --run_times 1 --data_path "./" \
53 |                        --epoch 3 --has_residual "True"  --has_wide "False" \
54 |                        --batch_size 1024 \
55 |                        > test_code_single.out &
56 | ```
57 | 
58 | You should see output like this:
59 | 
60 | ```
61 | ...
62 | train logs
63 | ...
64 | start testing!...
65 | restored from ./test_code/Criteo/b3h2_dnn_dropkeep1_400x2/1/
66 | test-result = 0.8088, test-logloss = 0.4430
67 | test_auc [0.8088305055534442]
68 | test_log_loss [0.44297631300399626]
69 | avg_auc 0.8088305055534442
70 | avg_log_loss 0.44297631300399626
71 | ```
72 | 
73 | ## Citation
74 | If you find AutoInt useful for your research, please consider citing the following paper:
75 | ```
76 | @article{weiping2018autoint,
77 |   title={AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks},
78 |   author={Weiping, Song and Chence, Shi and Zhiping, Xiao and Zhijian, Duan and Yewen, Xu and Ming, Zhang and Jian, Tang},
79 |   journal={arXiv preprint arXiv:1810.11921},
80 |   year={2018}
81 | }
82 | ```
83 | 
84 | 
85 | ## Contact information
86 | If you have questions related to the code, feel free to contact Weiping Song (`songweiping@pku.edu.cn`), Chence Shi (`chenceshi@pku.edu.cn`) and Zhijian Duan (`zjduan@pku.edu.cn`).
87 | 
88 | ## License
89 | MIT
90 | 
91 | ## Acknowledgement
92 | This implementation gets inspirations from Kyubyong Park's [transformer](https://github.com/Kyubyong/transformer) and Chenglong Chen' [DeepFM](https://github.com/ChenglongChen/tensorflow-DeepFM).
93 | 


--------------------------------------------------------------------------------
/2. glider/models/autoint/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tensorflow implementation of AutoInt described in:
  3 | AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks.
  4 | author: Chence Shi
  5 | email: chenceshi@pku.edu.cn
  6 | """
  7 | 
  8 | import os
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from time import time
 12 | from sklearn.base import BaseEstimator, TransformerMixin
 13 | from sklearn.metrics import roc_auc_score, log_loss
 14 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
 15 | 
 16 | 
 17 | """
 18 | The following two functions are adapted from kyubyong park's implementation of transformer
 19 | We slightly modify the code to make it suitable for our work.(add relu, delete key masking and causality mask)
 20 | June 2017 by kyubyong park. 
 21 | kbpark.linguist@gmail.com.
 22 | https://www.github.com/kyubyong/transformer
 23 | """
 24 | 
 25 | 
 26 | def normalize(inputs, epsilon=1e-8):
 27 |     """
 28 |     Applies layer normalization
 29 |     Args:
 30 |         inputs: A tensor with 2 or more dimensions
 31 |         epsilon: A floating number to prevent Zero Division
 32 |     Returns:
 33 |         A tensor with the same shape and data dtype
 34 |     """
 35 |     inputs_shape = inputs.get_shape()
 36 |     params_shape = inputs_shape[-1:]
 37 | 
 38 |     mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 39 |     beta = tf.Variable(tf.zeros(params_shape))
 40 |     gamma = tf.Variable(tf.ones(params_shape))
 41 |     normalized = (inputs - mean) / ((variance + epsilon) ** (0.5))
 42 |     outputs = gamma * normalized + beta
 43 | 
 44 |     return outputs
 45 | 
 46 | 
 47 | def multihead_attention(
 48 |     queries,
 49 |     keys,
 50 |     values,
 51 |     num_units=None,
 52 |     num_heads=1,
 53 |     dropout_keep_prob=1,
 54 |     is_training=True,
 55 |     has_residual=True,
 56 | ):
 57 | 
 58 |     if num_units is None:
 59 |         num_units = queries.get_shape().as_list[-1]
 60 | 
 61 |     # Linear projections
 62 |     Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)
 63 |     K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)
 64 |     V = tf.layers.dense(values, num_units, activation=tf.nn.relu)
 65 |     if has_residual:
 66 |         V_res = tf.layers.dense(values, num_units, activation=tf.nn.relu)
 67 | 
 68 |     # Split and concat
 69 |     Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)
 70 |     K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)
 71 |     V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)
 72 | 
 73 |     # Multiplication
 74 |     weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
 75 | 
 76 |     # Scale
 77 |     weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
 78 | 
 79 |     # Activation
 80 |     weights = tf.nn.softmax(weights)
 81 | 
 82 |     # Dropouts
 83 |     weights = tf.layers.dropout(
 84 |         weights, rate=1 - dropout_keep_prob, training=tf.convert_to_tensor(is_training)
 85 |     )
 86 | 
 87 |     # Weighted sum
 88 |     outputs = tf.matmul(weights, V_)
 89 | 
 90 |     # Restore shape
 91 |     outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)
 92 | 
 93 |     # Residual connection
 94 |     if has_residual:
 95 |         outputs += V_res
 96 | 
 97 |     outputs = tf.nn.relu(outputs)
 98 |     # Normalize
 99 |     outputs = normalize(outputs)
100 | 
101 |     return outputs
102 | 
103 | 
104 | class AutoInt:
105 |     def __init__(self, args, feature_size, run_cnt):
106 | 
107 |         self.feature_size = (
108 |             feature_size
109 |         )  # denote as n, dimension of concatenated features
110 |         self.field_size = args.field_size  # denote as M, number of total feature fields
111 |         self.embedding_size = (
112 |             args.embedding_size
113 |         )  # denote as d, size of the feature embedding
114 |         self.blocks = args.blocks  # number of the blocks
115 |         self.heads = args.heads  # number of the heads
116 |         self.block_shape = args.block_shape
117 |         self.output_size = args.block_shape[-1]
118 |         self.has_residual = args.has_residual
119 |         self.has_wide = args.has_wide  # whether to add wide part
120 |         self.deep_layers = (
121 |             args.deep_layers
122 |         )  # whether to joint train with deep networks as described in paper
123 | 
124 |         self.batch_norm = args.batch_norm
125 |         self.batch_norm_decay = args.batch_norm_decay
126 |         self.drop_keep_prob = args.dropout_keep_prob
127 |         self.l2_reg = args.l2_reg
128 |         self.epoch = args.epoch
129 |         self.batch_size = args.batch_size
130 |         self.learning_rate = args.learning_rate
131 |         self.learning_rate_wide = args.learning_rate_wide
132 |         self.optimizer_type = args.optimizer_type
133 | 
134 |         self.save_path = args.save_path + str(run_cnt) + "/"
135 |         self.is_save = args.is_save
136 |         if args.is_save == True and os.path.exists(self.save_path) == False:
137 |             os.makedirs(self.save_path)
138 | 
139 |         self.verbose = args.verbose
140 |         self.random_seed = args.random_seed
141 |         self.loss_type = args.loss_type
142 |         self.eval_metric = roc_auc_score
143 |         self.best_loss = 1.0
144 |         self.greater_is_better = args.greater_is_better
145 |         self.train_result, self.valid_result = [], []
146 |         self.train_loss, self.valid_loss = [], []
147 | 
148 |         self._init_graph()
149 | 
150 |     def _init_graph(self):
151 |         self.graph = tf.Graph()
152 |         with self.graph.as_default():
153 | 
154 |             tf.set_random_seed(self.random_seed)
155 | 
156 |             self.feat_index = tf.placeholder(
157 |                 tf.int32, shape=[None, None], name="feat_index"
158 |             )  # None * M    # M is num features
159 |             self.feat_value = tf.placeholder(
160 |                 tf.float32, shape=[None, None], name="feat_value"
161 |             )  # None * M
162 |             self.label = tf.placeholder(
163 |                 tf.float32, shape=[None, 1], name="label"
164 |             )  # None * 1
165 |             # In our implementation, the shape of dropout_keep_prob is [3], used in 3 different parts.
166 |             self.dropout_keep_prob = tf.placeholder(
167 |                 tf.float32, shape=[None], name="dropout_keep_prob"
168 |             )
169 |             self.train_phase = tf.placeholder(tf.bool, name="train_phase")
170 | 
171 |             self.weights = self._initialize_weights()
172 | 
173 |             # model
174 |             self.embeddings = tf.nn.embedding_lookup(
175 |                 self.weights["feature_embeddings"], self.feat_index
176 |             )  # None * M * d     # num * emb dim
177 |             feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
178 |             self.embeddings = tf.multiply(self.embeddings, feat_value)  # None * M * d
179 |             self.embeddings = tf.nn.dropout(
180 |                 self.embeddings, self.dropout_keep_prob[1]
181 |             )  # None * M * d
182 |             if self.has_wide:
183 |                 self.y_first_order = tf.nn.embedding_lookup(
184 |                     self.weights["feature_bias"], self.feat_index
185 |                 )  # None * M * 1
186 |                 self.y_first_order = tf.reduce_sum(
187 |                     tf.multiply(self.y_first_order, feat_value), 1
188 |                 )  # None * 1
189 | 
190 |             # joint training with feedforward nn
191 |             if self.deep_layers != None:
192 |                 self.y_dense = tf.reshape(
193 |                     self.embeddings, shape=[-1, self.field_size * self.embedding_size]
194 |                 )
195 |                 for i in range(0, len(self.deep_layers)):
196 |                     self.y_dense = tf.add(
197 |                         tf.matmul(self.y_dense, self.weights["layer_%d" % i]),
198 |                         self.weights["bias_%d" % i],
199 |                     )  # None * layer[i]
200 |                     if self.batch_norm:
201 |                         self.y_dense = self.batch_norm_layer(
202 |                             self.y_dense,
203 |                             train_phase=self.train_phase,
204 |                             scope_bn="bn_%d" % i,
205 |                         )
206 |                     self.y_dense = tf.nn.relu(self.y_dense)
207 |                     self.y_dense = tf.nn.dropout(
208 |                         self.y_dense, self.dropout_keep_prob[2]
209 |                     )
210 |                 self.y_dense = tf.add(
211 |                     tf.matmul(self.y_dense, self.weights["prediction_dense"]),
212 |                     self.weights["prediction_bias_dense"],
213 |                     name="logits_dense",
214 |                 )  # None * 1
215 | 
216 |             # ---------- main part of AutoInt-------------------
217 |             self.y_deep = self.embeddings  # None * M * d
218 |             for i in range(self.blocks):
219 |                 self.y_deep = multihead_attention(
220 |                     queries=self.y_deep,
221 |                     keys=self.y_deep,
222 |                     values=self.y_deep,
223 |                     num_units=self.block_shape[i],
224 |                     num_heads=self.heads,
225 |                     dropout_keep_prob=self.dropout_keep_prob[0],
226 |                     is_training=self.train_phase,
227 |                     has_residual=self.has_residual,
228 |                 )
229 | 
230 |             self.flat = tf.reshape(
231 |                 self.y_deep, shape=[-1, self.output_size * self.field_size]
232 |             )
233 | 
234 |             self.out = tf.add(
235 |                 tf.matmul(self.flat, self.weights["prediction"]),
236 |                 self.weights["prediction_bias"],
237 |                 name="logits",
238 |             )  # None * 1
239 | 
240 |             if self.has_wide:
241 |                 self.out += self.y_first_order
242 | 
243 |             if self.deep_layers != None:
244 |                 self.out += self.y_dense
245 | 
246 |             # ---------- Compute the loss ----------
247 |             # loss
248 |             if self.loss_type == "logloss":
249 |                 self.out = tf.nn.sigmoid(self.out, name="pred")
250 |                 self.loss = tf.losses.log_loss(self.label, self.out)
251 |             elif self.loss_type == "mse":
252 |                 self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
253 | 
254 |             # l2 regularization on weights
255 |             if self.l2_reg > 0:
256 |                 if self.deep_layers != None:
257 |                     for i in range(len(self.deep_layers)):
258 |                         self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
259 |                             self.weights["layer_%d" % i]
260 |                         )
261 | 
262 |             self.global_step = tf.Variable(0, name="global_step", trainable=False)
263 |             self.var1 = [
264 |                 v for v in tf.trainable_variables() if v.name != "feature_bias:0"
265 |             ]
266 |             self.var2 = [tf.trainable_variables()[1]]  # self.var2 = [feature_bias]
267 |             # optimizer
268 |             # here we should use two different optimizer for wide and deep model(if we add wide part).
269 |             if self.optimizer_type == "adam":
270 |                 if self.has_wide:
271 |                     optimizer1 = tf.train.AdamOptimizer(
272 |                         learning_rate=self.learning_rate,
273 |                         beta1=0.9,
274 |                         beta2=0.999,
275 |                         epsilon=1e-8,
276 |                     )
277 |                     optimizer2 = tf.train.GradientDescentOptimizer(
278 |                         learning_rate=self.learning_rate_wide
279 |                     )
280 |                     # minimize(self.loss, global_step=self.global_step)
281 |                     var_list1 = self.var1
282 |                     var_list2 = self.var2
283 |                     grads = tf.gradients(self.loss, var_list1 + var_list2)
284 |                     grads1 = grads[: len(var_list1)]
285 |                     grads2 = grads[len(var_list1) :]
286 |                     train_op1 = optimizer1.apply_gradients(
287 |                         zip(grads1, var_list1), global_step=self.global_step
288 |                     )
289 |                     train_op2 = optimizer2.apply_gradients(zip(grads2, var_list2))
290 |                     self.optimizer = tf.group(train_op1, train_op2)
291 |                 else:
292 |                     self.optimizer = tf.train.AdamOptimizer(
293 |                         learning_rate=self.learning_rate,
294 |                         beta1=0.9,
295 |                         beta2=0.999,
296 |                         epsilon=1e-8,
297 |                     ).minimize(self.loss, global_step=self.global_step)
298 |             elif self.optimizer_type == "adagrad":
299 |                 self.optimizer = tf.train.AdagradOptimizer(
300 |                     learning_rate=self.learning_rate, initial_accumulator_value=1e-8
301 |                 ).minimize(self.loss)
302 |             elif self.optimizer_type == "gd":
303 |                 self.optimizer = tf.train.GradientDescentOptimizer(
304 |                     learning_rate=self.learning_rate
305 |                 ).minimize(self.loss)
306 |             elif self.optimizer_type == "momentum":
307 |                 self.optimizer = tf.train.MomentumOptimizer(
308 |                     learning_rate=self.learning_rate, momentum=0.95
309 |                 ).minimize(self.loss)
310 | 
311 |             # init
312 |             self.saver = tf.train.Saver(max_to_keep=5)
313 |             init = tf.global_variables_initializer()
314 |             self.sess = self._init_session()
315 |             self.sess.run(init)
316 |             self.count_param()
317 | 
318 |     def count_param(self):
319 |         k = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
320 | 
321 |         # print(tf.trainable_variables())
322 |         print("total parameters :%d" % k)
323 |         print("extra parameters : %d" % (k - self.feature_size * self.embedding_size))
324 | 
325 |     def _init_session(self):
326 |         config = tf.ConfigProto(allow_soft_placement=True)
327 |         config.gpu_options.allow_growth = True
328 |         return tf.Session(config=config)
329 | 
330 |     def _initialize_weights(self):
331 |         weights = dict()
332 | 
333 |         # embeddings
334 |         weights["feature_embeddings"] = tf.Variable(
335 |             tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),
336 |             name="feature_embeddings",
337 |         )  # feature_size(n) * d
338 | 
339 |         if self.has_wide:
340 |             weights["feature_bias"] = tf.Variable(
341 |                 tf.random_normal([self.feature_size, 1], 0.0, 0.001),
342 |                 name="feature_bias",
343 |             )  # feature_size(n) * 1
344 |         input_size = self.output_size * self.field_size
345 | 
346 |         # dense layers
347 |         if self.deep_layers != None:
348 |             num_layer = len(self.deep_layers)
349 |             layer0_size = self.field_size * self.embedding_size
350 |             glorot = np.sqrt(2.0 / (layer0_size + self.deep_layers[0]))
351 |             weights["layer_0"] = tf.Variable(
352 |                 np.random.normal(
353 |                     loc=0, scale=glorot, size=(layer0_size, self.deep_layers[0])
354 |                 ),
355 |                 dtype=np.float32,
356 |             )
357 |             weights["bias_0"] = tf.Variable(
358 |                 np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])),
359 |                 dtype=np.float32,
360 |             )  # 1 * layers[0]
361 |             for i in range(1, num_layer):
362 |                 glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
363 |                 weights["layer_%d" % i] = tf.Variable(
364 |                     np.random.normal(
365 |                         loc=0,
366 |                         scale=glorot,
367 |                         size=(self.deep_layers[i - 1], self.deep_layers[i]),
368 |                     ),
369 |                     dtype=np.float32,
370 |                 )  # layers[i-1] * layers[i]
371 |                 weights["bias_%d" % i] = tf.Variable(
372 |                     np.random.normal(
373 |                         loc=0, scale=glorot, size=(1, self.deep_layers[i])
374 |                     ),
375 |                     dtype=np.float32,
376 |                 )  # 1 * layer[i]
377 |             glorot = np.sqrt(2.0 / (self.deep_layers[-1] + 1))
378 |             weights["prediction_dense"] = tf.Variable(
379 |                 np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[-1], 1)),
380 |                 dtype=np.float32,
381 |                 name="prediction_dense",
382 |             )
383 |             weights["prediction_bias_dense"] = tf.Variable(
384 |                 np.random.normal(), dtype=np.float32, name="prediction_bias_dense"
385 |             )
386 | 
387 |         # ---------- prediciton weight ------------------#
388 |         glorot = np.sqrt(2.0 / (input_size + 1))
389 |         weights["prediction"] = tf.Variable(
390 |             np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),
391 |             dtype=np.float32,
392 |             name="prediction",
393 |         )
394 |         weights["prediction_bias"] = tf.Variable(
395 |             np.random.normal(), dtype=np.float32, name="prediction_bias"
396 |         )
397 | 
398 |         return weights
399 | 
400 |     def batch_norm_layer(self, x, train_phase, scope_bn):
401 |         bn_train = batch_norm(
402 |             x,
403 |             decay=self.batch_norm_decay,
404 |             center=True,
405 |             scale=True,
406 |             updates_collections=None,
407 |             is_training=True,
408 |             reuse=None,
409 |             trainable=True,
410 |             scope=scope_bn,
411 |         )
412 |         bn_inference = batch_norm(
413 |             x,
414 |             decay=self.batch_norm_decay,
415 |             center=True,
416 |             scale=True,
417 |             updates_collections=None,
418 |             is_training=False,
419 |             reuse=True,
420 |             trainable=True,
421 |             scope=scope_bn,
422 |         )
423 |         z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
424 |         return z
425 | 
426 |     def get_batch(self, Xi, Xv, y, batch_size, index):
427 |         start = index * batch_size
428 |         end = (index + 1) * batch_size
429 |         end = end if end < len(y) else len(y)
430 |         return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]]
431 | 
432 |     # shuffle three lists simutaneously
433 |     def shuffle_in_unison_scary(self, a, b, c):
434 |         rng_state = np.random.get_state()
435 |         np.random.shuffle(a)
436 |         np.random.set_state(rng_state)
437 |         np.random.shuffle(b)
438 |         np.random.set_state(rng_state)
439 |         np.random.shuffle(c)
440 | 
441 |     def fit_on_batch(self, Xi, Xv, y):
442 |         feed_dict = {
443 |             self.feat_index: Xi,
444 |             self.feat_value: Xv,
445 |             self.label: y,
446 |             self.dropout_keep_prob: self.drop_keep_prob,
447 |             self.train_phase: True,
448 |         }
449 |         step, loss, opt = self.sess.run(
450 |             (self.global_step, self.loss, self.optimizer), feed_dict=feed_dict
451 |         )
452 |         return step, loss
453 | 
454 |     # Since the train data is very large, they can not be fit into the memory at the same time.
455 |     # We separate the whole train data into several files and call "fit_once" for each file.
456 |     def fit_once(
457 |         self,
458 |         Xi_train,
459 |         Xv_train,
460 |         y_train,
461 |         epoch,
462 |         file_count,
463 |         Xi_valid=None,
464 |         Xv_valid=None,
465 |         y_valid=None,
466 |         early_stopping=False,
467 |     ):
468 | 
469 |         has_valid = Xv_valid is not None
470 |         last_step = 0
471 |         t1 = time()
472 |         self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)
473 |         total_batch = int(len(y_train) / self.batch_size)
474 |         for i in range(total_batch):
475 |             Xi_batch, Xv_batch, y_batch = self.get_batch(
476 |                 Xi_train, Xv_train, y_train, self.batch_size, i
477 |             )
478 |             step, loss = self.fit_on_batch(Xi_batch, Xv_batch, y_batch)
479 |             last_step = step
480 | 
481 |         # evaluate training and validation datasets
482 |         train_result, train_loss = self.evaluate(Xi_train, Xv_train, y_train)
483 |         self.train_result.append(train_result)
484 |         self.train_loss.append(train_loss)
485 |         if has_valid:
486 |             valid_result, valid_loss = self.evaluate(Xi_valid, Xv_valid, y_valid)
487 |             self.valid_result.append(valid_result)
488 |             self.valid_loss.append(valid_loss)
489 |             if valid_loss < self.best_loss and self.is_save == True:
490 |                 old_loss = self.best_loss
491 |                 self.best_loss = valid_loss
492 |                 self.saver.save(
493 |                     self.sess, self.save_path + "model.ckpt", global_step=last_step
494 |                 )
495 |                 print(
496 |                     "[%d-%d] model saved!. Valid loss is improved from %.4f to %.4f"
497 |                     % (epoch, file_count, old_loss, self.best_loss)
498 |                 )
499 | 
500 |         if self.verbose > 0 and ((epoch - 1) * 9 + file_count) % self.verbose == 0:
501 |             if has_valid:
502 |                 print(
503 |                     "[%d-%d] train-result=%.4f, train-logloss=%.4f, valid-result=%.4f, valid-logloss=%.4f [%.1f s]"
504 |                     % (
505 |                         epoch,
506 |                         file_count,
507 |                         train_result,
508 |                         train_loss,
509 |                         valid_result,
510 |                         valid_loss,
511 |                         time() - t1,
512 |                     )
513 |                 )
514 |             else:
515 |                 print(
516 |                     "[%d-%d] train-result=%.4f [%.1f s]"
517 |                     % (epoch, file_count, train_result, time() - t1)
518 |                 )
519 |         if has_valid and early_stopping and self.training_termination(self.valid_loss):
520 |             return False
521 |         else:
522 |             return True
523 | 
524 |     def training_termination(self, valid_result):
525 |         if len(valid_result) > 5:
526 |             if self.greater_is_better:
527 |                 if (
528 |                     valid_result[-1] < valid_result[-2]
529 |                     and valid_result[-2] < valid_result[-3]
530 |                     and valid_result[-3] < valid_result[-4]
531 |                     and valid_result[-4] < valid_result[-5]
532 |                 ):
533 |                     return True
534 |             else:
535 |                 if (
536 |                     valid_result[-1] > valid_result[-2]
537 |                     and valid_result[-2] > valid_result[-3]
538 |                     and valid_result[-3] > valid_result[-4]
539 |                     and valid_result[-4] > valid_result[-5]
540 |                 ):
541 |                     return True
542 |         return False
543 | 
544 |     def predict(self, Xi, Xv):
545 |         """
546 |         :param Xi: list of list of feature indices of each sample in the dataset
547 |         :param Xv: list of list of feature values of each sample in the dataset
548 |         :return: predicted probability of each sample
549 |         """
550 |         # dummy y
551 |         dummy_y = [1] * len(Xi)
552 |         batch_index = 0
553 |         Xi_batch, Xv_batch, y_batch = self.get_batch(
554 |             Xi, Xv, dummy_y, self.batch_size, batch_index
555 |         )
556 |         y_pred = None
557 |         # y_loss = None
558 |         while len(Xi_batch) > 0:
559 |             num_batch = len(y_batch)
560 |             feed_dict = {
561 |                 self.feat_index: Xi_batch,
562 |                 self.feat_value: Xv_batch,
563 |                 self.label: y_batch,
564 |                 self.dropout_keep_prob: [1.0] * len(self.drop_keep_prob),
565 |                 self.train_phase: False,
566 |             }
567 |             batch_out = self.sess.run(self.out, feed_dict=feed_dict)
568 | 
569 |             if batch_index == 0:
570 |                 y_pred = np.reshape(batch_out, (num_batch,))
571 |             else:
572 |                 y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))
573 | 
574 |             batch_index += 1
575 |             Xi_batch, Xv_batch, y_batch = self.get_batch(
576 |                 Xi, Xv, dummy_y, self.batch_size, batch_index
577 |             )
578 | 
579 |         return y_pred
580 | 
581 |     def evaluate(self, Xi, Xv, y):
582 |         """
583 |         :param Xi: list of list of feature indices of each sample in the dataset
584 |         :param Xv: list of list of feature values of each sample in the dataset
585 |         :param y: label of each sample in the dataset
586 |         :return: metric of the evaluation
587 |         """
588 |         y_pred = self.predict(Xi, Xv)
589 |         y_pred = np.clip(y_pred, 1e-6, 1 - 1e-6)
590 |         return self.eval_metric(y, y_pred), log_loss(y, y_pred)
591 | 
592 |     def restore(self, save_path=None):
593 |         if save_path == None:
594 |             save_path = self.save_path
595 |         ckpt = tf.train.get_checkpoint_state(save_path)
596 |         if ckpt and ckpt.model_checkpoint_path:
597 |             self.saver.restore(self.sess, ckpt.model_checkpoint_path)
598 |             if self.verbose > 0:
599 |                 print("restored from %s" % (save_path))
600 | 


--------------------------------------------------------------------------------
/2. glider/models/autoint/train.py:
--------------------------------------------------------------------------------
  1 | ## AutoInt's official training code. Modifications are only made to accomodate cross feature
  2 | 
  3 | import math
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | from sklearn.metrics import make_scorer
  8 | from sklearn.model_selection import StratifiedKFold
  9 | from time import time
 10 | from model import AutoInt
 11 | import argparse
 12 | import os
 13 | 
 14 | from os.path import join
 15 | 
 16 | 
 17 | def str2list(v):
 18 |     v = v.split(",")
 19 |     v = [int(_.strip("[]")) for _ in v]
 20 | 
 21 |     return v
 22 | 
 23 | 
 24 | def str2list2(v):
 25 |     v = v.split(",")
 26 |     v = [float(_.strip("[]")) for _ in v]
 27 | 
 28 |     return v
 29 | 
 30 | 
 31 | def str2bool(v):
 32 |     if v.lower() in ["yes", "true", "t", "y", "1"]:
 33 |         return True
 34 |     elif v.lower() in ["no", "false", "f", "n", "0"]:
 35 |         return False
 36 |     else:
 37 |         raise argparse.ArgumentTypeError("Unsupported value encountered.")
 38 | 
 39 | 
 40 | def parse_args():
 41 |     parser = argparse.ArgumentParser()
 42 |     parser.add_argument("--blocks", type=int, default=3, help="#blocks")
 43 |     parser.add_argument(
 44 |         "--block_shape",
 45 |         type=str2list,
 46 |         default=[64, 64, 64],
 47 |         help="output shape of each block",
 48 |     )
 49 |     parser.add_argument("--heads", type=int, default=2, help="#heads")
 50 |     parser.add_argument("--embedding_size", type=int, default=16)
 51 |     parser.add_argument("--dropout_keep_prob", type=str2list2, default=[1, 1, 1])
 52 |     parser.add_argument("--epoch", type=int, default=3)
 53 |     parser.add_argument("--batch_size", type=int, default=1024)
 54 |     parser.add_argument("--learning_rate", type=float, default=0.001)
 55 |     parser.add_argument("--learning_rate_wide", type=float, default=0.001)
 56 |     parser.add_argument("--optimizer_type", type=str, default="adam")
 57 |     parser.add_argument("--l2_reg", type=float, default=0.0)
 58 |     parser.add_argument("--random_seed", type=int, default=2018)
 59 |     parser.add_argument("--save_path", type=str, default="./model/")
 60 |     parser.add_argument("--field_size", type=int, default=0, help="dummy variable")
 61 |     parser.add_argument("--loss_type", type=str, default="logloss")
 62 |     parser.add_argument("--verbose", type=int, default=1)
 63 |     parser.add_argument(
 64 |         "--run_times", type=int, default=5, help="run multiple times to eliminate error"
 65 |     )
 66 |     parser.add_argument("--is_save", type=str2bool, default=True)
 67 |     parser.add_argument(
 68 |         "--greater_is_better", type=str2bool, default=False, help="early stop criterion"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--has_residual", type=str2bool, default=True, help="add residual or not"
 72 |     )
 73 |     parser.add_argument("--has_wide", type=str2bool, default=False)
 74 |     parser.add_argument(
 75 |         "--deep_layers",
 76 |         type=str2list,
 77 |         default=[400, 400],
 78 |         help="config for dnn in joint train",
 79 |     )
 80 |     parser.add_argument("--batch_norm", type=int, default=0)
 81 |     parser.add_argument("--batch_norm_decay", type=float, default=0.995)
 82 |     parser.add_argument("--data", type=str, help="data name")
 83 |     parser.add_argument("--data_path", type=str, default="./", help="root path for all the data")
 84 |     parser.add_argument("--gpu", type=int, help="which gpu")
 85 |     parser.add_argument("--exp", type=str, help="experiment", default="cross")
 86 |     parser.add_argument("--cross_exp", type=str, help="cross exp", default="cross1")
 87 | 
 88 |     return parser.parse_args()
 89 | 
 90 | 
 91 | def include_cross_features(args, Xi, Xv, j):
 92 |     if args.exp == "cross":
 93 |         path = join(args.data_path, args.data, "part" + str(j), args.cross_exp)
 94 | 
 95 |         Xi_cross = np.load(join(path, "i_cross.npy"))
 96 |         Xv_cross = np.load(join(path, "x_cross.npy"))
 97 |         Xi = np.concatenate([Xi, Xi_cross], axis=1)
 98 |         Xv = np.concatenate([Xv, Xv_cross], axis=1)
 99 |     return Xi, Xv
100 | 
101 | 
102 | def _run_(args, file_name, run_cnt):
103 |     # path_prefix = '../Dataprocess/' + args.data
104 |     path_prefix = os.path.join(args.data_path, args.data)
105 |     if not os.path.exists(args.save_path):
106 |         os.makedirs(args.save_path)
107 | 
108 |     if args.exp == "cross":
109 |         feature_size = np.load(join(path_prefix, args.cross_exp, "feature_size.npy"))[0]
110 |     else:
111 |         feature_size = np.load(path_prefix + "/feature_size.npy")[0]
112 | 
113 |     # variables = tf.contrib.framework.get_variables_to_restore()
114 |     # print(variables)
115 |     # return
116 | 
117 |     Xi_valid = np.load(path_prefix + "/part2/" + file_name[0])
118 |     Xv_valid = np.load(path_prefix + "/part2/" + file_name[1])
119 |     y_valid = np.load(path_prefix + "/part2/" + file_name[2])
120 | 
121 |     Xi_valid, Xv_valid = include_cross_features(args, Xi_valid, Xv_valid, 2)
122 | 
123 |     args.field_size = Xi_valid.shape[1]
124 | 
125 |     # test: file1, valid: file2, train: file3-10
126 |     model = AutoInt(args=args, feature_size=feature_size, run_cnt=run_cnt)
127 | 
128 |     is_continue = True
129 |     for k in range(model.epoch):
130 |         if not is_continue:
131 |             print("early stopping at epoch %d" % (k + 1))
132 |             break
133 |         file_count = 0
134 |         time_epoch = 0
135 |         for j in range(3, 11):
136 |             if not is_continue:
137 |                 print("early stopping at epoch %d file %d" % (k + 1, j))
138 |                 break
139 |             file_count += 1
140 |             Xi_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[0])
141 |             Xv_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[1])
142 |             y_train = np.load(path_prefix + "/part" + str(j) + "/" + file_name[2])
143 | 
144 |             Xi_train, Xv_train = include_cross_features(args, Xi_train, Xv_train, j)
145 | 
146 |             print("epoch %d, file %d" % (k + 1, j))
147 |             t1 = time()
148 |             is_continue = model.fit_once(
149 |                 Xi_train,
150 |                 Xv_train,
151 |                 y_train,
152 |                 k + 1,
153 |                 file_count,
154 |                 Xi_valid,
155 |                 Xv_valid,
156 |                 y_valid,
157 |                 early_stopping=True,
158 |             )
159 |             time_epoch += time() - t1
160 | 
161 |         print("epoch %d, time %d" % (k + 1, time_epoch))
162 | 
163 |     print("start testing!...")
164 |     Xi_test = np.load(path_prefix + "/part1/" + file_name[0])
165 |     Xv_test = np.load(path_prefix + "/part1/" + file_name[1])
166 |     y_test = np.load(path_prefix + "/part1/" + file_name[2])
167 | 
168 |     Xi_test, Xv_test = include_cross_features(args, Xi_test, Xv_test, 1)
169 | 
170 |     model.restore()
171 | 
172 |     test_result, test_loss = model.evaluate(Xi_test, Xv_test, y_test)
173 |     print("test-result = %.4lf, test-logloss = %.4lf" % (test_result, test_loss))
174 |     return test_result, test_loss
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     args = parse_args()
179 |     print(args.__dict__)
180 |     print("**************")
181 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
182 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
183 | 
184 |     data_path = args.data.split("/")
185 |     if any([data_path[-1].startswith(d) for d in ["avazu"]]):
186 |         file_name = ["train_i.npy", "train_x.npy", "train_y.npy"]
187 |     elif any([data_path[-1].startswith(d) for d in ["criteo"]]):
188 |         file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"]
189 |     else:
190 |         raise ValueError("invalid data arg")
191 |     test_auc = []
192 |     test_log = []
193 | 
194 |     print("run time : %d" % args.run_times)
195 |     for i in range(1, args.run_times + 1):
196 |         test_result, test_loss = _run_(args, file_name, i)
197 |         test_auc.append(test_result)
198 |         test_log.append(test_loss)
199 |     print("test_auc", test_auc)
200 |     print("test_log_loss", test_log)
201 |     print("avg_auc", sum(test_auc) / len(test_auc))
202 |     print("avg_log_loss", sum(test_log) / len(test_log))
203 | 


--------------------------------------------------------------------------------
/2. glider/train_deepctr.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | parser = argparse.ArgumentParser()
  5 | parser.add_argument("--model", type=str, help="model", default="WDL")
  6 | parser.add_argument("--runs", type=int, help="num trials", default=5)
  7 | parser.add_argument("--exp", type=str, help="experiment", default="baseline")
  8 | parser.add_argument("--ds", type=str, help="dataset", default="criteo")
  9 | parser.add_argument("--bs", type=int, help="batchsize", default=1024)
 10 | parser.add_argument("--gpu", type=int, default=0)
 11 | parser.add_argument("--lr", type=float, help="learning rate", default=0.01)
 12 | parser.add_argument("--opt", type=str, help="optimizer", default="adagrad")
 13 | parser.add_argument("--epochs", type=int, help="epochs", default=50)
 14 | parser.add_argument("--test_id", type=str, help="test_id", default="test1")
 15 | parser.add_argument("--emb_dim", type=int, help="size of embedding table", default=16)
 16 | parser.add_argument("--patience", type=int, help="patience", default=1)
 17 | parser.add_argument("--d_base", type=str, help="base data id", default="baseline")
 18 | parser.add_argument("--d_cross", type=str, help="cross data id", default="cross")
 19 | parser.add_argument("--d_cross_exp", type=str, help="cross exp", default="cross1")
 20 | parser.add_argument("--n_cross", type=int, help="num cross features", default=40)
 21 | parser.add_argument(
 22 |     "--epochs_skip_es",
 23 |     type=int,
 24 |     help="num of epochs to skip for checking early stopping",
 25 |     default=0,
 26 | )
 27 | 
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | model_type = args.model  # ["WDL", "DeepFM", "DCN", "xDeepFM"]
 32 | num_trials = args.runs
 33 | experiment = args.exp  # ["baseline", "cross"]
 34 | dataset = args.ds  # ["criteo", "avazu"]
 35 | batch_size = args.bs
 36 | learning_rate = args.lr
 37 | opt = args.opt
 38 | gpu_device = args.gpu
 39 | epochs = args.epochs
 40 | test_id = args.test_id
 41 | emb_dim = args.emb_dim
 42 | patience = args.patience
 43 | base_data_id = args.d_base
 44 | cross_data_id = args.d_cross
 45 | cross_experiment = args.d_cross_exp
 46 | n_cross = args.n_cross
 47 | epochs_skip_es = args.epochs_skip_es
 48 | 
 49 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 50 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_device)
 51 | 
 52 | 
 53 | import pandas as pd
 54 | from sklearn.metrics import log_loss, roc_auc_score
 55 | from sklearn.model_selection import train_test_split
 56 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 57 | 
 58 | from deepctr.models import xDeepFM, DeepFM, WDL, DCN
 59 | from deepctr.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names
 60 | from tensorflow.python.keras.models import save_model, load_model
 61 | 
 62 | from deepctr.layers import custom_objects
 63 | import pickle
 64 | 
 65 | import keras
 66 | 
 67 | import numpy as np
 68 | from tqdm import tqdm
 69 | import math
 70 | 
 71 | from os.path import join
 72 | import tensorflow as tf
 73 | from tensorflow.python.keras.optimizers import Adam, Adagrad
 74 | 
 75 | from tensorflow.keras.callbacks import EarlyStopping
 76 | from keras.backend.tensorflow_backend import set_session
 77 | 
 78 | 
 79 | config = tf.ConfigProto()
 80 | config.gpu_options.allow_growth = True
 81 | config.log_device_placement = (
 82 |     True
 83 | ) 
 84 | sess = tf.Session(config=config)
 85 | set_session(sess)  
 86 | 
 87 | assert model_type in ["WDL", "DeepFM", "DCN", "xDeepFM"]
 88 | assert experiment in ["baseline", "cross"]
 89 | assert dataset in ["criteo", "avazu"]
 90 | 
 91 | if dataset == "criteo":
 92 |     src_datapath = "data/deepctr/criteo/"
 93 |     n_sparse = 26
 94 |     n_dense = 13
 95 | 
 96 | elif dataset == "avazu":
 97 |     src_datapath = "data/deepctr/avazu"
 98 |     n_sparse = 23
 99 |     n_dense = 0
100 | 
101 | 
102 | sparse_features = ["C" + str(i) for i in range(1, n_sparse + 1)]
103 | dense_features = ["I" + str(i) for i in range(1, n_dense + 1)]
104 | target = ["label"]
105 | 
106 | if experiment == "cross":
107 |     cross_features = ["G" + str(i) for i in range(1, n_cross + 1)]
108 | else:
109 |     cross_features = []
110 | 
111 | 
112 | def get_labels(input_path, batch_size, target="label"):
113 | 
114 |     labels = {}
115 |     for mode in ["valid", "test"]:
116 |         label_batches = []
117 |         for data_batch in tqdm(pd.read_hdf(input_path, key=mode, chunksize=batch_size)):
118 |             label_batches.append(data_batch[target].values)
119 |         labels[mode] = np.concatenate(label_batches)
120 |     return labels
121 | 
122 | 
123 | labels = get_labels(src_datapath + "/" + base_data_id + ".h5", int(1e5))
124 | 
125 | 
126 | with open(
127 |     join(src_datapath, "n_unique_dict_" + base_data_id + ".pickle"), "rb"
128 | ) as handle:
129 |     unique_dict_baseline = pickle.load(handle)
130 | 
131 | unique_dict = dict()
132 | for key in sparse_features:
133 |     unique_dict[key] = unique_dict_baseline[key]
134 | 
135 | if experiment == "cross":
136 |     with open(
137 |         join(
138 |             src_datapath, cross_experiment, "n_unique_dict_" + cross_data_id + ".pickle"
139 |         ),
140 |         "rb",
141 |     ) as handle:
142 |         unique_dict_cross = pickle.load(handle)
143 |     for key in cross_features:
144 |         unique_dict[key] = unique_dict_cross[key]
145 | 
146 | 
147 | fixlen_feature_columns = [
148 |     SparseFeat(feat, unique_dict[feat]) for feat in sparse_features + cross_features
149 | ] + [DenseFeat(feat, 1) for feat in dense_features]
150 | 
151 | dnn_feature_columns = fixlen_feature_columns
152 | linear_feature_columns = fixlen_feature_columns
153 | 
154 | fixlen_feature_names = get_fixlen_feature_names(
155 |     linear_feature_columns + dnn_feature_columns
156 | )
157 | 
158 | 
159 | def get_data_generator(
160 |     base_path,
161 |     cross_path,
162 |     model_inputs,
163 |     batch_size,
164 |     mode="train",
165 |     target="label",
166 |     keras=False,
167 | ):
168 |     while True:
169 |         i = 0
170 |         while True:
171 |             data_batch_baseline = pd.read_hdf(
172 |                 base_path, key=mode, start=i * batch_size, stop=(i + 1) * batch_size
173 |             )
174 |             if cross_path:
175 |                 data_batch_cross = pd.read_hdf(
176 |                     cross_path,
177 |                     key=mode,
178 |                     start=i * batch_size,
179 |                     stop=(i + 1) * batch_size,
180 |                 )
181 |             i += 1
182 |             if data_batch_baseline.shape[0] == 0:
183 |                 break
184 |             data_batch = (
185 |                 pd.concat([data_batch_baseline, data_batch_cross], axis=1)
186 |                 if cross_path
187 |                 else data_batch_baseline
188 |             )
189 |             X = [data_batch[name] for name in model_inputs]
190 |             Y = data_batch[target].values
191 |             yield (X, Y)
192 |         if not keras:
193 |             break
194 | 
195 | 
196 | base_path = join(src_datapath, base_data_id + ".h5")
197 | cross_path = (
198 |     join(src_datapath, cross_experiment, cross_data_id + "_" + str(n_cross) + ".h5")
199 |     if experiment == "cross"
200 |     else ""
201 | )
202 | cross_experiment = cross_experiment if experiment == "cross" else "baseline"
203 | 
204 | 
205 | def shuffle_batch(X, y=None, seed=None):
206 |     if seed is not None:
207 |         np.random.seed(seed)
208 | 
209 |     indices = np.random.permutation(len(X[0]))
210 | 
211 |     X_shuff = []
212 |     for i in range(len(X)):
213 |         X_shuff.append(X[i].iloc[indices])
214 | 
215 |     if y is not None:
216 |         y_shuff = y[indices]
217 |         return X_shuff, y_shuff
218 |     else:
219 |         return X_shuff
220 | 
221 | 
222 | exp_folder = join("experiments", "deepctr", test_id, "checkpoints")
223 | 
224 | if not os.path.exists(exp_folder):
225 |     os.makedirs(exp_folder)
226 | 
227 | 
228 | pkl_path = join(
229 |     "experiments",
230 |     "deepctr",
231 |     test_id,
232 |     dataset + "_" + model_type + "_" + cross_experiment + ".pkl",
233 | )
234 | 
235 | if os.path.exists(pkl_path):
236 |     with open(pkl_path, "rb") as handle:
237 |         results_dict = pickle.load(handle)
238 |     histories = results_dict["histories"]
239 |     histories_val = results_dict["val_loss"]
240 |     test_performances = results_dict["test_performances"]
241 |     checkpoints = results_dict["checkpoints"]
242 | else:
243 |     histories = []
244 |     histories_val = []
245 |     test_performances = []
246 |     checkpoints = []
247 | 
248 | 
249 | for i in range(num_trials):
250 |     if i < len(histories):
251 |         continue
252 | 
253 |     print("Starting trial", i + 1)
254 | 
255 |     model_checkpoint_file = join(
256 |         "experiments",
257 |         "deepctr",
258 |         test_id,
259 |         "checkpoints",
260 |         dataset + "_" + model_type + "_" + cross_experiment + "_trial" + str(i) + ".h5",
261 |     )
262 | 
263 |     test_generator = get_data_generator(
264 |         base_path, cross_path, fixlen_feature_names, batch_size, mode="test", keras=True
265 |     )
266 | 
267 |     if model_type == "DeepFM":
268 |         model = DeepFM(
269 |             linear_feature_columns,
270 |             dnn_feature_columns,
271 |             task="binary",
272 |             embedding_size=emb_dim,
273 |             use_fm=True,
274 |             dnn_hidden_units=[400, 400, 400],
275 |         )
276 | 
277 |     if model_type == "xDeepFM":
278 |         model = xDeepFM(
279 |             linear_feature_columns,
280 |             dnn_feature_columns,
281 |             task="binary",
282 |             embedding_size=emb_dim,
283 |             dnn_hidden_units=[400, 400],
284 |             cin_layer_size=[200, 200, 200],
285 |         )
286 | 
287 |     if model_type == "WDL":
288 |         model = WDL(
289 |             linear_feature_columns,
290 |             dnn_feature_columns,
291 |             task="binary",
292 |             embedding_size=emb_dim,
293 |             dnn_hidden_units=[1024, 512, 256],
294 |         )
295 | 
296 |     if model_type == "DCN":
297 |         model = DCN(
298 |             dnn_feature_columns,
299 |             task="binary",
300 |             embedding_size=emb_dim,
301 |             dnn_hidden_units=[1024, 1024],
302 |             cross_num=6,
303 |         )
304 | 
305 |     if opt == "adagrad":
306 |         optimizer = Adagrad
307 |     elif opt == "adam":
308 |         optimizer = Adam
309 |     else:
310 |         raise ValueError("Invalid optimizer")
311 | 
312 |     model.compile(
313 |         optimizer(learning_rate), "binary_crossentropy", metrics=["binary_crossentropy"]
314 |     )
315 | 
316 |     callbacks = []
317 | 
318 |     patience_counter = 0
319 |     best_valid_loss = float("Inf")
320 | 
321 |     history_epoch = {}
322 |     history_val = {}
323 |     for epoch in range(epochs):
324 |         breakout = False
325 |         history_epoch[epoch] = {}
326 |         history_val[epoch] = []
327 |         train_generator = get_data_generator(
328 |             base_path,
329 |             cross_path,
330 |             fixlen_feature_names,
331 |             len(labels["valid"]),
332 |             mode="train",
333 |         )
334 |         for file_count, data_batch in enumerate(train_generator):
335 |             print("epoch", epoch, "filecount", file_count)
336 |             train_model_input, train_model_labels = data_batch
337 | 
338 |             X_shuffled, Y_shuffled = shuffle_batch(
339 |                 train_model_input, train_model_labels
340 |             )  # using AutoInt's convention
341 | 
342 |             history = model.fit(
343 |                 X_shuffled,
344 |                 Y_shuffled,
345 |                 batch_size=batch_size,
346 |                 epochs=1,
347 |                 verbose=1,
348 |                 callbacks=callbacks,
349 |             )
350 | 
351 |             history_epoch[epoch][file_count] = [history.history, history.params]
352 | 
353 |             if epoch < epochs_skip_es:
354 |                 continue
355 | 
356 |             valid_generator = get_data_generator(
357 |                 base_path,
358 |                 cross_path,
359 |                 fixlen_feature_names,
360 |                 batch_size,
361 |                 mode="valid",
362 |                 keras=True,
363 |             )
364 |             valid_pred = model.predict_generator(
365 |                 valid_generator, steps=math.ceil(len(labels["valid"]) / batch_size)
366 |             )
367 |             valid_loss = log_loss(labels["valid"], valid_pred, eps=1e-7)
368 |             history_val[epoch].append(valid_loss)
369 | 
370 |             if valid_loss < best_valid_loss:
371 |                 save_model(model, model_checkpoint_file)
372 | 
373 |                 print(
374 |                     "[%d-%d] model saved!. Valid loss improved from %.4f to %.4f"
375 |                     % (epoch, file_count, best_valid_loss, valid_loss)
376 |                 )
377 |                 best_valid_loss = valid_loss
378 |                 patience_counter = 0
379 |             else:
380 |                 if patience_counter >= patience:
381 |                     breakout = True
382 |                     print("Early Stopping!")
383 |                     break
384 |                 patience_counter += 1
385 | 
386 |         if breakout:
387 |             break
388 | 
389 |     best_model = tf.keras.models.load_model(model_checkpoint_file, custom_objects)
390 | 
391 |     pred_ans = best_model.predict_generator(
392 |         test_generator, steps=math.ceil(len(labels["test"]) / batch_size)
393 |     )
394 | 
395 |     test_logloss = round(log_loss(labels["test"], pred_ans, eps=1e-7), 7)
396 |     test_auc = round(roc_auc_score(labels["test"], pred_ans), 7)
397 |     print("test LogLoss", test_logloss)
398 |     print("test AUC", test_auc)
399 | 
400 |     histories.append(history_epoch)
401 |     test_performances.append({"logloss": test_logloss, "auc": test_auc})
402 |     histories_val.append(
403 |         {
404 |             "history": history_val,
405 |             "best_valid_loss": best_valid_loss,
406 |             "patience": patience,
407 |         }
408 |     )
409 |     checkpoints.append(model_checkpoint_file)
410 |     results_dict = {
411 |         "histories": histories,
412 |         "test_performances": test_performances,
413 |         "val_loss": histories_val,
414 |         "params": best_model.count_params(),
415 |         "checkpoints": checkpoints,
416 |     }
417 | 
418 |     with open(pkl_path, "wb") as handle:
419 |         pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
420 | 
421 |     print("\n\n")
422 | 
423 | aucs = [x["auc"] for x in test_performances]
424 | loglosses = [x["logloss"] for x in test_performances]
425 | 
426 | auc_mean = np.mean(aucs)
427 | auc_std = np.std(aucs)
428 | logloss_mean = np.mean(loglosses)
429 | logloss_std = np.std(loglosses)
430 | 
431 | print(auc_mean, auc_std, logloss_mean, logloss_std)
432 | 


--------------------------------------------------------------------------------
/2. glider/utils/cross_feature_utils.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from tqdm import tqdm
  3 | import numpy as np
  4 | from sklearn.preprocessing import KBinsDiscretizer
  5 | import multiprocessing as mp
  6 | from itertools import repeat
  7 | import warnings
  8 | 
  9 | warnings.simplefilter("ignore")
 10 | 
 11 | 
 12 | def str2bool(v):
 13 |     if isinstance(v, bool):
 14 |         return v
 15 |     if v.lower() in ("yes", "true", "t", "y", "1"):
 16 |         return True
 17 |     elif v.lower() in ("no", "false", "f", "n", "0"):
 18 |         return False
 19 |     else:
 20 |         raise argparse.ArgumentTypeError("Boolean value expected.")
 21 | 
 22 | 
 23 | def load_global_interactions(interactions_file, field_size, max_rank, prune_subsets, top_k):
 24 | 
 25 |     with open(interactions_file, "rb") as handle:
 26 |         interaction_results = pickle.load(handle, encoding="latin1")
 27 | 
 28 |     global_interactions = {}
 29 |     mlp_losses = []
 30 |     inters = []
 31 |     for result in interaction_results:
 32 |         if result is None:
 33 |             continue
 34 |         for inter in result["inters"][:top_k]:
 35 |             if len(inter[0]) == field_size:
 36 |                 continue
 37 |             if inter[0] not in global_interactions:
 38 |                 global_interactions[inter[0]] = 1
 39 |             else:
 40 |                 global_interactions[inter[0]] += 1
 41 | 
 42 |     global_interactions = sorted(
 43 |         global_interactions.items(), key=lambda x: x[1], reverse=True
 44 |     )
 45 | 
 46 |     if prune_subsets:
 47 |         pruned_global_interactions = []
 48 |         index = 0
 49 |         while len(pruned_global_interactions) < max_rank:
 50 |             inter = global_interactions[index]
 51 |             if any(
 52 |                 set(inter[0]) < set(new_inter[0])
 53 |                 for new_inter in pruned_global_interactions
 54 |             ):
 55 |                 pass
 56 |             else:
 57 |                 pruned_global_interactions.append(inter)
 58 |                 pruned_global_interactions = [
 59 |                     t
 60 |                     for t in pruned_global_interactions
 61 |                     if not (set(t[0]) < set(inter[0]))
 62 |                 ]
 63 |             index += 1
 64 |     else:
 65 |         pruned_global_interactions = global_interactions[:max_rank]
 66 | 
 67 |     top_K_inters, _ = zip(*pruned_global_interactions)
 68 |     return top_K_inters
 69 | 
 70 | 
 71 | def load_data_autoint(dataset, data_path):
 72 | 
 73 |     path_prefix = data_path + "/"
 74 | 
 75 |     if dataset == "criteo":
 76 |         file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"]
 77 |     elif dataset == "avazu":
 78 |         file_name = ["train_i.npy", "train_x.npy", "train_y.npy"]
 79 |     else:
 80 |         raise ValueError("Invalid dataset")
 81 | 
 82 |     data = []
 83 |     for j in tqdm(range(1, 11)):
 84 |         folder_path = path_prefix + "/part" + str(j) + "/"
 85 |         Xi = np.load(folder_path + file_name[0])
 86 |         Xv = np.load(folder_path + file_name[1])
 87 |         y = np.load(folder_path + file_name[2])
 88 |         data.append({"Xi": Xi, "Xv": Xv, "y": y})
 89 | 
 90 |     return data
 91 | 
 92 | 
 93 | def merge_data(data):
 94 |     Xi = []
 95 |     Xv = []
 96 |     y = []
 97 |     lens = []
 98 |     for d in data:
 99 |         Xi.append(d["Xi"])
100 |         Xv.append(d["Xv"])
101 |         y.append(d["y"])
102 |         lens.append(len(d["y"]))
103 |     Xi = np.concatenate(Xi)
104 |     Xv = np.concatenate(Xv)
105 |     y = np.concatenate(y)
106 |     return Xi, Xv, y, lens
107 | 
108 | 
109 | def get_training_batch(data, size=1000000):
110 |     Xi_batch = data[2]["Xi"][:size]
111 |     Xv_batch = data[2]["Xv"][:size]
112 |     y_batch = data[2]["y"][:size]
113 |     return Xi_batch, Xv_batch, y_batch
114 | 
115 | 
116 | def get_dense_sparse_feat_indices(Xi_batch, dataset):
117 | 
118 |     dense_feat_indices = []
119 |     sparse_feat_indices = []
120 |     for i in tqdm(range(Xi_batch.shape[1])):
121 |         uniq = np.unique(Xi_batch[:, i])
122 |         if len(uniq) == 1 and "avazu" not in dataset:
123 |             dense_feat_indices.append(i)
124 |         else:
125 |             sparse_feat_indices.append(i)
126 | 
127 |     return dense_feat_indices, sparse_feat_indices
128 | 
129 | 
130 | def discretize_dense(Xv_feat, Xv_feat_batch, num_bins):
131 |     est = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="quantile")
132 |     est.fit(Xv_feat_batch)
133 |     disc = est.transform(Xv_feat)
134 |     cardinality = len(est.bin_edges_[0]) - 1
135 |     return disc, est, cardinality
136 | 
137 | 
138 | def _par_discretize(f_idx, Xv_feat, Xv_feat_batch, num_bins):
139 |     #     print("start", f_idx)
140 |     disc, est, cardinality = discretize_dense(Xv_feat, Xv_feat_batch, num_bins)
141 |     return f_idx, disc, est, cardinality
142 | 
143 | 
144 | def discretize_dense_features(
145 |     Xi,
146 |     Xv,
147 |     Xv_batch,
148 |     dense_feat_indices,
149 |     sparse_feat_indices,
150 |     num_feats,
151 |     num_bins,
152 |     num_processes=20,
153 | ):
154 | 
155 |     discretizers = {}
156 |     new_Xv_dense = []
157 |     cardinalities = {}
158 | 
159 |     Xv_feats = []
160 |     Xv_feats_batch = []
161 |     for i in dense_feat_indices:
162 |         Xv_feats.append(Xv[:, i].reshape(-1, 1))
163 |         Xv_feats_batch.append(Xv_batch[:, i].reshape(-1, 1))
164 | 
165 |     pool = mp.Pool(processes=num_processes)
166 |     disc_collect = pool.starmap(
167 |         _par_discretize,
168 |         zip(dense_feat_indices, Xv_feats, Xv_feats_batch, repeat(num_bins)),
169 |     )
170 |     cardinalities = {}
171 |     discretizers = {}
172 |     disc_summary = [x[1:] for x in sorted(disc_collect, key=lambda x: x[0])]
173 |     new_Xv_dense = []
174 |     for i, disc in enumerate(disc_summary):
175 |         new_Xv_dense.append(disc[0])
176 |         discretizers[i] = disc[1]
177 |         cardinalities[i] = disc[2]
178 | 
179 |     if dense_feat_indices:
180 |         new_Xv_dense = np.concatenate(new_Xv_dense, 1)
181 |         den = True
182 |     else:
183 |         den = False
184 | 
185 |     pool.close()
186 | 
187 |     if den:
188 |         sparsified_data = np.zeros((Xi.shape[0], num_feats))
189 |         sparsified_data[:, dense_feat_indices] = new_Xv_dense
190 |         sparsified_data[:, sparse_feat_indices] = Xi[:, sparse_feat_indices]
191 |     else:
192 |         sparsified_data = Xi
193 | 
194 |     return sparsified_data
195 | 
196 | 
197 | def zero_index_sp_feats(combo_map, feat_combo):
198 |     new_i = []
199 |     new_v = []
200 |     for c in feat_combo:
201 |         if tuple(c) not in combo_map:
202 |             new_i.append(0)
203 |             new_v.append(0)
204 |         else:
205 |             new_i.append(combo_map[tuple(c)])
206 |             new_v.append(1)
207 | 
208 |     return new_i, new_v
209 | 
210 | 
211 | def _par_zero_sp(combo_idx, combo_map, feat_combo):
212 |     #     print(combo_idx, feat_combo.shape)
213 |     new_i, new_v = zero_index_sp_feats(combo_map, feat_combo)
214 |     return combo_idx, new_i, new_v
215 | 
216 | 
217 | def cross_sparse_features(
218 |     top_K_inters,
219 |     sparsified_data,
220 |     sparsified_batch,
221 |     Xi_batch,
222 |     threshold,
223 |     num_processes=20,
224 | ):
225 | 
226 |     # collect combo frequency
227 |     inter_feats = []
228 |     inter_combo_maps = {}
229 | 
230 |     for inter in tqdm(top_K_inters):
231 | 
232 |         inter_list = list(inter)
233 |         inter_counts = {}
234 |         for d, data_inst in enumerate(sparsified_batch):
235 |             combo = tuple(data_inst[inter_list])
236 |             if combo not in inter_counts:
237 |                 inter_counts[combo] = 1
238 |             else:
239 |                 inter_counts[combo] += 1
240 |         combo_map = {}
241 |         for combo in inter_counts:
242 |             if inter_counts[combo] <= Xi_batch.shape[0] * threshold:
243 |                 pass
244 |             else:
245 |                 orig_len = len(combo_map)
246 |                 combo_map[combo] = orig_len + 1  # shift by 1 (0 value means missing)
247 |         inter_combo_maps[inter] = combo_map
248 | 
249 |     #  f = open("b" + str(nbins),"w")
250 |     #  for cm in inter_combo_maps:
251 |     #      f.write(str(cm) + "\t" + str(len(inter_combo_maps[cm])) + "\n")
252 |     #      print(len(inter_combo_maps[cm]))
253 | 
254 |     inters = []
255 |     combo_maps = []
256 |     feat_combos = []
257 |     for inter in tqdm(inter_combo_maps):
258 |         inters.append(inter)
259 |         combo_maps.append(inter_combo_maps[inter])
260 |         feat_combos.append(sparsified_data[:, list(inter)])
261 | 
262 |     pool = mp.Pool(processes=num_processes)
263 |     cross_feats = pool.starmap(
264 |         _par_zero_sp, zip(list(range(len(combo_maps))), combo_maps, feat_combos)
265 |     )
266 |     del feat_combos
267 |     pool.close()
268 |     cross_feats = sorted(cross_feats, key=lambda x: x[0])
269 | 
270 |     ##   the serial way of obtaining cross_feats (in case the parallel code doesnt work..)
271 |     #     cross_feats = []
272 |     #     i = 0
273 |     #     for inter in tqdm(inter_combo_maps):
274 |     #         cross_feats.append(_par_zero_sp(i, inter_combo_maps[inter], sparsified_data[:,list(inter)]))
275 |     #         i += 1
276 | 
277 |     return cross_feats
278 | 
279 | 
280 | def get_X_cross(inters, cross_feats, Xi, Xv, sparse_feat_indices):
281 |     # get cross feats in autoint data format
282 | 
283 |     # if a feature value is 0 , e.g. missing data, then any interaction with this feature will also be deemed missing with value 0
284 |     n = 0
285 |     Xv_cross = []
286 |     for inter in tqdm(inters):
287 |         mask = np.ones(Xv.shape[0])
288 |         for i, idx in enumerate(inter):
289 |             if idx in sparse_feat_indices:
290 |                 mask = mask * Xv[:, idx]
291 | 
292 |         Xv_cross.append(cross_feats[n][2] * mask)
293 |         n += 1
294 | 
295 |     cross_start = Xi.max() + 1
296 | 
297 |     # shift all the cross feature values so they can be packed later into a single embedding matrix (autoint format)
298 |     for i in tqdm(range(len(cross_feats))):
299 |         cur_cross_feat = np.array(cross_feats[i][1])
300 |         max_val = cur_cross_feat.max()
301 |         cross_feats[i] = (
302 |             cur_cross_feat + cross_start
303 |         )  # in-place modification to save memory
304 |         cross_start = max_val + cross_start + 1
305 | 
306 |     Xi_cross = cross_feats
307 | 
308 |     Xi_cross = np.stack(Xi_cross, 1)
309 |     Xv_cross = np.stack(Xv_cross, 1)
310 | 
311 |     return Xi, Xv, Xi_cross, Xv_cross
312 | 


--------------------------------------------------------------------------------
/2. glider/utils/global_interaction_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("models/autoint")
 4 | from model import AutoInt
 5 | import numpy as np
 6 | import os
 7 | 
 8 | 
 9 | class get_args:
10 |     # the original parameter configuration of AutoInt
11 |     blocks = 3
12 |     block_shape = [64, 64, 64]
13 |     heads = 2
14 |     embedding_size = 16
15 |     dropout_keep_prob = [1, 1, 1]
16 |     epoch = 3
17 |     batch_size = 1024
18 |     learning_rate = 0.001
19 |     learning_rate_wide = 0.001
20 |     optimizer_type = "adam"
21 |     l2_reg = 0.0
22 |     random_seed = 2018  # used in the official autoint code
23 |     loss_type = "logloss"
24 |     verbose = 1
25 |     run_times = 1
26 |     is_save = False
27 |     greater_is_better = False
28 |     has_residual = True
29 |     has_wide = False
30 |     deep_layers = [400, 400]
31 |     batch_norm = 0
32 |     batch_norm_decay = 0.995
33 | 
34 |     def __init__(self, save_path, field_size, dataset, data_path):
35 |         self.save_path = save_path
36 |         self.field_size = field_size
37 |         self.data = dataset
38 |         self.data_path = data_path
39 | 
40 | 
41 | def parse_args(dataset, data_path, save_path):
42 |     dataset = dataset.lower()
43 |     if "avazu" in dataset:
44 |         field_size = 23
45 |     elif "criteo" in dataset:
46 |         field_size = 39
47 |     else:
48 |         raise ValueError("Invalid dataset")
49 | 
50 |     return get_args(save_path, field_size, dataset, data_path)
51 | 
52 | 
53 | def get_data_info(args):
54 |     data = args.data.split("/")[-1].lower()
55 |     if any([data.startswith(d) for d in ["avazu"]]):
56 |         file_name = ["train_i.npy", "train_x.npy", "train_y.npy"]
57 |     elif any([data.startswith(d) for d in ["criteo"]]):
58 |         file_name = ["train_i.npy", "train_x2.npy", "train_y.npy"]
59 |     else:
60 |         raise ValueError("invalid data arg")
61 | 
62 |     path_prefix = os.path.join(args.data_path, args.data)
63 |     return file_name, path_prefix
64 | 
65 | 
66 | def get_autoint_and_data(
67 |     dataset="Criteo",
68 |     data_path="/workspace/AutoInt",
69 |     save_path="/test_code/Criteo/b3h2_dnn_dropkeep1_400x2/1/",
70 | ):
71 |     args = parse_args(dataset, data_path, save_path)
72 | 
73 |     file_name = []
74 | 
75 |     file_name, path_prefix = get_data_info(args)
76 |     feature_size = np.load(path_prefix + "/feature_size.npy")[0]
77 | 
78 |     run_cnt = 0
79 |     model = AutoInt(args=args, feature_size=feature_size, run_cnt=run_cnt)
80 | 
81 |     Xi_valid = np.load(path_prefix + "/part2/" + file_name[0])
82 |     Xv_valid = np.load(path_prefix + "/part2/" + file_name[1])
83 |     y_valid = np.load(path_prefix + "/part2/" + file_name[2])
84 | 
85 |     feature_indices = list(range(Xi_valid.shape[1]))
86 |     means_dict = {}
87 |     for i in feature_indices:
88 |         means_dict[i] = np.mean(Xv_valid[:, i])
89 | 
90 |     model.restore(args.save_path)
91 |     return model, {"Xi": Xi_valid, "Xv": Xv_valid, "y": y_valid, "means": means_dict}
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Feature Interaction Interpretability via Interaction Detection
  2 | 
  3 | This is the official code repository for the paper, "Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection".
  4 | 
  5 | <img src="figures/overview.png">
  6 | 
  7 | Example Explanations
  8 | 
  9 | * Global Interpretations
 10 | <p align="center">
 11 | <img src="figures/explanation1.png" width="700">
 12 | </p>
 13 | 
 14 | * Local Interpretations (of ResNet classifications)
 15 | <p align="center">
 16 | <img src="figures/explanation2.png" width="700">
 17 | </p>
 18 | 
 19 | <br />
 20 | 
 21 | Michael Tsang, Dehua Cheng, Hanpeng Liu, Xue Feng, Eric Zhou, Yan Liu, [Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection](https://openreview.net/forum?id=BkgnhTEtDS), ICLR 2020.
 22 | 
 23 | Neural Interaction Detection:\
 24 | Michael Tsang, Dehua Cheng, Yan Liu, [Detecting Statistical Interactions from Neural Network Weights](https://openreview.net/forum?id=ByOfBggRZ), ICLR 2018.
 25 | 
 26 | 
 27 | ## Setup
 28 | 
 29 | 
 30 | In a linux environment with Python 3.6:
 31 | 
 32 | ```bash
 33 | pip install -r requirements.txt
 34 | ```
 35 | 
 36 | We require CUDA 10 support to use GLIDER.
 37 | 
 38 | ## Usage
 39 | ### 1. MADEX
 40 | 
 41 | **MADEX (Model-Agnostic Dependency EXplainer)** is a method for interpreting feature interactions from a black-box prediction model per data instance. It contains two versions of Neural Interaction Detection (NID): the original NID and GradientNID. NID is a fast and accurate method to detect arbitrary-order interactions in polynomial time, whereas GradientNID exactly detects interactions from an explainer MLP. The following domains are showcased: DNA, graph, image, and text modeling.
 42 | 
 43 | <details><summary><b>Show instructions</b></summary>
 44 | 
 45 |     
 46 | ```bash
 47 | cd 1.\ madex/
 48 | ```
 49 | 
 50 | The following notebooks are available to demo MADEX:
 51 | * `madex_example_dna.ipynb`
 52 | * `madex_example_graph.ipynb`
 53 | * `madex_example_image.ipynb`
 54 | * `madex_example_text.ipynb`
 55 |     
 56 | </details>
 57 | 
 58 | ### 2. GLIDER
 59 | 
 60 | **GLIDER (GLobal Interaction Detection and Encoding for Recommendation)** takes MADEX beyond model interpretation on recommendation tasks (or tablular data modeling). GLIDER detects feature interactions that reoccur across data instances from a source recommender model, then explicitly encodes the interactions in a target recommender model. This process is a form of automatic feature engineering.
 61 | 
 62 | <details><summary><b>Show instructions</b></summary>
 63 | 
 64 | 
 65 | ```bash
 66 | cd 2.\ glider/
 67 | ```
 68 | 
 69 | #### A. Data Preparation
 70 | 
 71 | Please follow instructions in the [AutoInt repo](https://github.com/shichence/AutoInt) for how to prepare data splits.
 72 | 
 73 | The same code is also provided in this repo and follows the same series of commands. The Criteo dataset is found [here](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/). Place it in the path `data/autoint/criteo`.
 74 | 
 75 | ```bash
 76 | mkdir data/autoint/criteo
 77 | python data/initial_data_prep/criteo/preprocess.py
 78 | python data/initial_data_prep/Kfold_split/stratifiedKfold.py
 79 | python data/initial_data_prep/criteo/scale.py
 80 | ```
 81 | 
 82 | #### B. Global Interaction Detection
 83 | 
 84 | First, train a baseline AutoInt model.
 85 | 
 86 | ```bash
 87 | python models/autoint/train.py --exp baseline --data data/autoint/criteo --save_path experiments/autoint/criteo/baseline/ --run_times 1 --gpu 0 
 88 | ```
 89 | 
 90 | Then, run global interaction detection on this model.
 91 | 
 92 | ```bash
 93 | python detect_global_interactions.py --save_path experiments/autoint/criteo/baseline/1/ --data criteo --save_id SAVEID --par_batch_size par_batch_size
 94 | ```
 95 | 
 96 | * ```par_batch_size``` is the number of data instances to process in parallel. Set this based on the number of CPU processes and GPU memory available.
 97 | * ``SAVEID`` shows up again later. Use a descriptive identifier.
 98 | 
 99 | 
100 | #### C. Cross Feature Generation
101 | 
102 | To generate cross features:
103 | 
104 | ```bash
105 | python make_cross_feature_data.py --data_file experiments/detected_interactions_criteo_SAVEID.pickle --exp cross_K40 --K 40 --data criteo --autoint_save_path data/autoint/criteo --deepctr_save_path data/deepctr/criteo --save_base_data true
106 | ```
107 | 
108 | #### D. Train DeepCTR models:
109 | 
110 | * Wide&Deep: `WDL`
111 | * DeepFM: `DeepFM` 
112 | * Deep&Cross: `DCN`
113 | * xDeepFM: `xDeepFM`
114 | 
115 | Baseline:
116 | ```bash
117 | python train_deepctr.py --model WDL --ds criteo --exp baseline --patience 5 --test_id baseline_experiment --gpu 0
118 | ```
119 | 
120 | Baseline + GLIDER (distillation):
121 | ```bash
122 | python train_deepctr.py --model WDL --ds criteo --exp cross --patience 5 --test_id cross_experiment --gpu 0 --d_cross_exp cross_K40 --n_cross 40
123 | ```
124 | 
125 | 
126 | #### E. Train AutoInt models:
127 | 
128 | 
129 | Baseline + GLIDER (enhancement):
130 | ```bash
131 | python models/autoint/train.py --exp cross --data data/autoint/criteo --save_path experiments/autoint/criteo/cross/ --gpu 0 --cross_exp cross_K40
132 | ```
133 | 
134 | </details>
135 |     
136 | 
137 | ## References
138 | 
139 | ```
140 | @inproceedings{tsang2020feature,
141 |   title={Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection},
142 |   author={Michael Tsang and Dehua Cheng and Hanpeng Liu and Xue Feng and Eric Zhou and Yan Liu},
143 |   booktitle={International Conference on Learning Representations},
144 |   year={2020},
145 |   url={https://openreview.net/forum?id=BkgnhTEtDS}
146 | }
147 | ```
148 | 
149 | Neural Interaction Detection:
150 | ```
151 | @article{tsang2017detecting,
152 |   title={Detecting Statistical Interactions from Neural Network Weights},
153 |   author={Michael Tsang and Dehua Cheng and Yan Liu},
154 |   journal={arXiv preprint arXiv:1705.04977},
155 |   year={2017}
156 | }
157 | ```
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/figures/explanation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/explanation1.png


--------------------------------------------------------------------------------
/figures/explanation2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/explanation2.png


--------------------------------------------------------------------------------
/figures/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtsang/interaction_interpretability/02afd5b75b758e179f39c182a27de786b18be416/figures/overview.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | deepctr==0.6.0
 2 | future==0.17.1
 3 | h5py==2.8.0
 4 | jupyterlab==1.2.6
 5 | matplotlib==3.1.2
 6 | nltk==3.3
 7 | numpy==1.17.1
 8 | ipywidgets==6.0.0
 9 | pandas==0.20.3
10 | protobuf==3.11.2
11 | scikit-image==0.14.0
12 | scikit-learn==0.21.3
13 | scipy==1.3.1
14 | tables==3.5.2
15 | tensorboard==1.14.0
16 | tensorflow-estimator==1.14.0
17 | tensorflow-gpu==1.14.0
18 | torch==1.3.1
19 | torchtext==0.3.1
20 | torchvision==0.4.2
21 | tqdm==4.32.2
22 | transformers==2.4.1


--------------------------------------------------------------------------------