├── .DS_Store ├── CP_NA_Semi-synthetic.ipynb ├── CP_NA_Semi-synthetic_Plots.ipynb ├── CP_NA_Synthetic.ipynb ├── CP_NA_Synthetic_Plots.ipynb ├── LICENSE ├── README.md ├── data ├── .DS_Store └── cqr_datasets │ ├── .DS_Store │ ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb │ ├── CASP.csv │ ├── Concrete_Data.csv │ ├── README.md │ ├── STAR.csv │ ├── Untitled.ipynb │ ├── bike_train.csv │ ├── communities.data │ ├── communities_attributes.csv │ ├── facebook │ ├── Features_Variant_1.csv │ ├── Features_Variant_2.csv │ └── README.md │ ├── meps_19_reg.csv │ ├── meps_20_reg.csv │ └── meps_21_reg.csv ├── datasets.py ├── files.py ├── generation.py ├── imputation.py ├── plots └── .DS_Store ├── prediction.py ├── results └── .DS_Store └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/.DS_Store -------------------------------------------------------------------------------- /CP_NA_Semi-synthetic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import imputation as imp\n", 10 | "import generation as gen\n", 11 | "import prediction\n", 12 | "import utils\n", 13 | "import files\n", 14 | "import os\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "from tqdm.autonotebook import tqdm\n", 18 | "import datasets" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "datasets_name = ['meps_19', 'bio', 'concrete', 'bike']" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "datasets_sizes = {'meps_19': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 37 | " 'meps_20': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 38 | " 'meps_21': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 39 | " 'bio': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 40 | " 'concrete': {'train': 630, 'cal': 200, 'test_pattern': 100},\n", 41 | " 'bike': {'train': 1000, 'cal': 500, 'test_pattern': 100}}" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "n_rep = 100\n", 51 | "alpha = 0.1" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "prob_missing = 0.2" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "imputation = 'iterative_ridge'\n", 70 | "\n", 71 | "methods = ['QR', 'QR_TrainCal', 'CQR', 'CQR_MDA']\n", 72 | "basemodels = ['NNet']\n", 73 | "masks = ['Yes']\n", 74 | "protections = ['No']\n", 75 | "exacts = [False, True]\n", 76 | "\n", 77 | "cores = 1\n", 78 | "\n", 79 | "params_basemodel = {'cores':cores}" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "dataset_base_path = \"./data/cqr_datasets/\"" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "for dataset_name in tqdm(datasets_name):\n", 98 | " \n", 99 | " df, target, var_missing = datasets.GetDataset(dataset_name, dataset_base_path)\n", 100 | " \n", 101 | " params_missing = {}\n", 102 | " params_missing['var_missing'] = var_missing\n", 103 | " params_missing['prob_missing'] = prob_missing\n", 104 | " \n", 105 | " d = df.shape[1]-1\n", 106 | " \n", 107 | " if dataset_name == 'concrete':\n", 108 | " nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']\n", 109 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 110 | " else:\n", 111 | " nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']\n", 112 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 113 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n", 114 | " \n", 115 | " max_test_size = np.max(params_test['test_size'])\n", 116 | " \n", 117 | " train_size = datasets_sizes[dataset_name]['train']\n", 118 | " cal_size = datasets_sizes[dataset_name]['cal']\n", 119 | "\n", 120 | " name = files.get_name_data(train_size, cal_size, params_test, \n", 121 | " dataset=dataset_name, params_missing=params_missing, seed=n_rep)\n", 122 | " \n", 123 | " if os.path.isfile('data/'+name+'.xz'):\n", 124 | " print('data found')\n", 125 | " data = files.load_file('data', name, 'xz')\n", 126 | " else:\n", 127 | " print('data not found')\n", 128 | " X, X_missing, M, Y = gen.generate_multiple_real_data_MCAR(df, target, train_size=train_size, \n", 129 | " cal_size=cal_size, params_test=params_test,\n", 130 | " params_missing=params_missing, seed_max=n_rep)\n", 131 | " data = {'X': X, 'X_missing': X_missing, 'M': M,'Y': Y}\n", 132 | " files.write_file('data', name, 'xz', data)\n", 133 | " \n", 134 | " name_imputed = files.get_name_data_imputed(train_size, cal_size, params_test, imputation=imputation,\n", 135 | " dataset=dataset_name, params_missing=params_missing, seed=n_rep)\n", 136 | "\n", 137 | " if os.path.isfile('data/'+name_imputed+'.pkl'):\n", 138 | " print('imputation found')\n", 139 | " X_imp = files.load_file('data', name_imputed, 'pkl')\n", 140 | " else:\n", 141 | " print('imputation not found')\n", 142 | " if imputation == 'complete':\n", 143 | " X_imp = data['X']\n", 144 | " else:\n", 145 | " X_imp = imp.impute(data, imputation)\n", 146 | " files.write_file('data', name_imputed, 'pkl', X_imp)\n", 147 | " data_imputed = {'X': data['X'], 'X_missing': data['X_missing'], 'X_imp': X_imp, 'M': data['M'],'Y': data['Y']}\n", 148 | "\n", 149 | " \n", 150 | " results, methods_ran = prediction.run_experiments(data_imputed, alpha=alpha, methods=methods,\n", 151 | " basemodels=basemodels, params_basemodel=params_basemodel,\n", 152 | " masks=masks, protections=protections, \n", 153 | " exacts=exacts, imputation=imputation)\n", 154 | "\n", 155 | " for method in methods_ran:\n", 156 | " name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, \n", 157 | " dataset=dataset_name, imputation=imputation,\n", 158 | " params_missing=params_missing)\n", 159 | " \n", 160 | " results_method = results[method]\n", 161 | " files.write_file('results/'+name_dir, name_method, 'xz', results_method)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.8.5" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 4 193 | } 194 | -------------------------------------------------------------------------------- /CP_NA_Semi-synthetic_Plots.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import generation as gen\n", 10 | "import utils\n", 11 | "import files\n", 12 | "import os\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "from tqdm.autonotebook import tqdm\n", 16 | "import datasets\n", 17 | "import copy" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib.pyplot as plt\n", 27 | "import seaborn as sns\n", 28 | "import matplotlib as mpl\n", 29 | "from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset\n", 30 | "from matplotlib.backends.backend_pgf import FigureCanvasPgf\n", 31 | "mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)\n", 32 | "import matplotlib.lines as mlines" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "size=19\n", 42 | "mpl.rcParams.update({\n", 43 | " \"pgf.texsystem\": \"pdflatex\",\n", 44 | " 'font.family': 'serif',\n", 45 | " 'font.serif': 'Times',\n", 46 | " 'text.usetex': True,\n", 47 | " 'pgf.rcfonts': False,\n", 48 | " 'font.size': size,\n", 49 | " 'axes.labelsize':size,\n", 50 | " 'axes.titlesize':size,\n", 51 | " 'figure.titlesize':size,\n", 52 | " 'xtick.labelsize':size,\n", 53 | " 'ytick.labelsize':size,\n", 54 | " 'legend.fontsize':size,\n", 55 | "})" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "datasets_names = ['meps_19', 'bio', 'concrete', 'bike']\n", 65 | "dataset_base_path = \"./data/cqr_datasets/\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "datasets_sizes_normal = {'meps_19': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 75 | " 'meps_20': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 76 | " 'meps_21': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 77 | " 'bio': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n", 78 | " 'concrete': {'train': 630, 'cal': 200, 'test_pattern': 100},\n", 79 | " 'bike': {'train': 1000, 'cal': 500, 'test_pattern': 100}}\n", 80 | "\n", 81 | "datasets_sizes_small = {'meps_19': {'train': 500, 'cal': 250, 'test_pattern': 100},\n", 82 | " 'meps_20': {'train': 500, 'cal': 250, 'test_pattern': 100},\n", 83 | " 'meps_21': {'train': 500, 'cal': 250, 'test_pattern': 100},\n", 84 | " 'bio': {'train': 500, 'cal': 250, 'test_pattern': 100},\n", 85 | " 'concrete': {'train': 330, 'cal': 100, 'test_pattern': 100},\n", 86 | " 'bike': {'train': 500, 'cal': 250, 'test_pattern': 100}}" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "n_rep = 100\n", 96 | "alpha = 0.1" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "prob_missing = 0.2" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "imputation = 'iterative_ridge'\n", 115 | "mask = 'Yes'\n", 116 | "protection = 'No'" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "methods = ['CQR', 'CQR_MDA']#QR_TrainCal\n", 126 | "basemodel = 'NNet'\n", 127 | "exacts = [False, True]" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "name_pipelines = []\n", 137 | "for method in methods: \n", 138 | " if method == 'CQR_MDA':\n", 139 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=True)\n", 140 | " if not name_temp in name_pipelines:\n", 141 | " name_pipelines.append(name_temp)\n", 142 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=False)\n", 143 | " if not name_temp in name_pipelines:\n", 144 | " name_pipelines.append(name_temp)\n", 145 | " \n", 146 | "current_pipeline = method+'_'+basemodel" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "if mask == 'Yes':\n", 156 | " dict_methods = {'QR_TrainCal_NNet_Mask': 'QR',\n", 157 | " 'CQR_NNet_Mask': 'CQR', \n", 158 | " 'CQR_MDA_Exact_NNet_Mask': 'CQR-MDA-Exact',\n", 159 | " 'CQR_MDA_Nested_NNet_Mask': 'CQR-MDA-Nested'}\n", 160 | "else:\n", 161 | " dict_methods = {'QR_TrainCal_NNet': 'QR',\n", 162 | " 'CQR_NNet': 'CQR', \n", 163 | " 'CQR_MDA_Exact_NNet': 'CQR-MDA-Exact',\n", 164 | " 'CQR_MDA_Nested_NNet': 'CQR-MDA-Nested'}" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "#sizes = ['small', 'normal']\n", 174 | "sizes = ['normal']\n", 175 | " \n", 176 | "dict_cov = dict.fromkeys(datasets_names)\n", 177 | "dict_len = dict.fromkeys(datasets_names)\n", 178 | "dict_cov_patterns = dict.fromkeys(datasets_names)\n", 179 | "dict_len_patterns = dict.fromkeys(datasets_names)\n", 180 | "dict_cov_worst = dict.fromkeys(datasets_names)\n", 181 | "dict_len_worst = dict.fromkeys(datasets_names)\n", 182 | "dict_cov_best = dict.fromkeys(datasets_names)\n", 183 | "dict_len_best = dict.fromkeys(datasets_names)\n", 184 | "\n", 185 | "for dataset_name in datasets_names:\n", 186 | "\n", 187 | " dict_cov[dataset_name] = dict.fromkeys(sizes)\n", 188 | " dict_len[dataset_name] = dict.fromkeys(sizes)\n", 189 | " dict_cov_patterns[dataset_name] = dict.fromkeys(sizes)\n", 190 | " dict_len_patterns[dataset_name] = dict.fromkeys(sizes)\n", 191 | " dict_cov_worst[dataset_name] = dict.fromkeys(sizes)\n", 192 | " dict_len_worst[dataset_name] = dict.fromkeys(sizes)\n", 193 | " dict_cov_best[dataset_name] = dict.fromkeys(sizes)\n", 194 | " dict_len_best[dataset_name] = dict.fromkeys(sizes)\n", 195 | "\n", 196 | " for size in sizes:\n", 197 | "\n", 198 | " dict_cov[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 199 | " dict_len[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 200 | " dict_cov_patterns[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 201 | " dict_len_patterns[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 202 | " dict_cov_worst[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 203 | " dict_len_worst[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 204 | " dict_cov_best[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 205 | " dict_len_best[dataset_name][size] = dict.fromkeys(name_pipelines)\n", 206 | "\n", 207 | " for pipeline in name_pipelines:\n", 208 | " dict_cov_patterns[dataset_name][size][pipeline] = {}\n", 209 | " dict_len_patterns[dataset_name][size][pipeline] = {}" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "for dataset_name in datasets_names:\n", 219 | " \n", 220 | " print(dataset_name)\n", 221 | " \n", 222 | " df, target, var_missing = datasets.GetDataset(dataset_name, dataset_base_path)\n", 223 | "\n", 224 | " d = df.shape[1]-1\n", 225 | "\n", 226 | " params_missing = {}\n", 227 | " params_missing['var_missing'] = var_missing\n", 228 | " params_missing['prob_missing'] = prob_missing\n", 229 | "\n", 230 | " for size in sizes:\n", 231 | " \n", 232 | " if size == 'normal':\n", 233 | " train_size = datasets_sizes_normal[dataset_name]['train']\n", 234 | " cal_size = datasets_sizes_normal[dataset_name]['cal']\n", 235 | " \n", 236 | " if dataset_name == 'concrete':\n", 237 | " nb_sample_pattern = datasets_sizes_normal[dataset_name]['test_pattern']\n", 238 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 239 | " else:\n", 240 | " nb_sample_pattern = datasets_sizes_normal[dataset_name]['test_pattern']\n", 241 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 242 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n", 243 | "\n", 244 | " max_test_size = np.max(params_test['test_size'])\n", 245 | " \n", 246 | " elif size == 'small':\n", 247 | " train_size = datasets_sizes_small[dataset_name]['train']\n", 248 | " cal_size = datasets_sizes_small[dataset_name]['cal']\n", 249 | " \n", 250 | " if dataset_name == 'concrete':\n", 251 | " nb_sample_pattern = datasets_sizes_small[dataset_name]['test_pattern']\n", 252 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 253 | " else:\n", 254 | " nb_sample_pattern = datasets_sizes_small[dataset_name]['test_pattern']\n", 255 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n", 256 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n", 257 | "\n", 258 | " max_test_size = np.max(params_test['test_size'])\n", 259 | "\n", 260 | " name_method = []\n", 261 | "\n", 262 | " for pipeline in name_pipelines:\n", 263 | "\n", 264 | " name_method = np.append(name_method, '_'.join([imputation, pipeline]))\n", 265 | "\n", 266 | " key = -1\n", 267 | "\n", 268 | " data, results = utils.get_data_results(pipeline, train_size, cal_size, params_test, n_rep, imputation=imputation,\n", 269 | " dataset = dataset_name,\n", 270 | " params_missing=params_missing,\n", 271 | " parent_results='results', parent_data='data', extension='xz')\n", 272 | "\n", 273 | " contains, lengths = utils.compute_PI_metrics(data, results, 'iid')\n", 274 | "\n", 275 | " metrics = utils.compute_metrics_cond(n_rep, data, results, 'fixed_nb_sample_pattern', cond='Pattern')\n", 276 | "\n", 277 | " dict_cov[dataset_name][size][pipeline] = np.mean(contains, axis=1)\n", 278 | " dict_len[dataset_name][size][pipeline] = np.mean(lengths, axis=1)\n", 279 | "\n", 280 | " for key_pattern in list(metrics.keys()):\n", 281 | "\n", 282 | " dict_cov_patterns[dataset_name][size][pipeline][key_pattern] = metrics[key_pattern]['avg_cov']\n", 283 | " dict_len_patterns[dataset_name][size][pipeline][key_pattern] = metrics[key_pattern]['avg_len']" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "for dataset_name in datasets_names:\n", 293 | " \n", 294 | " for size in sizes:\n", 295 | " \n", 296 | " for pipeline in name_pipelines:\n", 297 | "\n", 298 | " avg_cov = dict.fromkeys(dict_cov_patterns[dataset_name][size][pipeline].keys())\n", 299 | "\n", 300 | " for key in list(avg_cov.keys()):\n", 301 | " avg_cov[key] = np.mean(dict_cov_patterns[dataset_name][size][pipeline][key])\n", 302 | "\n", 303 | " worst_index = np.argmin(list(avg_cov.values()))\n", 304 | " worst_key = list(avg_cov.keys())[worst_index]\n", 305 | " dict_cov_worst[dataset_name][size][pipeline] = dict_cov_patterns[dataset_name][size][pipeline][worst_key]\n", 306 | " dict_len_worst[dataset_name][size][pipeline] = dict_len_patterns[dataset_name][size][pipeline][worst_key]\n", 307 | "\n", 308 | " best_index = np.argmax(list(avg_cov.values()))\n", 309 | " best_key = list(avg_cov.keys())[best_index]\n", 310 | " dict_cov_best[dataset_name][size][pipeline] = dict_cov_patterns[dataset_name][size][pipeline][best_key]\n", 311 | " dict_len_best[dataset_name][size][pipeline] = dict_len_patterns[dataset_name][size][pipeline][best_key] " 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "colors_blindness = sns.color_palette(\"colorblind\")\n", 321 | "colors_blindness" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15,4.3))\n", 331 | "axes = {'meps_19': ax1, 'bio': ax2, 'concrete': ax3, 'bike': ax4}\n", 332 | "\n", 333 | "dict_markers = {'iid': 'd',\n", 334 | " 'worst': \"v\", \n", 335 | " 'best': \"^\"}\n", 336 | "\n", 337 | "dict_datasets = {'meps_19': r'\\texttt{meps_19} ($d=139$, $l=5$)','meps_20': r'\\texttt{meps} ($d=139$, $l=5$)',\n", 338 | " 'meps_21': r'\\texttt{meps} ($d=139$, $l=5$)','bio': r'\\texttt{bio} ($d=9$, $l=9$)',\n", 339 | " 'concrete': r'\\texttt{concrete} ($d=8$, $l=8$)','bike': r'\\texttt{bike} ($d=18$, $l=4$)'}\n", 340 | "alphas_meps = {'meps_19': 1, 'meps_20':0.7, 'meps_21': 0.4}\n", 341 | "dict_colors = {'QR_TrainCal_NNet_Mask': colors_blindness[2],\n", 342 | " 'CQR_NNet_Mask': colors_blindness[1], \n", 343 | " 'CQR_MDA_Exact_NNet_Mask': colors_blindness[4],\n", 344 | " 'CQR_MDA_Nested_NNet_Mask': colors_blindness[9]}\n", 345 | "\n", 346 | "dict_coverages = {'iid': 'Marginal', 'worst': 'Lowest', 'best': 'Highest'}\n", 347 | "\n", 348 | "marker_size = 60\n", 349 | "\n", 350 | "small = False\n", 351 | "medium = True\n", 352 | "\n", 353 | "name_pipelines_to_plot = name_pipelines\n", 354 | " \n", 355 | "for dataset_name in datasets_names:\n", 356 | " \n", 357 | " ax = axes[dataset_name]\n", 358 | " ax.set_title(dict_datasets[dataset_name])\n", 359 | "\n", 360 | " alpha_data = 1\n", 361 | "\n", 362 | " for pipeline in name_pipelines_to_plot:\n", 363 | "\n", 364 | " if medium:\n", 365 | " ax.scatter(np.mean(dict_cov[dataset_name]['normal'][pipeline]),\n", 366 | " np.mean(dict_len[dataset_name]['normal'][pipeline]), \n", 367 | " marker=dict_markers['iid'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n", 368 | " ax.errorbar(np.mean(dict_cov[dataset_name]['normal'][pipeline]), \n", 369 | " np.mean(dict_len[dataset_name]['normal'][pipeline]),\n", 370 | " xerr=np.std(dict_cov[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n", 371 | " yerr=np.std(dict_len[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n", 372 | " color=dict_colors[pipeline], alpha=0.3)\n", 373 | " ax.scatter(np.mean(dict_cov_worst[dataset_name]['normal'][pipeline]),\n", 374 | " np.mean(dict_len_worst[dataset_name]['normal'][pipeline]), \n", 375 | " marker=dict_markers['worst'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n", 376 | " ax.errorbar(np.mean(dict_cov_worst[dataset_name]['normal'][pipeline]), \n", 377 | " np.mean(dict_len_worst[dataset_name]['normal'][pipeline]),\n", 378 | " xerr=np.std(dict_cov_worst[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n", 379 | " yerr=np.std(dict_len_worst[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n", 380 | " color=dict_colors[pipeline], alpha=0.3)\n", 381 | " ax.scatter(np.mean(dict_cov_best[dataset_name]['normal'][pipeline]),\n", 382 | " np.mean(dict_len_best[dataset_name]['normal'][pipeline]), \n", 383 | " marker=dict_markers['best'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n", 384 | " ax.errorbar(np.mean(dict_cov_best[dataset_name]['normal'][pipeline]), \n", 385 | " np.mean(dict_len_best[dataset_name]['normal'][pipeline]),\n", 386 | " xerr=np.std(dict_cov_best[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n", 387 | " yerr=np.std(dict_len_best[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n", 388 | " color=dict_colors[pipeline], alpha=0.3)\n", 389 | "\n", 390 | "\n", 391 | " if small:\n", 392 | " ax.scatter(np.mean(dict_cov[dataset_name]['small'][pipeline]),\n", 393 | " np.mean(dict_len[dataset_name]['small'][pipeline]), \n", 394 | " marker=dict_markers['iid'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n", 395 | " ax.errorbar(np.mean(dict_cov[dataset_name]['small'][pipeline]), \n", 396 | " np.mean(dict_len[dataset_name]['small'][pipeline]),\n", 397 | " xerr=np.std(dict_cov[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n", 398 | " yerr=np.std(dict_len[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n", 399 | " color=dict_colors[pipeline], alpha=0.3)\n", 400 | " ax.scatter(np.mean(dict_cov_worst[dataset_name]['small'][pipeline]),\n", 401 | " np.mean(dict_len_worst[dataset_name]['small'][pipeline]), \n", 402 | " marker=dict_markers['worst'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n", 403 | " ax.errorbar(np.mean(dict_cov_worst[dataset_name]['small'][pipeline]), \n", 404 | " np.mean(dict_len_worst[dataset_name]['small'][pipeline]),\n", 405 | " xerr=np.std(dict_cov_worst[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n", 406 | " yerr=np.std(dict_len_worst[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n", 407 | " color=dict_colors[pipeline], alpha=0.3)\n", 408 | " ax.scatter(np.mean(dict_cov_best[dataset_name]['small'][pipeline]),\n", 409 | " np.mean(dict_len_best[dataset_name]['small'][pipeline]), \n", 410 | " marker=dict_markers['best'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n", 411 | " ax.errorbar(np.mean(dict_cov_best[dataset_name]['small'][pipeline]), \n", 412 | " np.mean(dict_len_best[dataset_name]['small'][pipeline]),\n", 413 | " xerr=np.std(dict_cov_best[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n", 414 | " yerr=np.std(dict_len_best[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n", 415 | " color=dict_colors[pipeline], alpha=0.3)\n", 416 | "\n", 417 | "\n", 418 | "for ax in [ax1,ax2,ax3,ax4]:\n", 419 | " ax.axvline(x=1-alpha, color='black', ls=':')\n", 420 | " ax.set_xlabel(\"Average coverage\")\n", 421 | "\n", 422 | " \n", 423 | "ax1.set_ylabel(\"Average length\")\n", 424 | "\n", 425 | "\n", 426 | "# Methods legend\n", 427 | "\n", 428 | "handles = []\n", 429 | "names = list( map(dict_methods.get, name_pipelines_to_plot) )\n", 430 | "for idc,color in enumerate(list( map(dict_colors.get, name_pipelines_to_plot) )):\n", 431 | " handles.append(mlines.Line2D([], [], color=color, marker='o', linestyle='None', markersize=8))\n", 432 | "\n", 433 | "if mask == 'Yes':\n", 434 | " fig.legend(handles, names, ncol=4, bbox_to_anchor=(0.63,0.13),handletextpad=0.1, \n", 435 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n", 436 | "else:\n", 437 | " fig.legend(handles, names, ncol=4, bbox_to_anchor=(0.63,0.1),handletextpad=0.1, \n", 438 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n", 439 | " \n", 440 | "# Coverage legend\n", 441 | "\n", 442 | "handles = []\n", 443 | "labels = []\n", 444 | "for cov in list(dict_coverages.keys()):\n", 445 | " handles.append(mlines.Line2D([], [], color='black', marker=dict_markers[cov], linestyle='None', markersize=8))\n", 446 | " labels.append(dict_coverages[cov])\n", 447 | "\n", 448 | "if mask == 'Yes':\n", 449 | " fig.legend(handles, labels, bbox_to_anchor=(0.95, 0.13),ncol=3,handletextpad=0.1, \n", 450 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n", 451 | "else:\n", 452 | " fig.legend(handles, labels, bbox_to_anchor=(0.95, 0.1),ncol=3,handletextpad=0.1, \n", 453 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n", 454 | "\n", 455 | "\n", 456 | "fig.tight_layout()\n", 457 | "\n", 458 | "name_plot = 'plots/semi_synth'\n", 459 | "\n", 460 | "if mask == 'Yes':\n", 461 | " plt.savefig(name_plot+'.pdf',bbox_inches='tight', dpi=300)\n", 462 | "else:\n", 463 | " plt.savefig(name_plot+'_nomask.pdf',bbox_inches='tight', dpi=300)\n", 464 | "\n", 465 | "plt.show()" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "Python 3", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.8.5" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 4 497 | } 498 | -------------------------------------------------------------------------------- /CP_NA_Synthetic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import generation as gen\n", 10 | "import imputation as imp\n", 11 | "import prediction\n", 12 | "import utils\n", 13 | "import files\n", 14 | "import os\n", 15 | "import numpy as np\n", 16 | "from tqdm.autonotebook import tqdm" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "alpha = 0.1\n", 26 | "d = 10\n", 27 | "phi = 0.8\n", 28 | "regression = 'Linear'\n", 29 | "n_rep = 100\n", 30 | "beta = np.array([1, 2, -1, 3, -0.5, -1, 0.3, 1.7, 0.4, -0.3])\n", 31 | "\n", 32 | "train_size = 500\n", 33 | "cal_size = 250\n", 34 | "params_test = {'iid':{'test_size': 2000}, \n", 35 | " 'fixed_nb_sample_pattern':{'nb_sample_pattern': 100}, \n", 36 | " 'fixed_nb_sample_pattern_size':{'nb_sample_pattern': 100}}\n", 37 | "params_test = gen.process_test(params_test, d=d)\n", 38 | "\n", 39 | "params_reg = {'regression':regression, 'beta': beta, 'phi': phi}\n", 40 | "\n", 41 | "params_noise = {'noise':'Gaussian'}\n", 42 | "\n", 43 | "prob_missing = 0.2\n", 44 | "var_missing = np.full(d, 1)\n", 45 | "params_missing = {'prob_missing':prob_missing, 'var_missing':var_missing, 'mechanism': 'MCAR'}\n", 46 | "\n", 47 | "imputations = np.array(['iterative_ridge'])\n", 48 | "\n", 49 | "methods = ['QR', 'QR_TrainCal', 'CQR', 'CQR_MDA']\n", 50 | "basemodels = ['NNet']\n", 51 | "masks = ['Yes']\n", 52 | "protections = ['No']#, 'Pattern', 'Pattern_size']\n", 53 | "exacts = [False, True]\n", 54 | "\n", 55 | "cores = 1\n", 56 | "\n", 57 | "params_basemodel = {'cores':cores}" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "name = files.get_name_data(train_size, cal_size, params_test, dim=d,\n", 67 | " params_reg=params_reg, params_noise=params_noise,\n", 68 | " params_missing=params_missing, seed=n_rep)\n", 69 | "\n", 70 | "if os.path.isfile('data/'+name+'.xz'):\n", 71 | " print('data found')\n", 72 | " data = files.load_file('data', name, 'xz')\n", 73 | "else:\n", 74 | " print('data not found')\n", 75 | " X, X_missing, M, Y, params_missing = gen.generate_multiple_data(train_size, cal_size, params_test, n_rep=n_rep, dim=d, \n", 76 | " params_reg=params_reg, params_noise=params_noise,\n", 77 | " params_missing=params_missing)\n", 78 | " data = {'X': X, 'X_missing': X_missing, 'M': M,'Y': Y}\n", 79 | " files.write_file('data', name, 'xz', data)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "for imputation in tqdm(imputations):\n", 89 | "\n", 90 | " name_imputed = files.get_name_data_imputed(train_size, cal_size, params_test, imputation,\n", 91 | " dim=d, \n", 92 | " params_reg=params_reg, params_noise=params_noise,\n", 93 | " params_missing=params_missing, seed=n_rep)\n", 94 | "\n", 95 | " if os.path.isfile('data/'+name_imputed+'.xz'):\n", 96 | " print('imputation found')\n", 97 | " X_imp = files.load_file('data', name_imputed, 'xz')\n", 98 | " else:\n", 99 | " print('imputation not found')\n", 100 | " if imputation == 'complete':\n", 101 | " X_imp = data['X']\n", 102 | " else:\n", 103 | " X_imp = imp.impute(data, imputation)\n", 104 | " files.write_file('data', name_imputed, 'xz', X_imp)\n", 105 | " data_imputed = {'X': data['X'], 'X_missing': data['X_missing'], 'X_imp': X_imp, 'M': data['M'],'Y': data['Y']}\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | " results, methods_ran = prediction.run_experiments(data_imputed, alpha=alpha, methods=methods,\n", 110 | " basemodels=basemodels, params_basemodel=params_basemodel,\n", 111 | " masks=masks, protections=protections, \n", 112 | " exacts=exacts, imputation=imputation,\n", 113 | " params_reg=params_reg)#, params_noise=params_noise)\n", 114 | "\n", 115 | " for method in methods_ran:\n", 116 | " name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, d=d, imputation=imputation,\n", 117 | " params_reg=params_reg, params_noise=params_noise, params_missing=params_missing)\n", 118 | " results_method = results[method]\n", 119 | " files.write_file('results/'+name_dir, name_method, 'xz', results_method)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.8.5" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 4 151 | } 152 | -------------------------------------------------------------------------------- /CP_NA_Synthetic_Plots.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import generation as gen\n", 10 | "import prediction as prediction\n", 11 | "import utils\n", 12 | "import files\n", 13 | "import os\n", 14 | "import numpy as np\n", 15 | "from tqdm.autonotebook import tqdm" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import matplotlib as mpl\n", 25 | "from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset\n", 26 | "from matplotlib.backends.backend_pgf import FigureCanvasPgf\n", 27 | "mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import matplotlib.lines as mlines\n", 30 | "import pandas as pd\n", 31 | "from matplotlib import cm" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "size=19\n", 41 | "mpl.rcParams.update({\n", 42 | " \"pgf.texsystem\": \"pdflatex\",\n", 43 | " 'font.family': 'serif',\n", 44 | " 'font.serif': 'Times',\n", 45 | " 'text.usetex': True,\n", 46 | " 'pgf.rcfonts': False,\n", 47 | " 'font.size': size,\n", 48 | " 'axes.labelsize':size,\n", 49 | " 'axes.titlesize':size,\n", 50 | " 'figure.titlesize':size,\n", 51 | " 'xtick.labelsize':size,\n", 52 | " 'ytick.labelsize':size,\n", 53 | " 'legend.fontsize':size,\n", 54 | "})" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "alpha = 0.1\n", 64 | "d = 10\n", 65 | "phi = 0.8\n", 66 | "regression = 'Linear'\n", 67 | "n_rep = 100\n", 68 | "beta = np.array([1, 2, -1, 3, -0.5, -1, 0.3, 1.7, 0.4, -0.3])\n", 69 | "\n", 70 | "train_size = 500\n", 71 | "cal_size = 250\n", 72 | "params_test = {'iid':{'test_size': 2000}, \n", 73 | " 'fixed_nb_sample_pattern':{'nb_sample_pattern': 100}, \n", 74 | " 'fixed_nb_sample_pattern_size':{'nb_sample_pattern': 100}}\n", 75 | "params_test = gen.process_test(params_test, d=d)\n", 76 | "\n", 77 | "params_reg = {'regression':regression, 'beta': beta, 'phi': phi}\n", 78 | "\n", 79 | "params_noise = {'noise':'Gaussian'}\n", 80 | "\n", 81 | "prob_missing = 0.2\n", 82 | "var_missing = np.full(d, 1)\n", 83 | "params_missing = {'prob_missing':prob_missing, 'var_missing':var_missing, 'mechanism': 'MCAR'}" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "methods = ['QR_TrainCal','CQR','CQR_MDA']\n", 93 | "\n", 94 | "basemodel = 'NNet'\n", 95 | "mask = 'Yes'\n", 96 | "protection = 'No'\n", 97 | "imputation = 'iterative_ridge'\n", 98 | "\n", 99 | "name_pipeline_to_plot = []\n", 100 | "for method in methods: \n", 101 | " if method == 'CQR_MDA':\n", 102 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=True)\n", 103 | " if not name_temp in name_pipeline_to_plot:\n", 104 | " name_pipeline_to_plot.append(name_temp)\n", 105 | " \n", 106 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=False)\n", 107 | " if not name_temp in name_pipeline_to_plot:\n", 108 | " name_pipeline_to_plot.append(name_temp)\n", 109 | " \n", 110 | "current_pipeline = method+'_'+basemodel" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "dict_cov = dict.fromkeys(name_pipeline_to_plot)\n", 120 | "dict_len = dict.fromkeys(name_pipeline_to_plot)\n", 121 | "\n", 122 | "for pipeline in name_pipeline_to_plot:\n", 123 | " dict_cov[pipeline] = {}\n", 124 | " dict_len[pipeline] = {}" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "keys_pattern = np.arange(d)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "impute_inf = True\n", 143 | "replace_inf = True" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "key = -1\n", 153 | "\n", 154 | "nb_boxplot = len(keys_pattern)+1\n", 155 | "\n", 156 | "name_method = []\n", 157 | "\n", 158 | "for pipeline in tqdm(name_pipeline_to_plot):\n", 159 | "\n", 160 | " name_method = np.append(name_method, '_'.join([imputation, pipeline]))\n", 161 | "\n", 162 | " data, results = utils.get_data_results(pipeline, train_size, cal_size, params_test, n_rep, d=d, imputation=imputation,\n", 163 | " params_reg=params_reg, params_noise=params_noise, params_missing=params_missing,\n", 164 | " parent_results='results', parent_data='data', extension='xz')\n", 165 | "\n", 166 | " contains, lengths = utils.compute_PI_metrics(data, results, 'iid')\n", 167 | " \n", 168 | " if replace_inf:\n", 169 | " max_y_train = np.max(data['Y']['Train'], axis=1)\n", 170 | " max_y_cal = np.max(data['Y']['Cal'], axis=1)\n", 171 | " min_y_train = np.min(data['Y']['Train'], axis=1)\n", 172 | " min_y_cal = np.min(data['Y']['Cal'], axis=1)\n", 173 | " max_length_traincal = np.maximum(max_y_train, max_y_cal)-np.minimum(min_y_train, min_y_cal)\n", 174 | " for k in range(n_rep):\n", 175 | " idx_inf = np.where(np.isinf(lengths[k,:]))[0]\n", 176 | " if len(idx_inf)>0:\n", 177 | " lengths[k,:][idx_inf] = max_length_traincal[k]\n", 178 | " \n", 179 | " metrics = utils.compute_metrics_cond(n_rep, data, results, 'fixed_nb_sample_pattern_size', cond='Pattern_Size',\n", 180 | " replace_inf=replace_inf)\n", 181 | " \n", 182 | " dict_cov[pipeline][key] = np.mean(contains, axis=1)\n", 183 | " dict_len[pipeline][key] = np.mean(lengths, axis=1)\n", 184 | "\n", 185 | " #key += 1\n", 186 | "\n", 187 | " for key_pattern in keys_pattern:\n", 188 | "\n", 189 | " dict_cov[pipeline][key_pattern] = metrics[key_pattern]['avg_cov']\n", 190 | " dict_len[pipeline][key_pattern] = metrics[key_pattern]['avg_len']\n", 191 | "\n", 192 | " #key += 1" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "import functools" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "if 'phi' in params_reg:\n", 211 | " phi = params_reg['phi']\n", 212 | "else:\n", 213 | " phi = 0.8\n", 214 | "cov = np.full((d,d),phi)+(1-phi)*np.eye(d)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "len_oracle_marginal = []\n", 224 | "M_test = data['M']['Test']['iid']\n", 225 | "test_size = M_test.shape[1]\n", 226 | "for i in range(n_rep):\n", 227 | " \n", 228 | " M_test_i = M_test[i,:,:]\n", 229 | " patterns = np.unique(M_test_i, axis=0)\n", 230 | " oracles_len_per_pattern = list(map(functools.partial(prediction.oracle_len_pattern, beta=beta, cov=cov, alpha=0.1), patterns))\n", 231 | "\n", 232 | " len_oracle = np.empty(test_size)\n", 233 | " \n", 234 | " for idp, pattern in enumerate(patterns):\n", 235 | " pattern_id = utils.pattern_to_id(pattern.astype(int))\n", 236 | " M_test_id = list(map(utils.pattern_to_id, M_test_i.astype(int)))\n", 237 | " len_oracle[np.where(np.array(M_test_id) == pattern_id)] = oracles_len_per_pattern[idp]\n", 238 | " len_oracle_marginal = np.append(len_oracle_marginal, np.mean(len_oracle))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "len_oracle = {}\n", 248 | "patterns_by_size = dict.fromkeys(np.arange(0,d))\n", 249 | "for k in range(d):\n", 250 | " patterns_by_size[k] = []\n", 251 | "patterns_id = np.arange(0, 2**d-1)\n", 252 | "for pattern_id in patterns_id:\n", 253 | " vec_pattern = utils.bin_to_vec(bin(pattern_id), d)\n", 254 | " size_pattern = utils.pattern_to_size(vec_pattern)\n", 255 | " patterns_by_size[size_pattern] = np.append(patterns_by_size[size_pattern], pattern_id)\n", 256 | "for k in range(d):\n", 257 | " list_len = []\n", 258 | " for pattern_id in patterns_by_size[k]:\n", 259 | " vec_pattern = utils.bin_to_vec(bin(np.int(pattern_id)), d)\n", 260 | " list_len = np.append(list_len, prediction.oracle_len_pattern(vec_pattern, beta, cov))\n", 261 | " len_oracle[k] = np.mean(list_len)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "import seaborn as sns" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "dict_methods = {'QR_TrainCal_NNet_Mask': 'QR',\n", 280 | " 'QR_NNet_Mask': 'QR',\n", 281 | " 'CQR_NNet_Mask': 'CQR', \n", 282 | " 'CQR_MDA_Exact_NNet_Mask': 'CQR-MDA-Exact',\n", 283 | " 'CQR_MDA_Nested_NNet_Mask': 'CQR-MDA-Nested'}" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(15,6), sharey='row')\n", 293 | "\n", 294 | "name_ticks = list(map(utils.name_tick, name_method))\n", 295 | "colors_palette = sns.color_palette(\"husl\", nb_boxplot)\n", 296 | "colors = colors_palette \n", 297 | "\n", 298 | "ax = [ax1, ax2, ax3, ax4]\n", 299 | "nb_subplots = len(ax)\n", 300 | "for axi in ax:\n", 301 | " axi.axhline(1-alpha, color='black', ls='--')\n", 302 | "\n", 303 | "axl = [ax5, ax6, ax7, ax8]\n", 304 | " \n", 305 | "for idp, pipeline in enumerate(name_pipeline_to_plot):\n", 306 | " \n", 307 | " ax[idp].set_title(dict_methods[pipeline])\n", 308 | " \n", 309 | " box = ax[idp].violinplot(dict_cov[pipeline].values(), showmeans=True, showextrema=False)#, quantiles=[[0.25, 0.75]]*nb_boxes)#, patch_artist=True)\n", 310 | " for pc,color in zip(box['bodies'], colors):\n", 311 | " pc.set_facecolor(color)\n", 312 | " pc.set_edgecolor('black')\n", 313 | " pc.set_alpha(1)\n", 314 | " box['cmeans'].set_color('black')\n", 315 | " \n", 316 | " box = axl[idp].violinplot(dict_len[pipeline].values(), showmeans=True, showextrema=False)#, quantiles=[[0.25, 0.75]]*nb_boxes)#, patch_artist=True)\n", 317 | " for pc,color in zip(box['bodies'], colors):\n", 318 | " pc.set_facecolor(color)\n", 319 | " pc.set_edgecolor('black')\n", 320 | " pc.set_alpha(1)\n", 321 | " box['cmeans'].set_color('black')\n", 322 | " \n", 323 | "idx = np.arange(d+1)\n", 324 | "idy = np.append([np.mean(len_oracle_marginal)], np.array(list(len_oracle.values())))\n", 325 | "\n", 326 | "for axi in axl:\n", 327 | " axi.scatter(idx+1, idy, color=colors, zorder=2, marker='*', s=100, edgecolor='black')\n", 328 | "\n", 329 | "for axi in ax:\n", 330 | " axi.set_xticks([])\n", 331 | " \n", 332 | "name_ticks_missing = []\n", 333 | "for k in range(d):\n", 334 | " name_ticks_missing = np.append(name_ticks_missing, str(k)+r' \\texttt{NA}')\n", 335 | "name_ticks = np.append(['Marg.'], name_ticks_missing)\n", 336 | "\n", 337 | "for axi in axl:\n", 338 | " ticks = np.arange(0,d+1)\n", 339 | " axi.set_xticks(ticks+1)\n", 340 | " axi.set_xticklabels(name_ticks, rotation=70)\n", 341 | "\n", 342 | "ax1.set_ylabel('Average coverage')\n", 343 | "ax5.set_ylabel('Average length')\n", 344 | "\n", 345 | "ax5.legend(handles = [mlines.Line2D([], [], marker=\"*\", linestyle='None', markersize=15, markeredgecolor='black', markerfacecolor='White')],\n", 346 | " labels=['Oracle length'], loc='upper left', handletextpad=10**(-60))\n", 347 | "\n", 348 | "fig.tight_layout()\n", 349 | "\n", 350 | "name_plot = 'plots/synthetic/Linear_d_'+str(d)+'_NA_'+str(prob_missing)+'_imputation_'+str(imputation)+'_basemodel_'+basemodel\n", 351 | "if mask == 'Yes':\n", 352 | " name_plot = name_plot + '_mask' \n", 353 | "name_plot = name_plot + '_train_'+str(train_size) + '_cal_'+str(cal_size) +'_rep_'+str(n_rep)\n", 354 | "if mask == 'No':\n", 355 | " name_plot = name_plot+'_nomask'\n", 356 | "if impute_inf:\n", 357 | " name_plot = name_plot+'_replaceinf'\n", 358 | "plt.savefig(name_plot+'.pdf',bbox_inches='tight', dpi=300)\n", 359 | "\n", 360 | "plt.show()" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [] 369 | } 370 | ], 371 | "metadata": { 372 | "kernelspec": { 373 | "display_name": "Python 3", 374 | "language": "python", 375 | "name": "python3" 376 | }, 377 | "language_info": { 378 | "codemirror_mode": { 379 | "name": "ipython", 380 | "version": 3 381 | }, 382 | "file_extension": ".py", 383 | "mimetype": "text/x-python", 384 | "name": "python", 385 | "nbconvert_exporter": "python", 386 | "pygments_lexer": "ipython3", 387 | "version": "3.8.5" 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 4 392 | } 393 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Margaux Zaffran 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Conformal Prediction with Missing Values 2 | 3 | This repository contains the code to reproduce the experiments of the paper _Conformal Prediction with Missing Values_, M. Zaffran, A. Dieuleveut, J. Josse and Y. Romano, ICML 2023. 4 | 5 | The notebook ``CP_NA_Synthetic.ipynb`` allows to reproduce the synthetic experiments, while ``CP_NA_Semi-synthetic.ipynb`` focuses on the semi-synthetic experiments. 6 | The corresponding ``_Plots`` notebooks contain the code for displaying the results in the same format as in the paper. 7 | 8 | The core code for the algorithms CP-MDA-Exact and CP-MDA-Nested can be found in the ```prediction.py``` file. 9 | 10 | ``imputation.py`` contains the functions used for imputation of the data sets. 11 | ``generation.py`` allows to generate synthetic data (outcome and features, but also missing values). 12 | ``files.py`` handles the file names, files writing and loading. 13 | ``utils.py`` contains some useful functions like computing the metrics associated to interval predictions, combinatorics on patterns etc. 14 | ``datasets.py`` pre-process the real data sets used in the semi-synthetic experiments. 15 | 16 | Note that, as mentioned in the ```.py``` files, some piece of code are taken from other GitHub repositories, namely: 17 | + CQR (Romano et al., 2019) repository, available [here](https://github.com/yromano/cqr), for the (cleaning of the) data sets used in the semi-synthetic experiments; 18 | + CHR (Sesia and Romano, 2021) repository, available [here](https://github.com/msesia/chr), for the Quantile Neural Network architecture. 19 | 20 | This repository will be updated in the next few days. 21 | 22 | ## License 23 | 24 | [MIT](LICENSE) © Margaux Zaffran -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/data/.DS_Store -------------------------------------------------------------------------------- /data/cqr_datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/data/cqr_datasets/.DS_Store -------------------------------------------------------------------------------- /data/cqr_datasets/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /data/cqr_datasets/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Publicly Available Datasets 3 | 4 | * Please download the file blogData_train.csv from [this link](https://archive.ics.uci.edu/ml/datasets/BlogFeedback), and save it in this directory. 5 | 6 | * Please download the files Features_Variant_1.csv and Features_Variant_2.csv from 7 | [this link](https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset) and store the two under ./facebook/ directory. 8 | 9 | ## Data subject to copyright/usage rules 10 | 11 | Please follow the instruction in [this README](https://github.com/yromano/cqr/blob/master/get_meps_data/README.md) file, describing how to download and process the MEPS datasets. 12 | 13 | Once downloaded, copy the the three files 'meps_19_reg.csv', 'meps_20_reg.csv', and 'meps_21_reg.csv' to this folder. 14 | -------------------------------------------------------------------------------- /data/cqr_datasets/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "951c0c40", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "from copy import deepcopy\n", 12 | "from tqdm import tqdm\n", 13 | "import scipy.stats as stat\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 12, 22 | "id": "1e1b8e72", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import datasets" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 24, 32 | "id": "b20e5441", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "dataset_base_path = \"./\"\n", 37 | "dataset_name = \"bio\"\n", 38 | "data = pd.read_csv('CASP.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 25, 44 | "id": "5b07be79", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "X, y = datasets.GetDataset(dataset_name, dataset_base_path)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 26, 54 | "id": "d693e22b", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | "
RMSDF1F2F3F4F5F6F7F8F9
017.28413558.304305.350.31754162.17301.872791e+06215.35904287.8710227.0302
16.0216191.961623.160.2621353.38948.034467e+0587.20243328.913938.5468
29.2757725.981726.280.2234367.28871.075648e+0681.79132981.042938.8119
315.8518424.582368.250.2811167.83251.210472e+06109.43903248.227039.0651
47.9627460.841736.940.2328052.41231.021020e+0694.52342814.424139.9147
\n", 158 | "
" 159 | ], 160 | "text/plain": [ 161 | " RMSD F1 F2 F3 F4 F5 F6 \\\n", 162 | "0 17.284 13558.30 4305.35 0.31754 162.1730 1.872791e+06 215.3590 \n", 163 | "1 6.021 6191.96 1623.16 0.26213 53.3894 8.034467e+05 87.2024 \n", 164 | "2 9.275 7725.98 1726.28 0.22343 67.2887 1.075648e+06 81.7913 \n", 165 | "3 15.851 8424.58 2368.25 0.28111 67.8325 1.210472e+06 109.4390 \n", 166 | "4 7.962 7460.84 1736.94 0.23280 52.4123 1.021020e+06 94.5234 \n", 167 | "\n", 168 | " F7 F8 F9 \n", 169 | "0 4287.87 102 27.0302 \n", 170 | "1 3328.91 39 38.5468 \n", 171 | "2 2981.04 29 38.8119 \n", 172 | "3 3248.22 70 39.0651 \n", 173 | "4 2814.42 41 39.9147 " 174 | ] 175 | }, 176 | "execution_count": 26, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "data.head()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 27, 188 | "id": "2300acd2", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "(45730, 10)" 195 | ] 196 | }, 197 | "execution_count": 27, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "data.shape" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 28, 209 | "id": "f6ebe8c8", 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "RMSD 0\n", 216 | "F1 0\n", 217 | "F2 0\n", 218 | "F3 0\n", 219 | "F4 0\n", 220 | "F5 0\n", 221 | "F6 0\n", 222 | "F7 0\n", 223 | "F8 0\n", 224 | "F9 0\n", 225 | "dtype: int64" 226 | ] 227 | }, 228 | "execution_count": 28, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "np.sum(data.isnull())" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 32, 240 | "id": "cbaba292", 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "array([[1.35583e+04, 4.30535e+03, 3.17540e-01, ..., 4.28787e+03,\n", 247 | " 1.02000e+02, 2.70302e+01],\n", 248 | " [6.19196e+03, 1.62316e+03, 2.62130e-01, ..., 3.32891e+03,\n", 249 | " 3.90000e+01, 3.85468e+01],\n", 250 | " [7.72598e+03, 1.72628e+03, 2.23430e-01, ..., 2.98104e+03,\n", 251 | " 2.90000e+01, 3.88119e+01],\n", 252 | " ...,\n", 253 | " [7.72665e+03, 2.48958e+03, 3.22200e-01, ..., 3.29046e+03,\n", 254 | " 4.60000e+01, 3.74718e+01],\n", 255 | " [8.87893e+03, 3.05578e+03, 3.44160e-01, ..., 3.42179e+03,\n", 256 | " 4.10000e+01, 3.56045e+01],\n", 257 | " [1.27324e+04, 4.44436e+03, 3.49050e-01, ..., 4.62685e+03,\n", 258 | " 1.41000e+02, 2.98118e+01]], dtype=float32)" 259 | ] 260 | }, 261 | "execution_count": 32, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "X" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 30, 273 | "id": "5cdfe892", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "array([17.284, 6.021, 9.275, ..., 10.356, 9.791, 18.827], dtype=float32)" 280 | ] 281 | }, 282 | "execution_count": 30, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "y" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "12d0a8d1", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3 (ipykernel)", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.9.12" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 5 321 | } 322 | -------------------------------------------------------------------------------- /data/cqr_datasets/communities_attributes.csv: -------------------------------------------------------------------------------- 1 | attributes 2 | state 3 | county 4 | community 5 | communityname 6 | fold 7 | population 8 | householdsize 9 | racepctblack 10 | racePctWhite 11 | racePctAsian 12 | racePctHisp 13 | agePct12t21 14 | agePct12t29 15 | agePct16t24 16 | agePct65up 17 | numbUrban 18 | pctUrban 19 | medIncome 20 | pctWWage 21 | pctWFarmSelf 22 | pctWInvInc 23 | pctWSocSec 24 | pctWPubAsst 25 | pctWRetire 26 | medFamInc 27 | perCapInc 28 | whitePerCap 29 | blackPerCap 30 | indianPerCap 31 | AsianPerCap 32 | OtherPerCap 33 | HispPerCap 34 | NumUnderPov 35 | PctPopUnderPov 36 | PctLess9thGrade 37 | PctNotHSGrad 38 | PctBSorMore 39 | PctUnemployed 40 | PctEmploy 41 | PctEmplManu 42 | PctEmplProfServ 43 | PctOccupManu 44 | PctOccupMgmtProf 45 | MalePctDivorce 46 | MalePctNevMarr 47 | FemalePctDiv 48 | TotalPctDiv 49 | PersPerFam 50 | PctFam2Par 51 | PctKids2Par 52 | PctYoungKids2Par 53 | PctTeen2Par 54 | PctWorkMomYoungKids 55 | PctWorkMom 56 | NumIlleg 57 | PctIlleg 58 | NumImmig 59 | PctImmigRecent 60 | PctImmigRec5 61 | PctImmigRec8 62 | PctImmigRec10 63 | PctRecentImmig 64 | PctRecImmig5 65 | PctRecImmig8 66 | PctRecImmig10 67 | PctSpeakEnglOnly 68 | PctNotSpeakEnglWell 69 | PctLargHouseFam 70 | PctLargHouseOccup 71 | PersPerOccupHous 72 | PersPerOwnOccHous 73 | PersPerRentOccHous 74 | PctPersOwnOccup 75 | PctPersDenseHous 76 | PctHousLess3BR 77 | MedNumBR 78 | HousVacant 79 | PctHousOccup 80 | PctHousOwnOcc 81 | PctVacantBoarded 82 | PctVacMore6Mos 83 | MedYrHousBuilt 84 | PctHousNoPhone 85 | PctWOFullPlumb 86 | OwnOccLowQuart 87 | OwnOccMedVal 88 | OwnOccHiQuart 89 | RentLowQ 90 | RentMedian 91 | RentHighQ 92 | MedRent 93 | MedRentPctHousInc 94 | MedOwnCostPctInc 95 | MedOwnCostPctIncNoMtg 96 | NumInShelters 97 | NumStreet 98 | PctForeignBorn 99 | PctBornSameState 100 | PctSameHouse85 101 | PctSameCity85 102 | PctSameState85 103 | LemasSwornFT 104 | LemasSwFTPerPop 105 | LemasSwFTFieldOps 106 | LemasSwFTFieldPerPop 107 | LemasTotalReq 108 | LemasTotReqPerPop 109 | PolicReqPerOffic 110 | PolicPerPop 111 | RacialMatchCommPol 112 | PctPolicWhite 113 | PctPolicBlack 114 | PctPolicHisp 115 | PctPolicAsian 116 | PctPolicMinor 117 | OfficAssgnDrugUnits 118 | NumKindsDrugsSeiz 119 | PolicAveOTWorked 120 | LandArea 121 | PopDens 122 | PctUsePubTrans 123 | PolicCars 124 | PolicOperBudg 125 | LemasPctPolicOnPatr 126 | LemasGangUnitDeploy 127 | LemasPctOfficDrugUn 128 | PolicBudgPerPop 129 | ViolentCrimesPerPop 130 | -------------------------------------------------------------------------------- /data/cqr_datasets/facebook/README.md: -------------------------------------------------------------------------------- 1 | 2 | Please download the files Features_Variant_1.csv and Features_Variant_2.csv from 3 | [this link](https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset) and store the two in this directory. 4 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | # Code adapted from CQR GitHub (Yaniv Romano, 2019) 2 | # https://github.com/yromano/cqr 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def GetDataset(name, base_path): 9 | """ Load a dataset 10 | 11 | Parameters 12 | ---------- 13 | name : string, dataset name 14 | base_path : string, e.g. "path/to/datasets/directory/" 15 | 16 | Returns 17 | ------- 18 | data : dataframe containing the data set (shape n x (1+d)) 19 | response_name : string defining the column to be predicted 20 | continuous_var : boolean vector of length d, 0 meaning categorical variable and 1 continuous 21 | 22 | """ 23 | if name == "meps_19": 24 | df = pd.read_csv(base_path + 'meps_19_reg.csv') 25 | 26 | response_name = "UTILIZATION_reg" 27 | 28 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1', 29 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1', 30 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7', 31 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2', 32 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4', 33 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1', 34 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5', 35 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4', 36 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1', 37 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2', 38 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2', 39 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1', 40 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1', 41 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2', 42 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1', 43 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1', 44 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2', 45 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1', 46 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1', 47 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2', 48 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1', 49 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2', 50 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0', 51 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5', 52 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4', 53 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5', 54 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE'] 55 | 56 | d = len(features_names) 57 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0)) 58 | 59 | col_names = np.append(features_names, response_name) 60 | 61 | data = df[col_names] 62 | 63 | if name == "meps_20": 64 | df = pd.read_csv(base_path + 'meps_20_reg.csv') 65 | 66 | response_name = "UTILIZATION_reg" 67 | 68 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1', 69 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1', 70 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7', 71 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2', 72 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4', 73 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1', 74 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5', 75 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4', 76 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1', 77 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2', 78 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2', 79 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1', 80 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1', 81 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2', 82 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1', 83 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1', 84 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2', 85 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1', 86 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1', 87 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2', 88 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1', 89 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2', 90 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0', 91 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5', 92 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4', 93 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5', 94 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE'] 95 | 96 | d = len(features_names) 97 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0)) 98 | 99 | col_names = np.append(features_names, response_name) 100 | 101 | data = df[col_names] 102 | 103 | if name == "meps_21": 104 | df = pd.read_csv(base_path + 'meps_21_reg.csv') 105 | 106 | response_name = "UTILIZATION_reg" 107 | 108 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT16F', 'REGION=1', 109 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1', 110 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7', 111 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2', 112 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4', 113 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1', 114 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5', 115 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4', 116 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1', 117 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2', 118 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2', 119 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1', 120 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1', 121 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2', 122 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1', 123 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1', 124 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2', 125 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1', 126 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1', 127 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2', 128 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1', 129 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2', 130 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0', 131 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5', 132 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4', 133 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5', 134 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE'] 135 | 136 | d = len(features_names) 137 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0)) 138 | 139 | col_names = np.append(features_names, response_name) 140 | 141 | data = df[col_names] 142 | 143 | if name == "bio": 144 | # https://github.com/joefavergel/TertiaryPhysicochemicalProperties/blob/master/RMSD-ProteinTertiaryStructures.ipynb 145 | df = pd.read_csv(base_path + 'CASP.csv') 146 | response_name = 'RMSD' 147 | d = df.shape[1]-1 148 | continuous_var = np.full(d, 1) 149 | data = df 150 | 151 | if name == "concrete": 152 | dataset = np.loadtxt(open(base_path + 'Concrete_Data.csv', "rb"), delimiter=",", skiprows=1) 153 | data = pd.DataFrame(data=dataset) 154 | response_name = 8 155 | d = data.shape[1] - 1 156 | continuous_var = np.full(d, 1) 157 | 158 | if name == "bike": 159 | # https://www.kaggle.com/rajmehra03/bike-sharing-demand-rmsle-0-3194 160 | df = pd.read_csv(base_path + 'bike_train.csv') 161 | 162 | # # seperating season as per values. this is bcoz this will enhance features. 163 | season = pd.get_dummies(df['season'], prefix='season') 164 | df = pd.concat([df, season], axis=1) 165 | 166 | # # # same for weather. this is bcoz this will enhance features. 167 | weather = pd.get_dummies(df['weather'], prefix='weather') 168 | df = pd.concat([df, weather], axis=1) 169 | 170 | # # # now can drop weather and season. 171 | df.drop(['season', 'weather'], inplace=True, axis=1) 172 | df.head() 173 | 174 | df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)] 175 | df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)] 176 | df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)] 177 | df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)] 178 | df['year'] = df['year'].map({2011: 0, 2012: 1}) 179 | 180 | df.drop('datetime', axis=1, inplace=True) 181 | df.drop(['casual', 'registered'], axis=1, inplace=True) 182 | df.columns.to_series().groupby(df.dtypes).groups 183 | 184 | features_names = ['temp', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday', 185 | 'season_1', 'season_2', 'season_3', 'season_4', 'weather_1', 186 | 'weather_2', 'weather_3', 'weather_4', 'hour', 'day', 'month', 'year'] 187 | response_name = 'count' 188 | 189 | d = len(features_names) 190 | continuous_var = np.append(np.full(4, 1), np.full(d - 5, 0)) 191 | 192 | col_names = np.append(features_names, response_name) 193 | 194 | data = df[col_names] 195 | 196 | if name == "community": 197 | # https://github.com/vbordalo/Communities-Crime/blob/master/Crime_v1.ipynb 198 | attrib = pd.read_csv(base_path + 'communities_attributes.csv', delim_whitespace=True) 199 | data = pd.read_csv(base_path + 'communities.data', names=attrib['attributes']) 200 | data = data.drop(columns=['state', 'county', 201 | 'community', 'communityname', 202 | 'fold'], axis=1) 203 | 204 | data = data.replace('?', np.nan) 205 | response_name = 'ViolentCrimesPerPop' 206 | 207 | return data, response_name, continuous_var 208 | 209 | -------------------------------------------------------------------------------- /files.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import lzma 3 | import numpy as np 4 | import os 5 | 6 | def get_setting(dim=3, params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'}, params_missing={}): 7 | 8 | regression = params_reg['regression'] 9 | assert regression in ['Linear'], 'regression must be Linear.' 10 | 11 | if 'mean' in params_reg: 12 | mean = params_reg['mean'] 13 | else: 14 | mean = 1 15 | if 'scale' in params_reg: 16 | scale = params_reg['scale'] 17 | else: 18 | scale = 1 19 | if params_reg['beta'] is not None: 20 | beta = params_reg['beta'] 21 | else: 22 | beta = np.full(dim,1) 23 | if dim < 10: 24 | name = 'Linear_d_'+str(dim)+'_beta_'+'_'.join(str(x) for x in beta)+'_Gaussian_Mean_'+str(mean) 25 | else: 26 | name = 'Linear_d_'+str(dim)+'_beta_varies_Gaussian_Mean_'+str(mean)+'_Scale_'+str(scale) 27 | 28 | if 'prob_missing' in params_missing: 29 | prob_missing = params_missing['prob_missing'] 30 | else: 31 | prob_missing = 0.2 32 | 33 | if 'mechanism' in params_missing: 34 | if params_missing['mechanism'] == 'MNAR_mask_quantiles': 35 | name = name + '_' + params_missing['mechanism'] + '_q_' + str(params_missing['q']) + '_' 36 | else: 37 | name = name + '_' + params_missing['mechanism'] + '_' 38 | if 'id_setting' in params_missing: 39 | name = name + 'id_'+str(params_missing['id_setting'])+'_' 40 | else: 41 | name = name + '_MCAR_' 42 | 43 | name = name + str(prob_missing) 44 | 45 | return name 46 | 47 | def get_name_data(train_size, cal_size, params_test, dim=3, params_reg={}, params_noise={}, dataset=None, params_missing={}, seed=1): 48 | """ 49 | Parameters 50 | ---------- 51 | n : experiment sample size 52 | dim : dimension of the covariates (i.e. X lies in R^dim) 53 | regression : regression model, should be Linear 54 | noise : noise type, can be Gaussian 55 | params_reg : parameters of the regression part 56 | params_noise : parameters of the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]} 57 | to generate an AR(1) noise with coefficient -ar1 58 | seed : random seed for reproducibility used in the experiment 59 | 60 | Returns 61 | ------- 62 | name : name of the file containing (if existing) 63 | the generated data with the given parameters of simulations 64 | """ 65 | 66 | max_test_size = np.max(params_test['test_size']) 67 | 68 | if dataset is None: 69 | 70 | regression = params_reg['regression'] 71 | 72 | assert regression in ['Linear'], 'regression must be Linear.' 73 | 74 | name = get_setting(dim=dim, params_reg=params_reg, params_noise=params_noise, params_missing=params_missing) 75 | 76 | else: 77 | name = dataset 78 | 79 | name = name + '_seed_' + str(seed) + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_test_' + str(max_test_size) 80 | 81 | if 'prob_missing' in list(params_missing.keys()): 82 | name = name + '_prob_' + str(params_missing['prob_missing']) 83 | 84 | return name 85 | 86 | def get_name_data_imputed(train_size, cal_size, params_test, imputation, 87 | dim=3, params_reg={}, params_noise={}, dataset=None, params_missing={}, seed=1): 88 | """ 89 | Parameters 90 | ---------- 91 | n : experiment sample size 92 | dim : dimension of the covariates (i.e. X lies in R^dim) 93 | regression : regression model, should be Linear 94 | noise : noise type, can be Gaussian 95 | params_reg : parameters of the regression part 96 | params_noise : parameters of the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]} 97 | to generate an AR(1) noise with coefficient -ar1 98 | seed : random seed for reproducibility used in the experiment 99 | 100 | Returns 101 | ------- 102 | name : name of the file containing (if existing) 103 | the generated data with the given parameters of simulations 104 | """ 105 | 106 | name = get_name_data(train_size, cal_size, params_test, dim=dim, 107 | params_reg=params_reg, params_noise=params_noise, dataset=dataset, params_missing=params_missing, seed=seed) 108 | 109 | if imputation is not None: 110 | name = name + '_imputation_' + imputation 111 | 112 | return name 113 | 114 | def get_name_results(pipeline, train_size, cal_size, n_rep, imputation=None, d=3, 115 | params_reg={}, params_noise={}, dataset=None, params_missing={}): 116 | """ ... 117 | Parameters 118 | ---------- 119 | pipeline : 120 | params_method : 121 | Returns 122 | ------- 123 | name : 124 | """ 125 | 126 | # Results file name, depending on the method 127 | 128 | if pipeline != 'Oracle': 129 | name_method = pipeline+'_Imp_'+imputation 130 | else: 131 | name_method = pipeline 132 | 133 | # Results directory name, depending on the data simulation 134 | 135 | if dataset is not None: 136 | name_directory = dataset 137 | else: 138 | name_directory = get_setting(dim=d, params_reg=params_reg, params_noise=params_noise, params_missing=params_missing) 139 | if 'prob_missing' in list(params_missing.keys()): 140 | name_directory = name_directory + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_prob_' + str(params_missing['prob_missing']) + '_rep_' + str(n_rep) 141 | else: 142 | name_directory = name_directory + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_rep_' + str(n_rep) 143 | 144 | return name_directory, name_method 145 | 146 | def load_file(parent, name, ext): 147 | """ ... 148 | Parameters 149 | ---------- 150 | parent : 151 | name : 152 | ext : 153 | Returns 154 | ------- 155 | file : 156 | """ 157 | assert ext in ['pkl', 'xz'], 'ext must be pkl or xz.' 158 | path = parent + '/' + name + '.' + ext 159 | if ext == 'pkl': 160 | with open(path,'rb') as f: 161 | file = pickle.load(f) 162 | elif ext == 'xz': 163 | with lzma.open(path,'rb') as f: 164 | file = pickle.load(f) 165 | 166 | return file 167 | 168 | def write_file(parent, name, ext, file): 169 | """ ... 170 | Parameters 171 | ---------- 172 | parent : 173 | name : 174 | ext : 175 | file : 176 | Returns 177 | ------- 178 | """ 179 | 180 | assert ext in ['pkl', 'xz'], 'ext must be pkl or xz.' 181 | path = parent + '/' + name + '.' + ext 182 | if ext == 'pkl': 183 | if not os.path.isdir(parent): 184 | os.makedirs(parent) 185 | with open(path,'wb') as f: 186 | pickle.dump(file, f) 187 | elif ext == 'xz': 188 | if not os.path.isdir(parent): 189 | os.makedirs(parent) 190 | with lzma.open(path,'wb') as f: 191 | pickle.dump(file, f) 192 | 193 | def get_name_method(method, basemodel=None, mask='No', protection='No', exact=False): 194 | if exact == True: 195 | assert method == 'CQR_MDA', 'With MDA-Exact you should be masking.' 196 | method = method + '_Exact' 197 | if method == 'CQR_MDA': 198 | method = method + '_Nested' 199 | if method == 'Oracle': 200 | name = method 201 | elif method == 'Oracle_mean' and protection=='No': 202 | name = method 203 | elif method == 'Oracle_mean' and protection!='No': 204 | name = '_'.join([method, protection]) 205 | elif protection == 'No' and mask == 'No': 206 | name = '_'.join([method, basemodel]) 207 | elif method in ['QR', 'QR_TrainCal', 'CQR_MDA_Nested', 'CQR_MDA_Exact'] and mask == 'No': 208 | name = '_'.join([method, basemodel]) 209 | elif method in ['QR', 'QR_TrainCal', 'CQR_MDA_Nested', 'CQR_MDA_Exact'] and mask == 'Yes': 210 | name = '_'.join([method, basemodel, 'Mask']) 211 | elif protection == 'No': 212 | name = '_'.join([method, basemodel, 'Mask']) 213 | elif mask == 'No': 214 | name = '_'.join([method, basemodel, protection]) 215 | else: 216 | name = '_'.join([method, basemodel, 'Mask', protection]) 217 | return name 218 | -------------------------------------------------------------------------------- /generation.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import pandas as pd 4 | import copy 5 | import utils 6 | from tqdm.autonotebook import tqdm 7 | 8 | def generate_data(n, dim=3, params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'}, seed=1): 9 | """ 10 | Parameters 11 | ---------- 12 | n : sample size to generate 13 | dim : dimension of the covariates (i.e. X lies in R^dim) 14 | regression : regression model, should be Linear 15 | noise : noise type, can be Gaussian 16 | params_reg : parameters for the regression part 17 | params_noise : parameters for the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]} 18 | to generate an AR(1) noise with coefficient -ar1 19 | seed : random seed for reproducibility 20 | 21 | Returns 22 | ------- 23 | X : covariates values, array of size n x dim 24 | Y : response values, array of size n 25 | """ 26 | 27 | random.seed(seed) 28 | np.random.seed(seed) 29 | 30 | regression = params_reg['regression'] 31 | assert regression in ['Linear'], 'regression must be Linear.' 32 | 33 | noise = params_noise['noise'] 34 | 35 | d = dim 36 | 37 | if 'mean' in params_reg: 38 | mean = params_reg['mean'] 39 | else: 40 | mean = 1 41 | if 'phi' in params_reg: 42 | phi = params_reg['phi'] 43 | else: 44 | phi = 0.8 45 | mean = np.full(d, mean) 46 | cov = np.full((d,d),phi)+(1-phi)*np.eye(d) 47 | X = np.random.multivariate_normal(mean, cov, size=n) 48 | if 'beta' not in params_reg or params_reg['beta'] is None: 49 | beta = np.full(d,1) 50 | else: 51 | beta = params_reg['beta'] 52 | Y_reg = X.dot(beta) 53 | 54 | assert noise in ['Gaussian'], 'noise must be Gaussian.' 55 | if noise == 'Gaussian': 56 | if 'mean' in params_noise: 57 | mean = params_noise['mean'] 58 | else: 59 | mean = 0 60 | if 'scale' in params_noise: 61 | scale = params_noise['scale'] 62 | else: 63 | scale = 1 64 | eps = np.random.normal(loc=mean,scale=scale,size=(n)) 65 | 66 | Y = Y_reg + eps 67 | 68 | data = {'X': X, 'Y': Y} 69 | 70 | return data 71 | 72 | def generate_split(train_size, cal_size, params_test, data): 73 | 74 | X = data['X'] 75 | X_train = X[:train_size,:] 76 | X_cal = X[train_size:(train_size+cal_size),:] 77 | 78 | Y = data['Y'] 79 | Y_train = Y[:train_size] 80 | Y_cal = Y[train_size:(train_size+cal_size)] 81 | 82 | test_size = params_test['test_size'] 83 | 84 | mechanisms_test = params_test['mechanisms_test'] 85 | 86 | #if test_size is list: 87 | X_test = dict.fromkeys(test_size) 88 | Y_test = dict.fromkeys(test_size) 89 | for n_test in test_size: 90 | if (train_size+cal_size+n_test) <= X.shape[0]: 91 | X_test[n_test] = X[(train_size+cal_size):(train_size+cal_size+n_test),:] 92 | Y_test[n_test] = Y[(train_size+cal_size):(train_size+cal_size+n_test)] 93 | else: 94 | if 'iid' in mechanisms_test: 95 | assert params_test['iid']['test_size'] != n_test 96 | for extreme in ['worst_pattern', 'best_pattern']: 97 | if extreme in mechanisms_test: 98 | assert params_test[extreme]['test_size'] != n_test 99 | for fixed in ['fixed_nb_sample_pattern','fixed_nb_sample_pattern_size']: 100 | if fixed in mechanisms_test and params_test[fixed]['test_size'] == n_test: 101 | assert (train_size + cal_size + params_test[fixed]['nb_sample_pattern']) <= X.shape[0] 102 | 103 | X_test_created = np.empty((n_test,X.shape[1])) 104 | Y_test_created = np.empty((n_test)) 105 | 106 | X_to_shuffle = copy.deepcopy(X[(train_size+cal_size):,:]) 107 | Y_to_shuffle = copy.deepcopy(Y[(train_size+cal_size):]) 108 | n_shuffle = X_to_shuffle.shape[0] 109 | nb_exact_shuffle = n_test//n_shuffle 110 | nb_rest = n_test%n_shuffle 111 | for k in range(nb_exact_shuffle): 112 | ido = random.sample(range(n_shuffle), n_shuffle) 113 | X_test_created[(k * n_shuffle):((k+1) * n_shuffle), :] = X_to_shuffle[ido, :] 114 | Y_test_created[(k * n_shuffle):((k + 1) * n_shuffle)] = Y_to_shuffle[ido] 115 | ido = random.sample(range(n_shuffle), nb_rest) 116 | X_test_created[((k+1) * n_shuffle):, :] = X_to_shuffle[ido, :] 117 | Y_test_created[((k+1) * n_shuffle):] = Y_to_shuffle[ido] 118 | X_test[n_test] = X_test_created 119 | Y_test[n_test] = Y_test_created 120 | 121 | X_split = {'Train': X_train, 'Cal': X_cal, 'Test': X_test} 122 | Y_split = {'Train': Y_train, 'Cal': Y_cal, 'Test': Y_test} 123 | 124 | return X_split, Y_split 125 | 126 | def generate_MCAR(X, params_test, params_missing={}, seed=1): 127 | 128 | """ 129 | Parameters 130 | ---------- 131 | X : data array (of shape n x dim) which will suffer missing values 132 | prob_missing : probability of being missing 133 | var_missing : binary vector of length dim, containing 1 if the variables can suffer from missing values, 0 otherwise 134 | (e.g. [1,1,0] indicates that X_3 can not have missing values but X_1 and X_2 can) 135 | 136 | Returns 137 | ------- 138 | X_mcar : covariates values (observed or missing, nan in this case), array of size n x dim 139 | M_mcar : Mask array of size n x dim, containing 1 if the realization is missing, 0 otherwise 140 | """ 141 | 142 | random.seed(seed) 143 | np.random.seed(seed) 144 | 145 | d = X['Train'].shape[1] 146 | 147 | if 'prob_missing' in params_missing: 148 | prob_missing = params_missing['prob_missing'] 149 | else: 150 | prob_missing = 0.2 151 | if 'var_missing' in params_missing: 152 | var_missing = params_missing['var_missing'] 153 | else: 154 | var_missing = np.full(d, 1) 155 | 156 | nb_var_missing = np.sum(var_missing) 157 | 158 | train_size = X['Train'].shape[0] 159 | cal_size = X['Cal'].shape[0] 160 | 161 | M_mcar_train = np.full(X['Train'].shape, False) 162 | X_mcar_train = copy.deepcopy(X['Train']) 163 | 164 | M_mcar_cal = np.full(X['Cal'].shape, False) 165 | X_mcar_cal = copy.deepcopy(X['Cal']) 166 | 167 | M_mcar_train[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(train_size,nb_var_missing)) <= (prob_missing)) 168 | X_mcar_train[M_mcar_train] = np.nan 169 | M_mcar_cal[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(cal_size,nb_var_missing)) <= (prob_missing)) 170 | X_mcar_cal[M_mcar_cal] = np.nan 171 | 172 | mechanisms_test = params_test['mechanisms_test'] 173 | 174 | M_mcar = {'Train': M_mcar_train, 'Cal': M_mcar_cal} 175 | M_mcar_test = dict.fromkeys(mechanisms_test) 176 | 177 | X_mcar = {'Train': X_mcar_train, 'Cal': X_mcar_cal} 178 | X_mcar_test = dict.fromkeys(mechanisms_test) 179 | 180 | if 'iid' in mechanisms_test: 181 | test_size = params_test['iid']['test_size'] 182 | M_mcar_iid = np.full((test_size, d), False) 183 | M_mcar_iid[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(test_size,nb_var_missing)) <= (prob_missing)) 184 | M_mcar_test['iid'] = M_mcar_iid 185 | X_mcar_iid = copy.deepcopy(X['Test'][test_size]) 186 | X_mcar_iid[M_mcar_iid] = np.nan 187 | X_mcar_test['iid'] = X_mcar_iid 188 | for extreme in ['worst_pattern', 'best_pattern']: 189 | if extreme in mechanisms_test: 190 | test_size = params_test[extreme]['test_size'] 191 | test_pattern = params_test[extreme]['pattern'] 192 | M_mcar_extreme = np.full((test_size, d), False) 193 | M_mcar_extreme[:,np.where(np.array(test_pattern) == 1)[0]] = 1 194 | M_mcar_test[extreme] = M_mcar_extreme 195 | X_mcar_extreme = copy.deepcopy(X['Test'][test_size]) 196 | X_mcar_extreme[M_mcar_extreme] = np.nan 197 | X_mcar_test[extreme] = X_mcar_extreme 198 | if 'fixed_nb_sample_pattern' in mechanisms_test: 199 | list_patterns = utils.create_patterns(d, var_missing) 200 | test_size = params_test['fixed_nb_sample_pattern']['test_size'] 201 | nb_sample_pattern = params_test['fixed_nb_sample_pattern']['nb_sample_pattern'] 202 | M_mcar_fixed_sample_pattern = np.full((test_size, d), False) 203 | X_mcar_fixed_sample_pattern = copy.deepcopy(X['Test'][test_size]) 204 | for idp, pattern in enumerate(list_patterns): 205 | M_mcar_fixed_sample_pattern[(idp*nb_sample_pattern):((idp+1)*nb_sample_pattern),np.where(np.array(pattern) == 1)[0]] = 1 206 | X_mcar_fixed_sample_pattern[M_mcar_fixed_sample_pattern] = np.nan 207 | M_mcar_test['fixed_nb_sample_pattern'] = M_mcar_fixed_sample_pattern 208 | X_mcar_test['fixed_nb_sample_pattern'] = X_mcar_fixed_sample_pattern 209 | if 'fixed_nb_sample_pattern_size' in mechanisms_test: 210 | list_pattern_sizes = np.arange(np.sum(var_missing)) 211 | test_size = params_test['fixed_nb_sample_pattern_size']['test_size'] 212 | nb_sample_pattern_size = params_test['fixed_nb_sample_pattern_size']['nb_sample_pattern'] 213 | M_mcar_fixed_sample_pattern_size = np.full((test_size, d), False) 214 | X_mcar_fixed_sample_pattern_size = copy.deepcopy(X['Test'][test_size]) 215 | 216 | list_patterns = utils.create_patterns(d, var_missing) 217 | size_to_ids = dict.fromkeys(np.arange(0, d)) 218 | for k in np.arange(0, d): 219 | size_to_ids[k] = [] 220 | for pattern in list_patterns: 221 | key_pattern = utils.pattern_to_id(pattern) 222 | size_pattern = utils.pattern_to_size(pattern) 223 | size_to_ids[size_pattern] = np.append(size_to_ids[size_pattern], key_pattern) 224 | 225 | for idp, pattern_size in enumerate(list_pattern_sizes): 226 | keys = random.choices(size_to_ids[pattern_size], k=nb_sample_pattern_size) 227 | unique_keys, count_keys = np.unique(keys, return_counts=True) 228 | min_ind = idp * nb_sample_pattern_size 229 | for idps, key in enumerate(unique_keys): 230 | nb_sample_pattern = count_keys[idps] 231 | pattern = utils.bin_to_vec(bin(int(key)), d) 232 | M_mcar_fixed_sample_pattern_size[min_ind:(min_ind+nb_sample_pattern),np.where(np.array(pattern) == 1)[0]] = 1 233 | min_ind = min_ind + nb_sample_pattern 234 | X_mcar_fixed_sample_pattern_size[M_mcar_fixed_sample_pattern_size] = np.nan 235 | M_mcar_test['fixed_nb_sample_pattern_size'] = M_mcar_fixed_sample_pattern_size 236 | X_mcar_test['fixed_nb_sample_pattern_size'] = X_mcar_fixed_sample_pattern_size 237 | 238 | X_mcar['Test'] = X_mcar_test 239 | M_mcar['Test'] = M_mcar_test 240 | 241 | return X_mcar, M_mcar 242 | 243 | def process_test(params_test, d, params_missing={}): 244 | 245 | test_sizes = [] 246 | mechanisms_test = [] 247 | 248 | for mechanism in list(params_test.keys()): 249 | assert mechanism in ['iid', 'worst_pattern', 'best_pattern', 'test_pattern', 'fixed_nb_sample_pattern', 'fixed_nb_sample_pattern_size'], 'Test mechanism should be among iid, worst_pattern, best_pattern, test_pattern, fixed_nb_sample_pattern, fixed_nb_sample_pattern_size.' 250 | mechanisms_test = np.append(mechanisms_test, mechanism) 251 | if mechanism not in ['fixed_nb_sample_pattern', 'fixed_nb_sample_pattern_size']: 252 | assert 'test_size' in list(params_test[mechanism].keys()), 'test_size should be provided for each test mechanism.' 253 | test_sizes = np.append(test_sizes, int(params_test[mechanism]['test_size'])) 254 | else: 255 | assert 'nb_sample_pattern' in list(params_test[mechanism].keys()), 'nb_sample_pattern should be provided for fixed_nb_sample_pattern mechanism.' 256 | nb_sample_pattern = params_test[mechanism]['nb_sample_pattern'] 257 | 258 | if 'var_missing' in params_missing: 259 | var_missing = params_missing['var_missing'] 260 | else: 261 | var_missing = np.full(d, 1) 262 | 263 | if mechanism == 'fixed_nb_sample_pattern': 264 | 265 | list_patterns = utils.create_patterns(d, var_missing) 266 | nb_pattern = len(list_patterns) 267 | test_size = nb_sample_pattern*nb_pattern 268 | test_sizes = np.append(test_sizes, int(test_size)) 269 | params_test[mechanism]['test_size'] = test_size 270 | 271 | else: 272 | 273 | nb_pattern_size = np.sum(var_missing) 274 | test_size = nb_sample_pattern * nb_pattern_size 275 | test_sizes = np.append(test_sizes, int(test_size)) 276 | params_test[mechanism]['test_size'] = test_size 277 | 278 | test_sizes = np.unique(test_sizes).astype(int) 279 | 280 | params_test['test_size'] = test_sizes 281 | params_test['mechanisms_test'] = mechanisms_test 282 | 283 | return params_test 284 | 285 | def generate_multiple_data(train_size, cal_size, params_test, n_rep, dim=3, 286 | params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'}, 287 | params_missing={'mechanism':'MCAR'}): 288 | """ 289 | Parameters 290 | ---------- 291 | n : sample size to generate 292 | dim : dimension of the covariates (i.e. X lies in R^dim) 293 | regression : regression model, should be Linear 294 | noise : noise type, can be Gaussian 295 | params_reg : parameters for the regression part 296 | params_noise : parameters for the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]} 297 | to generate an AR(1) noise with coefficient -ar1 298 | seed_max : random seeds for reproducibility, will generate seed_max data-sets, of seeds 0 to seed_max-1 299 | 300 | Returns 301 | ------- 302 | X : covariates values, array of size seedmax x n x dim 303 | Y : response values, array of size seedmax x n 304 | """ 305 | 306 | sets = ['Train', 'Cal', 'Test'] 307 | mechanisms_test = params_test['mechanisms_test'] 308 | max_test_size = np.max(params_test['test_size']) 309 | 310 | n = train_size + cal_size + max_test_size 311 | 312 | X = dict.fromkeys(sets) 313 | X_missing = dict.fromkeys(sets) 314 | M = dict.fromkeys(sets) 315 | Y = dict.fromkeys(sets) 316 | 317 | for k in tqdm(range(n_rep)): 318 | data = generate_data(n, dim=dim, params_reg=params_reg, params_noise=params_noise, seed=k) 319 | Xk, Yk = generate_split(train_size, cal_size, params_test, data) 320 | Xk_missing, Mk_missing = generate_MCAR(Xk, params_test, params_missing, seed=k) 321 | 322 | for set in ['Train', 'Cal']: 323 | if k == 0: 324 | X[set] = np.expand_dims(Xk[set], axis=0) 325 | X_missing[set] = np.expand_dims(Xk_missing[set], axis=0) 326 | M[set] = np.expand_dims(Mk_missing[set], axis=0) 327 | Y[set] = Yk[set] 328 | else: 329 | X[set] = np.vstack((X[set],np.expand_dims(Xk[set], axis=0))) 330 | X_missing[set] = np.vstack((X_missing[set],np.expand_dims(Xk_missing[set], axis=0))) 331 | M[set] = np.vstack((M[set],np.expand_dims(Mk_missing[set], axis=0))) 332 | Y[set] = np.vstack((Y[set],np.array(Yk[set]))) 333 | 334 | set = 'Test' 335 | if k == 0: 336 | X[set] = dict.fromkeys(mechanisms_test) 337 | X_missing[set] = dict.fromkeys(mechanisms_test) 338 | M[set] = dict.fromkeys(mechanisms_test) 339 | Y[set] = dict.fromkeys(mechanisms_test) 340 | for key in mechanisms_test: 341 | n_test = params_test[key]['test_size'] 342 | X[set][key] = np.expand_dims(Xk[set][n_test], axis=0) 343 | Y[set][key] = Yk[set][n_test] 344 | X_missing[set][key] = np.expand_dims(Xk_missing[set][key], axis=0) 345 | M[set][key] = np.expand_dims(Mk_missing[set][key], axis=0) 346 | 347 | else: 348 | for key in mechanisms_test: 349 | n_test = params_test[key]['test_size'] 350 | X[set][key] = np.vstack((X[set][key],np.expand_dims(Xk[set][n_test], axis=0))) 351 | Y[set][key] = np.vstack((Y[set][key], np.array(Yk[set][n_test]))) 352 | X_missing[set][key] = np.vstack((X_missing[set][key], np.expand_dims(Xk_missing[set][key], axis=0))) 353 | M[set][key] = np.vstack((M[set][key], np.expand_dims(Mk_missing[set][key], axis=0))) 354 | 355 | 356 | return X, X_missing, M, Y, params_missing 357 | 358 | # Real data 359 | 360 | def real_generate_multiple_split(dataframe, target, prob_test=0.2, seed_max=1): 361 | 362 | data_features = dataframe.loc[:, dataframe.columns != target] 363 | response = dataframe.loc[:, target] 364 | n = dataframe.shape[0] 365 | d = data_features.shape[1] 366 | 367 | test_size = int(n*prob_test) 368 | train_cal_size = int(n-test_size) 369 | train_size = int(2*(train_cal_size//3) + train_cal_size%3) 370 | cal_size = int(train_cal_size//3) 371 | 372 | sizes = {'Train': train_size, 'Cal': cal_size, 'Test':test_size} 373 | 374 | mask_original = data_features.isnull().replace({True: 1, False: 0}) 375 | 376 | vars_categ = data_features.select_dtypes("object").columns 377 | 378 | data_features_categ = data_features[vars_categ] 379 | 380 | vars_categ = data_features.select_dtypes("object").columns 381 | vars_quant = set(data_features.columns).difference(set(vars_categ)) 382 | mask_features = data_features[vars_quant].isnull().replace({True: 1,False: 0}) 383 | 384 | data_features_categ_na = data_features_categ.fillna("-2") 385 | data_features_categ_encoded = pd.DataFrame(index=data_features_categ_na.index) 386 | for var in vars_categ: 387 | if np.sum(data_features_categ_na[var]=="1") > 0: 388 | data_features_categ_encoded[str(var)+"_1"] = data_features_categ_na[var]=="1" 389 | if np.sum(data_features_categ_na[var]=="0") > 0: 390 | data_features_categ_encoded[str(var)+"_0"] = data_features_categ_na[var]=="0" 391 | if np.sum(data_features_categ_na[var]=="-1") > 0: 392 | data_features_categ_encoded[str(var)+"_-1"] = data_features_categ_na[var]=="-1" 393 | if np.sum(data_features_categ_na[var]=="-2") > 0: 394 | data_features_categ_encoded[str(var)+"_-2"] = data_features_categ_na[var]=="-2" 395 | data_features_categ_encoded = data_features_categ_encoded.replace({True:1, False:0}) 396 | data_features = data_features[vars_quant].merge(data_features_categ_encoded, left_index=True, right_index=True) 397 | 398 | mask = data_features.isnull().replace({True: 1, False: 0}) 399 | 400 | col_features = list(data_features.columns) 401 | 402 | d_quant = mask_features.shape[1] 403 | d_aug = data_features.shape[1] 404 | 405 | X_missing = np.empty((seed_max,n,d_aug)) 406 | M_original = np.empty((seed_max,n,d)) 407 | M = np.empty((seed_max, n, d_aug)) 408 | M_quant = np.empty((seed_max,n,d_quant)) 409 | Y = np.empty((seed_max,n)) 410 | 411 | for k in range(seed_max): 412 | 413 | random.seed(k) 414 | np.random.seed(k) 415 | 416 | ido = random.sample(range(n), n) 417 | 418 | X_missing[k,:,:] = data_features.iloc[ido,:] 419 | M_original[k,:,:] = mask_original.iloc[ido,:] 420 | M[k, :, :] = mask.iloc[ido, :] 421 | M_quant[k,:,:] = mask_features.iloc[ido,:] 422 | Y[k,:] = response[ido] 423 | 424 | data = {'X_missing':X_missing, 'M_original':M_original,'M':M, 'M_quant':M_quant, 'Y':Y} 425 | 426 | keys = ['X_missing', 'M_original', 'M', 'M_quant'] 427 | for key in keys: 428 | arr = data[key] 429 | arr_train = arr[:,:train_size,:] 430 | arr_cal = arr[:,train_size:(train_size+cal_size),:] 431 | arr_test = arr[:,(n-test_size):n,:] 432 | globals()[key+'_split'] = {'Train': arr_train, 'Cal': arr_cal, 'Test': {'iid': arr_test}} 433 | 434 | Y = data['Y'] 435 | Y_train = Y[:,:train_size] 436 | Y_cal = Y[:,train_size:(train_size+cal_size)] 437 | Y_test = Y[:,(n-test_size):n] 438 | Y_split = {'Train': Y_train, 'Cal': Y_cal, 'Test':{'iid': Y_test}} 439 | 440 | return X_missing_split, M_original_split, M_split, M_quant_split, Y_split, col_features, sizes 441 | 442 | def real_generate_multiple_split_holdout(dataframe, target, prob_test=0.2): 443 | 444 | n = dataframe.shape[0] 445 | ido = random.sample(range(n), n) 446 | dataframe = dataframe.iloc[ido,:] 447 | dataframe = dataframe.reset_index(drop=True) 448 | 449 | data_features = dataframe.loc[:, dataframe.columns != target] 450 | response = dataframe.loc[:, target] 451 | 452 | d = data_features.shape[1] 453 | 454 | test_size = int(n*prob_test) 455 | train_cal_size = int(n-test_size) 456 | train_size = int(2*(train_cal_size//3) + train_cal_size%3) 457 | cal_size = int(train_cal_size//3) 458 | 459 | sizes = {'Train': train_size, 'Cal': cal_size, 'Test':test_size} 460 | 461 | mask_original = data_features.isnull().replace({True: 1, False: 0}) 462 | 463 | vars_categ = data_features.select_dtypes("object").columns 464 | 465 | data_features_categ = data_features[vars_categ] 466 | 467 | vars_categ = data_features.select_dtypes("object").columns 468 | vars_quant = set(data_features.columns).difference(set(vars_categ)) 469 | mask_features = data_features[vars_quant].isnull().replace({True: 1,False: 0}) 470 | 471 | data_features_categ_na = data_features_categ.fillna("-2") 472 | data_features_categ_encoded = pd.DataFrame(index=data_features_categ_na.index) 473 | for var in vars_categ: 474 | if np.sum(data_features_categ_na[var]=="1") > 0: 475 | data_features_categ_encoded[str(var)+"_1"] = data_features_categ_na[var]=="1" 476 | if np.sum(data_features_categ_na[var]=="0") > 0: 477 | data_features_categ_encoded[str(var)+"_0"] = data_features_categ_na[var]=="0" 478 | if np.sum(data_features_categ_na[var]=="-1") > 0: 479 | data_features_categ_encoded[str(var)+"_-1"] = data_features_categ_na[var]=="-1" 480 | if np.sum(data_features_categ_na[var]=="-2") > 0: 481 | data_features_categ_encoded[str(var)+"_-2"] = data_features_categ_na[var]=="-2" 482 | data_features_categ_encoded = data_features_categ_encoded.replace({True:1, False:0}) 483 | data_features = data_features[vars_quant].merge(data_features_categ_encoded, left_index=True, right_index=True) 484 | 485 | mask = data_features.isnull().replace({True: 1, False: 0}) 486 | 487 | col_features = list(data_features.columns) 488 | 489 | d_quant = mask_features.shape[1] 490 | d_aug = data_features.shape[1] 491 | 492 | nb_split = n//(test_size) 493 | 494 | X_missing_train = np.empty((nb_split, sizes['Train'], d_aug)) 495 | M_train = np.empty((nb_split, sizes['Train'], d_aug)) 496 | M_original_train = np.empty((nb_split, sizes['Train'], d)) 497 | M_quant_train = np.empty((nb_split, sizes['Train'], d_quant)) 498 | Y_train = np.empty((nb_split, sizes['Train'])) 499 | 500 | X_missing_cal = np.empty((nb_split, sizes['Cal'], d_aug)) 501 | M_cal = np.empty((nb_split, sizes['Cal'], d_aug)) 502 | M_original_cal = np.empty((nb_split, sizes['Cal'], d)) 503 | M_quant_cal = np.empty((nb_split, sizes['Cal'], d_quant)) 504 | Y_cal = np.empty((nb_split, sizes['Cal'])) 505 | 506 | X_missing_test = np.empty((nb_split, sizes['Test'], d_aug)) 507 | M_test = np.empty((nb_split, sizes['Test'], d_aug)) 508 | M_original_test = np.empty((nb_split, sizes['Test'], d)) 509 | M_quant_test = np.empty((nb_split, sizes['Test'], d_quant)) 510 | Y_test = np.empty((nb_split, sizes['Test'])) 511 | 512 | idx = np.array(list(np.arange(n))) 513 | 514 | for k in range(nb_split): 515 | 516 | id_test = idx[(k*sizes['Test']):((k+1)*sizes['Test'])] 517 | idbool = np.full(len(idx), True, dtype=bool) 518 | idbool[id_test] = False 519 | test = list(idx[~idbool]) 520 | traincal = list(idx[idbool]) 521 | train = traincal[:train_size] 522 | cal = traincal[train_size:] 523 | 524 | X_missing_train[k, :, :] = data_features.iloc[train, :] 525 | X_missing_cal[k, :, :] = data_features.iloc[cal, :] 526 | X_missing_test[k,:,:] = data_features.iloc[test, :] 527 | M_train[k, :, :] = mask.iloc[train, :] 528 | M_cal[k, :, :] = mask.iloc[cal, :] 529 | M_test[k, :, :] = mask.iloc[test, :] 530 | M_original_train[k, :, :] = mask_original.iloc[train, :] 531 | M_original_cal[k, :, :] = mask_original.iloc[cal, :] 532 | M_original_test[k, :, :] = mask_original.iloc[test, :] 533 | M_quant_train[k, :, :] = mask_features.iloc[train, :] 534 | M_quant_cal[k, :, :] = mask_features.iloc[cal, :] 535 | M_quant_test[k, :, :] = mask_features.iloc[test, :] 536 | Y_train[k, :] = response[train] 537 | Y_cal[k, :] = response[cal] 538 | Y_test[k, :] = response[test] 539 | 540 | X_missing = {'Train': X_missing_train, 'Cal': X_missing_cal, 'Test': {'iid': X_missing_test}} 541 | M = {'Train': M_train, 'Cal': M_cal, 'Test': {'iid': M_test}} 542 | M_original = {'Train': M_original_train, 'Cal': M_original_cal, 'Test': {'iid': M_original_test}} 543 | M_quant = {'Train': M_quant_train, 'Cal': M_quant_cal, 'Test': {'iid': M_quant_test}} 544 | Y = {'Train': Y_train, 'Cal': Y_cal, 'Test': {'iid': Y_test}} 545 | 546 | return X_missing, M_original, M, M_quant, Y, col_features, sizes 547 | 548 | 549 | def generate_multiple_real_data_MCAR(dataframe, target, train_size, cal_size, params_test, params_missing={}, seed_max=1): 550 | """ 551 | Parameters 552 | ---------- 553 | 554 | seed_max : random seeds for reproducibility, will generate seed_max data-sets, of seeds 0 to seed_max-1 555 | 556 | Returns 557 | ------- 558 | X : covariates values, array of size seedmax x n x dim 559 | Y : response values, array of size seedmax x n 560 | """ 561 | 562 | data_features = dataframe.loc[:, dataframe.columns != target] 563 | response = dataframe.loc[:, target] 564 | 565 | sets = ['Train', 'Cal', 'Test'] 566 | mechanisms_test = params_test['mechanisms_test'] 567 | max_test_size = np.max(params_test['test_size']) 568 | 569 | n = dataframe.shape[0] 570 | 571 | X = dict.fromkeys(sets) 572 | X_missing = dict.fromkeys(sets) 573 | M = dict.fromkeys(sets) 574 | Y = dict.fromkeys(sets) 575 | 576 | for k in range(seed_max): 577 | 578 | random.seed(k) 579 | np.random.seed(k) 580 | 581 | ido = random.sample(range(n), n) 582 | 583 | Xk = np.array(data_features.iloc[ido,:]) 584 | Yk = np.array(response[ido]) 585 | 586 | data = {'X': Xk, 'Y':Yk} 587 | 588 | Xk, Yk = generate_split(train_size, cal_size, params_test, data) 589 | Xk_missing, Mk_missing = generate_MCAR(Xk, params_test, params_missing, seed=k) 590 | 591 | for set in ['Train', 'Cal']: 592 | if k == 0: 593 | X[set] = np.expand_dims(Xk[set], axis=0) 594 | X_missing[set] = np.expand_dims(Xk_missing[set], axis=0) 595 | M[set] = np.expand_dims(Mk_missing[set], axis=0) 596 | Y[set] = Yk[set] 597 | else: 598 | X[set] = np.vstack((X[set], np.expand_dims(Xk[set], axis=0))) 599 | X_missing[set] = np.vstack((X_missing[set], np.expand_dims(Xk_missing[set], axis=0))) 600 | M[set] = np.vstack((M[set], np.expand_dims(Mk_missing[set], axis=0))) 601 | Y[set] = np.vstack((Y[set], np.array(Yk[set]))) 602 | 603 | set = 'Test' 604 | if k == 0: 605 | X[set] = dict.fromkeys(mechanisms_test) 606 | X_missing[set] = dict.fromkeys(mechanisms_test) 607 | M[set] = dict.fromkeys(mechanisms_test) 608 | Y[set] = dict.fromkeys(mechanisms_test) 609 | for key in mechanisms_test: 610 | n_test = params_test[key]['test_size'] 611 | X[set][key] = np.expand_dims(Xk[set][n_test], axis=0) 612 | X_missing[set][key] = np.expand_dims(Xk_missing[set][key], axis=0) 613 | M[set][key] = np.expand_dims(Mk_missing[set][key], axis=0) 614 | Y[set][key] = Yk[set][n_test] 615 | else: 616 | for key in mechanisms_test: 617 | n_test = params_test[key]['test_size'] 618 | X[set][key] = np.vstack((X[set][key], np.expand_dims(Xk[set][n_test], axis=0))) 619 | X_missing[set][key] = np.vstack((X_missing[set][key], np.expand_dims(Xk_missing[set][key], axis=0))) 620 | M[set][key] = np.vstack((M[set][key], np.expand_dims(Mk_missing[set][key], axis=0))) 621 | Y[set][key] = np.vstack((Y[set][key], np.array(Yk[set][n_test]))) 622 | 623 | return X, X_missing, M, Y 624 | -------------------------------------------------------------------------------- /imputation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.experimental import enable_iterative_imputer # noqa 3 | from sklearn.impute import SimpleImputer, IterativeImputer 4 | 5 | def impute(data, imputation): 6 | 7 | assert imputation in ['mean', 'constant', 'MICE', 'iterative_ridge'], 'imputation must be constant, mean, iterative_ridge or MICE.' 8 | 9 | X_missing = data['X_missing'] 10 | 11 | if imputation in ['mean', 'constant']: 12 | imputer = SimpleImputer(missing_values=np.nan, strategy=imputation) 13 | elif imputation == 'MICE': 14 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=True) 15 | elif imputation == 'iterative_ridge': 16 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=False) 17 | 18 | n_rep = X_missing['Train'].shape[0] 19 | 20 | X_train_imp = np.empty(X_missing['Train'].shape) 21 | X_cal_imp = np.empty(X_missing['Cal'].shape) 22 | if type(X_missing['Test']) is dict: 23 | multiple_test = True 24 | keys_test = list(X_missing['Test'].keys()) 25 | X_test_imp = dict.fromkeys(keys_test) 26 | for key in keys_test: 27 | X_test_imp[key] = np.empty(X_missing['Test'][key].shape) 28 | else: 29 | multiple_test = False 30 | X_test_imp = np.empty(X_missing['Test'].shape) 31 | 32 | for k in range(n_rep): 33 | 34 | imputer.fit(X_missing['Train'][k,:,:]) 35 | 36 | X_train_imp[k,:,:] = imputer.transform(X_missing['Train'][k,:,:]) 37 | X_cal_imp[k,:,:] = imputer.transform(X_missing['Cal'][k,:,:]) 38 | if multiple_test: 39 | for key in keys_test: 40 | X_test_imp[key][k,:,:] = imputer.transform(X_missing['Test'][key][k,:,:]) 41 | else: 42 | X_test_imp[k,:,:] = imputer.transform(X_missing['Test'][k,:,:]) 43 | 44 | X_imputed = {'Train': X_train_imp, 'Cal': X_cal_imp, 'Test': X_test_imp} 45 | 46 | return X_imputed 47 | 48 | def impute_imputer(X, imputation): 49 | 50 | assert imputation in ['mean', 'constant', 'MICE', 'iterative_ridge'], 'imputation must be constant, mean, iterative_ridge or MICE.' 51 | 52 | if imputation in ['mean', 'constant']: 53 | imputer = SimpleImputer(missing_values=np.nan, strategy=imputation) 54 | elif imputation == 'MICE': 55 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=True) 56 | elif imputation == 'iterative_ridge': 57 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=False) 58 | 59 | imputer.fit(X) 60 | 61 | return imputer 62 | -------------------------------------------------------------------------------- /plots/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/plots/.DS_Store -------------------------------------------------------------------------------- /prediction.py: -------------------------------------------------------------------------------- 1 | import files 2 | import utils 3 | import imputation as imp 4 | from tqdm.autonotebook import tqdm 5 | import numpy as np 6 | np.warnings.filterwarnings('ignore') 7 | 8 | from scipy.stats import norm 9 | import functools 10 | 11 | from sklearn.ensemble import RandomForestRegressor 12 | from sklearn.ensemble import GradientBoostingRegressor 13 | 14 | from sklearn.linear_model import LinearRegression 15 | 16 | from sklearn.linear_model import QuantileRegressor 17 | 18 | import torch 19 | import torch.nn as nn 20 | import torch.optim as optim 21 | from torch.utils.data import DataLoader 22 | from torch.utils.data import Dataset 23 | from sklearn.preprocessing import StandardScaler 24 | 25 | import six 26 | import sys 27 | sys.modules['sklearn.externals.six'] = six 28 | 29 | import quantile_forest as qf 30 | 31 | 32 | import copy 33 | 34 | ### The following lines of code are copied from CHR (Sesia and Romano, 2021) public GitHub.` 35 | ### https://github.com/msesia/chr 36 | 37 | class RegressionDataset(Dataset): 38 | 39 | def __init__(self, X_data, y_data): 40 | self.X_data = torch.from_numpy(X_data).float() 41 | self.y_data = torch.from_numpy(y_data).float() 42 | 43 | def __getitem__(self, index): 44 | return self.X_data[index], self.y_data[index] 45 | 46 | def __len__ (self): 47 | return len(self.X_data) 48 | 49 | class NNet(nn.Module): 50 | """ Conditional quantile estimator, formulated as neural net 51 | """ 52 | def __init__(self, quantiles, num_features, num_hidden=64, dropout=0.1, no_crossing=False): 53 | """ Initialization 54 | Parameters 55 | ---------- 56 | quantiles : numpy array of quantile levels (q), each in the range (0,1) 57 | num_features : integer, input signal dimension (p) 58 | num_hidden : integer, hidden layer dimension 59 | dropout : float, dropout rate 60 | no_crossing: boolean, whether to explicitly prevent quantile crossovers 61 | """ 62 | super(NNet, self).__init__() 63 | 64 | self.no_crossing = no_crossing 65 | 66 | self.num_quantiles = len(quantiles) 67 | 68 | # Construct base network 69 | self.base_model = nn.Sequential( 70 | nn.Linear(num_features, num_hidden), 71 | nn.ReLU(), 72 | nn.Dropout(dropout), 73 | nn.Linear(num_hidden, num_hidden), 74 | nn.ReLU(), 75 | nn.Dropout(dropout), 76 | nn.Linear(num_hidden, self.num_quantiles), 77 | ) 78 | self.init_weights() 79 | 80 | def init_weights(self): 81 | """ Initialize the network parameters 82 | """ 83 | for m in self.base_model: 84 | if isinstance(m, nn.Linear): 85 | nn.init.orthogonal_(m.weight) 86 | nn.init.constant_(m.bias, 0) 87 | 88 | def forward(self, x): 89 | """ Run forward pass 90 | """ 91 | x = self.base_model(x) 92 | if self.no_crossing: 93 | y,_ = torch.sort(x,1) 94 | else: 95 | y = x 96 | return y 97 | 98 | class AllQuantileLoss(nn.Module): 99 | """ Pinball loss function 100 | """ 101 | def __init__(self, quantiles): 102 | """ Initialize 103 | Parameters 104 | ---------- 105 | quantiles : pytorch vector of quantile levels, each in the range (0,1) 106 | """ 107 | super().__init__() 108 | self.quantiles = quantiles 109 | 110 | def forward(self, preds, target): 111 | """ Compute the pinball loss 112 | Parameters 113 | ---------- 114 | preds : pytorch tensor of estimated labels (n) 115 | target : pytorch tensor of true labels (n) 116 | Returns 117 | ------- 118 | loss : cost function value 119 | """ 120 | #assert not target.requires_grad 121 | #assert preds.size(0) == target.size(0) 122 | 123 | errors = target.unsqueeze(1)-preds 124 | Q = self.quantiles.unsqueeze(0) 125 | loss = torch.max((Q-1.0)*errors, Q*errors).mean() 126 | 127 | return loss 128 | 129 | 130 | class QNet: 131 | """ Fit a neural network (conditional quantile) to training data 132 | """ 133 | def __init__(self, quantiles, num_features, no_crossing=False, dropout=0.2, learning_rate=0.001, 134 | num_epochs=100, batch_size=16, num_hidden=64, random_state=0, calibrate=0, verbose=False): 135 | """ Initialization 136 | Parameters 137 | ---------- 138 | quantiles : numpy array of quantile levels (q), each in the range (0,1) 139 | num_features : integer, input signal dimension (p) 140 | learning_rate : learning rate 141 | random_state : integer, seed used in CV when splitting to train-test 142 | """ 143 | 144 | # Detect whether CUDA is available 145 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 146 | 147 | # Store input (sort the quantiles) 148 | quantiles = np.sort(quantiles) 149 | self.quantiles = torch.from_numpy(quantiles).float().to(self.device) 150 | self.num_features = num_features 151 | 152 | # Define NNet model 153 | self.model = NNet(self.quantiles, self.num_features, num_hidden=num_hidden, dropout=dropout, no_crossing=no_crossing) 154 | self.model.to(self.device) 155 | 156 | # Initialize optimizer 157 | self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate) 158 | 159 | # Initialize loss function 160 | self.loss_func = AllQuantileLoss(self.quantiles) 161 | 162 | # Store variables 163 | self.num_epochs = num_epochs 164 | self.batch_size = batch_size 165 | self.random_state = random_state 166 | self.calibrate = int(calibrate) 167 | 168 | # Initialize training logs 169 | self.loss_history = [] 170 | self.test_loss_history = [] 171 | self.full_loss_history = [] 172 | 173 | # Validation 174 | self.val_period = 10 175 | 176 | self.verbose = verbose 177 | 178 | def fit(self, X, Y, return_loss=False): 179 | 180 | self.scaler = StandardScaler() 181 | self.scaler.fit(X) 182 | X = self.scaler.transform(X) 183 | 184 | Y = Y.flatten().astype(np.float32) 185 | X = X.astype(np.float32) 186 | 187 | dataset = RegressionDataset(X, Y) 188 | num_epochs = self.num_epochs 189 | if self.calibrate>0: 190 | # Train with 80% of samples 191 | n_valid = int(np.round(0.2*X.shape[0])) 192 | loss_stats = [] 193 | for b in range(self.calibrate): 194 | X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=n_valid, random_state=self.random_state+b) 195 | train_dataset = RegressionDataset(X_train, Y_train) 196 | val_dataset = RegressionDataset(X_valid, Y_valid) 197 | loss_stats_tmp = self._fit(train_dataset, num_epochs, val_dataset=val_dataset) 198 | loss_stats.append([loss_stats_tmp['val']]) 199 | # Reset model 200 | self.model.init_weights() 201 | 202 | loss_stats = np.matrix(np.concatenate(loss_stats,0)).T 203 | 204 | loss_stats = np.median(loss_stats,1).flatten() 205 | # Find optimal number of epochs 206 | num_epochs = self.val_period*(np.argmin(loss_stats)+1) 207 | loss_stats_cal = loss_stats 208 | 209 | # Train with all samples 210 | loss_stats = self._fit(dataset, num_epochs) 211 | if self.calibrate: 212 | loss_stats = loss_stats_cal 213 | 214 | #if return_loss: 215 | return self 216 | 217 | def _fit(self, train_dataset, num_epochs, val_dataset=None): 218 | batch_size = self.batch_size 219 | 220 | # Initialize data loaders 221 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size) 222 | if val_dataset is not None: 223 | val_loader = DataLoader(dataset=val_dataset, batch_size=1) 224 | 225 | num_samples, num_features = train_dataset.X_data.shape 226 | print("Training with {} samples and {} features.". \ 227 | format(num_samples, num_features)) 228 | 229 | loss_stats = {'train': [], "val": []} 230 | 231 | X_train_batch = train_dataset.X_data.to(self.device) 232 | y_train_batch = train_dataset.y_data.to(self.device) 233 | 234 | for e in tqdm(range(1, num_epochs+1)): 235 | 236 | # TRAINING 237 | train_epoch_loss = 0 238 | self.model.train() 239 | 240 | if batch_size<500: 241 | 242 | for X_train_batch, y_train_batch in train_loader: 243 | X_train_batch, y_train_batch = X_train_batch.to(self.device), y_train_batch.to(self.device) 244 | self.optimizer.zero_grad() 245 | 246 | y_train_pred = self.model(X_train_batch).to(self.device) 247 | 248 | train_loss = self.loss_func(y_train_pred, y_train_batch) 249 | 250 | train_loss.backward() 251 | self.optimizer.step() 252 | 253 | train_epoch_loss += train_loss.item() 254 | 255 | else: 256 | self.optimizer.zero_grad() 257 | 258 | y_train_pred = self.model(X_train_batch).to(self.device) 259 | 260 | train_loss = self.loss_func(y_train_pred, y_train_batch) 261 | 262 | train_loss.backward() 263 | self.optimizer.step() 264 | 265 | train_epoch_loss += train_loss.item() 266 | 267 | # VALIDATION 268 | if val_dataset is not None: 269 | if e % self.val_period == 0: 270 | self.model.eval() 271 | with torch.no_grad(): 272 | val_epoch_loss = 0 273 | for X_val_batch, y_val_batch in val_loader: 274 | X_val_batch, y_val_batch = X_val_batch.to(self.device), y_val_batch.to(self.device) 275 | y_val_pred = self.model(X_val_batch).to(self.device) 276 | val_loss = self.loss_func(y_val_pred, y_val_batch) 277 | val_epoch_loss += val_loss.item() 278 | 279 | loss_stats['val'].append(val_epoch_loss/len(val_loader)) 280 | self.model.train() 281 | 282 | else: 283 | loss_stats['val'].append(0) 284 | 285 | if e % self.val_period == 0: 286 | loss_stats['train'].append(train_epoch_loss/len(train_loader)) 287 | 288 | if (e % 10 == 0) and (self.verbose): 289 | if val_dataset is not None: 290 | print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | ', end='') 291 | print(f'Val Loss: {val_epoch_loss/len(val_loader):.5f} | ', flush=True) 292 | else: 293 | print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | ', flush=True) 294 | 295 | return loss_stats 296 | 297 | def predict(self, X): 298 | """ Estimate the label given the features 299 | Parameters 300 | ---------- 301 | x : numpy array of training features (nXp) 302 | Returns 303 | ------- 304 | ret_val : numpy array of predicted labels (n) 305 | """ 306 | X = self.scaler.transform(X) 307 | self.model.eval() 308 | ret_val = self.model(torch.from_numpy(X).to(self.device).float().requires_grad_(False)) 309 | return ret_val.cpu().detach().numpy() 310 | 311 | def get_quantiles(self): 312 | return self.quantiles.cpu().numpy() 313 | 314 | ### Here ends the code from CHR and start the new code. 315 | 316 | def fit_basemodel(X_train, Y_train, target='Mean', basemodel='Linear', alpha=0.1, params_basemodel={}): 317 | 318 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.' 319 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.' 320 | 321 | cores = params_basemodel['cores'] 322 | 323 | if basemodel == 'RF': 324 | n_estimators = params_basemodel['n_estimators'] 325 | min_samples_leaf = params_basemodel['min_samples_leaf'] 326 | max_features = params_basemodel['max_features'] 327 | 328 | if target == 'Mean': 329 | if basemodel == 'Linear': 330 | trained_model = LinearRegression(n_jobs=cores).fit(X_train,Y_train) 331 | elif basemodel == 'RF': 332 | trained_model = RandomForestRegressor(n_jobs=cores,n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, 333 | max_features=max_features, random_state=1).fit(X_train,Y_train) 334 | elif target == 'Quantiles': 335 | a_low = alpha/2 336 | a_high = 1-alpha/2 337 | if basemodel == 'Linear': 338 | trained_model = {'q_low': QuantileRegressor(quantile=a_low, solver='highs', alpha=0).fit(X_train,Y_train), 339 | 'q_high': QuantileRegressor(quantile=a_high, solver='highs', alpha=0).fit(X_train,Y_train)} 340 | elif basemodel == 'RF': 341 | trained_model = qf.RandomForestQuantileRegressor(random_state=1, min_samples_leaf=min_samples_leaf, 342 | n_estimators=n_estimators, max_features=max_features).fit(X_train,Y_train) 343 | elif basemodel == 'XGBoost': 344 | trained_model = {'q_low': GradientBoostingRegressor(loss="quantile", alpha=a_low, n_estimators=25).fit(X_train,Y_train), 345 | 'q_high': GradientBoostingRegressor(loss="quantile", alpha=a_high, n_estimators=25).fit(X_train,Y_train)} 346 | 347 | elif basemodel == 'NNet': 348 | 349 | n_train = len(Y_train) 350 | n_features = X_train.shape[1] 351 | epochs = 2000 352 | lr = 0.0005 353 | batch_size = n_train 354 | dropout = 0.1 355 | 356 | grid_quantiles = [alpha/2, 1-alpha/2] 357 | trained_model = QNet(grid_quantiles, n_features, no_crossing=True, batch_size=batch_size, 358 | dropout=dropout, num_epochs=epochs, learning_rate=lr, calibrate=0, 359 | verbose=False).fit(X_train, Y_train) 360 | 361 | return trained_model 362 | 363 | def predict_basemodel(fitted_basemodel, X_test, target='Mean', basemodel='Linear', alpha=0.1): 364 | 365 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.' 366 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.' 367 | 368 | if target == 'Mean': 369 | predictions = fitted_basemodel.predict(X_test) 370 | elif target == 'Quantiles': 371 | a_low = alpha/2 372 | a_high = 1-alpha/2 373 | if basemodel == 'Linear': 374 | predictions = {'y_inf': fitted_basemodel['q_low'].predict(X_test), 375 | 'y_sup': fitted_basemodel['q_high'].predict(X_test)} 376 | elif basemodel == 'RF': 377 | both_pred = fitted_basemodel.predict(X_test, quantiles=[a_low, a_high]) 378 | predictions = {'y_inf': both_pred[:, 0], 379 | 'y_sup': both_pred[:, 1]} 380 | elif basemodel == 'XGBoost': 381 | predictions = {'y_inf': fitted_basemodel['q_low'].predict(X_test), 382 | 'y_sup': fitted_basemodel['q_high'].predict(X_test)} 383 | elif basemodel == 'NNet': 384 | both_pred = fitted_basemodel.predict(X_test) 385 | predictions = {'y_inf': both_pred[:, 0], 386 | 'y_sup': both_pred[:, 1]} 387 | 388 | return predictions 389 | 390 | 391 | def quantile_corrected(x, alpha): 392 | n_x = len(x) 393 | if (1-alpha)*(1+1/n_x) > 1: 394 | return np.inf 395 | else: 396 | return np.quantile(x, (1-alpha)*(1+1/n_x)) 397 | 398 | def calibrate_predict_intervals(pred_cal, Y_cal, pred_test, groups_cal=None, groups_test=None, target='Mean', basemodel='Linear', alpha=0.1): 399 | 400 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.' 401 | assert basemodel in ['Oracle', 'Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.' 402 | 403 | if groups_cal == None: 404 | if target == 'Mean': 405 | scores = np.abs(Y_cal-pred_cal) 406 | q_scores = quantile_corrected(scores, alpha) 407 | interval_predictions = {'y_inf': pred_test-q_scores, 408 | 'y_sup': pred_test+q_scores} 409 | elif target == 'Quantiles': 410 | scores = np.maximum(pred_cal['y_inf']-Y_cal, Y_cal-pred_cal['y_sup']) 411 | q_scores = quantile_corrected(scores, alpha) 412 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores, 413 | 'y_sup': pred_test['y_sup']+q_scores} 414 | else: 415 | if target == 'Mean': 416 | scores = np.abs(Y_cal-pred_cal) 417 | elif target == 'Quantiles': 418 | scores = np.maximum(pred_cal['y_inf']-Y_cal, Y_cal-pred_cal['y_sup']) 419 | 420 | scores_sorted = np.array(scores)[np.array(groups_cal).argsort()] 421 | ids = np.unique(np.array(groups_cal)[np.array(groups_cal).argsort()], return_index=True)[0] 422 | inds = np.unique(np.array(groups_cal)[np.array(groups_cal).argsort()], return_index=True)[1] 423 | scores_splitted = np.split(scores_sorted, inds)[1:] 424 | 425 | q_scores_cal = list(map(functools.partial(quantile_corrected, alpha=alpha), scores_splitted)) 426 | 427 | missing_groups = np.array(groups_test)[~np.isin(groups_test, ids)] 428 | if (len(missing_groups) > 0): 429 | ids = np.concatenate((ids, missing_groups)) 430 | q_scores_cal = np.concatenate((q_scores_cal, np.full(len(missing_groups),np.inf))) 431 | 432 | inds_test = list(map(list(ids).index, groups_test)) 433 | 434 | q_scores_test = np.array(q_scores_cal)[np.array(inds_test)] 435 | 436 | if target == 'Mean': 437 | interval_predictions = {'y_inf': pred_test-q_scores_test, 438 | 'y_sup': pred_test+q_scores_test} 439 | elif target == 'Quantiles': 440 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores_test, 441 | 'y_sup': pred_test['y_sup']+q_scores_test} 442 | 443 | return interval_predictions 444 | 445 | def calibrate_masking_predict_intervals(fitted_basemodel, imputer, X_cal, M_cal, Y_cal, 446 | X_mis_test, features_test, M_test, mask, 447 | groups_test, exact=True, target='Quantiles', 448 | basemodel='Linear', alpha=0.1): 449 | 450 | assert target in ['Quantiles'], 'regression must be Quantiles.' 451 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.' 452 | 453 | patterns = np.unique(M_test, axis=0) 454 | ids = list(map(utils.pattern_to_id, patterns.astype(int))) 455 | 456 | n_test = features_test.shape[0] 457 | q_scores_test = np.empty(n_test) 458 | 459 | if exact == False: 460 | pred_test = {'y_inf': np.empty(n_test), 461 | 'y_sup': np.empty(n_test)} 462 | 463 | for idp, id_pattern in enumerate(ids): 464 | 465 | X_imp_cal_masking = copy.deepcopy(X_cal) 466 | M_cal_masking = copy.deepcopy(M_cal) 467 | Y_cal_masking = copy.deepcopy(Y_cal) 468 | 469 | pattern = patterns[idp] 470 | 471 | empty = False 472 | 473 | if exact == True: 474 | ind_subsample = np.all(M_cal[:, pattern == 0] == 0, axis=1) 475 | if np.sum(ind_subsample) == 0: 476 | empty = True 477 | X_imp_cal_masking = X_imp_cal_masking[ind_subsample, :] 478 | M_cal_masking = M_cal_masking[ind_subsample, :] 479 | Y_cal_masking = Y_cal_masking[ind_subsample] 480 | 481 | if X_imp_cal_masking.shape[1] > len(pattern): 482 | nb = X_imp_cal_masking.shape[1] - len(pattern) 483 | pattern_ext = np.append(pattern, np.full(nb,0)) 484 | else: 485 | pattern_ext = pattern 486 | 487 | X_imp_cal_masking[:, pattern_ext == 1] = np.nan 488 | M_cal_masking[:, pattern == 1] = 1 489 | 490 | if not empty: 491 | X_imp_cal_masking = imputer.transform(X_imp_cal_masking) 492 | 493 | if mask == 'Yes': 494 | features_cal = np.concatenate((X_imp_cal_masking, M_cal_masking), axis=1) 495 | else: 496 | features_cal = X_imp_cal_masking 497 | 498 | cal_predictions = predict_basemodel(fitted_basemodel, features_cal, target, basemodel, alpha) 499 | 500 | scores = np.maximum(cal_predictions['y_inf']-Y_cal_masking, Y_cal_masking-cal_predictions['y_sup']) 501 | 502 | if exact == True: 503 | 504 | if not empty: 505 | 506 | q_scores_cal = quantile_corrected(scores, alpha=alpha) 507 | 508 | q_scores_test[(np.array(groups_test) == id_pattern).flatten()] = q_scores_cal 509 | 510 | else: 511 | q_scores_test[(np.array(groups_test) == id_pattern).flatten()] = np.inf 512 | 513 | else: 514 | 515 | X_to_pred = copy.deepcopy(X_mis_test[(np.array(groups_test) == id_pattern).flatten(), :]) 516 | M_to_pred = copy.deepcopy(M_test[(np.array(groups_test) == id_pattern).flatten(), :]) 517 | 518 | n_current = X_to_pred.shape[0] 519 | 520 | patterns_cal = np.unique(M_cal_masking, axis=0) 521 | ids_cal = list(map(utils.pattern_to_id, patterns_cal.astype(int))) 522 | 523 | groups_cal_masking = list(map(utils.pattern_to_id, M_cal_masking.astype(int))) 524 | 525 | nb_cal = len(scores) 526 | 527 | all_preds = {'y_inf': np.empty((n_current, nb_cal)), 528 | 'y_sup': np.empty((n_current, nb_cal))} 529 | 530 | for idp_cal, id_pattern_cal in enumerate(ids_cal): 531 | 532 | idx_cal_masking = (np.array(groups_cal_masking) == id_pattern_cal).flatten() 533 | nb_mask = np.sum(idx_cal_masking) 534 | 535 | all_preds['y_inf'][:, idx_cal_masking] = np.tile(scores[idx_cal_masking], (n_current, 1)) 536 | all_preds['y_sup'][:, idx_cal_masking] = np.tile(scores[idx_cal_masking], (n_current, 1)) 537 | 538 | pattern_masking = patterns_cal[idp_cal] 539 | 540 | X_to_pred_masking = copy.deepcopy(X_to_pred) 541 | M_to_pred_masking = copy.deepcopy(M_to_pred) 542 | X_to_pred_masking[:, pattern_masking == 1] = np.nan 543 | M_to_pred_masking[:, pattern_masking == 1] = 1 544 | 545 | X_to_pred_masking = imputer.transform(X_to_pred_masking) 546 | 547 | if mask == 'Yes': 548 | features_test_pattern = np.concatenate((X_to_pred_masking, M_to_pred_masking), axis=1) 549 | else: 550 | features_test_pattern = X_to_pred_masking 551 | 552 | preds_k = predict_basemodel(fitted_basemodel, features_test_pattern, target, basemodel, alpha) 553 | 554 | all_preds['y_inf'][:, idx_cal_masking] = -np.tile(preds_k['y_inf'], (nb_mask, 1)).T + all_preds['y_inf'][:, idx_cal_masking] 555 | all_preds['y_sup'][:, idx_cal_masking] = np.tile(preds_k['y_sup'], (nb_mask, 1)).T + all_preds['y_sup'][:, idx_cal_masking] 556 | 557 | 558 | if (1 - alpha) * (1 + 1 / nb_cal) > 1: 559 | pred_test['y_inf'][(np.array(groups_test) == id_pattern).flatten()] = [-np.inf] * n_current 560 | pred_test['y_sup'][(np.array(groups_test) == id_pattern).flatten()] = [np.inf] * n_current 561 | else: 562 | pred_test['y_inf'][(np.array(groups_test) == id_pattern).flatten()] = -np.quantile(all_preds['y_inf'], (1 - alpha) * (1 + 1 / nb_cal), axis=1) 563 | pred_test['y_sup'][(np.array(groups_test) == id_pattern).flatten()] = np.quantile(all_preds['y_sup'], (1 - alpha) * (1 + 1 / nb_cal), axis=1) 564 | 565 | if exact == True: 566 | pred_test = predict_basemodel(fitted_basemodel, features_test, target, basemodel, alpha) 567 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores_test, 568 | 'y_sup': pred_test['y_sup']+q_scores_test} 569 | else: 570 | interval_predictions = {'y_inf': pred_test['y_inf'], 571 | 'y_sup': pred_test['y_sup']} 572 | 573 | return interval_predictions 574 | 575 | def compute_mean_mis_given_obs(X_obs_in_mis, mean_mis, cov_mis_obs, cov_obs_inv, mean_obs): 576 | return mean_mis + np.dot(cov_mis_obs,np.dot(cov_obs_inv, X_obs_in_mis - mean_obs)) 577 | 578 | def oracle_pattern(pattern, X_test, M_test, beta, mean, cov, alpha=0.1): 579 | 580 | a_low = alpha/2 581 | a_high = 1-alpha/2 582 | 583 | pattern_id = utils.pattern_to_id(pattern.astype(int)) 584 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int))) 585 | X_pattern = X_test[np.where(np.array(M_test_id) == pattern_id)] 586 | 587 | pattern = np.array(list(map(bool, pattern))) 588 | 589 | beta_mis = beta[pattern] 590 | beta_obs = beta[~pattern] 591 | 592 | mean_mis = mean[pattern] 593 | mean_obs = mean[~pattern] 594 | 595 | X_obs_in_mis = X_pattern[:,~pattern] 596 | 597 | cov_obs = cov[~pattern][:,~pattern] 598 | cov_obs_inv = np.linalg.pinv(cov_obs) 599 | 600 | cov_mis = cov[pattern][:,pattern] 601 | cov_mis_obs = cov[pattern][:,~pattern] 602 | 603 | mean_mis_given_obs = np.array(list(map(functools.partial(compute_mean_mis_given_obs, 604 | mean_mis=mean_mis, cov_mis_obs=cov_mis_obs, 605 | cov_obs_inv=cov_obs_inv, mean_obs=mean_obs), X_obs_in_mis))) 606 | 607 | beta_mis_mean_mis = np.array(list(map(functools.partial(np.dot, beta_mis), mean_mis_given_obs))) 608 | beta_obs_X_obs = np.array(list(map(functools.partial(np.dot, beta_obs), X_obs_in_mis))) 609 | 610 | cov_mis_given_obs = cov_mis - np.dot(cov_mis_obs,np.dot(cov_obs_inv, cov_mis_obs.T)) 611 | 612 | q_low = beta_obs_X_obs + beta_mis_mean_mis + norm.ppf(a_low)*np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs , beta_mis.T))+1) 613 | q_high = beta_obs_X_obs + beta_mis_mean_mis + norm.ppf(a_high)*np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs , beta_mis.T))+1) 614 | 615 | return {'q_low': q_low, 'q_high': q_high} 616 | 617 | def oracle(M_test, X_test, beta, mean, cov, alpha=0.1): 618 | 619 | n_test = X_test.shape[0] 620 | 621 | interval_predictions = {'y_inf': np.empty(n_test), 622 | 'y_sup': np.empty(n_test)} 623 | 624 | patterns = np.unique(M_test, axis=0) 625 | 626 | oracles_intervals_per_pattern = list(map(functools.partial(oracle_pattern, 627 | X_test=X_test, M_test=M_test, beta=beta, 628 | mean=mean, cov=cov, alpha=alpha), patterns)) 629 | 630 | for idp, pattern in enumerate(patterns): 631 | 632 | pattern_id = utils.pattern_to_id(pattern.astype(int)) 633 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int))) 634 | interval_predictions['y_inf'][np.where(np.array(M_test_id) == pattern_id)] = oracles_intervals_per_pattern[idp]['q_low'] 635 | interval_predictions['y_sup'][np.where(np.array(M_test_id) == pattern_id)] = oracles_intervals_per_pattern[idp]['q_high'] 636 | 637 | return interval_predictions 638 | 639 | def oracle_len_pattern(pattern, beta, cov, alpha=0.1): 640 | 641 | pattern = np.array(list(map(bool, pattern))) 642 | 643 | beta_mis = beta[pattern] 644 | 645 | cov_obs = cov[~pattern][:,~pattern] 646 | cov_obs_inv = np.linalg.pinv(cov_obs) 647 | 648 | cov_mis = cov[pattern][:,pattern] 649 | cov_mis_obs = cov[pattern][:,~pattern] 650 | 651 | cov_mis_given_obs = cov_mis - np.dot(cov_mis_obs,np.dot(cov_obs_inv, cov_mis_obs.T)) 652 | 653 | length = 2 * norm.ppf(1-alpha/2) * np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs, beta_mis.T)) + 1) 654 | 655 | return length 656 | 657 | def oracle_mean_pattern(pattern, X_test, M_test, beta, mean, cov): 658 | 659 | pattern_id = utils.pattern_to_id(pattern.astype(int)) 660 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int))) 661 | X_pattern = X_test[np.where(np.array(M_test_id) == pattern_id)] 662 | 663 | pattern = np.array(list(map(bool, pattern))) 664 | 665 | beta_mis = beta[pattern] 666 | beta_obs = beta[~pattern] 667 | 668 | mean_mis = mean[pattern] 669 | mean_obs = mean[~pattern] 670 | 671 | X_obs_in_mis = X_pattern[:,~pattern] 672 | 673 | cov_obs = cov[~pattern][:,~pattern] 674 | cov_obs_inv = np.linalg.pinv(cov_obs) 675 | 676 | cov_mis = cov[pattern][:,pattern] 677 | cov_mis_obs = cov[pattern][:,~pattern] 678 | 679 | mean_mis_given_obs = np.array(list(map(functools.partial(compute_mean_mis_given_obs, 680 | mean_mis=mean_mis, cov_mis_obs=cov_mis_obs, 681 | cov_obs_inv=cov_obs_inv, mean_obs=mean_obs), X_obs_in_mis))) 682 | 683 | beta_mis_mean_mis = np.array(list(map(functools.partial(np.dot, beta_mis), mean_mis_given_obs))) 684 | beta_obs_X_obs = np.array(list(map(functools.partial(np.dot, beta_obs), X_obs_in_mis))) 685 | 686 | mean_pattern = beta_obs_X_obs + beta_mis_mean_mis 687 | 688 | return mean_pattern 689 | 690 | def oracle_mean(M_test, X_test, beta, mean, cov): 691 | 692 | n_test = X_test.shape[0] 693 | 694 | predictions = np.empty(n_test) 695 | 696 | patterns = np.unique(M_test, axis=0) 697 | 698 | oracles_mean_per_pattern = list(map(functools.partial(oracle_mean_pattern, 699 | X_test=X_test, M_test=M_test, beta=beta, 700 | mean=mean, cov=cov), patterns)) 701 | 702 | for idp, pattern in enumerate(patterns): 703 | 704 | pattern_id = utils.pattern_to_id(pattern.astype(int)) 705 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int))) 706 | predictions[np.where(np.array(M_test_id) == pattern_id)] = oracles_mean_per_pattern[idp] 707 | 708 | return predictions 709 | 710 | def run_experiments(data, alpha, methods, basemodels, params_basemodel, masks, protections, exacts=['False'], imputation=None, 711 | params_reg={}, params_noise={}, 712 | parent_results='results'): 713 | 714 | d = data['X_missing']['Train'].shape[2] 715 | n_rep = data['X_missing']['Train'].shape[0] 716 | 717 | name_pipeline = [] 718 | for method in methods: 719 | for basemodel in basemodels: 720 | for mask in masks: 721 | for protection in protections: 722 | if method == 'CQR_MDA': 723 | for exact in exacts: 724 | name_temp = files.get_name_method(method, basemodel, mask, protection, exact) 725 | if not name_temp in name_pipeline: 726 | name_pipeline.append(name_temp) 727 | else: 728 | name_temp = files.get_name_method(method, basemodel, mask, protection) 729 | if not name_temp in name_pipeline: 730 | name_pipeline.append(name_temp) 731 | 732 | results_methods = dict.fromkeys(name_pipeline) 733 | 734 | for k in tqdm(range(n_rep)): 735 | 736 | if 'X' in list(data.keys()): 737 | X_train = data['X']['Train'][k,:,:] 738 | X_cal = data['X']['Cal'][k,:,:] 739 | X_mis_train = data['X_missing']['Train'][k,:,:] 740 | X_mis_cal = data['X_missing']['Cal'][k,:,:] 741 | X_imp_train = data['X_imp']['Train'][k,:,:] 742 | X_imp_cal = data['X_imp']['Cal'][k,:,:] 743 | M_train = data['M']['Train'][k,:,:] 744 | M_cal = data['M']['Cal'][k,:,:] 745 | Y_train = data['Y']['Train'][k,:] 746 | Y_cal = data['Y']['Cal'][k,:] 747 | 748 | keys_test = list(data['X_missing']['Test'].keys()) 749 | X_test = dict.fromkeys(keys_test) 750 | X_mis_test = dict.fromkeys(keys_test) 751 | X_imp_test = dict.fromkeys(keys_test) 752 | M_test = dict.fromkeys(keys_test) 753 | Y_test = dict.fromkeys(keys_test) 754 | for key in keys_test: 755 | if 'X' in list(data.keys()): 756 | X_test[key] = data['X']['Test'][key][k,:,:] 757 | X_mis_test[key] = data['X_missing']['Test'][key][k,:,:] 758 | X_imp_test[key] = data['X_imp']['Test'][key][k,:,:] 759 | M_test[key] = data['M']['Test'][key][k,:,:] 760 | Y_test[key] = data['Y']['Test'][key][k,:] 761 | 762 | trained_models = {} 763 | 764 | for method in methods: 765 | 766 | if method in ['Oracle', 'Oracle_mean']: 767 | if 'mean' in params_reg: 768 | mean = params_reg['mean'] 769 | else: 770 | mean = 1 771 | if 'phi' in params_reg: 772 | phi = params_reg['phi'] 773 | else: 774 | phi = 0.8 775 | mean = np.full(d, mean) 776 | cov = np.full((d,d),phi)+(1-phi)*np.eye(d) 777 | if 'beta' not in params_reg or params_reg['beta'] is None: 778 | beta = np.full(d,1) 779 | else: 780 | beta = params_reg['beta'] 781 | 782 | if method == 'Oracle': 783 | preds = dict.fromkeys(keys_test) 784 | for key in keys_test: 785 | pred = oracle(M_test[key], X_test[key], beta, mean, cov, alpha=alpha) 786 | preds[key] = pred 787 | 788 | elif method == 'Oracle_mean': 789 | cal_predictions = oracle_mean(M_cal, X_cal, beta, mean, cov) 790 | 791 | test_predictions = dict.fromkeys(keys_test) 792 | for key in keys_test: 793 | test_predictions[key] = oracle_mean(M_test[key], X_test[key], beta, mean, cov) 794 | 795 | for protection in protections: 796 | pipeline = files.get_name_method(method, basemodel=None, mask='No', protection=protection) 797 | if protection == 'No': 798 | groups_cal = None 799 | groups_test = dict.fromkeys(keys_test) 800 | for key in keys_test: 801 | groups_test[key] = None 802 | elif protection == 'Pattern': 803 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int))) 804 | groups_test = dict.fromkeys(keys_test) 805 | for key in keys_test: 806 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int))) 807 | elif protection == 'Pattern_Size': 808 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int))) 809 | groups_test = dict.fromkeys(keys_test) 810 | for key in keys_test: 811 | groups_test[key] = list(map(utils.pattern_to_size, M_test[key].astype(int))) 812 | preds = dict.fromkeys(keys_test) 813 | for key in keys_test: 814 | 815 | preds[key] = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions[key], 816 | groups_cal=groups_cal, groups_test=groups_test[key], 817 | target='Mean', 818 | basemodel='Oracle', alpha=alpha) 819 | 820 | results = results_methods[pipeline] 821 | if results_methods[pipeline] == None: 822 | results = dict.fromkeys(keys_test) 823 | for key in keys_test: 824 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']), 825 | 'Y_sup': np.array(preds[key]['y_sup'])} 826 | else: 827 | for key in keys_test: 828 | results[key]['Y_inf'] = np.vstack( 829 | (results[key]['Y_inf'], np.array(preds[key]['y_inf']))) 830 | results[key]['Y_sup'] = np.vstack( 831 | (results[key]['Y_sup'], np.array(preds[key]['y_sup']))) 832 | results_methods[pipeline] = results 833 | 834 | elif method == 'CQR_MDA': 835 | 836 | assert imputation is not None, "imputation must be specified for Masking" 837 | 838 | target = 'Quantiles' 839 | 840 | imputer_masking = imp.impute_imputer(X_mis_train, imputation) 841 | 842 | X_imp_train_masking = imputer_masking.transform(X_mis_train) 843 | X_imp_test_masking = dict.fromkeys(keys_test) 844 | 845 | for key in keys_test: 846 | X_imp_test_masking[key] = imputer_masking.transform(X_mis_test[key]) 847 | 848 | if target in trained_models.keys(): 849 | trained_models_target = trained_models[target] 850 | else: 851 | trained_models[target] = {} 852 | trained_models_target = None 853 | 854 | for basemodel in basemodels: 855 | 856 | if trained_models_target is not None and basemodel in trained_models_target.keys(): 857 | trained_models_target_basemodel = trained_models_target[basemodel] 858 | else: 859 | trained_models[target][basemodel] = {} 860 | trained_models_target_basemodel = None 861 | 862 | for mask in masks: 863 | 864 | if mask == 'Yes': 865 | name_mask = 'mask' 866 | features_train = np.concatenate((X_imp_train_masking, M_train), axis=1) 867 | features_test = dict.fromkeys(keys_test) 868 | for key in keys_test: 869 | features_test[key] = np.concatenate((X_imp_test_masking[key], M_test[key]), axis=1) 870 | else: 871 | name_mask = 'no_mask' 872 | features_train = X_imp_train_masking 873 | features_test = X_imp_test_masking 874 | 875 | if trained_models_target_basemodel is not None and name_mask in trained_models_target_basemodel.keys(): 876 | trained_models_target_basemodel_mask = trained_models_target_basemodel[name_mask] 877 | else: 878 | trained_models[target][basemodel][name_mask] = {} 879 | trained_models_target_basemodel_mask = None 880 | 881 | if trained_models_target_basemodel_mask is None: 882 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha, 883 | params_basemodel=params_basemodel) 884 | trained_models[target][basemodel][name_mask] = trained_model 885 | else: 886 | trained_model = trained_models_target_basemodel_mask 887 | 888 | groups_test = dict.fromkeys(keys_test) 889 | for key in keys_test: 890 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int))) 891 | 892 | for exact in exacts: 893 | pipeline = files.get_name_method(method, basemodel, mask, exact=exact) 894 | 895 | preds = dict.fromkeys(keys_test) 896 | for key in keys_test: 897 | pred = calibrate_masking_predict_intervals(trained_model, imputer_masking, 898 | X_mis_cal, M_cal, Y_cal, 899 | X_mis_test[key], features_test[key], M_test[key], mask, 900 | groups_test=groups_test[key], exact=exact, 901 | target=target, basemodel=basemodel, alpha=alpha) 902 | preds[key] = pred 903 | 904 | results = results_methods[pipeline] 905 | 906 | if results_methods[pipeline] == None: 907 | results = dict.fromkeys(keys_test) 908 | for key in keys_test: 909 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']), 'Y_sup': np.array(preds[key]['y_sup'])} 910 | else: 911 | for key in keys_test: 912 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(preds[key]['y_inf']))) 913 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(preds[key]['y_sup']))) 914 | results_methods[pipeline] = results 915 | 916 | else: 917 | 918 | if method == 'SCP': 919 | target = 'Mean' 920 | elif method in ['CQR', 'QR', 'QR_TrainCal']: 921 | target = 'Quantiles' 922 | 923 | if method in ['QR', 'QR_TrainCal']: 924 | conformalized = False 925 | else: 926 | conformalized = True 927 | 928 | if target in trained_models.keys(): 929 | trained_models_target = trained_models[target] 930 | else: 931 | trained_models[target] = {} 932 | trained_models_target = None 933 | 934 | for basemodel in basemodels: 935 | 936 | if method != 'QR_TrainCal' and trained_models_target is not None and basemodel in trained_models_target.keys(): 937 | trained_models_target_basemodel = trained_models_target[basemodel] 938 | elif method != 'QR_TrainCal': 939 | trained_models[target][basemodel] = {} 940 | trained_models_target_basemodel = None 941 | 942 | for mask in masks: 943 | 944 | if mask == 'Yes': 945 | name_mask = 'mask' 946 | features_train = np.concatenate((X_imp_train, M_train), axis=1) 947 | features_cal = np.concatenate((X_imp_cal, M_cal), axis=1) 948 | features_test = dict.fromkeys(keys_test) 949 | for key in keys_test: 950 | features_test[key] = np.concatenate((X_imp_test[key], M_test[key]), axis=1) 951 | else: 952 | name_mask = 'no_mask' 953 | features_train = X_imp_train 954 | features_cal = X_imp_cal 955 | features_test = X_imp_test 956 | 957 | if method == 'QR_TrainCal': 958 | features_train = np.concatenate((features_train, features_cal), axis=0) 959 | Y_traincal = np.concatenate((Y_train, Y_cal), axis=0) 960 | trained_model = fit_basemodel(features_train, Y_traincal, target=target, basemodel=basemodel, 961 | alpha=alpha, 962 | params_basemodel=params_basemodel) 963 | else: 964 | if trained_models_target_basemodel is not None and name_mask in trained_models_target_basemodel.keys(): 965 | trained_models_target_basemodel_mask = trained_models_target_basemodel[name_mask] 966 | else: 967 | trained_models[target][basemodel][name_mask] = {} 968 | trained_models_target_basemodel_mask = None 969 | 970 | if trained_models_target_basemodel_mask is None: 971 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha, 972 | params_basemodel=params_basemodel) 973 | trained_models[target][basemodel][name_mask] = trained_model 974 | else: 975 | trained_model = trained_models_target_basemodel_mask 976 | 977 | cal_predictions = predict_basemodel(trained_model, features_cal, target, basemodel, alpha) 978 | 979 | test_predictions = dict.fromkeys(keys_test) 980 | for key in keys_test: 981 | test_predictions[key] = predict_basemodel(trained_model, features_test[key], target, basemodel, alpha) 982 | 983 | if conformalized: 984 | 985 | for protection in protections: 986 | pipeline = files.get_name_method(method, basemodel, mask, protection) 987 | if protection == 'No': 988 | groups_cal = None 989 | groups_test = dict.fromkeys(keys_test) 990 | for key in keys_test: 991 | groups_test[key] = None 992 | elif protection == 'Pattern': 993 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int))) 994 | groups_test = dict.fromkeys(keys_test) 995 | for key in keys_test: 996 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int))) 997 | elif protection == 'Pattern_Size': 998 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int))) 999 | groups_test = dict.fromkeys(keys_test) 1000 | for key in keys_test: 1001 | groups_test[key] = list(map(utils.pattern_to_size, M_test[key].astype(int))) 1002 | preds = dict.fromkeys(keys_test) 1003 | for key in keys_test: 1004 | preds[key] = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions[key], 1005 | groups_cal=groups_cal, groups_test=groups_test[key], 1006 | target=target, 1007 | basemodel=basemodel, alpha=alpha) 1008 | 1009 | results = results_methods[pipeline] 1010 | if results_methods[pipeline] == None: 1011 | results = dict.fromkeys(keys_test) 1012 | for key in keys_test: 1013 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']), 'Y_sup': np.array(preds[key]['y_sup'])} 1014 | else: 1015 | for key in keys_test: 1016 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(preds[key]['y_inf']))) 1017 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(preds[key]['y_sup']))) 1018 | results_methods[pipeline] = results 1019 | 1020 | else: 1021 | interval_predictions = dict.fromkeys(keys_test) 1022 | for key in keys_test: 1023 | interval_predictions[key] = {'y_inf': test_predictions[key]['y_inf'], 1024 | 'y_sup': test_predictions[key]['y_sup']} 1025 | pipeline = files.get_name_method(method, basemodel, mask, conformalized) 1026 | results = results_methods[pipeline] 1027 | if results_methods[pipeline] == None: 1028 | results = dict.fromkeys(keys_test) 1029 | for key in keys_test: 1030 | results[key] = {'Y_inf': np.array(interval_predictions[key]['y_inf']), 1031 | 'Y_sup': np.array(interval_predictions[key]['y_sup'])} 1032 | else: 1033 | for key in keys_test: 1034 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(interval_predictions[key]['y_inf']))) 1035 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(interval_predictions[key]['y_sup']))) 1036 | results_methods[pipeline] = results 1037 | 1038 | return results_methods, name_pipeline 1039 | 1040 | def run_real_experiments(data, alpha, methods, basemodels, params_basemodel, masks, conformalized, protections, 1041 | n_rep, parent_results='results', imputation=None, data_missing=None, exact=True): 1042 | 1043 | test_size = len(data['Y']['Test'][0,:]) 1044 | d = data['X_imp']['Train'].shape[2] 1045 | 1046 | name_pipeline = [] 1047 | for method in methods: 1048 | for basemodel in basemodels: 1049 | for mask in masks: 1050 | for protection in protections: 1051 | name_temp = files.get_name_method(method, basemodel, mask, protection, conformalized, exact) 1052 | if not name_temp in name_pipeline: 1053 | name_pipeline.append(name_temp) 1054 | 1055 | results_methods = dict.fromkeys(name_pipeline) 1056 | 1057 | if 'M_original' in data.keys(): 1058 | mask_original = True 1059 | else: 1060 | mask_original = False 1061 | 1062 | for k in tqdm(range(n_rep)): 1063 | 1064 | X_imp_train = data['X_imp']['Train'][k,:,:] 1065 | X_imp_cal = data['X_imp']['Cal'][k,:,:] 1066 | X_imp_test = data['X_imp']['Test'][k,:,:] 1067 | if mask_original: 1068 | M_original_train = data['M_original']['Train'][k,:,:] 1069 | M_original_cal = data['M_original']['Cal'][k,:,:] 1070 | M_original_test = data['M_original']['Test'][k,:,:] 1071 | M_train = data['M']['Train'][k,:,:] 1072 | M_cal = data['M']['Cal'][k,:,:] 1073 | M_test = data['M']['Test'][k,:,:] 1074 | Y_train = data['Y']['Train'][k,:] 1075 | Y_cal = data['Y']['Cal'][k,:] 1076 | Y_test = data['Y']['Test'][k,:] 1077 | 1078 | for method in methods: 1079 | if method == 'CQR_MDA': 1080 | 1081 | assert imputation is not None, "imputation must be specified for Masking" 1082 | 1083 | target = 'Quantiles' 1084 | 1085 | X_mis_train = data_missing['X_missing']['Train'][k,:,:] 1086 | X_mis_cal = data_missing['X_missing']['Cal'][k,:,:] 1087 | X_mis_test = data_missing['X_missing']['Test'][k,:,:] 1088 | 1089 | imputer_masking = imp.impute_imputer(X_mis_train, imputation) 1090 | 1091 | X_imp_train_masking = imputer_masking.transform(X_mis_train) 1092 | X_imp_test_masking = imputer_masking.transform(X_mis_test) 1093 | 1094 | for basemodel in basemodels: 1095 | for mask in masks: 1096 | if mask == 'Yes': 1097 | features_train = np.concatenate((X_imp_train_masking, M_train), axis=1) 1098 | features_test = np.concatenate((X_imp_test_masking, M_test), axis=1) 1099 | else: 1100 | features_train = X_imp_train_masking 1101 | features_test = X_imp_test_masking 1102 | 1103 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha, 1104 | params_basemodel=params_basemodel) 1105 | pipeline = files.get_name_method(method, basemodel, mask, exact=exact) 1106 | groups_test = list(map(utils.pattern_to_id, M_test.astype(int))) 1107 | 1108 | pred = calibrate_masking_predict_intervals(trained_model, imputer_masking, 1109 | X_mis_cal, M_cal, Y_cal, features_test, 1110 | M_test, mask, 1111 | groups_test=groups_test, exact=exact, target=target, 1112 | basemodel=basemodel, alpha=alpha) 1113 | results = results_methods[pipeline] 1114 | if results_methods[pipeline] == None: 1115 | results = {'Y_inf': np.array(pred['y_inf']), 'Y_sup': np.array(pred['y_sup'])} 1116 | else: 1117 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(pred['y_inf']))) 1118 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(pred['y_sup']))) 1119 | results_methods[pipeline] = results 1120 | else: 1121 | if method == 'SCP': 1122 | target = 'Mean' 1123 | elif method in ['CQR', 'QR']: 1124 | target = 'Quantiles' 1125 | for basemodel in basemodels: 1126 | for mask in masks: 1127 | if mask == 'Yes': 1128 | features_train = np.concatenate((X_imp_train, M_train), axis=1) 1129 | features_cal = np.concatenate((X_imp_cal, M_cal), axis=1) 1130 | features_test = np.concatenate((X_imp_test, M_test), axis=1) 1131 | else: 1132 | features_train = X_imp_train 1133 | features_cal = X_imp_cal 1134 | features_test = X_imp_test 1135 | 1136 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha, 1137 | params_basemodel=params_basemodel) 1138 | 1139 | cal_predictions = predict_basemodel(trained_model, features_cal, target, basemodel, alpha) 1140 | test_predictions = predict_basemodel(trained_model, features_test, target, basemodel, alpha) 1141 | 1142 | if conformalized: 1143 | for protection in protections: 1144 | pipeline = files.get_name_method(method, basemodel, mask, protection) 1145 | if protection == 'No': 1146 | groups_cal = None 1147 | groups_test = None 1148 | elif protection == 'Pattern': 1149 | if mask_original: 1150 | groups_cal = list(map(utils.pattern_to_id, M_original_cal.astype(int))) 1151 | groups_test = list(map(utils.pattern_to_id, M_original_test.astype(int))) 1152 | else: 1153 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int))) 1154 | groups_test = list(map(utils.pattern_to_id, M_test.astype(int))) 1155 | elif protection == 'Pattern_Size': 1156 | if mask_original: 1157 | groups_cal = list(map(utils.pattern_to_size, M_original_cal.astype(int))) 1158 | groups_test = list(map(utils.pattern_to_size, M_original_test.astype(int))) 1159 | else: 1160 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int))) 1161 | groups_test = list(map(utils.pattern_to_size, M_test.astype(int))) 1162 | pred = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions, 1163 | groups_cal=groups_cal, groups_test=groups_test, target=target, 1164 | basemodel=basemodel, alpha=alpha) 1165 | results = results_methods[pipeline] 1166 | if results_methods[pipeline] == None: 1167 | results = {'Y_inf': np.array(pred['y_inf']), 'Y_sup': np.array(pred['y_sup'])} 1168 | else: 1169 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(pred['y_inf']))) 1170 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(pred['y_sup']))) 1171 | results_methods[pipeline] = results 1172 | else: 1173 | interval_predictions = {'y_inf': test_predictions['y_inf'], 1174 | 'y_sup': test_predictions['y_sup']} 1175 | pipeline = files.get_name_method(method, basemodel, mask, protection, conformalized) 1176 | results = results_methods[pipeline] 1177 | if results_methods[pipeline] == None: 1178 | results = {'Y_inf': np.array(interval_predictions['y_inf']), 1179 | 'Y_sup': np.array(interval_predictions['y_sup'])} 1180 | else: 1181 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(interval_predictions['y_inf']))) 1182 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(interval_predictions['y_sup']))) 1183 | results_methods[pipeline] = results 1184 | 1185 | return results_methods, name_pipeline 1186 | -------------------------------------------------------------------------------- /results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/results/.DS_Store -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import files 3 | import functools 4 | 5 | def pattern_to_id(m): 6 | return(int(''.join(map(str,m)), 2)) 7 | 8 | def pattern_to_id_float(m): 9 | return(float(int(''.join(map(str,m)), 2))) 10 | 11 | def pattern_to_size(m): 12 | return(int(np.sum(m))) 13 | 14 | def bin_to_vec(bin_pattern, d, var_missing=None): 15 | bin_pattern = bin_pattern[2:] 16 | l = len(bin_pattern) 17 | if var_missing is None: 18 | nb_missing = d 19 | else: 20 | nb_missing = np.sum(var_missing) 21 | if l < nb_missing: 22 | for i in range(nb_missing-l): 23 | bin_pattern = '0'+bin_pattern 24 | vec_bin = [int(x) for x in bin_pattern] 25 | if nb_missing < d: 26 | vec_pattern = [0] * d 27 | vec_pattern = np.array(vec_pattern) 28 | vec_pattern[list(np.where(var_missing == 1)[0])] = vec_bin 29 | vec_pattern = list(vec_pattern) 30 | else: 31 | vec_pattern = vec_bin 32 | return(vec_pattern) 33 | 34 | def create_patterns(d, var_missing): 35 | nb_var_missing = np.sum(var_missing) 36 | if nb_var_missing == d: 37 | keys_patterns = np.arange(0, 2**d-1) 38 | bin_patterns = list(map(bin, keys_patterns)) 39 | vec_patterns = list(map(functools.partial(bin_to_vec, d=d), bin_patterns)) 40 | else: 41 | keys_patterns = np.arange(0, 2**(nb_var_missing)) 42 | bin_patterns = list(map(bin, keys_patterns)) 43 | vec_patterns = list(map(functools.partial(bin_to_vec, d=d, var_missing=var_missing), bin_patterns)) 44 | return(vec_patterns) 45 | 46 | 47 | def get_data_results(method, train_size, cal_size, params_test, n_rep, imputation, d=3, 48 | params_reg={}, params_noise={}, dataset=None, params_missing={}, 49 | parent_results='results', parent_data='data', extension='pkl'): 50 | 51 | name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, 52 | imputation=imputation, d=d, 53 | params_reg=params_reg, params_noise=params_noise, 54 | dataset=dataset, 55 | params_missing=params_missing) 56 | results = files.load_file(parent_results+'/'+name_dir, name_method, extension) 57 | 58 | name_data = files.get_name_data(train_size, cal_size, params_test, dim=d, 59 | params_reg=params_reg, params_noise=params_noise, 60 | dataset=dataset, 61 | params_missing=params_missing, seed=n_rep) 62 | data = files.load_file(parent_data, name_data, extension) 63 | 64 | return data, results 65 | 66 | def compute_PI_metrics(data, results, mechanism_test): 67 | 68 | contains = (data['Y']['Test'][mechanism_test] <= results[mechanism_test]['Y_sup']) & (data['Y']['Test'][mechanism_test] >= results[mechanism_test]['Y_inf']) 69 | lengths = results[mechanism_test]['Y_sup'] - results[mechanism_test]['Y_inf'] 70 | 71 | return contains, lengths#, 72 | 73 | def compute_metrics_cond(n_rep, data, results, mechanism_test, cond='Pattern', replace_inf=False): 74 | 75 | contains, lengths = compute_PI_metrics(data, results, mechanism_test) 76 | 77 | 78 | if replace_inf: 79 | max_y_train = np.max(data['Y']['Train'], axis=1) 80 | max_y_cal = np.max(data['Y']['Cal'], axis=1) 81 | min_y_train = np.min(data['Y']['Train'], axis=1) 82 | min_y_cal = np.min(data['Y']['Cal'], axis=1) 83 | max_length_traincal = np.maximum(max_y_train, max_y_cal) - np.minimum(min_y_train, min_y_cal) 84 | 85 | M_test = data['M']['Test'][mechanism_test] 86 | 87 | if cond == 'Pattern': 88 | groups = np.apply_along_axis(pattern_to_id_float, 2, M_test.astype(int)) 89 | test_patterns_id = np.unique(groups) 90 | elif cond == 'Pattern_Size': 91 | groups = np.apply_along_axis(pattern_to_size, 2, M_test.astype(int)) 92 | test_patterns_id = np.unique(groups) 93 | 94 | metrics = dict.fromkeys(test_patterns_id) 95 | 96 | for pattern_id in test_patterns_id: 97 | 98 | avg_cov = [] 99 | avg_len = [] 100 | nb_samples = [] 101 | 102 | for k in range(n_rep): 103 | current_lens = lengths[k,groups[k,:] == pattern_id] 104 | 105 | temp_cov = np.nanmean(contains[k,groups[k,:] == pattern_id]) 106 | temp_nb = np.sum(groups[k,:] == pattern_id) 107 | 108 | if replace_inf: 109 | idx_inf = np.where(np.isinf(current_lens)) 110 | if len(idx_inf) > 0: 111 | current_lens[idx_inf] = max_length_traincal[k] 112 | 113 | temp_len = np.nanmean(current_lens) 114 | 115 | avg_cov = np.append(avg_cov, temp_cov) 116 | avg_len = np.append(avg_len, temp_len) 117 | nb_samples = np.append(nb_samples, temp_nb) 118 | 119 | metrics[pattern_id] = {'avg_cov': avg_cov, 'avg_len': avg_len, 'nb_sample': nb_samples} 120 | 121 | return metrics 122 | 123 | def name_tick(name_method): 124 | if name_method[-4:] == 'Mask': 125 | name_tick = '+ mask' 126 | else: 127 | name_tick = re.search(r"[a-zA-Z]*", name_method).group() 128 | if name_tick != 'MICE': 129 | name_tick = name_tick.capitalize() 130 | return name_tick --------------------------------------------------------------------------------