├── .DS_Store
├── CP_NA_Semi-synthetic.ipynb
├── CP_NA_Semi-synthetic_Plots.ipynb
├── CP_NA_Synthetic.ipynb
├── CP_NA_Synthetic_Plots.ipynb
├── LICENSE
├── README.md
├── data
├── .DS_Store
└── cqr_datasets
│ ├── .DS_Store
│ ├── .ipynb_checkpoints
│ └── Untitled-checkpoint.ipynb
│ ├── CASP.csv
│ ├── Concrete_Data.csv
│ ├── README.md
│ ├── STAR.csv
│ ├── Untitled.ipynb
│ ├── bike_train.csv
│ ├── communities.data
│ ├── communities_attributes.csv
│ ├── facebook
│ ├── Features_Variant_1.csv
│ ├── Features_Variant_2.csv
│ └── README.md
│ ├── meps_19_reg.csv
│ ├── meps_20_reg.csv
│ └── meps_21_reg.csv
├── datasets.py
├── files.py
├── generation.py
├── imputation.py
├── plots
└── .DS_Store
├── prediction.py
├── results
└── .DS_Store
└── utils.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/.DS_Store
--------------------------------------------------------------------------------
/CP_NA_Semi-synthetic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import imputation as imp\n",
10 | "import generation as gen\n",
11 | "import prediction\n",
12 | "import utils\n",
13 | "import files\n",
14 | "import os\n",
15 | "import numpy as np\n",
16 | "import pandas as pd\n",
17 | "from tqdm.autonotebook import tqdm\n",
18 | "import datasets"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "datasets_name = ['meps_19', 'bio', 'concrete', 'bike']"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "datasets_sizes = {'meps_19': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
37 | " 'meps_20': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
38 | " 'meps_21': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
39 | " 'bio': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
40 | " 'concrete': {'train': 630, 'cal': 200, 'test_pattern': 100},\n",
41 | " 'bike': {'train': 1000, 'cal': 500, 'test_pattern': 100}}"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "n_rep = 100\n",
51 | "alpha = 0.1"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "prob_missing = 0.2"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "imputation = 'iterative_ridge'\n",
70 | "\n",
71 | "methods = ['QR', 'QR_TrainCal', 'CQR', 'CQR_MDA']\n",
72 | "basemodels = ['NNet']\n",
73 | "masks = ['Yes']\n",
74 | "protections = ['No']\n",
75 | "exacts = [False, True]\n",
76 | "\n",
77 | "cores = 1\n",
78 | "\n",
79 | "params_basemodel = {'cores':cores}"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "dataset_base_path = \"./data/cqr_datasets/\""
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "for dataset_name in tqdm(datasets_name):\n",
98 | " \n",
99 | " df, target, var_missing = datasets.GetDataset(dataset_name, dataset_base_path)\n",
100 | " \n",
101 | " params_missing = {}\n",
102 | " params_missing['var_missing'] = var_missing\n",
103 | " params_missing['prob_missing'] = prob_missing\n",
104 | " \n",
105 | " d = df.shape[1]-1\n",
106 | " \n",
107 | " if dataset_name == 'concrete':\n",
108 | " nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']\n",
109 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
110 | " else:\n",
111 | " nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']\n",
112 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
113 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n",
114 | " \n",
115 | " max_test_size = np.max(params_test['test_size'])\n",
116 | " \n",
117 | " train_size = datasets_sizes[dataset_name]['train']\n",
118 | " cal_size = datasets_sizes[dataset_name]['cal']\n",
119 | "\n",
120 | " name = files.get_name_data(train_size, cal_size, params_test, \n",
121 | " dataset=dataset_name, params_missing=params_missing, seed=n_rep)\n",
122 | " \n",
123 | " if os.path.isfile('data/'+name+'.xz'):\n",
124 | " print('data found')\n",
125 | " data = files.load_file('data', name, 'xz')\n",
126 | " else:\n",
127 | " print('data not found')\n",
128 | " X, X_missing, M, Y = gen.generate_multiple_real_data_MCAR(df, target, train_size=train_size, \n",
129 | " cal_size=cal_size, params_test=params_test,\n",
130 | " params_missing=params_missing, seed_max=n_rep)\n",
131 | " data = {'X': X, 'X_missing': X_missing, 'M': M,'Y': Y}\n",
132 | " files.write_file('data', name, 'xz', data)\n",
133 | " \n",
134 | " name_imputed = files.get_name_data_imputed(train_size, cal_size, params_test, imputation=imputation,\n",
135 | " dataset=dataset_name, params_missing=params_missing, seed=n_rep)\n",
136 | "\n",
137 | " if os.path.isfile('data/'+name_imputed+'.pkl'):\n",
138 | " print('imputation found')\n",
139 | " X_imp = files.load_file('data', name_imputed, 'pkl')\n",
140 | " else:\n",
141 | " print('imputation not found')\n",
142 | " if imputation == 'complete':\n",
143 | " X_imp = data['X']\n",
144 | " else:\n",
145 | " X_imp = imp.impute(data, imputation)\n",
146 | " files.write_file('data', name_imputed, 'pkl', X_imp)\n",
147 | " data_imputed = {'X': data['X'], 'X_missing': data['X_missing'], 'X_imp': X_imp, 'M': data['M'],'Y': data['Y']}\n",
148 | "\n",
149 | " \n",
150 | " results, methods_ran = prediction.run_experiments(data_imputed, alpha=alpha, methods=methods,\n",
151 | " basemodels=basemodels, params_basemodel=params_basemodel,\n",
152 | " masks=masks, protections=protections, \n",
153 | " exacts=exacts, imputation=imputation)\n",
154 | "\n",
155 | " for method in methods_ran:\n",
156 | " name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, \n",
157 | " dataset=dataset_name, imputation=imputation,\n",
158 | " params_missing=params_missing)\n",
159 | " \n",
160 | " results_method = results[method]\n",
161 | " files.write_file('results/'+name_dir, name_method, 'xz', results_method)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": []
170 | }
171 | ],
172 | "metadata": {
173 | "kernelspec": {
174 | "display_name": "Python 3",
175 | "language": "python",
176 | "name": "python3"
177 | },
178 | "language_info": {
179 | "codemirror_mode": {
180 | "name": "ipython",
181 | "version": 3
182 | },
183 | "file_extension": ".py",
184 | "mimetype": "text/x-python",
185 | "name": "python",
186 | "nbconvert_exporter": "python",
187 | "pygments_lexer": "ipython3",
188 | "version": "3.8.5"
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 4
193 | }
194 |
--------------------------------------------------------------------------------
/CP_NA_Semi-synthetic_Plots.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import generation as gen\n",
10 | "import utils\n",
11 | "import files\n",
12 | "import os\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "from tqdm.autonotebook import tqdm\n",
16 | "import datasets\n",
17 | "import copy"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import matplotlib.pyplot as plt\n",
27 | "import seaborn as sns\n",
28 | "import matplotlib as mpl\n",
29 | "from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset\n",
30 | "from matplotlib.backends.backend_pgf import FigureCanvasPgf\n",
31 | "mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)\n",
32 | "import matplotlib.lines as mlines"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "size=19\n",
42 | "mpl.rcParams.update({\n",
43 | " \"pgf.texsystem\": \"pdflatex\",\n",
44 | " 'font.family': 'serif',\n",
45 | " 'font.serif': 'Times',\n",
46 | " 'text.usetex': True,\n",
47 | " 'pgf.rcfonts': False,\n",
48 | " 'font.size': size,\n",
49 | " 'axes.labelsize':size,\n",
50 | " 'axes.titlesize':size,\n",
51 | " 'figure.titlesize':size,\n",
52 | " 'xtick.labelsize':size,\n",
53 | " 'ytick.labelsize':size,\n",
54 | " 'legend.fontsize':size,\n",
55 | "})"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "datasets_names = ['meps_19', 'bio', 'concrete', 'bike']\n",
65 | "dataset_base_path = \"./data/cqr_datasets/\""
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "datasets_sizes_normal = {'meps_19': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
75 | " 'meps_20': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
76 | " 'meps_21': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
77 | " 'bio': {'train': 1000, 'cal': 500, 'test_pattern': 100},\n",
78 | " 'concrete': {'train': 630, 'cal': 200, 'test_pattern': 100},\n",
79 | " 'bike': {'train': 1000, 'cal': 500, 'test_pattern': 100}}\n",
80 | "\n",
81 | "datasets_sizes_small = {'meps_19': {'train': 500, 'cal': 250, 'test_pattern': 100},\n",
82 | " 'meps_20': {'train': 500, 'cal': 250, 'test_pattern': 100},\n",
83 | " 'meps_21': {'train': 500, 'cal': 250, 'test_pattern': 100},\n",
84 | " 'bio': {'train': 500, 'cal': 250, 'test_pattern': 100},\n",
85 | " 'concrete': {'train': 330, 'cal': 100, 'test_pattern': 100},\n",
86 | " 'bike': {'train': 500, 'cal': 250, 'test_pattern': 100}}"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "n_rep = 100\n",
96 | "alpha = 0.1"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "prob_missing = 0.2"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "imputation = 'iterative_ridge'\n",
115 | "mask = 'Yes'\n",
116 | "protection = 'No'"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "methods = ['CQR', 'CQR_MDA']#QR_TrainCal\n",
126 | "basemodel = 'NNet'\n",
127 | "exacts = [False, True]"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "name_pipelines = []\n",
137 | "for method in methods: \n",
138 | " if method == 'CQR_MDA':\n",
139 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=True)\n",
140 | " if not name_temp in name_pipelines:\n",
141 | " name_pipelines.append(name_temp)\n",
142 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=False)\n",
143 | " if not name_temp in name_pipelines:\n",
144 | " name_pipelines.append(name_temp)\n",
145 | " \n",
146 | "current_pipeline = method+'_'+basemodel"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "if mask == 'Yes':\n",
156 | " dict_methods = {'QR_TrainCal_NNet_Mask': 'QR',\n",
157 | " 'CQR_NNet_Mask': 'CQR', \n",
158 | " 'CQR_MDA_Exact_NNet_Mask': 'CQR-MDA-Exact',\n",
159 | " 'CQR_MDA_Nested_NNet_Mask': 'CQR-MDA-Nested'}\n",
160 | "else:\n",
161 | " dict_methods = {'QR_TrainCal_NNet': 'QR',\n",
162 | " 'CQR_NNet': 'CQR', \n",
163 | " 'CQR_MDA_Exact_NNet': 'CQR-MDA-Exact',\n",
164 | " 'CQR_MDA_Nested_NNet': 'CQR-MDA-Nested'}"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "#sizes = ['small', 'normal']\n",
174 | "sizes = ['normal']\n",
175 | " \n",
176 | "dict_cov = dict.fromkeys(datasets_names)\n",
177 | "dict_len = dict.fromkeys(datasets_names)\n",
178 | "dict_cov_patterns = dict.fromkeys(datasets_names)\n",
179 | "dict_len_patterns = dict.fromkeys(datasets_names)\n",
180 | "dict_cov_worst = dict.fromkeys(datasets_names)\n",
181 | "dict_len_worst = dict.fromkeys(datasets_names)\n",
182 | "dict_cov_best = dict.fromkeys(datasets_names)\n",
183 | "dict_len_best = dict.fromkeys(datasets_names)\n",
184 | "\n",
185 | "for dataset_name in datasets_names:\n",
186 | "\n",
187 | " dict_cov[dataset_name] = dict.fromkeys(sizes)\n",
188 | " dict_len[dataset_name] = dict.fromkeys(sizes)\n",
189 | " dict_cov_patterns[dataset_name] = dict.fromkeys(sizes)\n",
190 | " dict_len_patterns[dataset_name] = dict.fromkeys(sizes)\n",
191 | " dict_cov_worst[dataset_name] = dict.fromkeys(sizes)\n",
192 | " dict_len_worst[dataset_name] = dict.fromkeys(sizes)\n",
193 | " dict_cov_best[dataset_name] = dict.fromkeys(sizes)\n",
194 | " dict_len_best[dataset_name] = dict.fromkeys(sizes)\n",
195 | "\n",
196 | " for size in sizes:\n",
197 | "\n",
198 | " dict_cov[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
199 | " dict_len[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
200 | " dict_cov_patterns[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
201 | " dict_len_patterns[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
202 | " dict_cov_worst[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
203 | " dict_len_worst[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
204 | " dict_cov_best[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
205 | " dict_len_best[dataset_name][size] = dict.fromkeys(name_pipelines)\n",
206 | "\n",
207 | " for pipeline in name_pipelines:\n",
208 | " dict_cov_patterns[dataset_name][size][pipeline] = {}\n",
209 | " dict_len_patterns[dataset_name][size][pipeline] = {}"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "for dataset_name in datasets_names:\n",
219 | " \n",
220 | " print(dataset_name)\n",
221 | " \n",
222 | " df, target, var_missing = datasets.GetDataset(dataset_name, dataset_base_path)\n",
223 | "\n",
224 | " d = df.shape[1]-1\n",
225 | "\n",
226 | " params_missing = {}\n",
227 | " params_missing['var_missing'] = var_missing\n",
228 | " params_missing['prob_missing'] = prob_missing\n",
229 | "\n",
230 | " for size in sizes:\n",
231 | " \n",
232 | " if size == 'normal':\n",
233 | " train_size = datasets_sizes_normal[dataset_name]['train']\n",
234 | " cal_size = datasets_sizes_normal[dataset_name]['cal']\n",
235 | " \n",
236 | " if dataset_name == 'concrete':\n",
237 | " nb_sample_pattern = datasets_sizes_normal[dataset_name]['test_pattern']\n",
238 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
239 | " else:\n",
240 | " nb_sample_pattern = datasets_sizes_normal[dataset_name]['test_pattern']\n",
241 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
242 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n",
243 | "\n",
244 | " max_test_size = np.max(params_test['test_size'])\n",
245 | " \n",
246 | " elif size == 'small':\n",
247 | " train_size = datasets_sizes_small[dataset_name]['train']\n",
248 | " cal_size = datasets_sizes_small[dataset_name]['cal']\n",
249 | " \n",
250 | " if dataset_name == 'concrete':\n",
251 | " nb_sample_pattern = datasets_sizes_small[dataset_name]['test_pattern']\n",
252 | " params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
253 | " else:\n",
254 | " nb_sample_pattern = datasets_sizes_small[dataset_name]['test_pattern']\n",
255 | " params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}\n",
256 | " params_test = gen.process_test(params_test, d=d, params_missing=params_missing)\n",
257 | "\n",
258 | " max_test_size = np.max(params_test['test_size'])\n",
259 | "\n",
260 | " name_method = []\n",
261 | "\n",
262 | " for pipeline in name_pipelines:\n",
263 | "\n",
264 | " name_method = np.append(name_method, '_'.join([imputation, pipeline]))\n",
265 | "\n",
266 | " key = -1\n",
267 | "\n",
268 | " data, results = utils.get_data_results(pipeline, train_size, cal_size, params_test, n_rep, imputation=imputation,\n",
269 | " dataset = dataset_name,\n",
270 | " params_missing=params_missing,\n",
271 | " parent_results='results', parent_data='data', extension='xz')\n",
272 | "\n",
273 | " contains, lengths = utils.compute_PI_metrics(data, results, 'iid')\n",
274 | "\n",
275 | " metrics = utils.compute_metrics_cond(n_rep, data, results, 'fixed_nb_sample_pattern', cond='Pattern')\n",
276 | "\n",
277 | " dict_cov[dataset_name][size][pipeline] = np.mean(contains, axis=1)\n",
278 | " dict_len[dataset_name][size][pipeline] = np.mean(lengths, axis=1)\n",
279 | "\n",
280 | " for key_pattern in list(metrics.keys()):\n",
281 | "\n",
282 | " dict_cov_patterns[dataset_name][size][pipeline][key_pattern] = metrics[key_pattern]['avg_cov']\n",
283 | " dict_len_patterns[dataset_name][size][pipeline][key_pattern] = metrics[key_pattern]['avg_len']"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "for dataset_name in datasets_names:\n",
293 | " \n",
294 | " for size in sizes:\n",
295 | " \n",
296 | " for pipeline in name_pipelines:\n",
297 | "\n",
298 | " avg_cov = dict.fromkeys(dict_cov_patterns[dataset_name][size][pipeline].keys())\n",
299 | "\n",
300 | " for key in list(avg_cov.keys()):\n",
301 | " avg_cov[key] = np.mean(dict_cov_patterns[dataset_name][size][pipeline][key])\n",
302 | "\n",
303 | " worst_index = np.argmin(list(avg_cov.values()))\n",
304 | " worst_key = list(avg_cov.keys())[worst_index]\n",
305 | " dict_cov_worst[dataset_name][size][pipeline] = dict_cov_patterns[dataset_name][size][pipeline][worst_key]\n",
306 | " dict_len_worst[dataset_name][size][pipeline] = dict_len_patterns[dataset_name][size][pipeline][worst_key]\n",
307 | "\n",
308 | " best_index = np.argmax(list(avg_cov.values()))\n",
309 | " best_key = list(avg_cov.keys())[best_index]\n",
310 | " dict_cov_best[dataset_name][size][pipeline] = dict_cov_patterns[dataset_name][size][pipeline][best_key]\n",
311 | " dict_len_best[dataset_name][size][pipeline] = dict_len_patterns[dataset_name][size][pipeline][best_key] "
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "colors_blindness = sns.color_palette(\"colorblind\")\n",
321 | "colors_blindness"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15,4.3))\n",
331 | "axes = {'meps_19': ax1, 'bio': ax2, 'concrete': ax3, 'bike': ax4}\n",
332 | "\n",
333 | "dict_markers = {'iid': 'd',\n",
334 | " 'worst': \"v\", \n",
335 | " 'best': \"^\"}\n",
336 | "\n",
337 | "dict_datasets = {'meps_19': r'\\texttt{meps_19} ($d=139$, $l=5$)','meps_20': r'\\texttt{meps} ($d=139$, $l=5$)',\n",
338 | " 'meps_21': r'\\texttt{meps} ($d=139$, $l=5$)','bio': r'\\texttt{bio} ($d=9$, $l=9$)',\n",
339 | " 'concrete': r'\\texttt{concrete} ($d=8$, $l=8$)','bike': r'\\texttt{bike} ($d=18$, $l=4$)'}\n",
340 | "alphas_meps = {'meps_19': 1, 'meps_20':0.7, 'meps_21': 0.4}\n",
341 | "dict_colors = {'QR_TrainCal_NNet_Mask': colors_blindness[2],\n",
342 | " 'CQR_NNet_Mask': colors_blindness[1], \n",
343 | " 'CQR_MDA_Exact_NNet_Mask': colors_blindness[4],\n",
344 | " 'CQR_MDA_Nested_NNet_Mask': colors_blindness[9]}\n",
345 | "\n",
346 | "dict_coverages = {'iid': 'Marginal', 'worst': 'Lowest', 'best': 'Highest'}\n",
347 | "\n",
348 | "marker_size = 60\n",
349 | "\n",
350 | "small = False\n",
351 | "medium = True\n",
352 | "\n",
353 | "name_pipelines_to_plot = name_pipelines\n",
354 | " \n",
355 | "for dataset_name in datasets_names:\n",
356 | " \n",
357 | " ax = axes[dataset_name]\n",
358 | " ax.set_title(dict_datasets[dataset_name])\n",
359 | "\n",
360 | " alpha_data = 1\n",
361 | "\n",
362 | " for pipeline in name_pipelines_to_plot:\n",
363 | "\n",
364 | " if medium:\n",
365 | " ax.scatter(np.mean(dict_cov[dataset_name]['normal'][pipeline]),\n",
366 | " np.mean(dict_len[dataset_name]['normal'][pipeline]), \n",
367 | " marker=dict_markers['iid'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n",
368 | " ax.errorbar(np.mean(dict_cov[dataset_name]['normal'][pipeline]), \n",
369 | " np.mean(dict_len[dataset_name]['normal'][pipeline]),\n",
370 | " xerr=np.std(dict_cov[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n",
371 | " yerr=np.std(dict_len[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n",
372 | " color=dict_colors[pipeline], alpha=0.3)\n",
373 | " ax.scatter(np.mean(dict_cov_worst[dataset_name]['normal'][pipeline]),\n",
374 | " np.mean(dict_len_worst[dataset_name]['normal'][pipeline]), \n",
375 | " marker=dict_markers['worst'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n",
376 | " ax.errorbar(np.mean(dict_cov_worst[dataset_name]['normal'][pipeline]), \n",
377 | " np.mean(dict_len_worst[dataset_name]['normal'][pipeline]),\n",
378 | " xerr=np.std(dict_cov_worst[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n",
379 | " yerr=np.std(dict_len_worst[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n",
380 | " color=dict_colors[pipeline], alpha=0.3)\n",
381 | " ax.scatter(np.mean(dict_cov_best[dataset_name]['normal'][pipeline]),\n",
382 | " np.mean(dict_len_best[dataset_name]['normal'][pipeline]), \n",
383 | " marker=dict_markers['best'], color=dict_colors[pipeline],s=marker_size,alpha=alpha_data)\n",
384 | " ax.errorbar(np.mean(dict_cov_best[dataset_name]['normal'][pipeline]), \n",
385 | " np.mean(dict_len_best[dataset_name]['normal'][pipeline]),\n",
386 | " xerr=np.std(dict_cov_best[dataset_name]['normal'][pipeline])/np.sqrt(n_rep),\n",
387 | " yerr=np.std(dict_len_best[dataset_name]['normal'][pipeline])/np.sqrt(n_rep), \n",
388 | " color=dict_colors[pipeline], alpha=0.3)\n",
389 | "\n",
390 | "\n",
391 | " if small:\n",
392 | " ax.scatter(np.mean(dict_cov[dataset_name]['small'][pipeline]),\n",
393 | " np.mean(dict_len[dataset_name]['small'][pipeline]), \n",
394 | " marker=dict_markers['iid'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n",
395 | " ax.errorbar(np.mean(dict_cov[dataset_name]['small'][pipeline]), \n",
396 | " np.mean(dict_len[dataset_name]['small'][pipeline]),\n",
397 | " xerr=np.std(dict_cov[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n",
398 | " yerr=np.std(dict_len[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n",
399 | " color=dict_colors[pipeline], alpha=0.3)\n",
400 | " ax.scatter(np.mean(dict_cov_worst[dataset_name]['small'][pipeline]),\n",
401 | " np.mean(dict_len_worst[dataset_name]['small'][pipeline]), \n",
402 | " marker=dict_markers['worst'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n",
403 | " ax.errorbar(np.mean(dict_cov_worst[dataset_name]['small'][pipeline]), \n",
404 | " np.mean(dict_len_worst[dataset_name]['small'][pipeline]),\n",
405 | " xerr=np.std(dict_cov_worst[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n",
406 | " yerr=np.std(dict_len_worst[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n",
407 | " color=dict_colors[pipeline], alpha=0.3)\n",
408 | " ax.scatter(np.mean(dict_cov_best[dataset_name]['small'][pipeline]),\n",
409 | " np.mean(dict_len_best[dataset_name]['small'][pipeline]), \n",
410 | " marker=dict_markers['best'], color=dict_colors[pipeline],s=marker_size, facecolors='none',alpha=alpha_data)\n",
411 | " ax.errorbar(np.mean(dict_cov_best[dataset_name]['small'][pipeline]), \n",
412 | " np.mean(dict_len_best[dataset_name]['small'][pipeline]),\n",
413 | " xerr=np.std(dict_cov_best[dataset_name]['small'][pipeline])/np.sqrt(n_rep),\n",
414 | " yerr=np.std(dict_len_best[dataset_name]['small'][pipeline])/np.sqrt(n_rep), \n",
415 | " color=dict_colors[pipeline], alpha=0.3)\n",
416 | "\n",
417 | "\n",
418 | "for ax in [ax1,ax2,ax3,ax4]:\n",
419 | " ax.axvline(x=1-alpha, color='black', ls=':')\n",
420 | " ax.set_xlabel(\"Average coverage\")\n",
421 | "\n",
422 | " \n",
423 | "ax1.set_ylabel(\"Average length\")\n",
424 | "\n",
425 | "\n",
426 | "# Methods legend\n",
427 | "\n",
428 | "handles = []\n",
429 | "names = list( map(dict_methods.get, name_pipelines_to_plot) )\n",
430 | "for idc,color in enumerate(list( map(dict_colors.get, name_pipelines_to_plot) )):\n",
431 | " handles.append(mlines.Line2D([], [], color=color, marker='o', linestyle='None', markersize=8))\n",
432 | "\n",
433 | "if mask == 'Yes':\n",
434 | " fig.legend(handles, names, ncol=4, bbox_to_anchor=(0.63,0.13),handletextpad=0.1, \n",
435 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n",
436 | "else:\n",
437 | " fig.legend(handles, names, ncol=4, bbox_to_anchor=(0.63,0.1),handletextpad=0.1, \n",
438 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n",
439 | " \n",
440 | "# Coverage legend\n",
441 | "\n",
442 | "handles = []\n",
443 | "labels = []\n",
444 | "for cov in list(dict_coverages.keys()):\n",
445 | " handles.append(mlines.Line2D([], [], color='black', marker=dict_markers[cov], linestyle='None', markersize=8))\n",
446 | " labels.append(dict_coverages[cov])\n",
447 | "\n",
448 | "if mask == 'Yes':\n",
449 | " fig.legend(handles, labels, bbox_to_anchor=(0.95, 0.13),ncol=3,handletextpad=0.1, \n",
450 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n",
451 | "else:\n",
452 | " fig.legend(handles, labels, bbox_to_anchor=(0.95, 0.1),ncol=3,handletextpad=0.1, \n",
453 | " labelspacing=0.2, borderpad=0.3, handlelength=1.2, borderaxespad=1.1)\n",
454 | "\n",
455 | "\n",
456 | "fig.tight_layout()\n",
457 | "\n",
458 | "name_plot = 'plots/semi_synth'\n",
459 | "\n",
460 | "if mask == 'Yes':\n",
461 | " plt.savefig(name_plot+'.pdf',bbox_inches='tight', dpi=300)\n",
462 | "else:\n",
463 | " plt.savefig(name_plot+'_nomask.pdf',bbox_inches='tight', dpi=300)\n",
464 | "\n",
465 | "plt.show()"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": null,
471 | "metadata": {},
472 | "outputs": [],
473 | "source": []
474 | }
475 | ],
476 | "metadata": {
477 | "kernelspec": {
478 | "display_name": "Python 3",
479 | "language": "python",
480 | "name": "python3"
481 | },
482 | "language_info": {
483 | "codemirror_mode": {
484 | "name": "ipython",
485 | "version": 3
486 | },
487 | "file_extension": ".py",
488 | "mimetype": "text/x-python",
489 | "name": "python",
490 | "nbconvert_exporter": "python",
491 | "pygments_lexer": "ipython3",
492 | "version": "3.8.5"
493 | }
494 | },
495 | "nbformat": 4,
496 | "nbformat_minor": 4
497 | }
498 |
--------------------------------------------------------------------------------
/CP_NA_Synthetic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import generation as gen\n",
10 | "import imputation as imp\n",
11 | "import prediction\n",
12 | "import utils\n",
13 | "import files\n",
14 | "import os\n",
15 | "import numpy as np\n",
16 | "from tqdm.autonotebook import tqdm"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "alpha = 0.1\n",
26 | "d = 10\n",
27 | "phi = 0.8\n",
28 | "regression = 'Linear'\n",
29 | "n_rep = 100\n",
30 | "beta = np.array([1, 2, -1, 3, -0.5, -1, 0.3, 1.7, 0.4, -0.3])\n",
31 | "\n",
32 | "train_size = 500\n",
33 | "cal_size = 250\n",
34 | "params_test = {'iid':{'test_size': 2000}, \n",
35 | " 'fixed_nb_sample_pattern':{'nb_sample_pattern': 100}, \n",
36 | " 'fixed_nb_sample_pattern_size':{'nb_sample_pattern': 100}}\n",
37 | "params_test = gen.process_test(params_test, d=d)\n",
38 | "\n",
39 | "params_reg = {'regression':regression, 'beta': beta, 'phi': phi}\n",
40 | "\n",
41 | "params_noise = {'noise':'Gaussian'}\n",
42 | "\n",
43 | "prob_missing = 0.2\n",
44 | "var_missing = np.full(d, 1)\n",
45 | "params_missing = {'prob_missing':prob_missing, 'var_missing':var_missing, 'mechanism': 'MCAR'}\n",
46 | "\n",
47 | "imputations = np.array(['iterative_ridge'])\n",
48 | "\n",
49 | "methods = ['QR', 'QR_TrainCal', 'CQR', 'CQR_MDA']\n",
50 | "basemodels = ['NNet']\n",
51 | "masks = ['Yes']\n",
52 | "protections = ['No']#, 'Pattern', 'Pattern_size']\n",
53 | "exacts = [False, True]\n",
54 | "\n",
55 | "cores = 1\n",
56 | "\n",
57 | "params_basemodel = {'cores':cores}"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "name = files.get_name_data(train_size, cal_size, params_test, dim=d,\n",
67 | " params_reg=params_reg, params_noise=params_noise,\n",
68 | " params_missing=params_missing, seed=n_rep)\n",
69 | "\n",
70 | "if os.path.isfile('data/'+name+'.xz'):\n",
71 | " print('data found')\n",
72 | " data = files.load_file('data', name, 'xz')\n",
73 | "else:\n",
74 | " print('data not found')\n",
75 | " X, X_missing, M, Y, params_missing = gen.generate_multiple_data(train_size, cal_size, params_test, n_rep=n_rep, dim=d, \n",
76 | " params_reg=params_reg, params_noise=params_noise,\n",
77 | " params_missing=params_missing)\n",
78 | " data = {'X': X, 'X_missing': X_missing, 'M': M,'Y': Y}\n",
79 | " files.write_file('data', name, 'xz', data)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "for imputation in tqdm(imputations):\n",
89 | "\n",
90 | " name_imputed = files.get_name_data_imputed(train_size, cal_size, params_test, imputation,\n",
91 | " dim=d, \n",
92 | " params_reg=params_reg, params_noise=params_noise,\n",
93 | " params_missing=params_missing, seed=n_rep)\n",
94 | "\n",
95 | " if os.path.isfile('data/'+name_imputed+'.xz'):\n",
96 | " print('imputation found')\n",
97 | " X_imp = files.load_file('data', name_imputed, 'xz')\n",
98 | " else:\n",
99 | " print('imputation not found')\n",
100 | " if imputation == 'complete':\n",
101 | " X_imp = data['X']\n",
102 | " else:\n",
103 | " X_imp = imp.impute(data, imputation)\n",
104 | " files.write_file('data', name_imputed, 'xz', X_imp)\n",
105 | " data_imputed = {'X': data['X'], 'X_missing': data['X_missing'], 'X_imp': X_imp, 'M': data['M'],'Y': data['Y']}\n",
106 | "\n",
107 | "\n",
108 | "\n",
109 | " results, methods_ran = prediction.run_experiments(data_imputed, alpha=alpha, methods=methods,\n",
110 | " basemodels=basemodels, params_basemodel=params_basemodel,\n",
111 | " masks=masks, protections=protections, \n",
112 | " exacts=exacts, imputation=imputation,\n",
113 | " params_reg=params_reg)#, params_noise=params_noise)\n",
114 | "\n",
115 | " for method in methods_ran:\n",
116 | " name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, d=d, imputation=imputation,\n",
117 | " params_reg=params_reg, params_noise=params_noise, params_missing=params_missing)\n",
118 | " results_method = results[method]\n",
119 | " files.write_file('results/'+name_dir, name_method, 'xz', results_method)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 3",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.8.5"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 4
151 | }
152 |
--------------------------------------------------------------------------------
/CP_NA_Synthetic_Plots.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import generation as gen\n",
10 | "import prediction as prediction\n",
11 | "import utils\n",
12 | "import files\n",
13 | "import os\n",
14 | "import numpy as np\n",
15 | "from tqdm.autonotebook import tqdm"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import matplotlib as mpl\n",
25 | "from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset\n",
26 | "from matplotlib.backends.backend_pgf import FigureCanvasPgf\n",
27 | "mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)\n",
28 | "import matplotlib.pyplot as plt\n",
29 | "import matplotlib.lines as mlines\n",
30 | "import pandas as pd\n",
31 | "from matplotlib import cm"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "size=19\n",
41 | "mpl.rcParams.update({\n",
42 | " \"pgf.texsystem\": \"pdflatex\",\n",
43 | " 'font.family': 'serif',\n",
44 | " 'font.serif': 'Times',\n",
45 | " 'text.usetex': True,\n",
46 | " 'pgf.rcfonts': False,\n",
47 | " 'font.size': size,\n",
48 | " 'axes.labelsize':size,\n",
49 | " 'axes.titlesize':size,\n",
50 | " 'figure.titlesize':size,\n",
51 | " 'xtick.labelsize':size,\n",
52 | " 'ytick.labelsize':size,\n",
53 | " 'legend.fontsize':size,\n",
54 | "})"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "alpha = 0.1\n",
64 | "d = 10\n",
65 | "phi = 0.8\n",
66 | "regression = 'Linear'\n",
67 | "n_rep = 100\n",
68 | "beta = np.array([1, 2, -1, 3, -0.5, -1, 0.3, 1.7, 0.4, -0.3])\n",
69 | "\n",
70 | "train_size = 500\n",
71 | "cal_size = 250\n",
72 | "params_test = {'iid':{'test_size': 2000}, \n",
73 | " 'fixed_nb_sample_pattern':{'nb_sample_pattern': 100}, \n",
74 | " 'fixed_nb_sample_pattern_size':{'nb_sample_pattern': 100}}\n",
75 | "params_test = gen.process_test(params_test, d=d)\n",
76 | "\n",
77 | "params_reg = {'regression':regression, 'beta': beta, 'phi': phi}\n",
78 | "\n",
79 | "params_noise = {'noise':'Gaussian'}\n",
80 | "\n",
81 | "prob_missing = 0.2\n",
82 | "var_missing = np.full(d, 1)\n",
83 | "params_missing = {'prob_missing':prob_missing, 'var_missing':var_missing, 'mechanism': 'MCAR'}"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "methods = ['QR_TrainCal','CQR','CQR_MDA']\n",
93 | "\n",
94 | "basemodel = 'NNet'\n",
95 | "mask = 'Yes'\n",
96 | "protection = 'No'\n",
97 | "imputation = 'iterative_ridge'\n",
98 | "\n",
99 | "name_pipeline_to_plot = []\n",
100 | "for method in methods: \n",
101 | " if method == 'CQR_MDA':\n",
102 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=True)\n",
103 | " if not name_temp in name_pipeline_to_plot:\n",
104 | " name_pipeline_to_plot.append(name_temp)\n",
105 | " \n",
106 | " name_temp = files.get_name_method(method, basemodel, mask=mask, protection=protection, exact=False)\n",
107 | " if not name_temp in name_pipeline_to_plot:\n",
108 | " name_pipeline_to_plot.append(name_temp)\n",
109 | " \n",
110 | "current_pipeline = method+'_'+basemodel"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "dict_cov = dict.fromkeys(name_pipeline_to_plot)\n",
120 | "dict_len = dict.fromkeys(name_pipeline_to_plot)\n",
121 | "\n",
122 | "for pipeline in name_pipeline_to_plot:\n",
123 | " dict_cov[pipeline] = {}\n",
124 | " dict_len[pipeline] = {}"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "keys_pattern = np.arange(d)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "impute_inf = True\n",
143 | "replace_inf = True"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "key = -1\n",
153 | "\n",
154 | "nb_boxplot = len(keys_pattern)+1\n",
155 | "\n",
156 | "name_method = []\n",
157 | "\n",
158 | "for pipeline in tqdm(name_pipeline_to_plot):\n",
159 | "\n",
160 | " name_method = np.append(name_method, '_'.join([imputation, pipeline]))\n",
161 | "\n",
162 | " data, results = utils.get_data_results(pipeline, train_size, cal_size, params_test, n_rep, d=d, imputation=imputation,\n",
163 | " params_reg=params_reg, params_noise=params_noise, params_missing=params_missing,\n",
164 | " parent_results='results', parent_data='data', extension='xz')\n",
165 | "\n",
166 | " contains, lengths = utils.compute_PI_metrics(data, results, 'iid')\n",
167 | " \n",
168 | " if replace_inf:\n",
169 | " max_y_train = np.max(data['Y']['Train'], axis=1)\n",
170 | " max_y_cal = np.max(data['Y']['Cal'], axis=1)\n",
171 | " min_y_train = np.min(data['Y']['Train'], axis=1)\n",
172 | " min_y_cal = np.min(data['Y']['Cal'], axis=1)\n",
173 | " max_length_traincal = np.maximum(max_y_train, max_y_cal)-np.minimum(min_y_train, min_y_cal)\n",
174 | " for k in range(n_rep):\n",
175 | " idx_inf = np.where(np.isinf(lengths[k,:]))[0]\n",
176 | " if len(idx_inf)>0:\n",
177 | " lengths[k,:][idx_inf] = max_length_traincal[k]\n",
178 | " \n",
179 | " metrics = utils.compute_metrics_cond(n_rep, data, results, 'fixed_nb_sample_pattern_size', cond='Pattern_Size',\n",
180 | " replace_inf=replace_inf)\n",
181 | " \n",
182 | " dict_cov[pipeline][key] = np.mean(contains, axis=1)\n",
183 | " dict_len[pipeline][key] = np.mean(lengths, axis=1)\n",
184 | "\n",
185 | " #key += 1\n",
186 | "\n",
187 | " for key_pattern in keys_pattern:\n",
188 | "\n",
189 | " dict_cov[pipeline][key_pattern] = metrics[key_pattern]['avg_cov']\n",
190 | " dict_len[pipeline][key_pattern] = metrics[key_pattern]['avg_len']\n",
191 | "\n",
192 | " #key += 1"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "import functools"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "if 'phi' in params_reg:\n",
211 | " phi = params_reg['phi']\n",
212 | "else:\n",
213 | " phi = 0.8\n",
214 | "cov = np.full((d,d),phi)+(1-phi)*np.eye(d)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "len_oracle_marginal = []\n",
224 | "M_test = data['M']['Test']['iid']\n",
225 | "test_size = M_test.shape[1]\n",
226 | "for i in range(n_rep):\n",
227 | " \n",
228 | " M_test_i = M_test[i,:,:]\n",
229 | " patterns = np.unique(M_test_i, axis=0)\n",
230 | " oracles_len_per_pattern = list(map(functools.partial(prediction.oracle_len_pattern, beta=beta, cov=cov, alpha=0.1), patterns))\n",
231 | "\n",
232 | " len_oracle = np.empty(test_size)\n",
233 | " \n",
234 | " for idp, pattern in enumerate(patterns):\n",
235 | " pattern_id = utils.pattern_to_id(pattern.astype(int))\n",
236 | " M_test_id = list(map(utils.pattern_to_id, M_test_i.astype(int)))\n",
237 | " len_oracle[np.where(np.array(M_test_id) == pattern_id)] = oracles_len_per_pattern[idp]\n",
238 | " len_oracle_marginal = np.append(len_oracle_marginal, np.mean(len_oracle))"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "len_oracle = {}\n",
248 | "patterns_by_size = dict.fromkeys(np.arange(0,d))\n",
249 | "for k in range(d):\n",
250 | " patterns_by_size[k] = []\n",
251 | "patterns_id = np.arange(0, 2**d-1)\n",
252 | "for pattern_id in patterns_id:\n",
253 | " vec_pattern = utils.bin_to_vec(bin(pattern_id), d)\n",
254 | " size_pattern = utils.pattern_to_size(vec_pattern)\n",
255 | " patterns_by_size[size_pattern] = np.append(patterns_by_size[size_pattern], pattern_id)\n",
256 | "for k in range(d):\n",
257 | " list_len = []\n",
258 | " for pattern_id in patterns_by_size[k]:\n",
259 | " vec_pattern = utils.bin_to_vec(bin(np.int(pattern_id)), d)\n",
260 | " list_len = np.append(list_len, prediction.oracle_len_pattern(vec_pattern, beta, cov))\n",
261 | " len_oracle[k] = np.mean(list_len)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "import seaborn as sns"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "dict_methods = {'QR_TrainCal_NNet_Mask': 'QR',\n",
280 | " 'QR_NNet_Mask': 'QR',\n",
281 | " 'CQR_NNet_Mask': 'CQR', \n",
282 | " 'CQR_MDA_Exact_NNet_Mask': 'CQR-MDA-Exact',\n",
283 | " 'CQR_MDA_Nested_NNet_Mask': 'CQR-MDA-Nested'}"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(15,6), sharey='row')\n",
293 | "\n",
294 | "name_ticks = list(map(utils.name_tick, name_method))\n",
295 | "colors_palette = sns.color_palette(\"husl\", nb_boxplot)\n",
296 | "colors = colors_palette \n",
297 | "\n",
298 | "ax = [ax1, ax2, ax3, ax4]\n",
299 | "nb_subplots = len(ax)\n",
300 | "for axi in ax:\n",
301 | " axi.axhline(1-alpha, color='black', ls='--')\n",
302 | "\n",
303 | "axl = [ax5, ax6, ax7, ax8]\n",
304 | " \n",
305 | "for idp, pipeline in enumerate(name_pipeline_to_plot):\n",
306 | " \n",
307 | " ax[idp].set_title(dict_methods[pipeline])\n",
308 | " \n",
309 | " box = ax[idp].violinplot(dict_cov[pipeline].values(), showmeans=True, showextrema=False)#, quantiles=[[0.25, 0.75]]*nb_boxes)#, patch_artist=True)\n",
310 | " for pc,color in zip(box['bodies'], colors):\n",
311 | " pc.set_facecolor(color)\n",
312 | " pc.set_edgecolor('black')\n",
313 | " pc.set_alpha(1)\n",
314 | " box['cmeans'].set_color('black')\n",
315 | " \n",
316 | " box = axl[idp].violinplot(dict_len[pipeline].values(), showmeans=True, showextrema=False)#, quantiles=[[0.25, 0.75]]*nb_boxes)#, patch_artist=True)\n",
317 | " for pc,color in zip(box['bodies'], colors):\n",
318 | " pc.set_facecolor(color)\n",
319 | " pc.set_edgecolor('black')\n",
320 | " pc.set_alpha(1)\n",
321 | " box['cmeans'].set_color('black')\n",
322 | " \n",
323 | "idx = np.arange(d+1)\n",
324 | "idy = np.append([np.mean(len_oracle_marginal)], np.array(list(len_oracle.values())))\n",
325 | "\n",
326 | "for axi in axl:\n",
327 | " axi.scatter(idx+1, idy, color=colors, zorder=2, marker='*', s=100, edgecolor='black')\n",
328 | "\n",
329 | "for axi in ax:\n",
330 | " axi.set_xticks([])\n",
331 | " \n",
332 | "name_ticks_missing = []\n",
333 | "for k in range(d):\n",
334 | " name_ticks_missing = np.append(name_ticks_missing, str(k)+r' \\texttt{NA}')\n",
335 | "name_ticks = np.append(['Marg.'], name_ticks_missing)\n",
336 | "\n",
337 | "for axi in axl:\n",
338 | " ticks = np.arange(0,d+1)\n",
339 | " axi.set_xticks(ticks+1)\n",
340 | " axi.set_xticklabels(name_ticks, rotation=70)\n",
341 | "\n",
342 | "ax1.set_ylabel('Average coverage')\n",
343 | "ax5.set_ylabel('Average length')\n",
344 | "\n",
345 | "ax5.legend(handles = [mlines.Line2D([], [], marker=\"*\", linestyle='None', markersize=15, markeredgecolor='black', markerfacecolor='White')],\n",
346 | " labels=['Oracle length'], loc='upper left', handletextpad=10**(-60))\n",
347 | "\n",
348 | "fig.tight_layout()\n",
349 | "\n",
350 | "name_plot = 'plots/synthetic/Linear_d_'+str(d)+'_NA_'+str(prob_missing)+'_imputation_'+str(imputation)+'_basemodel_'+basemodel\n",
351 | "if mask == 'Yes':\n",
352 | " name_plot = name_plot + '_mask' \n",
353 | "name_plot = name_plot + '_train_'+str(train_size) + '_cal_'+str(cal_size) +'_rep_'+str(n_rep)\n",
354 | "if mask == 'No':\n",
355 | " name_plot = name_plot+'_nomask'\n",
356 | "if impute_inf:\n",
357 | " name_plot = name_plot+'_replaceinf'\n",
358 | "plt.savefig(name_plot+'.pdf',bbox_inches='tight', dpi=300)\n",
359 | "\n",
360 | "plt.show()"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "metadata": {},
367 | "outputs": [],
368 | "source": []
369 | }
370 | ],
371 | "metadata": {
372 | "kernelspec": {
373 | "display_name": "Python 3",
374 | "language": "python",
375 | "name": "python3"
376 | },
377 | "language_info": {
378 | "codemirror_mode": {
379 | "name": "ipython",
380 | "version": 3
381 | },
382 | "file_extension": ".py",
383 | "mimetype": "text/x-python",
384 | "name": "python",
385 | "nbconvert_exporter": "python",
386 | "pygments_lexer": "ipython3",
387 | "version": "3.8.5"
388 | }
389 | },
390 | "nbformat": 4,
391 | "nbformat_minor": 4
392 | }
393 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Margaux Zaffran
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Conformal Prediction with Missing Values
2 |
3 | This repository contains the code to reproduce the experiments of the paper _Conformal Prediction with Missing Values_, M. Zaffran, A. Dieuleveut, J. Josse and Y. Romano, ICML 2023.
4 |
5 | The notebook ``CP_NA_Synthetic.ipynb`` allows to reproduce the synthetic experiments, while ``CP_NA_Semi-synthetic.ipynb`` focuses on the semi-synthetic experiments.
6 | The corresponding ``_Plots`` notebooks contain the code for displaying the results in the same format as in the paper.
7 |
8 | The core code for the algorithms CP-MDA-Exact and CP-MDA-Nested can be found in the ```prediction.py``` file.
9 |
10 | ``imputation.py`` contains the functions used for imputation of the data sets.
11 | ``generation.py`` allows to generate synthetic data (outcome and features, but also missing values).
12 | ``files.py`` handles the file names, files writing and loading.
13 | ``utils.py`` contains some useful functions like computing the metrics associated to interval predictions, combinatorics on patterns etc.
14 | ``datasets.py`` pre-process the real data sets used in the semi-synthetic experiments.
15 |
16 | Note that, as mentioned in the ```.py``` files, some piece of code are taken from other GitHub repositories, namely:
17 | + CQR (Romano et al., 2019) repository, available [here](https://github.com/yromano/cqr), for the (cleaning of the) data sets used in the semi-synthetic experiments;
18 | + CHR (Sesia and Romano, 2021) repository, available [here](https://github.com/msesia/chr), for the Quantile Neural Network architecture.
19 |
20 | This repository will be updated in the next few days.
21 |
22 | ## License
23 |
24 | [MIT](LICENSE) © Margaux Zaffran
--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/data/.DS_Store
--------------------------------------------------------------------------------
/data/cqr_datasets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/data/cqr_datasets/.DS_Store
--------------------------------------------------------------------------------
/data/cqr_datasets/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 5
6 | }
7 |
--------------------------------------------------------------------------------
/data/cqr_datasets/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Publicly Available Datasets
3 |
4 | * Please download the file blogData_train.csv from [this link](https://archive.ics.uci.edu/ml/datasets/BlogFeedback), and save it in this directory.
5 |
6 | * Please download the files Features_Variant_1.csv and Features_Variant_2.csv from
7 | [this link](https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset) and store the two under ./facebook/ directory.
8 |
9 | ## Data subject to copyright/usage rules
10 |
11 | Please follow the instruction in [this README](https://github.com/yromano/cqr/blob/master/get_meps_data/README.md) file, describing how to download and process the MEPS datasets.
12 |
13 | Once downloaded, copy the the three files 'meps_19_reg.csv', 'meps_20_reg.csv', and 'meps_21_reg.csv' to this folder.
14 |
--------------------------------------------------------------------------------
/data/cqr_datasets/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "951c0c40",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import os\n",
11 | "from copy import deepcopy\n",
12 | "from tqdm import tqdm\n",
13 | "import scipy.stats as stat\n",
14 | "\n",
15 | "import numpy as np\n",
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 12,
22 | "id": "1e1b8e72",
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import datasets"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 24,
32 | "id": "b20e5441",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "dataset_base_path = \"./\"\n",
37 | "dataset_name = \"bio\"\n",
38 | "data = pd.read_csv('CASP.csv')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 25,
44 | "id": "5b07be79",
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "X, y = datasets.GetDataset(dataset_name, dataset_base_path)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 26,
54 | "id": "d693e22b",
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/html": [
60 | "
\n",
61 | "\n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " | \n",
78 | " RMSD | \n",
79 | " F1 | \n",
80 | " F2 | \n",
81 | " F3 | \n",
82 | " F4 | \n",
83 | " F5 | \n",
84 | " F6 | \n",
85 | " F7 | \n",
86 | " F8 | \n",
87 | " F9 | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 | \n",
93 | " 17.284 | \n",
94 | " 13558.30 | \n",
95 | " 4305.35 | \n",
96 | " 0.31754 | \n",
97 | " 162.1730 | \n",
98 | " 1.872791e+06 | \n",
99 | " 215.3590 | \n",
100 | " 4287.87 | \n",
101 | " 102 | \n",
102 | " 27.0302 | \n",
103 | "
\n",
104 | " \n",
105 | " 1 | \n",
106 | " 6.021 | \n",
107 | " 6191.96 | \n",
108 | " 1623.16 | \n",
109 | " 0.26213 | \n",
110 | " 53.3894 | \n",
111 | " 8.034467e+05 | \n",
112 | " 87.2024 | \n",
113 | " 3328.91 | \n",
114 | " 39 | \n",
115 | " 38.5468 | \n",
116 | "
\n",
117 | " \n",
118 | " 2 | \n",
119 | " 9.275 | \n",
120 | " 7725.98 | \n",
121 | " 1726.28 | \n",
122 | " 0.22343 | \n",
123 | " 67.2887 | \n",
124 | " 1.075648e+06 | \n",
125 | " 81.7913 | \n",
126 | " 2981.04 | \n",
127 | " 29 | \n",
128 | " 38.8119 | \n",
129 | "
\n",
130 | " \n",
131 | " 3 | \n",
132 | " 15.851 | \n",
133 | " 8424.58 | \n",
134 | " 2368.25 | \n",
135 | " 0.28111 | \n",
136 | " 67.8325 | \n",
137 | " 1.210472e+06 | \n",
138 | " 109.4390 | \n",
139 | " 3248.22 | \n",
140 | " 70 | \n",
141 | " 39.0651 | \n",
142 | "
\n",
143 | " \n",
144 | " 4 | \n",
145 | " 7.962 | \n",
146 | " 7460.84 | \n",
147 | " 1736.94 | \n",
148 | " 0.23280 | \n",
149 | " 52.4123 | \n",
150 | " 1.021020e+06 | \n",
151 | " 94.5234 | \n",
152 | " 2814.42 | \n",
153 | " 41 | \n",
154 | " 39.9147 | \n",
155 | "
\n",
156 | " \n",
157 | "
\n",
158 | "
"
159 | ],
160 | "text/plain": [
161 | " RMSD F1 F2 F3 F4 F5 F6 \\\n",
162 | "0 17.284 13558.30 4305.35 0.31754 162.1730 1.872791e+06 215.3590 \n",
163 | "1 6.021 6191.96 1623.16 0.26213 53.3894 8.034467e+05 87.2024 \n",
164 | "2 9.275 7725.98 1726.28 0.22343 67.2887 1.075648e+06 81.7913 \n",
165 | "3 15.851 8424.58 2368.25 0.28111 67.8325 1.210472e+06 109.4390 \n",
166 | "4 7.962 7460.84 1736.94 0.23280 52.4123 1.021020e+06 94.5234 \n",
167 | "\n",
168 | " F7 F8 F9 \n",
169 | "0 4287.87 102 27.0302 \n",
170 | "1 3328.91 39 38.5468 \n",
171 | "2 2981.04 29 38.8119 \n",
172 | "3 3248.22 70 39.0651 \n",
173 | "4 2814.42 41 39.9147 "
174 | ]
175 | },
176 | "execution_count": 26,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "data.head()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 27,
188 | "id": "2300acd2",
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "(45730, 10)"
195 | ]
196 | },
197 | "execution_count": 27,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "data.shape"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 28,
209 | "id": "f6ebe8c8",
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "data": {
214 | "text/plain": [
215 | "RMSD 0\n",
216 | "F1 0\n",
217 | "F2 0\n",
218 | "F3 0\n",
219 | "F4 0\n",
220 | "F5 0\n",
221 | "F6 0\n",
222 | "F7 0\n",
223 | "F8 0\n",
224 | "F9 0\n",
225 | "dtype: int64"
226 | ]
227 | },
228 | "execution_count": 28,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "np.sum(data.isnull())"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 32,
240 | "id": "cbaba292",
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "array([[1.35583e+04, 4.30535e+03, 3.17540e-01, ..., 4.28787e+03,\n",
247 | " 1.02000e+02, 2.70302e+01],\n",
248 | " [6.19196e+03, 1.62316e+03, 2.62130e-01, ..., 3.32891e+03,\n",
249 | " 3.90000e+01, 3.85468e+01],\n",
250 | " [7.72598e+03, 1.72628e+03, 2.23430e-01, ..., 2.98104e+03,\n",
251 | " 2.90000e+01, 3.88119e+01],\n",
252 | " ...,\n",
253 | " [7.72665e+03, 2.48958e+03, 3.22200e-01, ..., 3.29046e+03,\n",
254 | " 4.60000e+01, 3.74718e+01],\n",
255 | " [8.87893e+03, 3.05578e+03, 3.44160e-01, ..., 3.42179e+03,\n",
256 | " 4.10000e+01, 3.56045e+01],\n",
257 | " [1.27324e+04, 4.44436e+03, 3.49050e-01, ..., 4.62685e+03,\n",
258 | " 1.41000e+02, 2.98118e+01]], dtype=float32)"
259 | ]
260 | },
261 | "execution_count": 32,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "X"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 30,
273 | "id": "5cdfe892",
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "array([17.284, 6.021, 9.275, ..., 10.356, 9.791, 18.827], dtype=float32)"
280 | ]
281 | },
282 | "execution_count": 30,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "y"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "12d0a8d1",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": []
298 | }
299 | ],
300 | "metadata": {
301 | "kernelspec": {
302 | "display_name": "Python 3 (ipykernel)",
303 | "language": "python",
304 | "name": "python3"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 3
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython3",
316 | "version": "3.9.12"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 5
321 | }
322 |
--------------------------------------------------------------------------------
/data/cqr_datasets/communities_attributes.csv:
--------------------------------------------------------------------------------
1 | attributes
2 | state
3 | county
4 | community
5 | communityname
6 | fold
7 | population
8 | householdsize
9 | racepctblack
10 | racePctWhite
11 | racePctAsian
12 | racePctHisp
13 | agePct12t21
14 | agePct12t29
15 | agePct16t24
16 | agePct65up
17 | numbUrban
18 | pctUrban
19 | medIncome
20 | pctWWage
21 | pctWFarmSelf
22 | pctWInvInc
23 | pctWSocSec
24 | pctWPubAsst
25 | pctWRetire
26 | medFamInc
27 | perCapInc
28 | whitePerCap
29 | blackPerCap
30 | indianPerCap
31 | AsianPerCap
32 | OtherPerCap
33 | HispPerCap
34 | NumUnderPov
35 | PctPopUnderPov
36 | PctLess9thGrade
37 | PctNotHSGrad
38 | PctBSorMore
39 | PctUnemployed
40 | PctEmploy
41 | PctEmplManu
42 | PctEmplProfServ
43 | PctOccupManu
44 | PctOccupMgmtProf
45 | MalePctDivorce
46 | MalePctNevMarr
47 | FemalePctDiv
48 | TotalPctDiv
49 | PersPerFam
50 | PctFam2Par
51 | PctKids2Par
52 | PctYoungKids2Par
53 | PctTeen2Par
54 | PctWorkMomYoungKids
55 | PctWorkMom
56 | NumIlleg
57 | PctIlleg
58 | NumImmig
59 | PctImmigRecent
60 | PctImmigRec5
61 | PctImmigRec8
62 | PctImmigRec10
63 | PctRecentImmig
64 | PctRecImmig5
65 | PctRecImmig8
66 | PctRecImmig10
67 | PctSpeakEnglOnly
68 | PctNotSpeakEnglWell
69 | PctLargHouseFam
70 | PctLargHouseOccup
71 | PersPerOccupHous
72 | PersPerOwnOccHous
73 | PersPerRentOccHous
74 | PctPersOwnOccup
75 | PctPersDenseHous
76 | PctHousLess3BR
77 | MedNumBR
78 | HousVacant
79 | PctHousOccup
80 | PctHousOwnOcc
81 | PctVacantBoarded
82 | PctVacMore6Mos
83 | MedYrHousBuilt
84 | PctHousNoPhone
85 | PctWOFullPlumb
86 | OwnOccLowQuart
87 | OwnOccMedVal
88 | OwnOccHiQuart
89 | RentLowQ
90 | RentMedian
91 | RentHighQ
92 | MedRent
93 | MedRentPctHousInc
94 | MedOwnCostPctInc
95 | MedOwnCostPctIncNoMtg
96 | NumInShelters
97 | NumStreet
98 | PctForeignBorn
99 | PctBornSameState
100 | PctSameHouse85
101 | PctSameCity85
102 | PctSameState85
103 | LemasSwornFT
104 | LemasSwFTPerPop
105 | LemasSwFTFieldOps
106 | LemasSwFTFieldPerPop
107 | LemasTotalReq
108 | LemasTotReqPerPop
109 | PolicReqPerOffic
110 | PolicPerPop
111 | RacialMatchCommPol
112 | PctPolicWhite
113 | PctPolicBlack
114 | PctPolicHisp
115 | PctPolicAsian
116 | PctPolicMinor
117 | OfficAssgnDrugUnits
118 | NumKindsDrugsSeiz
119 | PolicAveOTWorked
120 | LandArea
121 | PopDens
122 | PctUsePubTrans
123 | PolicCars
124 | PolicOperBudg
125 | LemasPctPolicOnPatr
126 | LemasGangUnitDeploy
127 | LemasPctOfficDrugUn
128 | PolicBudgPerPop
129 | ViolentCrimesPerPop
130 |
--------------------------------------------------------------------------------
/data/cqr_datasets/facebook/README.md:
--------------------------------------------------------------------------------
1 |
2 | Please download the files Features_Variant_1.csv and Features_Variant_2.csv from
3 | [this link](https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset) and store the two in this directory.
4 |
--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
1 | # Code adapted from CQR GitHub (Yaniv Romano, 2019)
2 | # https://github.com/yromano/cqr
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | def GetDataset(name, base_path):
9 | """ Load a dataset
10 |
11 | Parameters
12 | ----------
13 | name : string, dataset name
14 | base_path : string, e.g. "path/to/datasets/directory/"
15 |
16 | Returns
17 | -------
18 | data : dataframe containing the data set (shape n x (1+d))
19 | response_name : string defining the column to be predicted
20 | continuous_var : boolean vector of length d, 0 meaning categorical variable and 1 continuous
21 |
22 | """
23 | if name == "meps_19":
24 | df = pd.read_csv(base_path + 'meps_19_reg.csv')
25 |
26 | response_name = "UTILIZATION_reg"
27 |
28 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1',
29 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
30 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
31 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
32 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
33 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
34 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
35 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
36 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
37 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
38 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
39 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
40 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
41 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
42 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
43 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
44 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
45 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
46 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
47 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
48 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
49 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
50 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
51 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
52 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
53 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
54 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
55 |
56 | d = len(features_names)
57 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0))
58 |
59 | col_names = np.append(features_names, response_name)
60 |
61 | data = df[col_names]
62 |
63 | if name == "meps_20":
64 | df = pd.read_csv(base_path + 'meps_20_reg.csv')
65 |
66 | response_name = "UTILIZATION_reg"
67 |
68 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1',
69 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
70 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
71 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
72 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
73 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
74 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
75 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
76 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
77 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
78 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
79 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
80 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
81 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
82 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
83 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
84 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
85 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
86 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
87 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
88 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
89 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
90 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
91 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
92 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
93 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
94 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
95 |
96 | d = len(features_names)
97 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0))
98 |
99 | col_names = np.append(features_names, response_name)
100 |
101 | data = df[col_names]
102 |
103 | if name == "meps_21":
104 | df = pd.read_csv(base_path + 'meps_21_reg.csv')
105 |
106 | response_name = "UTILIZATION_reg"
107 |
108 | features_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT16F', 'REGION=1',
109 | 'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
110 | 'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
111 | 'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
112 | 'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
113 | 'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
114 | 'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
115 | 'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
116 | 'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
117 | 'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
118 | 'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
119 | 'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
120 | 'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
121 | 'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
122 | 'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
123 | 'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
124 | 'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
125 | 'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
126 | 'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
127 | 'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
128 | 'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
129 | 'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
130 | 'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
131 | 'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
132 | 'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
133 | 'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
134 | 'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
135 |
136 | d = len(features_names)
137 | continuous_var = np.append(np.full(5, 1), np.full(d - 5, 0))
138 |
139 | col_names = np.append(features_names, response_name)
140 |
141 | data = df[col_names]
142 |
143 | if name == "bio":
144 | # https://github.com/joefavergel/TertiaryPhysicochemicalProperties/blob/master/RMSD-ProteinTertiaryStructures.ipynb
145 | df = pd.read_csv(base_path + 'CASP.csv')
146 | response_name = 'RMSD'
147 | d = df.shape[1]-1
148 | continuous_var = np.full(d, 1)
149 | data = df
150 |
151 | if name == "concrete":
152 | dataset = np.loadtxt(open(base_path + 'Concrete_Data.csv', "rb"), delimiter=",", skiprows=1)
153 | data = pd.DataFrame(data=dataset)
154 | response_name = 8
155 | d = data.shape[1] - 1
156 | continuous_var = np.full(d, 1)
157 |
158 | if name == "bike":
159 | # https://www.kaggle.com/rajmehra03/bike-sharing-demand-rmsle-0-3194
160 | df = pd.read_csv(base_path + 'bike_train.csv')
161 |
162 | # # seperating season as per values. this is bcoz this will enhance features.
163 | season = pd.get_dummies(df['season'], prefix='season')
164 | df = pd.concat([df, season], axis=1)
165 |
166 | # # # same for weather. this is bcoz this will enhance features.
167 | weather = pd.get_dummies(df['weather'], prefix='weather')
168 | df = pd.concat([df, weather], axis=1)
169 |
170 | # # # now can drop weather and season.
171 | df.drop(['season', 'weather'], inplace=True, axis=1)
172 | df.head()
173 |
174 | df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
175 | df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
176 | df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)]
177 | df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)]
178 | df['year'] = df['year'].map({2011: 0, 2012: 1})
179 |
180 | df.drop('datetime', axis=1, inplace=True)
181 | df.drop(['casual', 'registered'], axis=1, inplace=True)
182 | df.columns.to_series().groupby(df.dtypes).groups
183 |
184 | features_names = ['temp', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday',
185 | 'season_1', 'season_2', 'season_3', 'season_4', 'weather_1',
186 | 'weather_2', 'weather_3', 'weather_4', 'hour', 'day', 'month', 'year']
187 | response_name = 'count'
188 |
189 | d = len(features_names)
190 | continuous_var = np.append(np.full(4, 1), np.full(d - 5, 0))
191 |
192 | col_names = np.append(features_names, response_name)
193 |
194 | data = df[col_names]
195 |
196 | if name == "community":
197 | # https://github.com/vbordalo/Communities-Crime/blob/master/Crime_v1.ipynb
198 | attrib = pd.read_csv(base_path + 'communities_attributes.csv', delim_whitespace=True)
199 | data = pd.read_csv(base_path + 'communities.data', names=attrib['attributes'])
200 | data = data.drop(columns=['state', 'county',
201 | 'community', 'communityname',
202 | 'fold'], axis=1)
203 |
204 | data = data.replace('?', np.nan)
205 | response_name = 'ViolentCrimesPerPop'
206 |
207 | return data, response_name, continuous_var
208 |
209 |
--------------------------------------------------------------------------------
/files.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import lzma
3 | import numpy as np
4 | import os
5 |
6 | def get_setting(dim=3, params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'}, params_missing={}):
7 |
8 | regression = params_reg['regression']
9 | assert regression in ['Linear'], 'regression must be Linear.'
10 |
11 | if 'mean' in params_reg:
12 | mean = params_reg['mean']
13 | else:
14 | mean = 1
15 | if 'scale' in params_reg:
16 | scale = params_reg['scale']
17 | else:
18 | scale = 1
19 | if params_reg['beta'] is not None:
20 | beta = params_reg['beta']
21 | else:
22 | beta = np.full(dim,1)
23 | if dim < 10:
24 | name = 'Linear_d_'+str(dim)+'_beta_'+'_'.join(str(x) for x in beta)+'_Gaussian_Mean_'+str(mean)
25 | else:
26 | name = 'Linear_d_'+str(dim)+'_beta_varies_Gaussian_Mean_'+str(mean)+'_Scale_'+str(scale)
27 |
28 | if 'prob_missing' in params_missing:
29 | prob_missing = params_missing['prob_missing']
30 | else:
31 | prob_missing = 0.2
32 |
33 | if 'mechanism' in params_missing:
34 | if params_missing['mechanism'] == 'MNAR_mask_quantiles':
35 | name = name + '_' + params_missing['mechanism'] + '_q_' + str(params_missing['q']) + '_'
36 | else:
37 | name = name + '_' + params_missing['mechanism'] + '_'
38 | if 'id_setting' in params_missing:
39 | name = name + 'id_'+str(params_missing['id_setting'])+'_'
40 | else:
41 | name = name + '_MCAR_'
42 |
43 | name = name + str(prob_missing)
44 |
45 | return name
46 |
47 | def get_name_data(train_size, cal_size, params_test, dim=3, params_reg={}, params_noise={}, dataset=None, params_missing={}, seed=1):
48 | """
49 | Parameters
50 | ----------
51 | n : experiment sample size
52 | dim : dimension of the covariates (i.e. X lies in R^dim)
53 | regression : regression model, should be Linear
54 | noise : noise type, can be Gaussian
55 | params_reg : parameters of the regression part
56 | params_noise : parameters of the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]}
57 | to generate an AR(1) noise with coefficient -ar1
58 | seed : random seed for reproducibility used in the experiment
59 |
60 | Returns
61 | -------
62 | name : name of the file containing (if existing)
63 | the generated data with the given parameters of simulations
64 | """
65 |
66 | max_test_size = np.max(params_test['test_size'])
67 |
68 | if dataset is None:
69 |
70 | regression = params_reg['regression']
71 |
72 | assert regression in ['Linear'], 'regression must be Linear.'
73 |
74 | name = get_setting(dim=dim, params_reg=params_reg, params_noise=params_noise, params_missing=params_missing)
75 |
76 | else:
77 | name = dataset
78 |
79 | name = name + '_seed_' + str(seed) + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_test_' + str(max_test_size)
80 |
81 | if 'prob_missing' in list(params_missing.keys()):
82 | name = name + '_prob_' + str(params_missing['prob_missing'])
83 |
84 | return name
85 |
86 | def get_name_data_imputed(train_size, cal_size, params_test, imputation,
87 | dim=3, params_reg={}, params_noise={}, dataset=None, params_missing={}, seed=1):
88 | """
89 | Parameters
90 | ----------
91 | n : experiment sample size
92 | dim : dimension of the covariates (i.e. X lies in R^dim)
93 | regression : regression model, should be Linear
94 | noise : noise type, can be Gaussian
95 | params_reg : parameters of the regression part
96 | params_noise : parameters of the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]}
97 | to generate an AR(1) noise with coefficient -ar1
98 | seed : random seed for reproducibility used in the experiment
99 |
100 | Returns
101 | -------
102 | name : name of the file containing (if existing)
103 | the generated data with the given parameters of simulations
104 | """
105 |
106 | name = get_name_data(train_size, cal_size, params_test, dim=dim,
107 | params_reg=params_reg, params_noise=params_noise, dataset=dataset, params_missing=params_missing, seed=seed)
108 |
109 | if imputation is not None:
110 | name = name + '_imputation_' + imputation
111 |
112 | return name
113 |
114 | def get_name_results(pipeline, train_size, cal_size, n_rep, imputation=None, d=3,
115 | params_reg={}, params_noise={}, dataset=None, params_missing={}):
116 | """ ...
117 | Parameters
118 | ----------
119 | pipeline :
120 | params_method :
121 | Returns
122 | -------
123 | name :
124 | """
125 |
126 | # Results file name, depending on the method
127 |
128 | if pipeline != 'Oracle':
129 | name_method = pipeline+'_Imp_'+imputation
130 | else:
131 | name_method = pipeline
132 |
133 | # Results directory name, depending on the data simulation
134 |
135 | if dataset is not None:
136 | name_directory = dataset
137 | else:
138 | name_directory = get_setting(dim=d, params_reg=params_reg, params_noise=params_noise, params_missing=params_missing)
139 | if 'prob_missing' in list(params_missing.keys()):
140 | name_directory = name_directory + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_prob_' + str(params_missing['prob_missing']) + '_rep_' + str(n_rep)
141 | else:
142 | name_directory = name_directory + '_train_' + str(train_size) + '_cal_' + str(cal_size) + '_rep_' + str(n_rep)
143 |
144 | return name_directory, name_method
145 |
146 | def load_file(parent, name, ext):
147 | """ ...
148 | Parameters
149 | ----------
150 | parent :
151 | name :
152 | ext :
153 | Returns
154 | -------
155 | file :
156 | """
157 | assert ext in ['pkl', 'xz'], 'ext must be pkl or xz.'
158 | path = parent + '/' + name + '.' + ext
159 | if ext == 'pkl':
160 | with open(path,'rb') as f:
161 | file = pickle.load(f)
162 | elif ext == 'xz':
163 | with lzma.open(path,'rb') as f:
164 | file = pickle.load(f)
165 |
166 | return file
167 |
168 | def write_file(parent, name, ext, file):
169 | """ ...
170 | Parameters
171 | ----------
172 | parent :
173 | name :
174 | ext :
175 | file :
176 | Returns
177 | -------
178 | """
179 |
180 | assert ext in ['pkl', 'xz'], 'ext must be pkl or xz.'
181 | path = parent + '/' + name + '.' + ext
182 | if ext == 'pkl':
183 | if not os.path.isdir(parent):
184 | os.makedirs(parent)
185 | with open(path,'wb') as f:
186 | pickle.dump(file, f)
187 | elif ext == 'xz':
188 | if not os.path.isdir(parent):
189 | os.makedirs(parent)
190 | with lzma.open(path,'wb') as f:
191 | pickle.dump(file, f)
192 |
193 | def get_name_method(method, basemodel=None, mask='No', protection='No', exact=False):
194 | if exact == True:
195 | assert method == 'CQR_MDA', 'With MDA-Exact you should be masking.'
196 | method = method + '_Exact'
197 | if method == 'CQR_MDA':
198 | method = method + '_Nested'
199 | if method == 'Oracle':
200 | name = method
201 | elif method == 'Oracle_mean' and protection=='No':
202 | name = method
203 | elif method == 'Oracle_mean' and protection!='No':
204 | name = '_'.join([method, protection])
205 | elif protection == 'No' and mask == 'No':
206 | name = '_'.join([method, basemodel])
207 | elif method in ['QR', 'QR_TrainCal', 'CQR_MDA_Nested', 'CQR_MDA_Exact'] and mask == 'No':
208 | name = '_'.join([method, basemodel])
209 | elif method in ['QR', 'QR_TrainCal', 'CQR_MDA_Nested', 'CQR_MDA_Exact'] and mask == 'Yes':
210 | name = '_'.join([method, basemodel, 'Mask'])
211 | elif protection == 'No':
212 | name = '_'.join([method, basemodel, 'Mask'])
213 | elif mask == 'No':
214 | name = '_'.join([method, basemodel, protection])
215 | else:
216 | name = '_'.join([method, basemodel, 'Mask', protection])
217 | return name
218 |
--------------------------------------------------------------------------------
/generation.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import pandas as pd
4 | import copy
5 | import utils
6 | from tqdm.autonotebook import tqdm
7 |
8 | def generate_data(n, dim=3, params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'}, seed=1):
9 | """
10 | Parameters
11 | ----------
12 | n : sample size to generate
13 | dim : dimension of the covariates (i.e. X lies in R^dim)
14 | regression : regression model, should be Linear
15 | noise : noise type, can be Gaussian
16 | params_reg : parameters for the regression part
17 | params_noise : parameters for the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]}
18 | to generate an AR(1) noise with coefficient -ar1
19 | seed : random seed for reproducibility
20 |
21 | Returns
22 | -------
23 | X : covariates values, array of size n x dim
24 | Y : response values, array of size n
25 | """
26 |
27 | random.seed(seed)
28 | np.random.seed(seed)
29 |
30 | regression = params_reg['regression']
31 | assert regression in ['Linear'], 'regression must be Linear.'
32 |
33 | noise = params_noise['noise']
34 |
35 | d = dim
36 |
37 | if 'mean' in params_reg:
38 | mean = params_reg['mean']
39 | else:
40 | mean = 1
41 | if 'phi' in params_reg:
42 | phi = params_reg['phi']
43 | else:
44 | phi = 0.8
45 | mean = np.full(d, mean)
46 | cov = np.full((d,d),phi)+(1-phi)*np.eye(d)
47 | X = np.random.multivariate_normal(mean, cov, size=n)
48 | if 'beta' not in params_reg or params_reg['beta'] is None:
49 | beta = np.full(d,1)
50 | else:
51 | beta = params_reg['beta']
52 | Y_reg = X.dot(beta)
53 |
54 | assert noise in ['Gaussian'], 'noise must be Gaussian.'
55 | if noise == 'Gaussian':
56 | if 'mean' in params_noise:
57 | mean = params_noise['mean']
58 | else:
59 | mean = 0
60 | if 'scale' in params_noise:
61 | scale = params_noise['scale']
62 | else:
63 | scale = 1
64 | eps = np.random.normal(loc=mean,scale=scale,size=(n))
65 |
66 | Y = Y_reg + eps
67 |
68 | data = {'X': X, 'Y': Y}
69 |
70 | return data
71 |
72 | def generate_split(train_size, cal_size, params_test, data):
73 |
74 | X = data['X']
75 | X_train = X[:train_size,:]
76 | X_cal = X[train_size:(train_size+cal_size),:]
77 |
78 | Y = data['Y']
79 | Y_train = Y[:train_size]
80 | Y_cal = Y[train_size:(train_size+cal_size)]
81 |
82 | test_size = params_test['test_size']
83 |
84 | mechanisms_test = params_test['mechanisms_test']
85 |
86 | #if test_size is list:
87 | X_test = dict.fromkeys(test_size)
88 | Y_test = dict.fromkeys(test_size)
89 | for n_test in test_size:
90 | if (train_size+cal_size+n_test) <= X.shape[0]:
91 | X_test[n_test] = X[(train_size+cal_size):(train_size+cal_size+n_test),:]
92 | Y_test[n_test] = Y[(train_size+cal_size):(train_size+cal_size+n_test)]
93 | else:
94 | if 'iid' in mechanisms_test:
95 | assert params_test['iid']['test_size'] != n_test
96 | for extreme in ['worst_pattern', 'best_pattern']:
97 | if extreme in mechanisms_test:
98 | assert params_test[extreme]['test_size'] != n_test
99 | for fixed in ['fixed_nb_sample_pattern','fixed_nb_sample_pattern_size']:
100 | if fixed in mechanisms_test and params_test[fixed]['test_size'] == n_test:
101 | assert (train_size + cal_size + params_test[fixed]['nb_sample_pattern']) <= X.shape[0]
102 |
103 | X_test_created = np.empty((n_test,X.shape[1]))
104 | Y_test_created = np.empty((n_test))
105 |
106 | X_to_shuffle = copy.deepcopy(X[(train_size+cal_size):,:])
107 | Y_to_shuffle = copy.deepcopy(Y[(train_size+cal_size):])
108 | n_shuffle = X_to_shuffle.shape[0]
109 | nb_exact_shuffle = n_test//n_shuffle
110 | nb_rest = n_test%n_shuffle
111 | for k in range(nb_exact_shuffle):
112 | ido = random.sample(range(n_shuffle), n_shuffle)
113 | X_test_created[(k * n_shuffle):((k+1) * n_shuffle), :] = X_to_shuffle[ido, :]
114 | Y_test_created[(k * n_shuffle):((k + 1) * n_shuffle)] = Y_to_shuffle[ido]
115 | ido = random.sample(range(n_shuffle), nb_rest)
116 | X_test_created[((k+1) * n_shuffle):, :] = X_to_shuffle[ido, :]
117 | Y_test_created[((k+1) * n_shuffle):] = Y_to_shuffle[ido]
118 | X_test[n_test] = X_test_created
119 | Y_test[n_test] = Y_test_created
120 |
121 | X_split = {'Train': X_train, 'Cal': X_cal, 'Test': X_test}
122 | Y_split = {'Train': Y_train, 'Cal': Y_cal, 'Test': Y_test}
123 |
124 | return X_split, Y_split
125 |
126 | def generate_MCAR(X, params_test, params_missing={}, seed=1):
127 |
128 | """
129 | Parameters
130 | ----------
131 | X : data array (of shape n x dim) which will suffer missing values
132 | prob_missing : probability of being missing
133 | var_missing : binary vector of length dim, containing 1 if the variables can suffer from missing values, 0 otherwise
134 | (e.g. [1,1,0] indicates that X_3 can not have missing values but X_1 and X_2 can)
135 |
136 | Returns
137 | -------
138 | X_mcar : covariates values (observed or missing, nan in this case), array of size n x dim
139 | M_mcar : Mask array of size n x dim, containing 1 if the realization is missing, 0 otherwise
140 | """
141 |
142 | random.seed(seed)
143 | np.random.seed(seed)
144 |
145 | d = X['Train'].shape[1]
146 |
147 | if 'prob_missing' in params_missing:
148 | prob_missing = params_missing['prob_missing']
149 | else:
150 | prob_missing = 0.2
151 | if 'var_missing' in params_missing:
152 | var_missing = params_missing['var_missing']
153 | else:
154 | var_missing = np.full(d, 1)
155 |
156 | nb_var_missing = np.sum(var_missing)
157 |
158 | train_size = X['Train'].shape[0]
159 | cal_size = X['Cal'].shape[0]
160 |
161 | M_mcar_train = np.full(X['Train'].shape, False)
162 | X_mcar_train = copy.deepcopy(X['Train'])
163 |
164 | M_mcar_cal = np.full(X['Cal'].shape, False)
165 | X_mcar_cal = copy.deepcopy(X['Cal'])
166 |
167 | M_mcar_train[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(train_size,nb_var_missing)) <= (prob_missing))
168 | X_mcar_train[M_mcar_train] = np.nan
169 | M_mcar_cal[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(cal_size,nb_var_missing)) <= (prob_missing))
170 | X_mcar_cal[M_mcar_cal] = np.nan
171 |
172 | mechanisms_test = params_test['mechanisms_test']
173 |
174 | M_mcar = {'Train': M_mcar_train, 'Cal': M_mcar_cal}
175 | M_mcar_test = dict.fromkeys(mechanisms_test)
176 |
177 | X_mcar = {'Train': X_mcar_train, 'Cal': X_mcar_cal}
178 | X_mcar_test = dict.fromkeys(mechanisms_test)
179 |
180 | if 'iid' in mechanisms_test:
181 | test_size = params_test['iid']['test_size']
182 | M_mcar_iid = np.full((test_size, d), False)
183 | M_mcar_iid[:,np.where(np.array(var_missing) == 1)[0]] = (np.random.uniform(low=0,high=1,size=(test_size,nb_var_missing)) <= (prob_missing))
184 | M_mcar_test['iid'] = M_mcar_iid
185 | X_mcar_iid = copy.deepcopy(X['Test'][test_size])
186 | X_mcar_iid[M_mcar_iid] = np.nan
187 | X_mcar_test['iid'] = X_mcar_iid
188 | for extreme in ['worst_pattern', 'best_pattern']:
189 | if extreme in mechanisms_test:
190 | test_size = params_test[extreme]['test_size']
191 | test_pattern = params_test[extreme]['pattern']
192 | M_mcar_extreme = np.full((test_size, d), False)
193 | M_mcar_extreme[:,np.where(np.array(test_pattern) == 1)[0]] = 1
194 | M_mcar_test[extreme] = M_mcar_extreme
195 | X_mcar_extreme = copy.deepcopy(X['Test'][test_size])
196 | X_mcar_extreme[M_mcar_extreme] = np.nan
197 | X_mcar_test[extreme] = X_mcar_extreme
198 | if 'fixed_nb_sample_pattern' in mechanisms_test:
199 | list_patterns = utils.create_patterns(d, var_missing)
200 | test_size = params_test['fixed_nb_sample_pattern']['test_size']
201 | nb_sample_pattern = params_test['fixed_nb_sample_pattern']['nb_sample_pattern']
202 | M_mcar_fixed_sample_pattern = np.full((test_size, d), False)
203 | X_mcar_fixed_sample_pattern = copy.deepcopy(X['Test'][test_size])
204 | for idp, pattern in enumerate(list_patterns):
205 | M_mcar_fixed_sample_pattern[(idp*nb_sample_pattern):((idp+1)*nb_sample_pattern),np.where(np.array(pattern) == 1)[0]] = 1
206 | X_mcar_fixed_sample_pattern[M_mcar_fixed_sample_pattern] = np.nan
207 | M_mcar_test['fixed_nb_sample_pattern'] = M_mcar_fixed_sample_pattern
208 | X_mcar_test['fixed_nb_sample_pattern'] = X_mcar_fixed_sample_pattern
209 | if 'fixed_nb_sample_pattern_size' in mechanisms_test:
210 | list_pattern_sizes = np.arange(np.sum(var_missing))
211 | test_size = params_test['fixed_nb_sample_pattern_size']['test_size']
212 | nb_sample_pattern_size = params_test['fixed_nb_sample_pattern_size']['nb_sample_pattern']
213 | M_mcar_fixed_sample_pattern_size = np.full((test_size, d), False)
214 | X_mcar_fixed_sample_pattern_size = copy.deepcopy(X['Test'][test_size])
215 |
216 | list_patterns = utils.create_patterns(d, var_missing)
217 | size_to_ids = dict.fromkeys(np.arange(0, d))
218 | for k in np.arange(0, d):
219 | size_to_ids[k] = []
220 | for pattern in list_patterns:
221 | key_pattern = utils.pattern_to_id(pattern)
222 | size_pattern = utils.pattern_to_size(pattern)
223 | size_to_ids[size_pattern] = np.append(size_to_ids[size_pattern], key_pattern)
224 |
225 | for idp, pattern_size in enumerate(list_pattern_sizes):
226 | keys = random.choices(size_to_ids[pattern_size], k=nb_sample_pattern_size)
227 | unique_keys, count_keys = np.unique(keys, return_counts=True)
228 | min_ind = idp * nb_sample_pattern_size
229 | for idps, key in enumerate(unique_keys):
230 | nb_sample_pattern = count_keys[idps]
231 | pattern = utils.bin_to_vec(bin(int(key)), d)
232 | M_mcar_fixed_sample_pattern_size[min_ind:(min_ind+nb_sample_pattern),np.where(np.array(pattern) == 1)[0]] = 1
233 | min_ind = min_ind + nb_sample_pattern
234 | X_mcar_fixed_sample_pattern_size[M_mcar_fixed_sample_pattern_size] = np.nan
235 | M_mcar_test['fixed_nb_sample_pattern_size'] = M_mcar_fixed_sample_pattern_size
236 | X_mcar_test['fixed_nb_sample_pattern_size'] = X_mcar_fixed_sample_pattern_size
237 |
238 | X_mcar['Test'] = X_mcar_test
239 | M_mcar['Test'] = M_mcar_test
240 |
241 | return X_mcar, M_mcar
242 |
243 | def process_test(params_test, d, params_missing={}):
244 |
245 | test_sizes = []
246 | mechanisms_test = []
247 |
248 | for mechanism in list(params_test.keys()):
249 | assert mechanism in ['iid', 'worst_pattern', 'best_pattern', 'test_pattern', 'fixed_nb_sample_pattern', 'fixed_nb_sample_pattern_size'], 'Test mechanism should be among iid, worst_pattern, best_pattern, test_pattern, fixed_nb_sample_pattern, fixed_nb_sample_pattern_size.'
250 | mechanisms_test = np.append(mechanisms_test, mechanism)
251 | if mechanism not in ['fixed_nb_sample_pattern', 'fixed_nb_sample_pattern_size']:
252 | assert 'test_size' in list(params_test[mechanism].keys()), 'test_size should be provided for each test mechanism.'
253 | test_sizes = np.append(test_sizes, int(params_test[mechanism]['test_size']))
254 | else:
255 | assert 'nb_sample_pattern' in list(params_test[mechanism].keys()), 'nb_sample_pattern should be provided for fixed_nb_sample_pattern mechanism.'
256 | nb_sample_pattern = params_test[mechanism]['nb_sample_pattern']
257 |
258 | if 'var_missing' in params_missing:
259 | var_missing = params_missing['var_missing']
260 | else:
261 | var_missing = np.full(d, 1)
262 |
263 | if mechanism == 'fixed_nb_sample_pattern':
264 |
265 | list_patterns = utils.create_patterns(d, var_missing)
266 | nb_pattern = len(list_patterns)
267 | test_size = nb_sample_pattern*nb_pattern
268 | test_sizes = np.append(test_sizes, int(test_size))
269 | params_test[mechanism]['test_size'] = test_size
270 |
271 | else:
272 |
273 | nb_pattern_size = np.sum(var_missing)
274 | test_size = nb_sample_pattern * nb_pattern_size
275 | test_sizes = np.append(test_sizes, int(test_size))
276 | params_test[mechanism]['test_size'] = test_size
277 |
278 | test_sizes = np.unique(test_sizes).astype(int)
279 |
280 | params_test['test_size'] = test_sizes
281 | params_test['mechanisms_test'] = mechanisms_test
282 |
283 | return params_test
284 |
285 | def generate_multiple_data(train_size, cal_size, params_test, n_rep, dim=3,
286 | params_reg={'regression':'Linear'}, params_noise={'noise':'Gaussian'},
287 | params_missing={'mechanism':'MCAR'}):
288 | """
289 | Parameters
290 | ----------
291 | n : sample size to generate
292 | dim : dimension of the covariates (i.e. X lies in R^dim)
293 | regression : regression model, should be Linear
294 | noise : noise type, can be Gaussian
295 | params_reg : parameters for the regression part
296 | params_noise : parameters for the noise, e.g. a dictionary {'ar': [1, ar1], 'ma':[1]}
297 | to generate an AR(1) noise with coefficient -ar1
298 | seed_max : random seeds for reproducibility, will generate seed_max data-sets, of seeds 0 to seed_max-1
299 |
300 | Returns
301 | -------
302 | X : covariates values, array of size seedmax x n x dim
303 | Y : response values, array of size seedmax x n
304 | """
305 |
306 | sets = ['Train', 'Cal', 'Test']
307 | mechanisms_test = params_test['mechanisms_test']
308 | max_test_size = np.max(params_test['test_size'])
309 |
310 | n = train_size + cal_size + max_test_size
311 |
312 | X = dict.fromkeys(sets)
313 | X_missing = dict.fromkeys(sets)
314 | M = dict.fromkeys(sets)
315 | Y = dict.fromkeys(sets)
316 |
317 | for k in tqdm(range(n_rep)):
318 | data = generate_data(n, dim=dim, params_reg=params_reg, params_noise=params_noise, seed=k)
319 | Xk, Yk = generate_split(train_size, cal_size, params_test, data)
320 | Xk_missing, Mk_missing = generate_MCAR(Xk, params_test, params_missing, seed=k)
321 |
322 | for set in ['Train', 'Cal']:
323 | if k == 0:
324 | X[set] = np.expand_dims(Xk[set], axis=0)
325 | X_missing[set] = np.expand_dims(Xk_missing[set], axis=0)
326 | M[set] = np.expand_dims(Mk_missing[set], axis=0)
327 | Y[set] = Yk[set]
328 | else:
329 | X[set] = np.vstack((X[set],np.expand_dims(Xk[set], axis=0)))
330 | X_missing[set] = np.vstack((X_missing[set],np.expand_dims(Xk_missing[set], axis=0)))
331 | M[set] = np.vstack((M[set],np.expand_dims(Mk_missing[set], axis=0)))
332 | Y[set] = np.vstack((Y[set],np.array(Yk[set])))
333 |
334 | set = 'Test'
335 | if k == 0:
336 | X[set] = dict.fromkeys(mechanisms_test)
337 | X_missing[set] = dict.fromkeys(mechanisms_test)
338 | M[set] = dict.fromkeys(mechanisms_test)
339 | Y[set] = dict.fromkeys(mechanisms_test)
340 | for key in mechanisms_test:
341 | n_test = params_test[key]['test_size']
342 | X[set][key] = np.expand_dims(Xk[set][n_test], axis=0)
343 | Y[set][key] = Yk[set][n_test]
344 | X_missing[set][key] = np.expand_dims(Xk_missing[set][key], axis=0)
345 | M[set][key] = np.expand_dims(Mk_missing[set][key], axis=0)
346 |
347 | else:
348 | for key in mechanisms_test:
349 | n_test = params_test[key]['test_size']
350 | X[set][key] = np.vstack((X[set][key],np.expand_dims(Xk[set][n_test], axis=0)))
351 | Y[set][key] = np.vstack((Y[set][key], np.array(Yk[set][n_test])))
352 | X_missing[set][key] = np.vstack((X_missing[set][key], np.expand_dims(Xk_missing[set][key], axis=0)))
353 | M[set][key] = np.vstack((M[set][key], np.expand_dims(Mk_missing[set][key], axis=0)))
354 |
355 |
356 | return X, X_missing, M, Y, params_missing
357 |
358 | # Real data
359 |
360 | def real_generate_multiple_split(dataframe, target, prob_test=0.2, seed_max=1):
361 |
362 | data_features = dataframe.loc[:, dataframe.columns != target]
363 | response = dataframe.loc[:, target]
364 | n = dataframe.shape[0]
365 | d = data_features.shape[1]
366 |
367 | test_size = int(n*prob_test)
368 | train_cal_size = int(n-test_size)
369 | train_size = int(2*(train_cal_size//3) + train_cal_size%3)
370 | cal_size = int(train_cal_size//3)
371 |
372 | sizes = {'Train': train_size, 'Cal': cal_size, 'Test':test_size}
373 |
374 | mask_original = data_features.isnull().replace({True: 1, False: 0})
375 |
376 | vars_categ = data_features.select_dtypes("object").columns
377 |
378 | data_features_categ = data_features[vars_categ]
379 |
380 | vars_categ = data_features.select_dtypes("object").columns
381 | vars_quant = set(data_features.columns).difference(set(vars_categ))
382 | mask_features = data_features[vars_quant].isnull().replace({True: 1,False: 0})
383 |
384 | data_features_categ_na = data_features_categ.fillna("-2")
385 | data_features_categ_encoded = pd.DataFrame(index=data_features_categ_na.index)
386 | for var in vars_categ:
387 | if np.sum(data_features_categ_na[var]=="1") > 0:
388 | data_features_categ_encoded[str(var)+"_1"] = data_features_categ_na[var]=="1"
389 | if np.sum(data_features_categ_na[var]=="0") > 0:
390 | data_features_categ_encoded[str(var)+"_0"] = data_features_categ_na[var]=="0"
391 | if np.sum(data_features_categ_na[var]=="-1") > 0:
392 | data_features_categ_encoded[str(var)+"_-1"] = data_features_categ_na[var]=="-1"
393 | if np.sum(data_features_categ_na[var]=="-2") > 0:
394 | data_features_categ_encoded[str(var)+"_-2"] = data_features_categ_na[var]=="-2"
395 | data_features_categ_encoded = data_features_categ_encoded.replace({True:1, False:0})
396 | data_features = data_features[vars_quant].merge(data_features_categ_encoded, left_index=True, right_index=True)
397 |
398 | mask = data_features.isnull().replace({True: 1, False: 0})
399 |
400 | col_features = list(data_features.columns)
401 |
402 | d_quant = mask_features.shape[1]
403 | d_aug = data_features.shape[1]
404 |
405 | X_missing = np.empty((seed_max,n,d_aug))
406 | M_original = np.empty((seed_max,n,d))
407 | M = np.empty((seed_max, n, d_aug))
408 | M_quant = np.empty((seed_max,n,d_quant))
409 | Y = np.empty((seed_max,n))
410 |
411 | for k in range(seed_max):
412 |
413 | random.seed(k)
414 | np.random.seed(k)
415 |
416 | ido = random.sample(range(n), n)
417 |
418 | X_missing[k,:,:] = data_features.iloc[ido,:]
419 | M_original[k,:,:] = mask_original.iloc[ido,:]
420 | M[k, :, :] = mask.iloc[ido, :]
421 | M_quant[k,:,:] = mask_features.iloc[ido,:]
422 | Y[k,:] = response[ido]
423 |
424 | data = {'X_missing':X_missing, 'M_original':M_original,'M':M, 'M_quant':M_quant, 'Y':Y}
425 |
426 | keys = ['X_missing', 'M_original', 'M', 'M_quant']
427 | for key in keys:
428 | arr = data[key]
429 | arr_train = arr[:,:train_size,:]
430 | arr_cal = arr[:,train_size:(train_size+cal_size),:]
431 | arr_test = arr[:,(n-test_size):n,:]
432 | globals()[key+'_split'] = {'Train': arr_train, 'Cal': arr_cal, 'Test': {'iid': arr_test}}
433 |
434 | Y = data['Y']
435 | Y_train = Y[:,:train_size]
436 | Y_cal = Y[:,train_size:(train_size+cal_size)]
437 | Y_test = Y[:,(n-test_size):n]
438 | Y_split = {'Train': Y_train, 'Cal': Y_cal, 'Test':{'iid': Y_test}}
439 |
440 | return X_missing_split, M_original_split, M_split, M_quant_split, Y_split, col_features, sizes
441 |
442 | def real_generate_multiple_split_holdout(dataframe, target, prob_test=0.2):
443 |
444 | n = dataframe.shape[0]
445 | ido = random.sample(range(n), n)
446 | dataframe = dataframe.iloc[ido,:]
447 | dataframe = dataframe.reset_index(drop=True)
448 |
449 | data_features = dataframe.loc[:, dataframe.columns != target]
450 | response = dataframe.loc[:, target]
451 |
452 | d = data_features.shape[1]
453 |
454 | test_size = int(n*prob_test)
455 | train_cal_size = int(n-test_size)
456 | train_size = int(2*(train_cal_size//3) + train_cal_size%3)
457 | cal_size = int(train_cal_size//3)
458 |
459 | sizes = {'Train': train_size, 'Cal': cal_size, 'Test':test_size}
460 |
461 | mask_original = data_features.isnull().replace({True: 1, False: 0})
462 |
463 | vars_categ = data_features.select_dtypes("object").columns
464 |
465 | data_features_categ = data_features[vars_categ]
466 |
467 | vars_categ = data_features.select_dtypes("object").columns
468 | vars_quant = set(data_features.columns).difference(set(vars_categ))
469 | mask_features = data_features[vars_quant].isnull().replace({True: 1,False: 0})
470 |
471 | data_features_categ_na = data_features_categ.fillna("-2")
472 | data_features_categ_encoded = pd.DataFrame(index=data_features_categ_na.index)
473 | for var in vars_categ:
474 | if np.sum(data_features_categ_na[var]=="1") > 0:
475 | data_features_categ_encoded[str(var)+"_1"] = data_features_categ_na[var]=="1"
476 | if np.sum(data_features_categ_na[var]=="0") > 0:
477 | data_features_categ_encoded[str(var)+"_0"] = data_features_categ_na[var]=="0"
478 | if np.sum(data_features_categ_na[var]=="-1") > 0:
479 | data_features_categ_encoded[str(var)+"_-1"] = data_features_categ_na[var]=="-1"
480 | if np.sum(data_features_categ_na[var]=="-2") > 0:
481 | data_features_categ_encoded[str(var)+"_-2"] = data_features_categ_na[var]=="-2"
482 | data_features_categ_encoded = data_features_categ_encoded.replace({True:1, False:0})
483 | data_features = data_features[vars_quant].merge(data_features_categ_encoded, left_index=True, right_index=True)
484 |
485 | mask = data_features.isnull().replace({True: 1, False: 0})
486 |
487 | col_features = list(data_features.columns)
488 |
489 | d_quant = mask_features.shape[1]
490 | d_aug = data_features.shape[1]
491 |
492 | nb_split = n//(test_size)
493 |
494 | X_missing_train = np.empty((nb_split, sizes['Train'], d_aug))
495 | M_train = np.empty((nb_split, sizes['Train'], d_aug))
496 | M_original_train = np.empty((nb_split, sizes['Train'], d))
497 | M_quant_train = np.empty((nb_split, sizes['Train'], d_quant))
498 | Y_train = np.empty((nb_split, sizes['Train']))
499 |
500 | X_missing_cal = np.empty((nb_split, sizes['Cal'], d_aug))
501 | M_cal = np.empty((nb_split, sizes['Cal'], d_aug))
502 | M_original_cal = np.empty((nb_split, sizes['Cal'], d))
503 | M_quant_cal = np.empty((nb_split, sizes['Cal'], d_quant))
504 | Y_cal = np.empty((nb_split, sizes['Cal']))
505 |
506 | X_missing_test = np.empty((nb_split, sizes['Test'], d_aug))
507 | M_test = np.empty((nb_split, sizes['Test'], d_aug))
508 | M_original_test = np.empty((nb_split, sizes['Test'], d))
509 | M_quant_test = np.empty((nb_split, sizes['Test'], d_quant))
510 | Y_test = np.empty((nb_split, sizes['Test']))
511 |
512 | idx = np.array(list(np.arange(n)))
513 |
514 | for k in range(nb_split):
515 |
516 | id_test = idx[(k*sizes['Test']):((k+1)*sizes['Test'])]
517 | idbool = np.full(len(idx), True, dtype=bool)
518 | idbool[id_test] = False
519 | test = list(idx[~idbool])
520 | traincal = list(idx[idbool])
521 | train = traincal[:train_size]
522 | cal = traincal[train_size:]
523 |
524 | X_missing_train[k, :, :] = data_features.iloc[train, :]
525 | X_missing_cal[k, :, :] = data_features.iloc[cal, :]
526 | X_missing_test[k,:,:] = data_features.iloc[test, :]
527 | M_train[k, :, :] = mask.iloc[train, :]
528 | M_cal[k, :, :] = mask.iloc[cal, :]
529 | M_test[k, :, :] = mask.iloc[test, :]
530 | M_original_train[k, :, :] = mask_original.iloc[train, :]
531 | M_original_cal[k, :, :] = mask_original.iloc[cal, :]
532 | M_original_test[k, :, :] = mask_original.iloc[test, :]
533 | M_quant_train[k, :, :] = mask_features.iloc[train, :]
534 | M_quant_cal[k, :, :] = mask_features.iloc[cal, :]
535 | M_quant_test[k, :, :] = mask_features.iloc[test, :]
536 | Y_train[k, :] = response[train]
537 | Y_cal[k, :] = response[cal]
538 | Y_test[k, :] = response[test]
539 |
540 | X_missing = {'Train': X_missing_train, 'Cal': X_missing_cal, 'Test': {'iid': X_missing_test}}
541 | M = {'Train': M_train, 'Cal': M_cal, 'Test': {'iid': M_test}}
542 | M_original = {'Train': M_original_train, 'Cal': M_original_cal, 'Test': {'iid': M_original_test}}
543 | M_quant = {'Train': M_quant_train, 'Cal': M_quant_cal, 'Test': {'iid': M_quant_test}}
544 | Y = {'Train': Y_train, 'Cal': Y_cal, 'Test': {'iid': Y_test}}
545 |
546 | return X_missing, M_original, M, M_quant, Y, col_features, sizes
547 |
548 |
549 | def generate_multiple_real_data_MCAR(dataframe, target, train_size, cal_size, params_test, params_missing={}, seed_max=1):
550 | """
551 | Parameters
552 | ----------
553 |
554 | seed_max : random seeds for reproducibility, will generate seed_max data-sets, of seeds 0 to seed_max-1
555 |
556 | Returns
557 | -------
558 | X : covariates values, array of size seedmax x n x dim
559 | Y : response values, array of size seedmax x n
560 | """
561 |
562 | data_features = dataframe.loc[:, dataframe.columns != target]
563 | response = dataframe.loc[:, target]
564 |
565 | sets = ['Train', 'Cal', 'Test']
566 | mechanisms_test = params_test['mechanisms_test']
567 | max_test_size = np.max(params_test['test_size'])
568 |
569 | n = dataframe.shape[0]
570 |
571 | X = dict.fromkeys(sets)
572 | X_missing = dict.fromkeys(sets)
573 | M = dict.fromkeys(sets)
574 | Y = dict.fromkeys(sets)
575 |
576 | for k in range(seed_max):
577 |
578 | random.seed(k)
579 | np.random.seed(k)
580 |
581 | ido = random.sample(range(n), n)
582 |
583 | Xk = np.array(data_features.iloc[ido,:])
584 | Yk = np.array(response[ido])
585 |
586 | data = {'X': Xk, 'Y':Yk}
587 |
588 | Xk, Yk = generate_split(train_size, cal_size, params_test, data)
589 | Xk_missing, Mk_missing = generate_MCAR(Xk, params_test, params_missing, seed=k)
590 |
591 | for set in ['Train', 'Cal']:
592 | if k == 0:
593 | X[set] = np.expand_dims(Xk[set], axis=0)
594 | X_missing[set] = np.expand_dims(Xk_missing[set], axis=0)
595 | M[set] = np.expand_dims(Mk_missing[set], axis=0)
596 | Y[set] = Yk[set]
597 | else:
598 | X[set] = np.vstack((X[set], np.expand_dims(Xk[set], axis=0)))
599 | X_missing[set] = np.vstack((X_missing[set], np.expand_dims(Xk_missing[set], axis=0)))
600 | M[set] = np.vstack((M[set], np.expand_dims(Mk_missing[set], axis=0)))
601 | Y[set] = np.vstack((Y[set], np.array(Yk[set])))
602 |
603 | set = 'Test'
604 | if k == 0:
605 | X[set] = dict.fromkeys(mechanisms_test)
606 | X_missing[set] = dict.fromkeys(mechanisms_test)
607 | M[set] = dict.fromkeys(mechanisms_test)
608 | Y[set] = dict.fromkeys(mechanisms_test)
609 | for key in mechanisms_test:
610 | n_test = params_test[key]['test_size']
611 | X[set][key] = np.expand_dims(Xk[set][n_test], axis=0)
612 | X_missing[set][key] = np.expand_dims(Xk_missing[set][key], axis=0)
613 | M[set][key] = np.expand_dims(Mk_missing[set][key], axis=0)
614 | Y[set][key] = Yk[set][n_test]
615 | else:
616 | for key in mechanisms_test:
617 | n_test = params_test[key]['test_size']
618 | X[set][key] = np.vstack((X[set][key], np.expand_dims(Xk[set][n_test], axis=0)))
619 | X_missing[set][key] = np.vstack((X_missing[set][key], np.expand_dims(Xk_missing[set][key], axis=0)))
620 | M[set][key] = np.vstack((M[set][key], np.expand_dims(Mk_missing[set][key], axis=0)))
621 | Y[set][key] = np.vstack((Y[set][key], np.array(Yk[set][n_test])))
622 |
623 | return X, X_missing, M, Y
624 |
--------------------------------------------------------------------------------
/imputation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.experimental import enable_iterative_imputer # noqa
3 | from sklearn.impute import SimpleImputer, IterativeImputer
4 |
5 | def impute(data, imputation):
6 |
7 | assert imputation in ['mean', 'constant', 'MICE', 'iterative_ridge'], 'imputation must be constant, mean, iterative_ridge or MICE.'
8 |
9 | X_missing = data['X_missing']
10 |
11 | if imputation in ['mean', 'constant']:
12 | imputer = SimpleImputer(missing_values=np.nan, strategy=imputation)
13 | elif imputation == 'MICE':
14 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=True)
15 | elif imputation == 'iterative_ridge':
16 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=False)
17 |
18 | n_rep = X_missing['Train'].shape[0]
19 |
20 | X_train_imp = np.empty(X_missing['Train'].shape)
21 | X_cal_imp = np.empty(X_missing['Cal'].shape)
22 | if type(X_missing['Test']) is dict:
23 | multiple_test = True
24 | keys_test = list(X_missing['Test'].keys())
25 | X_test_imp = dict.fromkeys(keys_test)
26 | for key in keys_test:
27 | X_test_imp[key] = np.empty(X_missing['Test'][key].shape)
28 | else:
29 | multiple_test = False
30 | X_test_imp = np.empty(X_missing['Test'].shape)
31 |
32 | for k in range(n_rep):
33 |
34 | imputer.fit(X_missing['Train'][k,:,:])
35 |
36 | X_train_imp[k,:,:] = imputer.transform(X_missing['Train'][k,:,:])
37 | X_cal_imp[k,:,:] = imputer.transform(X_missing['Cal'][k,:,:])
38 | if multiple_test:
39 | for key in keys_test:
40 | X_test_imp[key][k,:,:] = imputer.transform(X_missing['Test'][key][k,:,:])
41 | else:
42 | X_test_imp[k,:,:] = imputer.transform(X_missing['Test'][k,:,:])
43 |
44 | X_imputed = {'Train': X_train_imp, 'Cal': X_cal_imp, 'Test': X_test_imp}
45 |
46 | return X_imputed
47 |
48 | def impute_imputer(X, imputation):
49 |
50 | assert imputation in ['mean', 'constant', 'MICE', 'iterative_ridge'], 'imputation must be constant, mean, iterative_ridge or MICE.'
51 |
52 | if imputation in ['mean', 'constant']:
53 | imputer = SimpleImputer(missing_values=np.nan, strategy=imputation)
54 | elif imputation == 'MICE':
55 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=True)
56 | elif imputation == 'iterative_ridge':
57 | imputer = IterativeImputer(missing_values=np.nan, sample_posterior=False)
58 |
59 | imputer.fit(X)
60 |
61 | return imputer
62 |
--------------------------------------------------------------------------------
/plots/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/plots/.DS_Store
--------------------------------------------------------------------------------
/prediction.py:
--------------------------------------------------------------------------------
1 | import files
2 | import utils
3 | import imputation as imp
4 | from tqdm.autonotebook import tqdm
5 | import numpy as np
6 | np.warnings.filterwarnings('ignore')
7 |
8 | from scipy.stats import norm
9 | import functools
10 |
11 | from sklearn.ensemble import RandomForestRegressor
12 | from sklearn.ensemble import GradientBoostingRegressor
13 |
14 | from sklearn.linear_model import LinearRegression
15 |
16 | from sklearn.linear_model import QuantileRegressor
17 |
18 | import torch
19 | import torch.nn as nn
20 | import torch.optim as optim
21 | from torch.utils.data import DataLoader
22 | from torch.utils.data import Dataset
23 | from sklearn.preprocessing import StandardScaler
24 |
25 | import six
26 | import sys
27 | sys.modules['sklearn.externals.six'] = six
28 |
29 | import quantile_forest as qf
30 |
31 |
32 | import copy
33 |
34 | ### The following lines of code are copied from CHR (Sesia and Romano, 2021) public GitHub.`
35 | ### https://github.com/msesia/chr
36 |
37 | class RegressionDataset(Dataset):
38 |
39 | def __init__(self, X_data, y_data):
40 | self.X_data = torch.from_numpy(X_data).float()
41 | self.y_data = torch.from_numpy(y_data).float()
42 |
43 | def __getitem__(self, index):
44 | return self.X_data[index], self.y_data[index]
45 |
46 | def __len__ (self):
47 | return len(self.X_data)
48 |
49 | class NNet(nn.Module):
50 | """ Conditional quantile estimator, formulated as neural net
51 | """
52 | def __init__(self, quantiles, num_features, num_hidden=64, dropout=0.1, no_crossing=False):
53 | """ Initialization
54 | Parameters
55 | ----------
56 | quantiles : numpy array of quantile levels (q), each in the range (0,1)
57 | num_features : integer, input signal dimension (p)
58 | num_hidden : integer, hidden layer dimension
59 | dropout : float, dropout rate
60 | no_crossing: boolean, whether to explicitly prevent quantile crossovers
61 | """
62 | super(NNet, self).__init__()
63 |
64 | self.no_crossing = no_crossing
65 |
66 | self.num_quantiles = len(quantiles)
67 |
68 | # Construct base network
69 | self.base_model = nn.Sequential(
70 | nn.Linear(num_features, num_hidden),
71 | nn.ReLU(),
72 | nn.Dropout(dropout),
73 | nn.Linear(num_hidden, num_hidden),
74 | nn.ReLU(),
75 | nn.Dropout(dropout),
76 | nn.Linear(num_hidden, self.num_quantiles),
77 | )
78 | self.init_weights()
79 |
80 | def init_weights(self):
81 | """ Initialize the network parameters
82 | """
83 | for m in self.base_model:
84 | if isinstance(m, nn.Linear):
85 | nn.init.orthogonal_(m.weight)
86 | nn.init.constant_(m.bias, 0)
87 |
88 | def forward(self, x):
89 | """ Run forward pass
90 | """
91 | x = self.base_model(x)
92 | if self.no_crossing:
93 | y,_ = torch.sort(x,1)
94 | else:
95 | y = x
96 | return y
97 |
98 | class AllQuantileLoss(nn.Module):
99 | """ Pinball loss function
100 | """
101 | def __init__(self, quantiles):
102 | """ Initialize
103 | Parameters
104 | ----------
105 | quantiles : pytorch vector of quantile levels, each in the range (0,1)
106 | """
107 | super().__init__()
108 | self.quantiles = quantiles
109 |
110 | def forward(self, preds, target):
111 | """ Compute the pinball loss
112 | Parameters
113 | ----------
114 | preds : pytorch tensor of estimated labels (n)
115 | target : pytorch tensor of true labels (n)
116 | Returns
117 | -------
118 | loss : cost function value
119 | """
120 | #assert not target.requires_grad
121 | #assert preds.size(0) == target.size(0)
122 |
123 | errors = target.unsqueeze(1)-preds
124 | Q = self.quantiles.unsqueeze(0)
125 | loss = torch.max((Q-1.0)*errors, Q*errors).mean()
126 |
127 | return loss
128 |
129 |
130 | class QNet:
131 | """ Fit a neural network (conditional quantile) to training data
132 | """
133 | def __init__(self, quantiles, num_features, no_crossing=False, dropout=0.2, learning_rate=0.001,
134 | num_epochs=100, batch_size=16, num_hidden=64, random_state=0, calibrate=0, verbose=False):
135 | """ Initialization
136 | Parameters
137 | ----------
138 | quantiles : numpy array of quantile levels (q), each in the range (0,1)
139 | num_features : integer, input signal dimension (p)
140 | learning_rate : learning rate
141 | random_state : integer, seed used in CV when splitting to train-test
142 | """
143 |
144 | # Detect whether CUDA is available
145 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
146 |
147 | # Store input (sort the quantiles)
148 | quantiles = np.sort(quantiles)
149 | self.quantiles = torch.from_numpy(quantiles).float().to(self.device)
150 | self.num_features = num_features
151 |
152 | # Define NNet model
153 | self.model = NNet(self.quantiles, self.num_features, num_hidden=num_hidden, dropout=dropout, no_crossing=no_crossing)
154 | self.model.to(self.device)
155 |
156 | # Initialize optimizer
157 | self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
158 |
159 | # Initialize loss function
160 | self.loss_func = AllQuantileLoss(self.quantiles)
161 |
162 | # Store variables
163 | self.num_epochs = num_epochs
164 | self.batch_size = batch_size
165 | self.random_state = random_state
166 | self.calibrate = int(calibrate)
167 |
168 | # Initialize training logs
169 | self.loss_history = []
170 | self.test_loss_history = []
171 | self.full_loss_history = []
172 |
173 | # Validation
174 | self.val_period = 10
175 |
176 | self.verbose = verbose
177 |
178 | def fit(self, X, Y, return_loss=False):
179 |
180 | self.scaler = StandardScaler()
181 | self.scaler.fit(X)
182 | X = self.scaler.transform(X)
183 |
184 | Y = Y.flatten().astype(np.float32)
185 | X = X.astype(np.float32)
186 |
187 | dataset = RegressionDataset(X, Y)
188 | num_epochs = self.num_epochs
189 | if self.calibrate>0:
190 | # Train with 80% of samples
191 | n_valid = int(np.round(0.2*X.shape[0]))
192 | loss_stats = []
193 | for b in range(self.calibrate):
194 | X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=n_valid, random_state=self.random_state+b)
195 | train_dataset = RegressionDataset(X_train, Y_train)
196 | val_dataset = RegressionDataset(X_valid, Y_valid)
197 | loss_stats_tmp = self._fit(train_dataset, num_epochs, val_dataset=val_dataset)
198 | loss_stats.append([loss_stats_tmp['val']])
199 | # Reset model
200 | self.model.init_weights()
201 |
202 | loss_stats = np.matrix(np.concatenate(loss_stats,0)).T
203 |
204 | loss_stats = np.median(loss_stats,1).flatten()
205 | # Find optimal number of epochs
206 | num_epochs = self.val_period*(np.argmin(loss_stats)+1)
207 | loss_stats_cal = loss_stats
208 |
209 | # Train with all samples
210 | loss_stats = self._fit(dataset, num_epochs)
211 | if self.calibrate:
212 | loss_stats = loss_stats_cal
213 |
214 | #if return_loss:
215 | return self
216 |
217 | def _fit(self, train_dataset, num_epochs, val_dataset=None):
218 | batch_size = self.batch_size
219 |
220 | # Initialize data loaders
221 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size)
222 | if val_dataset is not None:
223 | val_loader = DataLoader(dataset=val_dataset, batch_size=1)
224 |
225 | num_samples, num_features = train_dataset.X_data.shape
226 | print("Training with {} samples and {} features.". \
227 | format(num_samples, num_features))
228 |
229 | loss_stats = {'train': [], "val": []}
230 |
231 | X_train_batch = train_dataset.X_data.to(self.device)
232 | y_train_batch = train_dataset.y_data.to(self.device)
233 |
234 | for e in tqdm(range(1, num_epochs+1)):
235 |
236 | # TRAINING
237 | train_epoch_loss = 0
238 | self.model.train()
239 |
240 | if batch_size<500:
241 |
242 | for X_train_batch, y_train_batch in train_loader:
243 | X_train_batch, y_train_batch = X_train_batch.to(self.device), y_train_batch.to(self.device)
244 | self.optimizer.zero_grad()
245 |
246 | y_train_pred = self.model(X_train_batch).to(self.device)
247 |
248 | train_loss = self.loss_func(y_train_pred, y_train_batch)
249 |
250 | train_loss.backward()
251 | self.optimizer.step()
252 |
253 | train_epoch_loss += train_loss.item()
254 |
255 | else:
256 | self.optimizer.zero_grad()
257 |
258 | y_train_pred = self.model(X_train_batch).to(self.device)
259 |
260 | train_loss = self.loss_func(y_train_pred, y_train_batch)
261 |
262 | train_loss.backward()
263 | self.optimizer.step()
264 |
265 | train_epoch_loss += train_loss.item()
266 |
267 | # VALIDATION
268 | if val_dataset is not None:
269 | if e % self.val_period == 0:
270 | self.model.eval()
271 | with torch.no_grad():
272 | val_epoch_loss = 0
273 | for X_val_batch, y_val_batch in val_loader:
274 | X_val_batch, y_val_batch = X_val_batch.to(self.device), y_val_batch.to(self.device)
275 | y_val_pred = self.model(X_val_batch).to(self.device)
276 | val_loss = self.loss_func(y_val_pred, y_val_batch)
277 | val_epoch_loss += val_loss.item()
278 |
279 | loss_stats['val'].append(val_epoch_loss/len(val_loader))
280 | self.model.train()
281 |
282 | else:
283 | loss_stats['val'].append(0)
284 |
285 | if e % self.val_period == 0:
286 | loss_stats['train'].append(train_epoch_loss/len(train_loader))
287 |
288 | if (e % 10 == 0) and (self.verbose):
289 | if val_dataset is not None:
290 | print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | ', end='')
291 | print(f'Val Loss: {val_epoch_loss/len(val_loader):.5f} | ', flush=True)
292 | else:
293 | print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | ', flush=True)
294 |
295 | return loss_stats
296 |
297 | def predict(self, X):
298 | """ Estimate the label given the features
299 | Parameters
300 | ----------
301 | x : numpy array of training features (nXp)
302 | Returns
303 | -------
304 | ret_val : numpy array of predicted labels (n)
305 | """
306 | X = self.scaler.transform(X)
307 | self.model.eval()
308 | ret_val = self.model(torch.from_numpy(X).to(self.device).float().requires_grad_(False))
309 | return ret_val.cpu().detach().numpy()
310 |
311 | def get_quantiles(self):
312 | return self.quantiles.cpu().numpy()
313 |
314 | ### Here ends the code from CHR and start the new code.
315 |
316 | def fit_basemodel(X_train, Y_train, target='Mean', basemodel='Linear', alpha=0.1, params_basemodel={}):
317 |
318 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.'
319 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.'
320 |
321 | cores = params_basemodel['cores']
322 |
323 | if basemodel == 'RF':
324 | n_estimators = params_basemodel['n_estimators']
325 | min_samples_leaf = params_basemodel['min_samples_leaf']
326 | max_features = params_basemodel['max_features']
327 |
328 | if target == 'Mean':
329 | if basemodel == 'Linear':
330 | trained_model = LinearRegression(n_jobs=cores).fit(X_train,Y_train)
331 | elif basemodel == 'RF':
332 | trained_model = RandomForestRegressor(n_jobs=cores,n_estimators=n_estimators, min_samples_leaf=min_samples_leaf,
333 | max_features=max_features, random_state=1).fit(X_train,Y_train)
334 | elif target == 'Quantiles':
335 | a_low = alpha/2
336 | a_high = 1-alpha/2
337 | if basemodel == 'Linear':
338 | trained_model = {'q_low': QuantileRegressor(quantile=a_low, solver='highs', alpha=0).fit(X_train,Y_train),
339 | 'q_high': QuantileRegressor(quantile=a_high, solver='highs', alpha=0).fit(X_train,Y_train)}
340 | elif basemodel == 'RF':
341 | trained_model = qf.RandomForestQuantileRegressor(random_state=1, min_samples_leaf=min_samples_leaf,
342 | n_estimators=n_estimators, max_features=max_features).fit(X_train,Y_train)
343 | elif basemodel == 'XGBoost':
344 | trained_model = {'q_low': GradientBoostingRegressor(loss="quantile", alpha=a_low, n_estimators=25).fit(X_train,Y_train),
345 | 'q_high': GradientBoostingRegressor(loss="quantile", alpha=a_high, n_estimators=25).fit(X_train,Y_train)}
346 |
347 | elif basemodel == 'NNet':
348 |
349 | n_train = len(Y_train)
350 | n_features = X_train.shape[1]
351 | epochs = 2000
352 | lr = 0.0005
353 | batch_size = n_train
354 | dropout = 0.1
355 |
356 | grid_quantiles = [alpha/2, 1-alpha/2]
357 | trained_model = QNet(grid_quantiles, n_features, no_crossing=True, batch_size=batch_size,
358 | dropout=dropout, num_epochs=epochs, learning_rate=lr, calibrate=0,
359 | verbose=False).fit(X_train, Y_train)
360 |
361 | return trained_model
362 |
363 | def predict_basemodel(fitted_basemodel, X_test, target='Mean', basemodel='Linear', alpha=0.1):
364 |
365 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.'
366 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.'
367 |
368 | if target == 'Mean':
369 | predictions = fitted_basemodel.predict(X_test)
370 | elif target == 'Quantiles':
371 | a_low = alpha/2
372 | a_high = 1-alpha/2
373 | if basemodel == 'Linear':
374 | predictions = {'y_inf': fitted_basemodel['q_low'].predict(X_test),
375 | 'y_sup': fitted_basemodel['q_high'].predict(X_test)}
376 | elif basemodel == 'RF':
377 | both_pred = fitted_basemodel.predict(X_test, quantiles=[a_low, a_high])
378 | predictions = {'y_inf': both_pred[:, 0],
379 | 'y_sup': both_pred[:, 1]}
380 | elif basemodel == 'XGBoost':
381 | predictions = {'y_inf': fitted_basemodel['q_low'].predict(X_test),
382 | 'y_sup': fitted_basemodel['q_high'].predict(X_test)}
383 | elif basemodel == 'NNet':
384 | both_pred = fitted_basemodel.predict(X_test)
385 | predictions = {'y_inf': both_pred[:, 0],
386 | 'y_sup': both_pred[:, 1]}
387 |
388 | return predictions
389 |
390 |
391 | def quantile_corrected(x, alpha):
392 | n_x = len(x)
393 | if (1-alpha)*(1+1/n_x) > 1:
394 | return np.inf
395 | else:
396 | return np.quantile(x, (1-alpha)*(1+1/n_x))
397 |
398 | def calibrate_predict_intervals(pred_cal, Y_cal, pred_test, groups_cal=None, groups_test=None, target='Mean', basemodel='Linear', alpha=0.1):
399 |
400 | assert target in ['Mean', 'Quantiles'], 'regression must be Mean or Quantiles.'
401 | assert basemodel in ['Oracle', 'Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.'
402 |
403 | if groups_cal == None:
404 | if target == 'Mean':
405 | scores = np.abs(Y_cal-pred_cal)
406 | q_scores = quantile_corrected(scores, alpha)
407 | interval_predictions = {'y_inf': pred_test-q_scores,
408 | 'y_sup': pred_test+q_scores}
409 | elif target == 'Quantiles':
410 | scores = np.maximum(pred_cal['y_inf']-Y_cal, Y_cal-pred_cal['y_sup'])
411 | q_scores = quantile_corrected(scores, alpha)
412 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores,
413 | 'y_sup': pred_test['y_sup']+q_scores}
414 | else:
415 | if target == 'Mean':
416 | scores = np.abs(Y_cal-pred_cal)
417 | elif target == 'Quantiles':
418 | scores = np.maximum(pred_cal['y_inf']-Y_cal, Y_cal-pred_cal['y_sup'])
419 |
420 | scores_sorted = np.array(scores)[np.array(groups_cal).argsort()]
421 | ids = np.unique(np.array(groups_cal)[np.array(groups_cal).argsort()], return_index=True)[0]
422 | inds = np.unique(np.array(groups_cal)[np.array(groups_cal).argsort()], return_index=True)[1]
423 | scores_splitted = np.split(scores_sorted, inds)[1:]
424 |
425 | q_scores_cal = list(map(functools.partial(quantile_corrected, alpha=alpha), scores_splitted))
426 |
427 | missing_groups = np.array(groups_test)[~np.isin(groups_test, ids)]
428 | if (len(missing_groups) > 0):
429 | ids = np.concatenate((ids, missing_groups))
430 | q_scores_cal = np.concatenate((q_scores_cal, np.full(len(missing_groups),np.inf)))
431 |
432 | inds_test = list(map(list(ids).index, groups_test))
433 |
434 | q_scores_test = np.array(q_scores_cal)[np.array(inds_test)]
435 |
436 | if target == 'Mean':
437 | interval_predictions = {'y_inf': pred_test-q_scores_test,
438 | 'y_sup': pred_test+q_scores_test}
439 | elif target == 'Quantiles':
440 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores_test,
441 | 'y_sup': pred_test['y_sup']+q_scores_test}
442 |
443 | return interval_predictions
444 |
445 | def calibrate_masking_predict_intervals(fitted_basemodel, imputer, X_cal, M_cal, Y_cal,
446 | X_mis_test, features_test, M_test, mask,
447 | groups_test, exact=True, target='Quantiles',
448 | basemodel='Linear', alpha=0.1):
449 |
450 | assert target in ['Quantiles'], 'regression must be Quantiles.'
451 | assert basemodel in ['Linear', 'RF', 'NNet', 'XGBoost'], 'regression must be Linear, RF or NNet.'
452 |
453 | patterns = np.unique(M_test, axis=0)
454 | ids = list(map(utils.pattern_to_id, patterns.astype(int)))
455 |
456 | n_test = features_test.shape[0]
457 | q_scores_test = np.empty(n_test)
458 |
459 | if exact == False:
460 | pred_test = {'y_inf': np.empty(n_test),
461 | 'y_sup': np.empty(n_test)}
462 |
463 | for idp, id_pattern in enumerate(ids):
464 |
465 | X_imp_cal_masking = copy.deepcopy(X_cal)
466 | M_cal_masking = copy.deepcopy(M_cal)
467 | Y_cal_masking = copy.deepcopy(Y_cal)
468 |
469 | pattern = patterns[idp]
470 |
471 | empty = False
472 |
473 | if exact == True:
474 | ind_subsample = np.all(M_cal[:, pattern == 0] == 0, axis=1)
475 | if np.sum(ind_subsample) == 0:
476 | empty = True
477 | X_imp_cal_masking = X_imp_cal_masking[ind_subsample, :]
478 | M_cal_masking = M_cal_masking[ind_subsample, :]
479 | Y_cal_masking = Y_cal_masking[ind_subsample]
480 |
481 | if X_imp_cal_masking.shape[1] > len(pattern):
482 | nb = X_imp_cal_masking.shape[1] - len(pattern)
483 | pattern_ext = np.append(pattern, np.full(nb,0))
484 | else:
485 | pattern_ext = pattern
486 |
487 | X_imp_cal_masking[:, pattern_ext == 1] = np.nan
488 | M_cal_masking[:, pattern == 1] = 1
489 |
490 | if not empty:
491 | X_imp_cal_masking = imputer.transform(X_imp_cal_masking)
492 |
493 | if mask == 'Yes':
494 | features_cal = np.concatenate((X_imp_cal_masking, M_cal_masking), axis=1)
495 | else:
496 | features_cal = X_imp_cal_masking
497 |
498 | cal_predictions = predict_basemodel(fitted_basemodel, features_cal, target, basemodel, alpha)
499 |
500 | scores = np.maximum(cal_predictions['y_inf']-Y_cal_masking, Y_cal_masking-cal_predictions['y_sup'])
501 |
502 | if exact == True:
503 |
504 | if not empty:
505 |
506 | q_scores_cal = quantile_corrected(scores, alpha=alpha)
507 |
508 | q_scores_test[(np.array(groups_test) == id_pattern).flatten()] = q_scores_cal
509 |
510 | else:
511 | q_scores_test[(np.array(groups_test) == id_pattern).flatten()] = np.inf
512 |
513 | else:
514 |
515 | X_to_pred = copy.deepcopy(X_mis_test[(np.array(groups_test) == id_pattern).flatten(), :])
516 | M_to_pred = copy.deepcopy(M_test[(np.array(groups_test) == id_pattern).flatten(), :])
517 |
518 | n_current = X_to_pred.shape[0]
519 |
520 | patterns_cal = np.unique(M_cal_masking, axis=0)
521 | ids_cal = list(map(utils.pattern_to_id, patterns_cal.astype(int)))
522 |
523 | groups_cal_masking = list(map(utils.pattern_to_id, M_cal_masking.astype(int)))
524 |
525 | nb_cal = len(scores)
526 |
527 | all_preds = {'y_inf': np.empty((n_current, nb_cal)),
528 | 'y_sup': np.empty((n_current, nb_cal))}
529 |
530 | for idp_cal, id_pattern_cal in enumerate(ids_cal):
531 |
532 | idx_cal_masking = (np.array(groups_cal_masking) == id_pattern_cal).flatten()
533 | nb_mask = np.sum(idx_cal_masking)
534 |
535 | all_preds['y_inf'][:, idx_cal_masking] = np.tile(scores[idx_cal_masking], (n_current, 1))
536 | all_preds['y_sup'][:, idx_cal_masking] = np.tile(scores[idx_cal_masking], (n_current, 1))
537 |
538 | pattern_masking = patterns_cal[idp_cal]
539 |
540 | X_to_pred_masking = copy.deepcopy(X_to_pred)
541 | M_to_pred_masking = copy.deepcopy(M_to_pred)
542 | X_to_pred_masking[:, pattern_masking == 1] = np.nan
543 | M_to_pred_masking[:, pattern_masking == 1] = 1
544 |
545 | X_to_pred_masking = imputer.transform(X_to_pred_masking)
546 |
547 | if mask == 'Yes':
548 | features_test_pattern = np.concatenate((X_to_pred_masking, M_to_pred_masking), axis=1)
549 | else:
550 | features_test_pattern = X_to_pred_masking
551 |
552 | preds_k = predict_basemodel(fitted_basemodel, features_test_pattern, target, basemodel, alpha)
553 |
554 | all_preds['y_inf'][:, idx_cal_masking] = -np.tile(preds_k['y_inf'], (nb_mask, 1)).T + all_preds['y_inf'][:, idx_cal_masking]
555 | all_preds['y_sup'][:, idx_cal_masking] = np.tile(preds_k['y_sup'], (nb_mask, 1)).T + all_preds['y_sup'][:, idx_cal_masking]
556 |
557 |
558 | if (1 - alpha) * (1 + 1 / nb_cal) > 1:
559 | pred_test['y_inf'][(np.array(groups_test) == id_pattern).flatten()] = [-np.inf] * n_current
560 | pred_test['y_sup'][(np.array(groups_test) == id_pattern).flatten()] = [np.inf] * n_current
561 | else:
562 | pred_test['y_inf'][(np.array(groups_test) == id_pattern).flatten()] = -np.quantile(all_preds['y_inf'], (1 - alpha) * (1 + 1 / nb_cal), axis=1)
563 | pred_test['y_sup'][(np.array(groups_test) == id_pattern).flatten()] = np.quantile(all_preds['y_sup'], (1 - alpha) * (1 + 1 / nb_cal), axis=1)
564 |
565 | if exact == True:
566 | pred_test = predict_basemodel(fitted_basemodel, features_test, target, basemodel, alpha)
567 | interval_predictions = {'y_inf': pred_test['y_inf']-q_scores_test,
568 | 'y_sup': pred_test['y_sup']+q_scores_test}
569 | else:
570 | interval_predictions = {'y_inf': pred_test['y_inf'],
571 | 'y_sup': pred_test['y_sup']}
572 |
573 | return interval_predictions
574 |
575 | def compute_mean_mis_given_obs(X_obs_in_mis, mean_mis, cov_mis_obs, cov_obs_inv, mean_obs):
576 | return mean_mis + np.dot(cov_mis_obs,np.dot(cov_obs_inv, X_obs_in_mis - mean_obs))
577 |
578 | def oracle_pattern(pattern, X_test, M_test, beta, mean, cov, alpha=0.1):
579 |
580 | a_low = alpha/2
581 | a_high = 1-alpha/2
582 |
583 | pattern_id = utils.pattern_to_id(pattern.astype(int))
584 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int)))
585 | X_pattern = X_test[np.where(np.array(M_test_id) == pattern_id)]
586 |
587 | pattern = np.array(list(map(bool, pattern)))
588 |
589 | beta_mis = beta[pattern]
590 | beta_obs = beta[~pattern]
591 |
592 | mean_mis = mean[pattern]
593 | mean_obs = mean[~pattern]
594 |
595 | X_obs_in_mis = X_pattern[:,~pattern]
596 |
597 | cov_obs = cov[~pattern][:,~pattern]
598 | cov_obs_inv = np.linalg.pinv(cov_obs)
599 |
600 | cov_mis = cov[pattern][:,pattern]
601 | cov_mis_obs = cov[pattern][:,~pattern]
602 |
603 | mean_mis_given_obs = np.array(list(map(functools.partial(compute_mean_mis_given_obs,
604 | mean_mis=mean_mis, cov_mis_obs=cov_mis_obs,
605 | cov_obs_inv=cov_obs_inv, mean_obs=mean_obs), X_obs_in_mis)))
606 |
607 | beta_mis_mean_mis = np.array(list(map(functools.partial(np.dot, beta_mis), mean_mis_given_obs)))
608 | beta_obs_X_obs = np.array(list(map(functools.partial(np.dot, beta_obs), X_obs_in_mis)))
609 |
610 | cov_mis_given_obs = cov_mis - np.dot(cov_mis_obs,np.dot(cov_obs_inv, cov_mis_obs.T))
611 |
612 | q_low = beta_obs_X_obs + beta_mis_mean_mis + norm.ppf(a_low)*np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs , beta_mis.T))+1)
613 | q_high = beta_obs_X_obs + beta_mis_mean_mis + norm.ppf(a_high)*np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs , beta_mis.T))+1)
614 |
615 | return {'q_low': q_low, 'q_high': q_high}
616 |
617 | def oracle(M_test, X_test, beta, mean, cov, alpha=0.1):
618 |
619 | n_test = X_test.shape[0]
620 |
621 | interval_predictions = {'y_inf': np.empty(n_test),
622 | 'y_sup': np.empty(n_test)}
623 |
624 | patterns = np.unique(M_test, axis=0)
625 |
626 | oracles_intervals_per_pattern = list(map(functools.partial(oracle_pattern,
627 | X_test=X_test, M_test=M_test, beta=beta,
628 | mean=mean, cov=cov, alpha=alpha), patterns))
629 |
630 | for idp, pattern in enumerate(patterns):
631 |
632 | pattern_id = utils.pattern_to_id(pattern.astype(int))
633 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int)))
634 | interval_predictions['y_inf'][np.where(np.array(M_test_id) == pattern_id)] = oracles_intervals_per_pattern[idp]['q_low']
635 | interval_predictions['y_sup'][np.where(np.array(M_test_id) == pattern_id)] = oracles_intervals_per_pattern[idp]['q_high']
636 |
637 | return interval_predictions
638 |
639 | def oracle_len_pattern(pattern, beta, cov, alpha=0.1):
640 |
641 | pattern = np.array(list(map(bool, pattern)))
642 |
643 | beta_mis = beta[pattern]
644 |
645 | cov_obs = cov[~pattern][:,~pattern]
646 | cov_obs_inv = np.linalg.pinv(cov_obs)
647 |
648 | cov_mis = cov[pattern][:,pattern]
649 | cov_mis_obs = cov[pattern][:,~pattern]
650 |
651 | cov_mis_given_obs = cov_mis - np.dot(cov_mis_obs,np.dot(cov_obs_inv, cov_mis_obs.T))
652 |
653 | length = 2 * norm.ppf(1-alpha/2) * np.sqrt(np.dot(beta_mis, np.dot(cov_mis_given_obs, beta_mis.T)) + 1)
654 |
655 | return length
656 |
657 | def oracle_mean_pattern(pattern, X_test, M_test, beta, mean, cov):
658 |
659 | pattern_id = utils.pattern_to_id(pattern.astype(int))
660 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int)))
661 | X_pattern = X_test[np.where(np.array(M_test_id) == pattern_id)]
662 |
663 | pattern = np.array(list(map(bool, pattern)))
664 |
665 | beta_mis = beta[pattern]
666 | beta_obs = beta[~pattern]
667 |
668 | mean_mis = mean[pattern]
669 | mean_obs = mean[~pattern]
670 |
671 | X_obs_in_mis = X_pattern[:,~pattern]
672 |
673 | cov_obs = cov[~pattern][:,~pattern]
674 | cov_obs_inv = np.linalg.pinv(cov_obs)
675 |
676 | cov_mis = cov[pattern][:,pattern]
677 | cov_mis_obs = cov[pattern][:,~pattern]
678 |
679 | mean_mis_given_obs = np.array(list(map(functools.partial(compute_mean_mis_given_obs,
680 | mean_mis=mean_mis, cov_mis_obs=cov_mis_obs,
681 | cov_obs_inv=cov_obs_inv, mean_obs=mean_obs), X_obs_in_mis)))
682 |
683 | beta_mis_mean_mis = np.array(list(map(functools.partial(np.dot, beta_mis), mean_mis_given_obs)))
684 | beta_obs_X_obs = np.array(list(map(functools.partial(np.dot, beta_obs), X_obs_in_mis)))
685 |
686 | mean_pattern = beta_obs_X_obs + beta_mis_mean_mis
687 |
688 | return mean_pattern
689 |
690 | def oracle_mean(M_test, X_test, beta, mean, cov):
691 |
692 | n_test = X_test.shape[0]
693 |
694 | predictions = np.empty(n_test)
695 |
696 | patterns = np.unique(M_test, axis=0)
697 |
698 | oracles_mean_per_pattern = list(map(functools.partial(oracle_mean_pattern,
699 | X_test=X_test, M_test=M_test, beta=beta,
700 | mean=mean, cov=cov), patterns))
701 |
702 | for idp, pattern in enumerate(patterns):
703 |
704 | pattern_id = utils.pattern_to_id(pattern.astype(int))
705 | M_test_id = list(map(utils.pattern_to_id, M_test.astype(int)))
706 | predictions[np.where(np.array(M_test_id) == pattern_id)] = oracles_mean_per_pattern[idp]
707 |
708 | return predictions
709 |
710 | def run_experiments(data, alpha, methods, basemodels, params_basemodel, masks, protections, exacts=['False'], imputation=None,
711 | params_reg={}, params_noise={},
712 | parent_results='results'):
713 |
714 | d = data['X_missing']['Train'].shape[2]
715 | n_rep = data['X_missing']['Train'].shape[0]
716 |
717 | name_pipeline = []
718 | for method in methods:
719 | for basemodel in basemodels:
720 | for mask in masks:
721 | for protection in protections:
722 | if method == 'CQR_MDA':
723 | for exact in exacts:
724 | name_temp = files.get_name_method(method, basemodel, mask, protection, exact)
725 | if not name_temp in name_pipeline:
726 | name_pipeline.append(name_temp)
727 | else:
728 | name_temp = files.get_name_method(method, basemodel, mask, protection)
729 | if not name_temp in name_pipeline:
730 | name_pipeline.append(name_temp)
731 |
732 | results_methods = dict.fromkeys(name_pipeline)
733 |
734 | for k in tqdm(range(n_rep)):
735 |
736 | if 'X' in list(data.keys()):
737 | X_train = data['X']['Train'][k,:,:]
738 | X_cal = data['X']['Cal'][k,:,:]
739 | X_mis_train = data['X_missing']['Train'][k,:,:]
740 | X_mis_cal = data['X_missing']['Cal'][k,:,:]
741 | X_imp_train = data['X_imp']['Train'][k,:,:]
742 | X_imp_cal = data['X_imp']['Cal'][k,:,:]
743 | M_train = data['M']['Train'][k,:,:]
744 | M_cal = data['M']['Cal'][k,:,:]
745 | Y_train = data['Y']['Train'][k,:]
746 | Y_cal = data['Y']['Cal'][k,:]
747 |
748 | keys_test = list(data['X_missing']['Test'].keys())
749 | X_test = dict.fromkeys(keys_test)
750 | X_mis_test = dict.fromkeys(keys_test)
751 | X_imp_test = dict.fromkeys(keys_test)
752 | M_test = dict.fromkeys(keys_test)
753 | Y_test = dict.fromkeys(keys_test)
754 | for key in keys_test:
755 | if 'X' in list(data.keys()):
756 | X_test[key] = data['X']['Test'][key][k,:,:]
757 | X_mis_test[key] = data['X_missing']['Test'][key][k,:,:]
758 | X_imp_test[key] = data['X_imp']['Test'][key][k,:,:]
759 | M_test[key] = data['M']['Test'][key][k,:,:]
760 | Y_test[key] = data['Y']['Test'][key][k,:]
761 |
762 | trained_models = {}
763 |
764 | for method in methods:
765 |
766 | if method in ['Oracle', 'Oracle_mean']:
767 | if 'mean' in params_reg:
768 | mean = params_reg['mean']
769 | else:
770 | mean = 1
771 | if 'phi' in params_reg:
772 | phi = params_reg['phi']
773 | else:
774 | phi = 0.8
775 | mean = np.full(d, mean)
776 | cov = np.full((d,d),phi)+(1-phi)*np.eye(d)
777 | if 'beta' not in params_reg or params_reg['beta'] is None:
778 | beta = np.full(d,1)
779 | else:
780 | beta = params_reg['beta']
781 |
782 | if method == 'Oracle':
783 | preds = dict.fromkeys(keys_test)
784 | for key in keys_test:
785 | pred = oracle(M_test[key], X_test[key], beta, mean, cov, alpha=alpha)
786 | preds[key] = pred
787 |
788 | elif method == 'Oracle_mean':
789 | cal_predictions = oracle_mean(M_cal, X_cal, beta, mean, cov)
790 |
791 | test_predictions = dict.fromkeys(keys_test)
792 | for key in keys_test:
793 | test_predictions[key] = oracle_mean(M_test[key], X_test[key], beta, mean, cov)
794 |
795 | for protection in protections:
796 | pipeline = files.get_name_method(method, basemodel=None, mask='No', protection=protection)
797 | if protection == 'No':
798 | groups_cal = None
799 | groups_test = dict.fromkeys(keys_test)
800 | for key in keys_test:
801 | groups_test[key] = None
802 | elif protection == 'Pattern':
803 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int)))
804 | groups_test = dict.fromkeys(keys_test)
805 | for key in keys_test:
806 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int)))
807 | elif protection == 'Pattern_Size':
808 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int)))
809 | groups_test = dict.fromkeys(keys_test)
810 | for key in keys_test:
811 | groups_test[key] = list(map(utils.pattern_to_size, M_test[key].astype(int)))
812 | preds = dict.fromkeys(keys_test)
813 | for key in keys_test:
814 |
815 | preds[key] = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions[key],
816 | groups_cal=groups_cal, groups_test=groups_test[key],
817 | target='Mean',
818 | basemodel='Oracle', alpha=alpha)
819 |
820 | results = results_methods[pipeline]
821 | if results_methods[pipeline] == None:
822 | results = dict.fromkeys(keys_test)
823 | for key in keys_test:
824 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']),
825 | 'Y_sup': np.array(preds[key]['y_sup'])}
826 | else:
827 | for key in keys_test:
828 | results[key]['Y_inf'] = np.vstack(
829 | (results[key]['Y_inf'], np.array(preds[key]['y_inf'])))
830 | results[key]['Y_sup'] = np.vstack(
831 | (results[key]['Y_sup'], np.array(preds[key]['y_sup'])))
832 | results_methods[pipeline] = results
833 |
834 | elif method == 'CQR_MDA':
835 |
836 | assert imputation is not None, "imputation must be specified for Masking"
837 |
838 | target = 'Quantiles'
839 |
840 | imputer_masking = imp.impute_imputer(X_mis_train, imputation)
841 |
842 | X_imp_train_masking = imputer_masking.transform(X_mis_train)
843 | X_imp_test_masking = dict.fromkeys(keys_test)
844 |
845 | for key in keys_test:
846 | X_imp_test_masking[key] = imputer_masking.transform(X_mis_test[key])
847 |
848 | if target in trained_models.keys():
849 | trained_models_target = trained_models[target]
850 | else:
851 | trained_models[target] = {}
852 | trained_models_target = None
853 |
854 | for basemodel in basemodels:
855 |
856 | if trained_models_target is not None and basemodel in trained_models_target.keys():
857 | trained_models_target_basemodel = trained_models_target[basemodel]
858 | else:
859 | trained_models[target][basemodel] = {}
860 | trained_models_target_basemodel = None
861 |
862 | for mask in masks:
863 |
864 | if mask == 'Yes':
865 | name_mask = 'mask'
866 | features_train = np.concatenate((X_imp_train_masking, M_train), axis=1)
867 | features_test = dict.fromkeys(keys_test)
868 | for key in keys_test:
869 | features_test[key] = np.concatenate((X_imp_test_masking[key], M_test[key]), axis=1)
870 | else:
871 | name_mask = 'no_mask'
872 | features_train = X_imp_train_masking
873 | features_test = X_imp_test_masking
874 |
875 | if trained_models_target_basemodel is not None and name_mask in trained_models_target_basemodel.keys():
876 | trained_models_target_basemodel_mask = trained_models_target_basemodel[name_mask]
877 | else:
878 | trained_models[target][basemodel][name_mask] = {}
879 | trained_models_target_basemodel_mask = None
880 |
881 | if trained_models_target_basemodel_mask is None:
882 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha,
883 | params_basemodel=params_basemodel)
884 | trained_models[target][basemodel][name_mask] = trained_model
885 | else:
886 | trained_model = trained_models_target_basemodel_mask
887 |
888 | groups_test = dict.fromkeys(keys_test)
889 | for key in keys_test:
890 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int)))
891 |
892 | for exact in exacts:
893 | pipeline = files.get_name_method(method, basemodel, mask, exact=exact)
894 |
895 | preds = dict.fromkeys(keys_test)
896 | for key in keys_test:
897 | pred = calibrate_masking_predict_intervals(trained_model, imputer_masking,
898 | X_mis_cal, M_cal, Y_cal,
899 | X_mis_test[key], features_test[key], M_test[key], mask,
900 | groups_test=groups_test[key], exact=exact,
901 | target=target, basemodel=basemodel, alpha=alpha)
902 | preds[key] = pred
903 |
904 | results = results_methods[pipeline]
905 |
906 | if results_methods[pipeline] == None:
907 | results = dict.fromkeys(keys_test)
908 | for key in keys_test:
909 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']), 'Y_sup': np.array(preds[key]['y_sup'])}
910 | else:
911 | for key in keys_test:
912 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(preds[key]['y_inf'])))
913 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(preds[key]['y_sup'])))
914 | results_methods[pipeline] = results
915 |
916 | else:
917 |
918 | if method == 'SCP':
919 | target = 'Mean'
920 | elif method in ['CQR', 'QR', 'QR_TrainCal']:
921 | target = 'Quantiles'
922 |
923 | if method in ['QR', 'QR_TrainCal']:
924 | conformalized = False
925 | else:
926 | conformalized = True
927 |
928 | if target in trained_models.keys():
929 | trained_models_target = trained_models[target]
930 | else:
931 | trained_models[target] = {}
932 | trained_models_target = None
933 |
934 | for basemodel in basemodels:
935 |
936 | if method != 'QR_TrainCal' and trained_models_target is not None and basemodel in trained_models_target.keys():
937 | trained_models_target_basemodel = trained_models_target[basemodel]
938 | elif method != 'QR_TrainCal':
939 | trained_models[target][basemodel] = {}
940 | trained_models_target_basemodel = None
941 |
942 | for mask in masks:
943 |
944 | if mask == 'Yes':
945 | name_mask = 'mask'
946 | features_train = np.concatenate((X_imp_train, M_train), axis=1)
947 | features_cal = np.concatenate((X_imp_cal, M_cal), axis=1)
948 | features_test = dict.fromkeys(keys_test)
949 | for key in keys_test:
950 | features_test[key] = np.concatenate((X_imp_test[key], M_test[key]), axis=1)
951 | else:
952 | name_mask = 'no_mask'
953 | features_train = X_imp_train
954 | features_cal = X_imp_cal
955 | features_test = X_imp_test
956 |
957 | if method == 'QR_TrainCal':
958 | features_train = np.concatenate((features_train, features_cal), axis=0)
959 | Y_traincal = np.concatenate((Y_train, Y_cal), axis=0)
960 | trained_model = fit_basemodel(features_train, Y_traincal, target=target, basemodel=basemodel,
961 | alpha=alpha,
962 | params_basemodel=params_basemodel)
963 | else:
964 | if trained_models_target_basemodel is not None and name_mask in trained_models_target_basemodel.keys():
965 | trained_models_target_basemodel_mask = trained_models_target_basemodel[name_mask]
966 | else:
967 | trained_models[target][basemodel][name_mask] = {}
968 | trained_models_target_basemodel_mask = None
969 |
970 | if trained_models_target_basemodel_mask is None:
971 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha,
972 | params_basemodel=params_basemodel)
973 | trained_models[target][basemodel][name_mask] = trained_model
974 | else:
975 | trained_model = trained_models_target_basemodel_mask
976 |
977 | cal_predictions = predict_basemodel(trained_model, features_cal, target, basemodel, alpha)
978 |
979 | test_predictions = dict.fromkeys(keys_test)
980 | for key in keys_test:
981 | test_predictions[key] = predict_basemodel(trained_model, features_test[key], target, basemodel, alpha)
982 |
983 | if conformalized:
984 |
985 | for protection in protections:
986 | pipeline = files.get_name_method(method, basemodel, mask, protection)
987 | if protection == 'No':
988 | groups_cal = None
989 | groups_test = dict.fromkeys(keys_test)
990 | for key in keys_test:
991 | groups_test[key] = None
992 | elif protection == 'Pattern':
993 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int)))
994 | groups_test = dict.fromkeys(keys_test)
995 | for key in keys_test:
996 | groups_test[key] = list(map(utils.pattern_to_id, M_test[key].astype(int)))
997 | elif protection == 'Pattern_Size':
998 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int)))
999 | groups_test = dict.fromkeys(keys_test)
1000 | for key in keys_test:
1001 | groups_test[key] = list(map(utils.pattern_to_size, M_test[key].astype(int)))
1002 | preds = dict.fromkeys(keys_test)
1003 | for key in keys_test:
1004 | preds[key] = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions[key],
1005 | groups_cal=groups_cal, groups_test=groups_test[key],
1006 | target=target,
1007 | basemodel=basemodel, alpha=alpha)
1008 |
1009 | results = results_methods[pipeline]
1010 | if results_methods[pipeline] == None:
1011 | results = dict.fromkeys(keys_test)
1012 | for key in keys_test:
1013 | results[key] = {'Y_inf': np.array(preds[key]['y_inf']), 'Y_sup': np.array(preds[key]['y_sup'])}
1014 | else:
1015 | for key in keys_test:
1016 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(preds[key]['y_inf'])))
1017 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(preds[key]['y_sup'])))
1018 | results_methods[pipeline] = results
1019 |
1020 | else:
1021 | interval_predictions = dict.fromkeys(keys_test)
1022 | for key in keys_test:
1023 | interval_predictions[key] = {'y_inf': test_predictions[key]['y_inf'],
1024 | 'y_sup': test_predictions[key]['y_sup']}
1025 | pipeline = files.get_name_method(method, basemodel, mask, conformalized)
1026 | results = results_methods[pipeline]
1027 | if results_methods[pipeline] == None:
1028 | results = dict.fromkeys(keys_test)
1029 | for key in keys_test:
1030 | results[key] = {'Y_inf': np.array(interval_predictions[key]['y_inf']),
1031 | 'Y_sup': np.array(interval_predictions[key]['y_sup'])}
1032 | else:
1033 | for key in keys_test:
1034 | results[key]['Y_inf'] = np.vstack((results[key]['Y_inf'],np.array(interval_predictions[key]['y_inf'])))
1035 | results[key]['Y_sup'] = np.vstack((results[key]['Y_sup'],np.array(interval_predictions[key]['y_sup'])))
1036 | results_methods[pipeline] = results
1037 |
1038 | return results_methods, name_pipeline
1039 |
1040 | def run_real_experiments(data, alpha, methods, basemodels, params_basemodel, masks, conformalized, protections,
1041 | n_rep, parent_results='results', imputation=None, data_missing=None, exact=True):
1042 |
1043 | test_size = len(data['Y']['Test'][0,:])
1044 | d = data['X_imp']['Train'].shape[2]
1045 |
1046 | name_pipeline = []
1047 | for method in methods:
1048 | for basemodel in basemodels:
1049 | for mask in masks:
1050 | for protection in protections:
1051 | name_temp = files.get_name_method(method, basemodel, mask, protection, conformalized, exact)
1052 | if not name_temp in name_pipeline:
1053 | name_pipeline.append(name_temp)
1054 |
1055 | results_methods = dict.fromkeys(name_pipeline)
1056 |
1057 | if 'M_original' in data.keys():
1058 | mask_original = True
1059 | else:
1060 | mask_original = False
1061 |
1062 | for k in tqdm(range(n_rep)):
1063 |
1064 | X_imp_train = data['X_imp']['Train'][k,:,:]
1065 | X_imp_cal = data['X_imp']['Cal'][k,:,:]
1066 | X_imp_test = data['X_imp']['Test'][k,:,:]
1067 | if mask_original:
1068 | M_original_train = data['M_original']['Train'][k,:,:]
1069 | M_original_cal = data['M_original']['Cal'][k,:,:]
1070 | M_original_test = data['M_original']['Test'][k,:,:]
1071 | M_train = data['M']['Train'][k,:,:]
1072 | M_cal = data['M']['Cal'][k,:,:]
1073 | M_test = data['M']['Test'][k,:,:]
1074 | Y_train = data['Y']['Train'][k,:]
1075 | Y_cal = data['Y']['Cal'][k,:]
1076 | Y_test = data['Y']['Test'][k,:]
1077 |
1078 | for method in methods:
1079 | if method == 'CQR_MDA':
1080 |
1081 | assert imputation is not None, "imputation must be specified for Masking"
1082 |
1083 | target = 'Quantiles'
1084 |
1085 | X_mis_train = data_missing['X_missing']['Train'][k,:,:]
1086 | X_mis_cal = data_missing['X_missing']['Cal'][k,:,:]
1087 | X_mis_test = data_missing['X_missing']['Test'][k,:,:]
1088 |
1089 | imputer_masking = imp.impute_imputer(X_mis_train, imputation)
1090 |
1091 | X_imp_train_masking = imputer_masking.transform(X_mis_train)
1092 | X_imp_test_masking = imputer_masking.transform(X_mis_test)
1093 |
1094 | for basemodel in basemodels:
1095 | for mask in masks:
1096 | if mask == 'Yes':
1097 | features_train = np.concatenate((X_imp_train_masking, M_train), axis=1)
1098 | features_test = np.concatenate((X_imp_test_masking, M_test), axis=1)
1099 | else:
1100 | features_train = X_imp_train_masking
1101 | features_test = X_imp_test_masking
1102 |
1103 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha,
1104 | params_basemodel=params_basemodel)
1105 | pipeline = files.get_name_method(method, basemodel, mask, exact=exact)
1106 | groups_test = list(map(utils.pattern_to_id, M_test.astype(int)))
1107 |
1108 | pred = calibrate_masking_predict_intervals(trained_model, imputer_masking,
1109 | X_mis_cal, M_cal, Y_cal, features_test,
1110 | M_test, mask,
1111 | groups_test=groups_test, exact=exact, target=target,
1112 | basemodel=basemodel, alpha=alpha)
1113 | results = results_methods[pipeline]
1114 | if results_methods[pipeline] == None:
1115 | results = {'Y_inf': np.array(pred['y_inf']), 'Y_sup': np.array(pred['y_sup'])}
1116 | else:
1117 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(pred['y_inf'])))
1118 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(pred['y_sup'])))
1119 | results_methods[pipeline] = results
1120 | else:
1121 | if method == 'SCP':
1122 | target = 'Mean'
1123 | elif method in ['CQR', 'QR']:
1124 | target = 'Quantiles'
1125 | for basemodel in basemodels:
1126 | for mask in masks:
1127 | if mask == 'Yes':
1128 | features_train = np.concatenate((X_imp_train, M_train), axis=1)
1129 | features_cal = np.concatenate((X_imp_cal, M_cal), axis=1)
1130 | features_test = np.concatenate((X_imp_test, M_test), axis=1)
1131 | else:
1132 | features_train = X_imp_train
1133 | features_cal = X_imp_cal
1134 | features_test = X_imp_test
1135 |
1136 | trained_model = fit_basemodel(features_train, Y_train, target=target, basemodel=basemodel, alpha=alpha,
1137 | params_basemodel=params_basemodel)
1138 |
1139 | cal_predictions = predict_basemodel(trained_model, features_cal, target, basemodel, alpha)
1140 | test_predictions = predict_basemodel(trained_model, features_test, target, basemodel, alpha)
1141 |
1142 | if conformalized:
1143 | for protection in protections:
1144 | pipeline = files.get_name_method(method, basemodel, mask, protection)
1145 | if protection == 'No':
1146 | groups_cal = None
1147 | groups_test = None
1148 | elif protection == 'Pattern':
1149 | if mask_original:
1150 | groups_cal = list(map(utils.pattern_to_id, M_original_cal.astype(int)))
1151 | groups_test = list(map(utils.pattern_to_id, M_original_test.astype(int)))
1152 | else:
1153 | groups_cal = list(map(utils.pattern_to_id, M_cal.astype(int)))
1154 | groups_test = list(map(utils.pattern_to_id, M_test.astype(int)))
1155 | elif protection == 'Pattern_Size':
1156 | if mask_original:
1157 | groups_cal = list(map(utils.pattern_to_size, M_original_cal.astype(int)))
1158 | groups_test = list(map(utils.pattern_to_size, M_original_test.astype(int)))
1159 | else:
1160 | groups_cal = list(map(utils.pattern_to_size, M_cal.astype(int)))
1161 | groups_test = list(map(utils.pattern_to_size, M_test.astype(int)))
1162 | pred = calibrate_predict_intervals(cal_predictions, Y_cal, test_predictions,
1163 | groups_cal=groups_cal, groups_test=groups_test, target=target,
1164 | basemodel=basemodel, alpha=alpha)
1165 | results = results_methods[pipeline]
1166 | if results_methods[pipeline] == None:
1167 | results = {'Y_inf': np.array(pred['y_inf']), 'Y_sup': np.array(pred['y_sup'])}
1168 | else:
1169 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(pred['y_inf'])))
1170 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(pred['y_sup'])))
1171 | results_methods[pipeline] = results
1172 | else:
1173 | interval_predictions = {'y_inf': test_predictions['y_inf'],
1174 | 'y_sup': test_predictions['y_sup']}
1175 | pipeline = files.get_name_method(method, basemodel, mask, protection, conformalized)
1176 | results = results_methods[pipeline]
1177 | if results_methods[pipeline] == None:
1178 | results = {'Y_inf': np.array(interval_predictions['y_inf']),
1179 | 'Y_sup': np.array(interval_predictions['y_sup'])}
1180 | else:
1181 | results['Y_inf'] = np.vstack((results['Y_inf'],np.array(interval_predictions['y_inf'])))
1182 | results['Y_sup'] = np.vstack((results['Y_sup'],np.array(interval_predictions['y_sup'])))
1183 | results_methods[pipeline] = results
1184 |
1185 | return results_methods, name_pipeline
1186 |
--------------------------------------------------------------------------------
/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzaffran/ConformalPredictionMissingValues/e1ff0c83a4943e6468b78bfec49af64fc61c3561/results/.DS_Store
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import files
3 | import functools
4 |
5 | def pattern_to_id(m):
6 | return(int(''.join(map(str,m)), 2))
7 |
8 | def pattern_to_id_float(m):
9 | return(float(int(''.join(map(str,m)), 2)))
10 |
11 | def pattern_to_size(m):
12 | return(int(np.sum(m)))
13 |
14 | def bin_to_vec(bin_pattern, d, var_missing=None):
15 | bin_pattern = bin_pattern[2:]
16 | l = len(bin_pattern)
17 | if var_missing is None:
18 | nb_missing = d
19 | else:
20 | nb_missing = np.sum(var_missing)
21 | if l < nb_missing:
22 | for i in range(nb_missing-l):
23 | bin_pattern = '0'+bin_pattern
24 | vec_bin = [int(x) for x in bin_pattern]
25 | if nb_missing < d:
26 | vec_pattern = [0] * d
27 | vec_pattern = np.array(vec_pattern)
28 | vec_pattern[list(np.where(var_missing == 1)[0])] = vec_bin
29 | vec_pattern = list(vec_pattern)
30 | else:
31 | vec_pattern = vec_bin
32 | return(vec_pattern)
33 |
34 | def create_patterns(d, var_missing):
35 | nb_var_missing = np.sum(var_missing)
36 | if nb_var_missing == d:
37 | keys_patterns = np.arange(0, 2**d-1)
38 | bin_patterns = list(map(bin, keys_patterns))
39 | vec_patterns = list(map(functools.partial(bin_to_vec, d=d), bin_patterns))
40 | else:
41 | keys_patterns = np.arange(0, 2**(nb_var_missing))
42 | bin_patterns = list(map(bin, keys_patterns))
43 | vec_patterns = list(map(functools.partial(bin_to_vec, d=d, var_missing=var_missing), bin_patterns))
44 | return(vec_patterns)
45 |
46 |
47 | def get_data_results(method, train_size, cal_size, params_test, n_rep, imputation, d=3,
48 | params_reg={}, params_noise={}, dataset=None, params_missing={},
49 | parent_results='results', parent_data='data', extension='pkl'):
50 |
51 | name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep,
52 | imputation=imputation, d=d,
53 | params_reg=params_reg, params_noise=params_noise,
54 | dataset=dataset,
55 | params_missing=params_missing)
56 | results = files.load_file(parent_results+'/'+name_dir, name_method, extension)
57 |
58 | name_data = files.get_name_data(train_size, cal_size, params_test, dim=d,
59 | params_reg=params_reg, params_noise=params_noise,
60 | dataset=dataset,
61 | params_missing=params_missing, seed=n_rep)
62 | data = files.load_file(parent_data, name_data, extension)
63 |
64 | return data, results
65 |
66 | def compute_PI_metrics(data, results, mechanism_test):
67 |
68 | contains = (data['Y']['Test'][mechanism_test] <= results[mechanism_test]['Y_sup']) & (data['Y']['Test'][mechanism_test] >= results[mechanism_test]['Y_inf'])
69 | lengths = results[mechanism_test]['Y_sup'] - results[mechanism_test]['Y_inf']
70 |
71 | return contains, lengths#,
72 |
73 | def compute_metrics_cond(n_rep, data, results, mechanism_test, cond='Pattern', replace_inf=False):
74 |
75 | contains, lengths = compute_PI_metrics(data, results, mechanism_test)
76 |
77 |
78 | if replace_inf:
79 | max_y_train = np.max(data['Y']['Train'], axis=1)
80 | max_y_cal = np.max(data['Y']['Cal'], axis=1)
81 | min_y_train = np.min(data['Y']['Train'], axis=1)
82 | min_y_cal = np.min(data['Y']['Cal'], axis=1)
83 | max_length_traincal = np.maximum(max_y_train, max_y_cal) - np.minimum(min_y_train, min_y_cal)
84 |
85 | M_test = data['M']['Test'][mechanism_test]
86 |
87 | if cond == 'Pattern':
88 | groups = np.apply_along_axis(pattern_to_id_float, 2, M_test.astype(int))
89 | test_patterns_id = np.unique(groups)
90 | elif cond == 'Pattern_Size':
91 | groups = np.apply_along_axis(pattern_to_size, 2, M_test.astype(int))
92 | test_patterns_id = np.unique(groups)
93 |
94 | metrics = dict.fromkeys(test_patterns_id)
95 |
96 | for pattern_id in test_patterns_id:
97 |
98 | avg_cov = []
99 | avg_len = []
100 | nb_samples = []
101 |
102 | for k in range(n_rep):
103 | current_lens = lengths[k,groups[k,:] == pattern_id]
104 |
105 | temp_cov = np.nanmean(contains[k,groups[k,:] == pattern_id])
106 | temp_nb = np.sum(groups[k,:] == pattern_id)
107 |
108 | if replace_inf:
109 | idx_inf = np.where(np.isinf(current_lens))
110 | if len(idx_inf) > 0:
111 | current_lens[idx_inf] = max_length_traincal[k]
112 |
113 | temp_len = np.nanmean(current_lens)
114 |
115 | avg_cov = np.append(avg_cov, temp_cov)
116 | avg_len = np.append(avg_len, temp_len)
117 | nb_samples = np.append(nb_samples, temp_nb)
118 |
119 | metrics[pattern_id] = {'avg_cov': avg_cov, 'avg_len': avg_len, 'nb_sample': nb_samples}
120 |
121 | return metrics
122 |
123 | def name_tick(name_method):
124 | if name_method[-4:] == 'Mask':
125 | name_tick = '+ mask'
126 | else:
127 | name_tick = re.search(r"[a-zA-Z]*", name_method).group()
128 | if name_tick != 'MICE':
129 | name_tick = name_tick.capitalize()
130 | return name_tick
--------------------------------------------------------------------------------