├── .github └── workflows │ ├── testlinux.yml │ ├── testmacos.yml │ └── testwindows.yml ├── .gitignore ├── Examples ├── .Rhistory ├── adult_data.csv ├── cces_jss_format.csv ├── midas_demo.ipynb ├── midaspy_demo_cces.ipynb └── tmp │ ├── MIDAS.data-00000-of-00001 │ ├── MIDAS.index │ ├── MIDAS.meta │ └── checkpoint ├── LICENSE.txt ├── MIDASpy ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-312.pyc │ ├── __init__.cpython-38.pyc │ ├── midas_base.cpython-312.pyc │ └── midas_base.cpython-38.pyc └── midas_base.py ├── MIDASpy_logo.png ├── README.md ├── build └── lib │ └── MIDASpy │ ├── __init__.py │ └── midas_base.py ├── midas_functions.md ├── pytest.ini ├── setup.cfg ├── setup.py ├── tests ├── test_data │ └── adult_data.csv └── test_midas.py └── tmp ├── MIDAS.data-00000-of-00001 ├── MIDAS.index ├── MIDAS.meta └── checkpoint /.github/workflows/testlinux.yml: -------------------------------------------------------------------------------- 1 | name: CI-Linux 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | 13 | jobs: 14 | test: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest] 19 | python-version: ["3.7", "3.8", "3.9", "3.10"] 20 | steps: 21 | - name: Check out code 22 | uses: actions/checkout@v4 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -e .[test] 33 | 34 | - name: Run tests 35 | run: | 36 | pytest 37 | -------------------------------------------------------------------------------- /.github/workflows/testmacos.yml: -------------------------------------------------------------------------------- 1 | name: CI-macOS 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | - develop 9 | pull_request: 10 | branches: 11 | - main 12 | - master 13 | - develop 14 | 15 | jobs: 16 | test: 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | include: 22 | - os: macos-11 23 | python-version: "3.7" 24 | - os: macos-latest 25 | python-version: "3.8" 26 | - os: macos-latest 27 | python-version: "3.9" 28 | - os: macos-latest 29 | python-version: "3.10" 30 | steps: 31 | - name: Check out code 32 | uses: actions/checkout@v4 33 | 34 | - name: Set up Python ${{ matrix.python-version }} 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | 39 | - name: Install dependencies 40 | run: | 41 | python -m pip install --upgrade pip 42 | pip install -e .[test] 43 | - name: Run tests 44 | run: | 45 | pytest 46 | -------------------------------------------------------------------------------- /.github/workflows/testwindows.yml: -------------------------------------------------------------------------------- 1 | name: CI-Windows 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | 13 | jobs: 14 | test: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [windows-latest] 19 | python-version: ["3.7", "3.8", "3.9", "3.10"] 20 | steps: 21 | - name: Check out code 22 | uses: actions/checkout@v4 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -e .[test] 33 | - name: Run tests 34 | run: | 35 | pytest 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | MIDASpy.egg-info -------------------------------------------------------------------------------- /Examples/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/.Rhistory -------------------------------------------------------------------------------- /Examples/midas_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# MIDASpy demonstration" 7 | ], 8 | "metadata": {} 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "source": [ 13 | "This notebook provides a brief demonstration of **MIDASpy**'s core functionalities. We show how to use the package to impute missing values in the [Adult census dataset](https://github.com/MIDASverse/MIDASpy/blob/master/Examples/adult_data.csv) (which is commonly used for benchmarking machine learning tasks).\n", 14 | "\n", 15 | "Users of **MIDASpy** must have **TensorFlow** installed as a **pip** package in their Python environment. **MIDASpy** is compatible with both **TensorFlow** 1.X and **TensorFlow** >= 2.2 versions.\n", 16 | "\n\nOnce these packages are installed, users can import the dependencies and load the data:" 17 | ], 18 | "metadata": {} 19 | }, 20 | { 21 | "cell_type": "code", 22 | "source": [ 23 | "from sklearn.preprocessing import MinMaxScaler\n", 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import tensorflow as tf\n", 27 | "import MIDASpy as md\n", 28 | "\n", 29 | "data_0 = pd.read_csv('adult_data.csv')\n", 30 | "data_0.columns.str.strip()" 31 | ], 32 | "outputs": [ 33 | { 34 | "output_type": "execute_result", 35 | "execution_count": 1, 36 | "data": { 37 | "text/plain": [ 38 | "Index(['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education',\n", 39 | " 'education_num', 'marital_status', 'occupation', 'relationship', 'race',\n", 40 | " 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',\n", 41 | " 'native_country', 'class_labels'],\n", 42 | " dtype='object')" 43 | ] 44 | }, 45 | "metadata": {} 46 | } 47 | ], 48 | "execution_count": 1, 49 | "metadata": {} 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "source": [ 54 | "As the Adult dataset has very little missingness, we randomly set 5,000 observed values as missing in each column:" 55 | ], 56 | "metadata": {} 57 | }, 58 | { 59 | "cell_type": "code", 60 | "source": [ 61 | "np.random.seed(441)\n", 62 | "\n", 63 | "def spike_in_generation(data):\n", 64 | " spike_in = pd.DataFrame(np.zeros_like(data), columns= data.columns)\n", 65 | " for column in data.columns:\n", 66 | " subset = np.random.choice(data[column].index[data[column].notnull()], 5000, replace= False)\n", 67 | " spike_in.loc[subset, column] = 1\n", 68 | " return spike_in\n", 69 | "\n", 70 | "spike_in = spike_in_generation(data_0)\n", 71 | "original_value = data_0.loc[4, 'hours_per_week']\n", 72 | "data_0[spike_in == 1] = np.nan" 73 | ], 74 | "outputs": [], 75 | "execution_count": 2, 76 | "metadata": {} 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "source": [ 81 | "Next, we list categorical variables in a vector and one-hot encode them using **MIDASpy**'s inbuilt preprocessing function `cat_conv`, which returns both the encoded data and a nested list of categorical column names we can pass to the imputation algorithm. To construct the final, pre-processed data we append the one-hot encoded categorical data to the non-cateogrical data, and replace null values with `np.nan` values:" 82 | ], 83 | "metadata": {} 84 | }, 85 | { 86 | "cell_type": "code", 87 | "source": [ 88 | "categorical = ['workclass','marital_status','relationship','race','class_labels','sex','education','occupation','native_country']\n", 89 | "data_cat, cat_cols_list = md.cat_conv(data_0[categorical])\n", 90 | "\n", 91 | "data_0.drop(categorical, axis = 1, inplace = True)\n", 92 | "constructor_list = [data_0]\n", 93 | "constructor_list.append(data_cat)\n", 94 | "data_in = pd.concat(constructor_list, axis=1)\n", 95 | "\n", 96 | "na_loc = data_in.isnull()\n", 97 | "data_in[na_loc] = np.nan" 98 | ], 99 | "outputs": [], 100 | "execution_count": 3, 101 | "metadata": {} 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "source": [ 106 | "To visualize the results:\n" 107 | ], 108 | "metadata": {} 109 | }, 110 | { 111 | "cell_type": "code", 112 | "source": [ 113 | "print(data_in.head())" 114 | ], 115 | "outputs": [ 116 | { 117 | "output_type": "stream", 118 | "name": "stdout", 119 | "text": [ 120 | " Unnamed: 0 age fnlwgt education_num capital_gain capital_loss \\\n", 121 | "0 0.0 39.0 77516.0 13.0 2174.0 0.0 \n", 122 | "1 1.0 50.0 83311.0 13.0 0.0 0.0 \n", 123 | "2 2.0 38.0 215646.0 9.0 0.0 0.0 \n", 124 | "3 3.0 53.0 234721.0 NaN 0.0 0.0 \n", 125 | "4 4.0 28.0 NaN 13.0 0.0 NaN \n", 126 | "\n", 127 | " hours_per_week workclass_Federal-gov workclass_Local-gov \\\n", 128 | "0 40.0 0.0 0.0 \n", 129 | "1 13.0 0.0 0.0 \n", 130 | "2 40.0 0.0 0.0 \n", 131 | "3 40.0 0.0 0.0 \n", 132 | "4 NaN 0.0 0.0 \n", 133 | "\n", 134 | " workclass_Never-worked ... native_country_Portugal \\\n", 135 | "0 0.0 ... 0.0 \n", 136 | "1 0.0 ... 0.0 \n", 137 | "2 0.0 ... 0.0 \n", 138 | "3 0.0 ... 0.0 \n", 139 | "4 0.0 ... 0.0 \n", 140 | "\n", 141 | " native_country_Puerto-Rico native_country_Scotland native_country_South \\\n", 142 | "0 0.0 0.0 0.0 \n", 143 | "1 0.0 0.0 0.0 \n", 144 | "2 0.0 0.0 0.0 \n", 145 | "3 0.0 0.0 0.0 \n", 146 | "4 0.0 0.0 0.0 \n", 147 | "\n", 148 | " native_country_Taiwan native_country_Thailand \\\n", 149 | "0 0.0 0.0 \n", 150 | "1 0.0 0.0 \n", 151 | "2 0.0 0.0 \n", 152 | "3 0.0 0.0 \n", 153 | "4 0.0 0.0 \n", 154 | "\n", 155 | " native_country_Trinadad&Tobago native_country_United-States \\\n", 156 | "0 0.0 1.0 \n", 157 | "1 0.0 1.0 \n", 158 | "2 0.0 1.0 \n", 159 | "3 0.0 1.0 \n", 160 | "4 0.0 0.0 \n", 161 | "\n", 162 | " native_country_Vietnam native_country_Yugoslavia \n", 163 | "0 0.0 0.0 \n", 164 | "1 0.0 0.0 \n", 165 | "2 0.0 0.0 \n", 166 | "3 0.0 0.0 \n", 167 | "4 0.0 0.0 \n", 168 | "\n", 169 | "[5 rows x 108 columns]\n" 170 | ] 171 | } 172 | ], 173 | "execution_count": 4, 174 | "metadata": {} 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "source": [ 179 | "The data are now ready to be fed into the imputation algorithm, which involves three steps. First, we specify the dimensions, input corruption proportion, and other hyperparameters of the MIDAS neural network. Second, we build a MIDAS model based on the data. The vector of one-hot-encoded column names should be passed to the softmax_columns argument, as MIDAS employs a softmax final-layer activation function for categorical variables. Third, we train the model on the data, setting the number of training epochs as 20 in this example:" 180 | ], 181 | "metadata": {} 182 | }, 183 | { 184 | "cell_type": "code", 185 | "source": [ 186 | "imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 89, input_drop = 0.75)\n", 187 | "imputer.build_model(data_in, softmax_columns = cat_cols_list)\n", 188 | "imputer.train_model(training_epochs = 20)" 189 | ], 190 | "outputs": [ 191 | { 192 | "output_type": "stream", 193 | "name": "stdout", 194 | "text": [ 195 | "Size index: [7, 8, 7, 6, 5, 2, 2, 16, 14, 41]\n", 196 | "\n", 197 | "Computation graph constructed\n", 198 | "\n", 199 | "Model initialised\n", 200 | "\n", 201 | "Epoch: 0 , loss: 131055.20626587074\n", 202 | "Epoch: 1 , loss: 94882.5758455009\n", 203 | "Epoch: 2 , loss: 90956.90158796997\n", 204 | "Epoch: 3 , loss: 88764.57763543885\n", 205 | "Epoch: 4 , loss: 85847.00143988573\n", 206 | "Epoch: 5 , loss: 80933.15996490518\n", 207 | "Epoch: 6 , loss: 76754.09316700627\n", 208 | "Epoch: 7 , loss: 75646.90740190858\n", 209 | "Epoch: 8 , loss: 74589.6067678469\n", 210 | "Epoch: 9 , loss: 74155.46380383252\n", 211 | "Epoch: 10 , loss: 74159.95000204784\n", 212 | "Epoch: 11 , loss: 74705.84092718402\n", 213 | "Epoch: 12 , loss: 73753.75950004607\n", 214 | "Epoch: 13 , loss: 73959.30564486403\n", 215 | "Epoch: 14 , loss: 73135.93429385444\n", 216 | "Epoch: 15 , loss: 74014.20066695508\n", 217 | "Epoch: 16 , loss: 73246.82324794705\n", 218 | "Epoch: 17 , loss: 74179.63132589798\n", 219 | "Epoch: 18 , loss: 73412.0879309418\n", 220 | "Epoch: 19 , loss: 73584.05688892529\n", 221 | "Training complete. Saving file...\n", 222 | "Model saved in file: tmp/MIDAS\n" 223 | ] 224 | }, 225 | { 226 | "output_type": "execute_result", 227 | "execution_count": 5, 228 | "data": { 229 | "text/plain": [ 230 | "" 231 | ] 232 | }, 233 | "metadata": {} 234 | } 235 | ], 236 | "execution_count": 5, 237 | "metadata": {} 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "source": [ 242 | "Once training is complete, we can generate any number of imputed datasets (M) using the `generate_samples` function (here we set M as 10). Users can then either write these imputations to separate .CSV files or work with them directly in Python:" 243 | ], 244 | "metadata": {} 245 | }, 246 | { 247 | "cell_type": "code", 248 | "source": [ 249 | "imputations = imputer.generate_samples(m=10).output_list \n", 250 | "\n", 251 | "# for i in imputations:\n", 252 | "# file_out = \"midas_imp_\" + str(n) + \".csv\"\n", 253 | "# i.to_csv(file_out, index=False)\n", 254 | "# n += 1" 255 | ], 256 | "outputs": [ 257 | { 258 | "output_type": "stream", 259 | "name": "stdout", 260 | "text": [ 261 | "INFO:tensorflow:Restoring parameters from tmp/MIDAS\n", 262 | "Model restored.\n" 263 | ] 264 | } 265 | ], 266 | "execution_count": 6, 267 | "metadata": {} 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "source": [ 272 | "Finally, using the list of generated imputations, we can estimate M separate regression models and combine the parameter and variance estimates (see Rubin 1987) using **MIDASpy's** `combine` function:" 273 | ], 274 | "metadata": {} 275 | }, 276 | { 277 | "cell_type": "code", 278 | "source": [ 279 | "model = md.combine(y_var = \"capital_gain\", \n", 280 | " X_vars = [\"education_num\",\"age\"],\n", 281 | " df_list = imputations)\n", 282 | "\nmodel" 283 | ], 284 | "outputs": [ 285 | { 286 | "output_type": "execute_result", 287 | "execution_count": 7, 288 | "data": { 289 | "text/plain": [ 290 | " term estimate std.error statistic df p.value\n", 291 | "0 const -936.114554 136.800095 -6.842938 75.658615 1.764065e-09\n", 292 | "1 education_num 67.955119 9.202229 7.384637 26.664184 6.556180e-08\n", 293 | "2 age 31.339538 2.383158 13.150427 522.516002 0.000000e+00" 294 | ], 295 | "text/html": [ 296 | "
\n", 297 | "\n", 310 | "\n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | "
termestimatestd.errorstatisticdfp.value
0const-936.114554136.800095-6.84293875.6586151.764065e-09
1education_num67.9551199.2022297.38463726.6641846.556180e-08
2age31.3395382.38315813.150427522.5160020.000000e+00
\n", 352 | "
" 353 | ] 354 | }, 355 | "metadata": {} 356 | } 357 | ], 358 | "execution_count": 7, 359 | "metadata": { 360 | "collapsed": false, 361 | "outputHidden": false, 362 | "inputHidden": false 363 | } 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "source": [ 368 | "### Handling categorical data post-imputation\n", 369 | "\nTo impute missing data in a categorical variable, we one-hot encode this variable and then impute the probability of each class for each observation. For example, the imputed version of the one-hot-encoded `workclass` variable is represented by 8 columns, one for each category in the data:" 370 | ], 371 | "metadata": {} 372 | }, 373 | { 374 | "cell_type": "code", 375 | "source": [ 376 | "workclasses = [x for x in imputations[0].columns if \"workclass\" in x]\n", 377 | "imputations[0][workclasses].head()" 378 | ], 379 | "outputs": [ 380 | { 381 | "output_type": "execute_result", 382 | "execution_count": 8, 383 | "data": { 384 | "text/plain": [ 385 | " workclass_Federal-gov workclass_Local-gov workclass_Never-worked \\\n", 386 | "0 0.0 0.0 0.0 \n", 387 | "1 0.0 0.0 0.0 \n", 388 | "2 0.0 0.0 0.0 \n", 389 | "3 0.0 0.0 0.0 \n", 390 | "4 0.0 0.0 0.0 \n", 391 | "\n", 392 | " workclass_Private workclass_Self-emp-inc workclass_Self-emp-not-inc \\\n", 393 | "0 0.0 0.0 0.0 \n", 394 | "1 0.0 0.0 1.0 \n", 395 | "2 1.0 0.0 0.0 \n", 396 | "3 1.0 0.0 0.0 \n", 397 | "4 1.0 0.0 0.0 \n", 398 | "\n", 399 | " workclass_State-gov workclass_Without-pay \n", 400 | "0 1.0 0.0 \n", 401 | "1 0.0 0.0 \n", 402 | "2 0.0 0.0 \n", 403 | "3 0.0 0.0 \n", 404 | "4 0.0 0.0 " 405 | ], 406 | "text/html": [ 407 | "
\n", 408 | "\n", 421 | "\n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | "
workclass_Federal-govworkclass_Local-govworkclass_Never-workedworkclass_Privateworkclass_Self-emp-incworkclass_Self-emp-not-incworkclass_State-govworkclass_Without-pay
00.00.00.00.00.00.01.00.0
10.00.00.00.00.01.00.00.0
20.00.00.01.00.00.00.00.0
30.00.00.01.00.00.00.00.0
40.00.00.01.00.00.00.00.0
\n", 493 | "
" 494 | ] 495 | }, 496 | "metadata": {} 497 | } 498 | ], 499 | "execution_count": 8, 500 | "metadata": {} 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "source": [ 505 | "If we want to analyze the original categorical version of the variable post-imputation, we can transform these probabilities back into a vector of labels. The simplest approach is to select the category with the highest probability for each observation. Having used `md.conv()` to one-hot encode categorical variables earlier, we can use the associated `cat_cols_list` object to do just that. The following code collapses all one-hot-encoded columns back into single categorical columns:" 506 | ], 507 | "metadata": {} 508 | }, 509 | { 510 | "cell_type": "code", 511 | "source": [ 512 | "flat_cats = [cat for variable in cat_cols_list for cat in variable]\n", 513 | "\n", 514 | "for i in range(len(imputations)):\n", 515 | " tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]\n", 516 | " cat_df = pd.DataFrame({categorical[i]:tmp_cat[i] for i in range(len(categorical))})\n", 517 | " imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)\n" 518 | ], 519 | "outputs": [], 520 | "execution_count": 9, 521 | "metadata": {} 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "source": [ 526 | "If we now inspect the imputations, we can see that the data are back to their original shape. For example, the `workclass` variable's categories correspond to the one-hot-encoded variables shown earlier:" 527 | ], 528 | "metadata": {} 529 | }, 530 | { 531 | "cell_type": "code", 532 | "source": [ 533 | "print(imputations[0].columns)\n", 534 | "\nimputations[0]['workclass'].head()" 535 | ], 536 | "outputs": [ 537 | { 538 | "output_type": "stream", 539 | "name": "stdout", 540 | "text": [ 541 | "Index(['Unnamed: 0', 'age', 'fnlwgt', 'education_num', 'capital_gain',\n", 542 | " 'capital_loss', 'hours_per_week', 'workclass', 'marital_status',\n", 543 | " 'relationship', 'race', 'class_labels', 'sex', 'education',\n", 544 | " 'occupation', 'native_country'],\n", 545 | " dtype='object')\n" 546 | ] 547 | }, 548 | { 549 | "output_type": "execute_result", 550 | "execution_count": 10, 551 | "data": { 552 | "text/plain": [ 553 | "0 workclass_State-gov\n", 554 | "1 workclass_Self-emp-not-inc\n", 555 | "2 workclass_Private\n", 556 | "3 workclass_Private\n", 557 | "4 workclass_Private\n", 558 | "Name: workclass, dtype: object" 559 | ] 560 | }, 561 | "metadata": {} 562 | } 563 | ], 564 | "execution_count": 10, 565 | "metadata": {} 566 | } 567 | ], 568 | "metadata": { 569 | "kernelspec": { 570 | "name": "python3", 571 | "language": "python", 572 | "display_name": "Python 3" 573 | }, 574 | "language_info": { 575 | "name": "python", 576 | "version": "3.8.3", 577 | "mimetype": "text/x-python", 578 | "codemirror_mode": { 579 | "name": "ipython", 580 | "version": 3 581 | }, 582 | "pygments_lexer": "ipython3", 583 | "nbconvert_exporter": "python", 584 | "file_extension": ".py" 585 | }, 586 | "kernel_info": { 587 | "name": "python3" 588 | }, 589 | "nteract": { 590 | "version": "0.12.3" 591 | }, 592 | "interpreter": { 593 | "hash": "88f65ce5382ce20a2dfcb3047ae19453970fdb3147747ad8e6ead051daaa71e6" 594 | } 595 | }, 596 | "nbformat": 4, 597 | "nbformat_minor": 2 598 | } 599 | -------------------------------------------------------------------------------- /Examples/midaspy_demo_cces.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "609c91af", 6 | "metadata": {}, 7 | "source": [ 8 | "### __MIDASpy demonstration__" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "a1239507", 14 | "metadata": {}, 15 | "source": [ 16 | "MIDASpy's core functionalities are demonstrated here by using it to impute missing responses to the 2018 Cooperative Congressional Election Study (CCES), an electoral survey conducted in the United States whose size and complexity poses computational difficulties for many existing multiple imputation algorithms." 17 | ] 18 | }, 19 | { 20 | "attachments": {}, 21 | "cell_type": "markdown", 22 | "id": "e8752d9c", 23 | "metadata": {}, 24 | "source": [ 25 | "The full CCES has 525 columns and 60,000 rows, the latter corresponding to individual survey respondents. After removing variables that either require extensive preprocessing or are unhelpful for imputation purposes — open-ended string variables, time indices, and ZIP code variables — the dataset contains 349 columns. The vast majority of these variables are categorical and must therefore be one-hot encoded for most multiple imputation software packages — that is, each 1 × 60,000 categorical variable with K unique classes must be expanded into a K × 60,000 matrix of 1s and 0s — increasing their number to 1,914." 26 | ] 27 | }, 28 | { 29 | "attachments": {}, 30 | "cell_type": "markdown", 31 | "id": "48c09dfe", 32 | "metadata": {}, 33 | "source": [ 34 | "_**Loading and preprocessing the data**_" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "abb5b5f7", 40 | "metadata": {}, 41 | "source": [ 42 | "We begin by loading MIDASpy, its dependencies, and additional packages called in the workflow. We then read in the formatted CCES data and sort variables into continuous, binary, and categorical types." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "id": "e5e9ff71", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import numpy as np\n", 53 | "import pandas as pd\n", 54 | "import tensorflow as tf\n", 55 | "from sklearn.preprocessing import MinMaxScaler\n", 56 | "import sys\n", 57 | "import MIDASpy as md" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "id": "7608d31b", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "data_in = pd.read_csv(\"cces_jss_format.csv\")\n", 68 | "cont_vars = [\"citylength_1\",\"numchildren\",\"birthyr\"]\n", 69 | "vals = data_in.nunique()\n", 70 | "cat_vars = list(data_in.columns[(vals.values > 2) & ~(data_in.columns.isin(cont_vars))])\n", 71 | "bin_vars = list(data_in.columns[vals.values == 2])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "d325b60a", 77 | "metadata": {}, 78 | "source": [ 79 | "Next, we apply the `.binary_conv()` function to the list of binary variables (which are not in dummy form), before appending them and the continuous variables to a `constructor_list` object, the basis for our final preprocessed dataset." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "id": "5e23551f", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "data_bin = data_in[bin_vars].apply(md.binary_conv)\n", 90 | "constructor_list = [data_in[cont_vars], data_bin]" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "7b6e0da5", 96 | "metadata": {}, 97 | "source": [ 98 | "To one-hot encode categorical variables, we apply the `.cat_conv()` function to a dataframe containing them. We concatenate the resulting matrix to the existing `constructor_list` object." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "id": "1bd9e10c", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "data_cat = data_in[cat_vars]\n", 109 | "data_oh, cat_col_list = md.cat_conv(data_cat)\n", 110 | "constructor_list.append(data_oh)\n", 111 | "data_0 = pd.concat(constructor_list, axis=1)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "6f598191", 117 | "metadata": {}, 118 | "source": [ 119 | "The final preprocessing step, which is nonessential, is to scale all variables between 0 and 1 to aid model convergence. We use scikit-learn’s `MinMaxScaler()` function for this step." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "afcc8148", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "scaler = MinMaxScaler()\n", 130 | "data_scaled = scaler.fit_transform(data_0)\n", 131 | "data_scaled = pd.DataFrame(data_scaled, columns = data_0.columns)\n", 132 | "na_loc = data_scaled.isnull()\n", 133 | "data_scaled[na_loc] = np.nan" 134 | ] 135 | }, 136 | { 137 | "attachments": {}, 138 | "cell_type": "markdown", 139 | "id": "c75e2495", 140 | "metadata": {}, 141 | "source": [ 142 | "_**Imputation**_" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "f9f88f8b", 148 | "metadata": {}, 149 | "source": [ 150 | "Once the data are preprocessed, training a MIDAS network with MIDASpy is straightforward. We declare an instance of the `Midas` class, pass our data to this object (including the sorted variable names) with the `.build_model()` function, and train the network for 10 epochs with the `.train_model()` function. For the purposes of this illustration, we maintain most of MIDASpy’s default hyperparameter settings." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "id": "381c6ffc", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "imputer = md.Midas(layer_structure= [256,256], vae_layer = False, seed= 89, input_drop = 0.75)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "id": "a6d34c74", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Size index: [3, 178, 6, 8, 6, 3, 3, 6, 6, 4, 3, 59, 3, 3, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 3, 5, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 6, 6, 6, 6, 6, 6, 6, 10, 10, 7, 4, 7, 8, 5, 8, 3, 5, 9, 5, 52, 17, 3, 3, 3, 3, 3, 6, 3, 23, 4, 7, 8, 12, 14, 11, 6, 6, 4, 7, 10, 5, 4, 4, 7, 3, 4, 6, 3, 7, 5, 4, 4, 4, 6, 5, 17, 51, 53, 53, 3, 98, 6, 6, 5, 17, 17, 4, 6, 3, 3, 3, 6, 6, 6, 10, 5, 5, 5, 5, 6, 5, 7, 5, 5, 5, 5, 224, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 51, 53, 53, 5, 51, 14, 5, 6, 5]\n", 174 | "\n", 175 | "Computation graph constructed\n", 176 | "\n" 177 | ] 178 | }, 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "" 183 | ] 184 | }, 185 | "execution_count": 7, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "imputer.build_model(data_scaled, binary_columns = bin_vars, softmax_columns = cat_col_list)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "id": "7ab7e7ce", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Model initialised\n", 205 | "\n", 206 | "Epoch: 0 , loss: 186.26737846679688\n", 207 | "Epoch: 1 , loss: 169.38942487792968\n", 208 | "Epoch: 2 , loss: 163.48311638997396\n", 209 | "Epoch: 3 , loss: 159.68743997802736\n", 210 | "Epoch: 4 , loss: 157.04094825032553\n", 211 | "Epoch: 5 , loss: 154.82602157389323\n", 212 | "Epoch: 6 , loss: 153.35590602010092\n", 213 | "Epoch: 7 , loss: 152.05749235839843\n", 214 | "Epoch: 8 , loss: 151.08395079345703\n", 215 | "Epoch: 9 , loss: 150.22736969604492\n", 216 | "Training complete. Saving file...\n", 217 | "Model saved in file: tmp/MIDAS\n" 218 | ] 219 | }, 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "" 224 | ] 225 | }, 226 | "execution_count": 8, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "imputer.train_model(training_epochs = 10)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "ea0ffaca", 238 | "metadata": {}, 239 | "source": [ 240 | "Once the model is trained, we draw a list of 10 completed datasets. When datasets are very large, as in this case, we recommend accessing each one separately rather than simultaneously holding all of them in memory. We thus construct a dataset generator using the `.yield_samples()` function." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 9, 246 | "id": "fb0bd2da", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "imputations = imputer.yield_samples(m=10)" 251 | ] 252 | }, 253 | { 254 | "attachments": {}, 255 | "cell_type": "markdown", 256 | "id": "94f4131a", 257 | "metadata": {}, 258 | "source": [ 259 | "_**Analysis of completed datasets**_" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "c5f2c772", 265 | "metadata": {}, 266 | "source": [ 267 | "We analyze the 10 completed datasets using MIDASpy’s inbuilt `combine()` function. We estimate a simple linear probability model in which `\"CC18_415a\"`, a respondent’s degree of support for giving the United States Environmental Protection Agency power to regulate carbon dioxide emissions,is regressed on `\"age\" (2018 − \"birthyr\")`, a respondent’s age.\n", 268 | "\n", 269 | "Users can ensure exact reproducibility of analytical results by saving completed datasets to disk. The trained MIDAS model itself is also saved by default to the location specified in the `savepath` argument of `Midas()`.\n", 270 | "\n", 271 | "As we scaled the input dataset prior to imputation with the `MinMaxScaler()` function, for each completed dataset we first invert this transformation via scikit-learn’s `.inverse_transform()` function and also convert predicted probabilities for `CC18_415a` into binary categories using a threshold of 0.5. To save memory, we append the relevant subset of the data, for analysis, to a list." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 10, 277 | "id": "43f664b7", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "analysis_dfs = []" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 11, 287 | "id": "c23fdeeb", 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "INFO:tensorflow:Restoring parameters from tmp/MIDAS\n", 295 | "Model restored.\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "for df in imputations:\n", 301 | " df_unscaled = scaler.inverse_transform(df)\n", 302 | " df_unscaled = pd.DataFrame(df_unscaled, columns = data_scaled.columns) \n", 303 | " df['age'] = 2018 - df_unscaled['birthyr']\n", 304 | " df['CC18_415a'] = np.where(df_unscaled['CC18_415a'] >= 0.5,1,0)\n", 305 | " analysis_dfs.append(df.loc[:,[\"age\",\"CC18_415a\"]])" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 12, 311 | "id": "393ba27d", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "model = md.combine(y_var = \"CC18_415a\", X_vars = [\"age\"], df_list = analysis_dfs)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 13, 321 | "id": "605ef806", 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/html": [ 327 | "
\n", 328 | "\n", 341 | "\n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | "
termestimatestd.errorstatisticdfp.value
0const0.9344930.005515169.4597003056.4212380.0
1age-0.0052590.000107-49.1606654565.1255180.0
\n", 374 | "
" 375 | ], 376 | "text/plain": [ 377 | " term estimate std.error statistic df p.value\n", 378 | "0 const 0.934493 0.005515 169.459700 3056.421238 0.0\n", 379 | "1 age -0.005259 0.000107 -49.160665 4565.125518 0.0" 380 | ] 381 | }, 382 | "execution_count": 13, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "model" 389 | ] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3 (ipykernel)", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.10.10" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 5 413 | } 414 | -------------------------------------------------------------------------------- /Examples/tmp/MIDAS.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.data-00000-of-00001 -------------------------------------------------------------------------------- /Examples/tmp/MIDAS.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.index -------------------------------------------------------------------------------- /Examples/tmp/MIDAS.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.meta -------------------------------------------------------------------------------- /Examples/tmp/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "MIDAS" 2 | all_model_checkpoint_paths: "MIDAS" 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MIDASpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .midas_base import * 2 | -------------------------------------------------------------------------------- /MIDASpy/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /MIDASpy/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /MIDASpy/__pycache__/midas_base.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/midas_base.cpython-312.pyc -------------------------------------------------------------------------------- /MIDASpy/__pycache__/midas_base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/midas_base.cpython-38.pyc -------------------------------------------------------------------------------- /MIDASpy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy_logo.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # MIDASpy 3 | 4 | [![PyPI Latest Release](https://img.shields.io/pypi/v/midaspy.svg)](https://pypi.org/project/midaspy/) 5 | [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)](https://pypi.org/project/midaspy/) 6 | [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) 7 | [![CI Linux](https://github.com/edvinskis/MIDASpy/actions/workflows/testlinux.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testlinux.yml) 8 | [![CI macOS](https://github.com/edvinskis/MIDASpy/actions/workflows/testmacos.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testmacos.yml) 9 | [![CI Windows](https://github.com/edvinskis/MIDASpy/actions/workflows/testwindows.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testwindows.yml) 10 | 11 | ## Overview 12 | 13 | **MIDASpy** is a Python package for multiply imputing missing data using 14 | deep learning methods. The **MIDASpy** algorithm offers significant 15 | accuracy and efficiency advantages over other multiple imputation 16 | strategies, particularly when applied to large datasets with complex 17 | features. In addition to implementing the algorithm, the package contains 18 | functions for processing data before and after model training, running 19 | imputation model diagnostics, generating multiple completed datasets, 20 | and estimating regression models on these datasets. 21 | 22 | For an implementation in R, see our **rMIDAS** repository 23 | [here](https://github.com/MIDASverse/rMIDAS). 24 | 25 | ## Background and suggested citations 26 | 27 | For more information on MIDAS, the method underlying the software, see: 28 | 29 | Lall, Ranjit, and Thomas Robinson. 2022. "The MIDAS Touch: Accurate and Scalable Missing-Data Imputation with Deep Learning." _Political Analysis_ 30, no. 2: 179-196. doi:10.1017/pan.2020.49. [Published version](https://ranjitlall.github.io/assets/pdf/Lall%20and%20Robinson%202022%20PA.pdf). [Accepted version](http://eprints.lse.ac.uk/108170/1/Lall_Robinson_PA_Forthcoming.pdf). 30 | 31 | Lall, Ranjit, and Thomas Robinson. 2023. "Efficient Multiple Imputation for Diverse Data in Python and R: MIDASpy and rMIDAS." _Journal of Statistical Software_ 107, no. 9: 1-38. doi:10.18637/jss.v107.i09. [Published version](https://ranjitlall.github.io/assets/pdf/Lall%20and%20Robinson%202023%20JSS.pdf). 32 | 33 | ## Installation 34 | 35 | To install via pip, enter the following command into the terminal: 36 | `pip install MIDASpy` 37 | 38 | The latest development version (potentially unstable) can be installed 39 | via the terminal with: 40 | `pip install git+https://github.com/MIDASverse/MIDASpy.git` 41 | 42 | MIDAS requires: 43 | 44 | - Python (>=3.6; <3.11) 45 | - Numpy (>=1.5) 46 | - Pandas (>=0.19) 47 | - TensorFlow (<2.12) 48 | - Matplotlib 49 | - Statmodels 50 | - Scipy 51 | - TensorFlow Addons (<0.20) 52 | 53 | Tensorflow also has a number of requirements, particularly if GPU acceleration is desired. See for details. 54 | 55 | ## Examples 56 | 57 | For a simple demonstration of **MIDASpy**, see our Jupyter Notebook 58 | [examples](https://github.com/MIDASverse/MIDASpy/blob/master/Examples/). 59 | 60 | ## Contributing to MIDASpy 61 | 62 | Interested in contributing to **MIDASpy**? We are looking to hire a research assistant to work part-time (flexibly) to help us build out new features and integrate our software with existing machine learning pipelines. You would be paid the standard research assistant rate at the University of Oxford. To apply, please send your CV (or a summary of relevant skills/experience) to ranjit.lall@sjc.ox.ac.uk. 63 | 64 | 65 | ## Version 1.4.0 (August 2024) 66 | 67 | - Adds support for non-negative output columns, with a `positive_columns` argument 68 | 69 | 70 | ## Version 1.3.1 (October 2023) 71 | 72 | - Minor update to reflect publication of accompanying article in Journal of Statistical Software 73 | - Further updates to make documentation and URLs consistent, including removing unused metadata 74 | 75 | ## Version 1.2.4 (August 2023) 76 | 77 | - Adds support for Python 3.9 and 3.10 78 | - Addresses deprecation warnings and other minor bug fixes 79 | - Resolves dependency issues and includes an updated `setup.py` file 80 | - Adds GitHub Actions workflows that trigger automatic tests on the latest Ubuntu, macOS, and Windows for Python versions 3.7 to 3.10 each time a push or pull request is made to the main branch 81 | - An additional Jupyter Notebook example that demonstrates the core functionalities of **MIDASpy** 82 | 83 | ## Version 1.2.3 (December 2022) 84 | 85 | *v1.2.3 adds support for installation on Apple Silicon hardware (i.e. M1 and M2 Macs).* 86 | 87 | ## Version 1.2.2 (July 2022) 88 | 89 | *v1.2.2 makes minor efficiency changes to the codebase. Full details are available in the Release logs.* 90 | 91 | ## Version 1.2.1 (January 2021) 92 | 93 | *v1.2.1 adds new pre-processing functionality and a multiple imputation regression function.* 94 | 95 | Users can now automatically preprocess binary and categorical columns prior to running the MIDAS algorithm using `binary_conv()` and `cat_conv()`. 96 | 97 | The new `combine()` function allows users to run regression analysis across the complete data, following Rubin’s combination rules. 98 | 99 | ## Previous versions 100 | 101 | *Version 1.1.1 (October 2020)* 102 | 103 | Key changes: 104 | 105 | - Update adds **full Tensorflow 2.X support**: 106 | 107 | - Users can now run the MIDAS algorithm in TensorFlow 2.X (TF1 support 108 | retained) 109 | 110 | - Tidier handling of random seed setting across both TensorFlow and 111 | NumPy 112 | 113 | - Fixes a minor dependency bug 114 | 115 | - Other minor bug fixes 116 | 117 | *Version 1.0.2 (September 2020)* 118 | 119 | Key changes: 120 | 121 | - Minor, mainly cosmetic, changes to the underlying source code. 122 | - Renamed ‘categorical\_columns’ argument in build\_model() to 123 | ‘binary\_columns’ to avoid confusion 124 | - Added plotting arguments to overimputation() method to suppress 125 | intermediary overimputation plots (plot\_main) and all plots 126 | (skip\_plot). 127 | - Changed overimputation() plot titles, labels and legends 128 | - Added tensorflow 2.0 version check on import 129 | - Fixed seed-setting bug in earlier versions 130 | 131 | *Alpha 0.2:* 132 | 133 | Variational autoencoder enabled. More flexibility in model 134 | specification, although defaulting to a simple mirrored system. Deeper 135 | analysis tools within .overimpute() for checking fit on continuous 136 | values. Constructor code deconflicted. Individual output specification 137 | enabled for very large datasets. 138 | 139 | Key added features: 140 | 141 | - Variational autoencoder capacity added, including encoding to and 142 | sampling from latent space 143 | 144 | Planned features: 145 | 146 | - Time dependence handling through recurrent cells 147 | - Improving the pipeline methods for very large datasets 148 | - Tensorboard integration 149 | - Dropout scaling 150 | - A modified constructor that can generate embeddings for better 151 | interpolation of features 152 | - R support 153 | 154 | Wish list: 155 | 156 | - Smoothing for time series (LOESS?) 157 | - Informative priors? 158 | 159 | *Alpha 0.1:* 160 | 161 | - Basic functionality feature-complete. 162 | - Support for mixed categorical and continuous data types 163 | - An “additional data” pipeline, allowing data that may be relevant to 164 | the imputation to be included (without being included in error 165 | generating statistics) 166 | - Simplified calibration for model complexity through the 167 | “overimputation” function, including visualization of 168 | reconstructed features 169 | - Basic large dataset functionality 170 | -------------------------------------------------------------------------------- /build/lib/MIDASpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .midas_base import * 2 | -------------------------------------------------------------------------------- /build/lib/MIDASpy/midas_base.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # 3 | # 888b d888 8888888 8888888b. d8888 .d8888b. 4 | # 8888b d8888 888 888 "Y88b d88888 d88P Y88b 5 | # 88888b.d88888 888 888 888 d88P888 Y88b. 6 | # 888Y88888P888 888 888 888 d88P 888 "Y888b. 7 | # 888 Y888P 888 888 888 888 d88P 888 "Y88b. 8 | # 888 Y8P 888 888 888 888 d88P 888 "888 9 | # 888 " 888 888 888 .d88P d8888888888 Y88b d88P 10 | # 888 888 8888888 8888888P" d88P 888 "Y8888P" 11 | # 12 | # --- Multiple Imputation with Denoising Autoencoders 13 | # Copyright 2020 Ranjit Lall, Alex Stenlake, and Thomas Robinson. All Rights Reserved. 14 | # 15 | # Licensed under the Apache License, Version 2.0 (the "License"); 16 | # you may not use this file except in compliance with the License. 17 | # You may obtain a copy of the License at 18 | # 19 | # http://www.apache.org/licenses/LICENSE-2.0 20 | # 21 | # Unless required by applicable law or agreed to in writing, software 22 | # distributed under the License is distributed on an "AS IS" BASIS, 23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | # See the License for the specific language governing permissions and 25 | # limitations under the License. 26 | # ============================================================================== 27 | 28 | import matplotlib.pyplot as plt 29 | import numpy as np 30 | import pandas as pd 31 | import tensorflow as tf 32 | import os 33 | import statsmodels.api as sm 34 | from scipy import stats 35 | 36 | if tf.__version__[0] == '2': 37 | import tensorflow_addons as tfa 38 | 39 | from sklearn.metrics import mean_squared_error as mse 40 | from typing import List, Optional 41 | import random 42 | 43 | 44 | class Midas(object): 45 | """ 46 | MIDASpy is a Python package for multiply imputing missing data using deep learning methods. The MIDASpy algorithm 47 | offers significant accuracy and efficiency advantages over other multiple imputation strategies, particularly when 48 | applied to large datasets with complex features. In addition to implementing the algorithm, the package contains 49 | functions for processing data before and after model training, running imputation model diagnostics, generating 50 | multiple completed datasets, and estimating regression models on these datasets. 51 | """ 52 | def __init__(self, 53 | layer_structure: Optional[List[int]] = None, 54 | learn_rate: float = 1e-4, 55 | input_drop: float = 0.8, 56 | train_batch: int = 16, 57 | savepath: str = 'tmp/MIDAS', 58 | seed: (int, type(None)) = None, 59 | output_layers: str = 'reversed', 60 | loss_scale: int = 1, 61 | init_scale: int = 1, 62 | vae_layer: bool = False, 63 | individual_outputs: bool = False, 64 | manual_outputs: bool = False, 65 | output_structure: Optional[List[int]] = None, 66 | latent_space_size: int = 4, 67 | cont_adj: float = 1.0, 68 | binary_adj: float = 1.0, 69 | softmax_adj: float = 1.0, 70 | dropout_level: float = 0.5, 71 | weight_decay: str = 'default', 72 | vae_alpha: float = 1.0, 73 | act=tf.nn.elu, 74 | vae_sample_var: float = 1.0, 75 | noise_type: str = 'bernoulli', 76 | kld_min: float = 0.01, 77 | ): 78 | """ 79 | Initialiser. Called separately to 'build_model' to allow for out-of-memory 80 | datasets. All key hyperparameters are entered at this stage, as the model 81 | construction methods only deal with the dataset. 82 | 83 | Args: 84 | layer_structure: List of integers. The number of nodes in each layer of the 85 | network (default = [256, 256, 256], denoting a three-layer network with 256 86 | nodes per layer). Larger networks can learn more complex data structures but 87 | require longer training and are more prone to overfitting. 88 | 89 | learn_rate: Float. The learning rate (gamma; default = 0.0001), which 90 | controls the size of the weight adjustment in each training epoch. In general, 91 | higher values reduce training time at the expense of less accurate results. 92 | 93 | input_drop: Float between 0 and 1. The probability of corruption for input 94 | columns in training mini-batches (default = 0.8). Higher values increase 95 | training time but reduce the risk of overfitting. In our experience, values 96 | between 0.7 and 0.95 deliver the best performance. 97 | 98 | train_batch: Integer. The number of observations in training mini-batches 99 | (default = 16). Common choices are 8, 16, 32, 64, and 128; powers of 2 tend to 100 | enhance memory efficiency. In general, smaller sizes lead to faster convergence 101 | at the cost of greater noise and thus less accurate estimates of the error 102 | gradient. Where memory management is a concern, they should be favored. 103 | 104 | savepath: String. The location to which the trained model will be saved. 105 | 106 | seed: Integer. The value to which Python's pseudo-random number 107 | generator is initialized. This enables users to ensure that data shuffling, 108 | weight and bias initialization, and missingness indicator vectors are 109 | reproducible. 110 | 111 | loss_scale: Float. A constant by which the RMSE loss functions are multiplied 112 | (default = 1). This hyperparameter performs a similar function to the learning 113 | rate. If loss during training is very large, increasing its value can help to 114 | prevent overtraining. 115 | 116 | init_scale: Float. The numerator of the variance component of Xavier Initialisation 117 | equation (default = 1). In very deep networks, higher values may help to prevent 118 | extreme gradients (though this problem is less common with ELU activation functions). 119 | 120 | softmax_adj: Float. A constant by which the cross-entropy loss functions are 121 | multiplied (default = 1). This hyperparameter is the equivalent of loss_scale 122 | for categorical variables. If cross-entropy loss falls at a consistently faster 123 | rate than RMSE during training, a lower value may help to redress this imbalance. 124 | 125 | vae_layer: Boolean. Specifies whether to include a variational autoencoder layer in 126 | the network (default = False), one of the key diagnostic tools included in midas. 127 | If set to true, variational autoencoder hyperparameters must be specified via a number 128 | of additional arguments. 129 | 130 | latent_space_size: Integer. The number of normal dimensions used to parameterize the 131 | latent space when vae_layer = True. 132 | 133 | vae_sample_var: Float. The sampling variance of the normal distributions used to 134 | parameterize the latent space when vae_layer = True. 135 | 136 | vae_alpha: Float. The strength of the prior imposed on the Kullback-Leibler divergence term 137 | in the variational autoencoder loss functions. 138 | 139 | kld_min: Float. The minimum value of the Kullback-Leibler divergence term in the variational 140 | autoencoder loss functions. 141 | 142 | Returns: 143 | Self 144 | 145 | """ 146 | # Sanity Check layer_structure: 147 | if not layer_structure: 148 | layer_structure = [256, 256, 256] 149 | if not isinstance(layer_structure, list): 150 | raise TypeError("The layer structure must be specified within a list type.") 151 | if not all(isinstance(v, int) for v in layer_structure): 152 | raise ValueError("The elements of the layer_structure must all be specified as integer types.") 153 | 154 | # Sanity Check output_layers: 155 | if not isinstance(output_layers, (str, list)): 156 | raise TypeError("The 'output_layers' argument must be a string or a list type.") 157 | if isinstance(output_layers, str): 158 | if not output_layers == "reversed": 159 | raise ValueError("The only string argument accepted for output_layers is 'reversed'.") 160 | self.output_layers = layer_structure.copy() 161 | self.output_layers.reverse() 162 | if isinstance(output_layers, list): 163 | self.output_layers = output_layers 164 | 165 | # Sanity Check weight_decay: 166 | if not isinstance(weight_decay, (str, float)): 167 | raise TypeError("The 'weight_decay' argument must be a string or float type.") 168 | if isinstance(weight_decay, str): 169 | if not weight_decay == 'default': 170 | raise ValueError("The 'weight_decay' argument must be 'default' if a string.") 171 | self.weight_decay = 'default' 172 | if isinstance(weight_decay, float): 173 | self.weight_decay = weight_decay 174 | 175 | # Sanity Check output_structure: 176 | if output_structure is None: 177 | output_structure = [16, 16, 32] 178 | if isinstance(output_structure, int): 179 | self.output_structure = [output_structure] * 3 180 | elif (individual_outputs is True) | (len(output_structure) == 3): 181 | self.output_structure = output_structure 182 | else: 183 | raise TypeError("The output transform assignment must take the form of an integer, a list of three " 184 | "elements (cont, bin, cat), or individual values must be specified.") 185 | 186 | if seed is not None: 187 | os.environ['PYTHONHASHSEED'] = str(seed) 188 | os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 189 | os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 190 | tf.compat.v1.set_random_seed(seed) 191 | 192 | # Sanity Check savepath: 193 | if not isinstance(savepath, str): 194 | raise TypeError("The 'savepath' argument must be a string type.") 195 | if os.path.exists(savepath) and not os.path.isdir(savepath): 196 | raise FileExistsError("The passed argument was a file, not a directory.") 197 | if not os.path.exists(savepath): 198 | os.makedirs(savepath) 199 | 200 | self.layer_structure = layer_structure 201 | self.learn_rate = learn_rate 202 | self.input_drop = input_drop 203 | self.model_built = False 204 | self.savepath = savepath 205 | self.model = None 206 | self.additional_data = None 207 | self.train_batch = train_batch 208 | self.seed = seed 209 | self.input_is_pipeline = False 210 | self.input_pipeline = None 211 | self.vae_layer = vae_layer 212 | self.loss_scale = loss_scale 213 | self.init_scale = init_scale 214 | self.individual_outputs = individual_outputs 215 | self.manual_outputs = manual_outputs 216 | self.vae_sample_var = vae_sample_var 217 | self.latent_space_size = latent_space_size 218 | self.dropout_level = dropout_level 219 | self.prior_strength = vae_alpha 220 | self.kld_min = kld_min 221 | self.seed = seed 222 | self.cont_adj = cont_adj 223 | self.binary_adj = binary_adj 224 | self.softmax_adj = softmax_adj 225 | self.act = act 226 | self.noise_type = noise_type 227 | 228 | def _batch_iter(self, 229 | train_data, 230 | na_mask, 231 | b_size=16, 232 | rng=np.random): 233 | """ 234 | Function for handling the batch feeds for training loops 235 | """ 236 | indices = np.arange(train_data.shape[0]) 237 | rng.shuffle(indices) 238 | 239 | for start_idx in range(0, train_data.shape[0] - b_size + 1, b_size): 240 | excerpt = indices[start_idx:start_idx + b_size] 241 | if self.additional_data is None: 242 | yield train_data[excerpt], na_mask[excerpt] 243 | else: 244 | yield train_data[excerpt], na_mask[excerpt], self.additional_data.values[excerpt] 245 | 246 | def _batch_iter_output(self, 247 | train_data, 248 | b_size=256): 249 | """ 250 | Identical to _batch_iter(), although designed for a single datasource 251 | """ 252 | 253 | indices = np.arange(train_data.shape[0]) 254 | for start_idx in range(0, train_data.shape[0], b_size): 255 | excerpt = indices[start_idx:start_idx + b_size] 256 | if self.additional_data is None: 257 | yield train_data[excerpt] 258 | else: 259 | yield train_data[excerpt], self.additional_data.values[excerpt] 260 | 261 | @staticmethod 262 | def _batch_iter_zsample(data, 263 | b_size: int = 256): 264 | """ 265 | Identical to _batch_iter(), although designed for sampling from latent 266 | """ 267 | indices = np.arange(data.shape[0]) 268 | for start_idx in range(0, data.shape[0], b_size): 269 | excerpt = indices[start_idx:start_idx + b_size] 270 | yield data[excerpt] 271 | 272 | def _build_layer(self, 273 | X, 274 | weight_matrix, 275 | bias_vec, 276 | dropout_rate=0.5, 277 | output_layer=False): 278 | """ 279 | Constructs layers for the build function 280 | """ 281 | X_tx = tf.matmul(tf.compat.v1.nn.dropout(X, 282 | rate=(1 - dropout_rate)), 283 | weight_matrix) + bias_vec 284 | if output_layer: 285 | return X_tx 286 | else: 287 | return self.act(X_tx) 288 | 289 | @staticmethod 290 | def _build_variables(weights, 291 | biases, 292 | num_in, 293 | num_out, 294 | scale=1): 295 | """ 296 | Custom initialiser for a weights, using a variation on Xavier initialisation 297 | with smaller starting weights. Allows for faster convergence on low learn 298 | rates, useful in the presence of multiple loss functions 299 | """ 300 | weights.append(tf.Variable(tf.random.truncated_normal([num_in, num_out], 301 | mean=0, 302 | stddev=scale / np.sqrt(num_in + num_out)))) 303 | biases.append(tf.Variable(tf.zeros([num_out]))) # Bias can be zero 304 | return weights, biases 305 | 306 | @staticmethod 307 | def _sort_cols(data, 308 | subset): 309 | """ 310 | This function is used to sequence the columns of the dataset, so as to be in 311 | the order [Continuous data], [Binary data], [Categorical data]. It simply 312 | rearranges a column, done functionally to minimise memory overhead 313 | """ 314 | if not isinstance(subset, list): 315 | subset = list(subset) 316 | data_1 = data[subset] 317 | data_0 = data.drop(subset, axis=1) 318 | chunk = data_1.shape[1] 319 | return pd.concat([data_0, data_1], axis=1), chunk 320 | 321 | def build_model(self, 322 | imputation_target, 323 | binary_columns=None, 324 | softmax_columns=None, 325 | unsorted=True, 326 | additional_data=None, 327 | verbose=True, 328 | ): 329 | """ 330 | This method is called to construct the neural network that is the heart of 331 | MIDAS. This includes the assignment of loss functions to the appropriate 332 | data types. 333 | 334 | THIS FUNCTION MUST BE CALLED BEFORE ANY TRAINING OR IMPUTATION OCCURS. Failing 335 | to do so will simply raise an error. 336 | 337 | The categorical columns should be a list of column names. Softmax columns 338 | should be a list of lists of column names. This will allow the model to 339 | dynamically assign cost functions to the correct variables. If, however, 340 | the data comes pre-sorted, arranged can be set to "true", in which case 341 | the arguments can be passed in as integers of size, ie. shape[1] attributes 342 | for each of the relevant categories. 343 | 344 | In other words, if you're experienced at using MIDAS and understand how its 345 | indexing works, pre-sort your data and pass in the integers so specifying 346 | reindexing values doesn't become too onerous. 347 | 348 | Alternatively, list(df.columns.values) will output a list of column names, 349 | which can be easily implemented in the 'for' loop which constructs your dummy 350 | variables. 351 | 352 | Args: 353 | imputation_target: DataFrame. The name of the incomplete input dataset. 354 | Upon being read in, the dataset will be appropriately formatted and stored 355 | for training. 356 | 357 | binary_columns: List of names. A list of all binary variables in the input 358 | dataset. 359 | 360 | softmax_columns: List of lists. The outer list should include all non-binary 361 | categorical variables in the input dataset. Each inner list should contain 362 | the mutually exclusive set of possible classes for each of these variables. 363 | 364 | unsorted: Boolean. Specifies whether the input dataset has been pre-ordered 365 | in terms of variable type (default = True, denoting no sorting). If 366 | set to False, binary_columns and softmax_columns should be a list of integers 367 | denoting shape attributes for each category. 368 | 369 | additional_data: DataFrame. Data that should be included in the imputation 370 | model but are not required for later analyses. Such data will not be 371 | formatted, rearranged, or included in the loss functions, reducing training 372 | time. 373 | 374 | verbose: Boolean. Specifies whether to print messages to the terminal 375 | (default = True). 376 | 377 | Returns: 378 | Self 379 | 380 | """ 381 | if not isinstance(imputation_target, pd.DataFrame): 382 | raise TypeError("Input data must be in a DataFrame") 383 | if imputation_target.isnull().sum().sum() == 0: 384 | raise ValueError("Imputation target contains no missing values. Please ensure " 385 | "missing values are encoded as type np.nan") 386 | self.original_columns = imputation_target.columns 387 | cont_exists = False 388 | cat_exists = False 389 | in_size = imputation_target.shape[1] 390 | if additional_data is not None: 391 | add_size = additional_data.shape[1] 392 | else: 393 | add_size = 0 394 | 395 | # Establishing indices for cost function 396 | size_index = [] 397 | if binary_columns is not None: 398 | if unsorted: 399 | imputation_target, chunk = self._sort_cols(imputation_target, 400 | binary_columns) 401 | size_index.append(chunk) 402 | else: 403 | size_index.append(binary_columns) 404 | cat_exists = True 405 | if softmax_columns is not None: 406 | if unsorted: 407 | for subset in softmax_columns: 408 | imputation_target, chunk = self._sort_cols(imputation_target, 409 | subset) 410 | size_index.append(chunk) 411 | else: 412 | for digit in softmax_columns: 413 | size_index.append(digit) 414 | if sum(size_index) < in_size: 415 | chunk = in_size - sum(size_index) 416 | size_index.insert(0, chunk) 417 | cont_exists = True 418 | if not sum(size_index) == in_size: 419 | raise ValueError("Sorting columns has failed") 420 | if verbose: 421 | print("Size index:", size_index) 422 | 423 | # Commit some variables to the instance of the class 424 | self.size_index = size_index 425 | if not self.input_is_pipeline: 426 | self.na_matrix = imputation_target.notnull().astype(bool) 427 | self.imputation_target = imputation_target.fillna(0) 428 | if additional_data is not None: 429 | self.additional_data = additional_data.fillna(0) 430 | 431 | # Build graph 432 | tf.compat.v1.reset_default_graph() 433 | self.graph = tf.Graph() 434 | with self.graph.as_default(): 435 | if self.seed is not None: 436 | # np.random.seed(self.seed) 437 | tf.compat.v1.set_random_seed(self.seed) 438 | 439 | # Placeholders 440 | self.X = tf.compat.v1.placeholder(tf.float32, [None, in_size]) 441 | self.na_idx = tf.compat.v1.placeholder(tf.bool, [None, in_size]) 442 | if additional_data is not None: 443 | self.X_add = tf.compat.v1.placeholder(tf.float32, [None, add_size]) 444 | if self.vae_layer: 445 | self.latent_inputs = tf.compat.v1.placeholder(tf.float32, [None, self.latent_space_size]) 446 | 447 | # Build list for determining input and output structures 448 | struc_list = self.layer_structure.copy() 449 | struc_list.insert(0, in_size + add_size) 450 | outputs_struc = [] 451 | for n in range(len(size_index)): 452 | if n == 0: 453 | if cont_exists: 454 | outputs_struc += ["cont"] * size_index[n] 455 | elif cat_exists: 456 | outputs_struc += ["bin"] * size_index[n] 457 | 458 | else: 459 | outputs_struc += [size_index[n]] 460 | 461 | elif n == 1: 462 | if cont_exists and cat_exists: 463 | outputs_struc += ["bin"] * size_index[n] 464 | 465 | else: 466 | outputs_struc += [size_index[n]] 467 | else: 468 | outputs_struc += [size_index[n]] 469 | 470 | if self.manual_outputs is True: 471 | output_layer_size = np.sum(self.output_structure) 472 | output_layer_structure = self.output_structure 473 | else: 474 | output_layer_structure = [] 475 | for item in outputs_struc: 476 | if item == "cont": 477 | output_layer_structure.append(self.output_structure[0]) 478 | if item == "bin": 479 | output_layer_structure.append(self.output_structure[1]) 480 | if type(item) == int: 481 | output_layer_structure.append(self.output_structure[2]) 482 | output_layer_size = np.sum(output_layer_structure) 483 | 484 | # Instantiate and initialise variables 485 | _w = [] 486 | _b = [] 487 | _zw = [] 488 | _zb = [] 489 | _ow = [] 490 | _ob = [] 491 | 492 | # Input, denoising 493 | for n in range(len(struc_list) - 1): 494 | _w, _b = self._build_variables(weights=_w, biases=_b, 495 | num_in=struc_list[n], 496 | num_out=struc_list[n + 1], 497 | scale=self.init_scale) 498 | if self.vae_layer: 499 | mapped_dist = tf.compat.v1.distributions.Normal(tf.constant(0.), 500 | tf.constant(self.vae_sample_var)) 501 | # mapped_dist = tf.distributions.StudentT(tf.constant(3.0), 502 | # tf.constant(0.0), 503 | # tf.constant(1.0)) 504 | # Latent state, variance 505 | _zw, _wb = self._build_variables(weights=_zw, biases=_zb, 506 | num_in=struc_list[-1], 507 | num_out=self.latent_space_size * 2, 508 | scale=self.init_scale) 509 | _zw, _wb = self._build_variables(weights=_zw, biases=_zb, 510 | num_in=self.latent_space_size, 511 | num_out=self.output_layers[0], 512 | scale=self.init_scale) 513 | 514 | t_l = len(self.output_layers) 515 | # Output, specialisation 516 | assert len(output_layer_structure) == len(outputs_struc) 517 | output_split = [] 518 | if self.individual_outputs: 519 | self.output_layers.append(output_layer_size) 520 | for n in range(t_l): 521 | _ow, _ob = self._build_variables(weights=_ow, biases=_ob, 522 | num_in=self.output_layers[n], 523 | num_out=self.output_layers[n + 1], 524 | scale=self.init_scale) 525 | for n in range(len(outputs_struc)): 526 | if type(outputs_struc[n]) == str: 527 | _ow, _ob = self._build_variables(weights=_ow, biases=_ob, 528 | num_in=output_layer_structure[n], 529 | num_out=1, 530 | scale=self.init_scale) 531 | output_split.append(1) 532 | elif type(outputs_struc[n]) == int: 533 | _ow, _ob = self._build_variables(weights=_ow, biases=_ob, 534 | num_in=output_layer_structure[n], 535 | num_out=outputs_struc[n], 536 | scale=self.init_scale) 537 | output_split.append(outputs_struc[n]) 538 | else: 539 | self.output_layers.append(in_size) 540 | for n in range(t_l): 541 | _ow, _ob = self._build_variables(weights=_ow, biases=_ob, 542 | num_in=self.output_layers[n], 543 | num_out=self.output_layers[n + 1]) 544 | for n in range(len(outputs_struc)): 545 | if type(outputs_struc[n]) == str: 546 | output_split.append(1) 547 | elif type(outputs_struc[n]) == int: 548 | output_split.append(outputs_struc[n]) 549 | 550 | # Build the neural network. Each layer is determined by the struc list 551 | def denoise(X): 552 | # Input tx 553 | for n in range(len(struc_list) - 1): 554 | if n == 0: 555 | if self.noise_type == 'bernoulli': 556 | X = self._build_layer(X, _w[n], _b[n], 557 | dropout_rate=self.input_drop) 558 | elif self.noise_type == 'gaussian': 559 | X = X + tf.compat.v1.distributions.Normal(loc=tf.constant(0.), 560 | scale=tf.constant(self.input_drop)).sample( 561 | sample_shape=tf.shape(input=X)) 562 | X = self._build_layer(X, _w[n], _b[n], 563 | dropout_rate=self.input_drop) 564 | else: 565 | X = self._build_layer(X, _w[n], _b[n], 566 | dropout_rate=self.dropout_level) 567 | return X 568 | 569 | if self.vae_layer: 570 | def to_z(X): 571 | # Latent tx 572 | X = self._build_layer(X, _zw[0], _zb[0], dropout_rate=self.dropout_level, 573 | output_layer=True) 574 | x_mu, x_log_sigma = tf.split(X, [self.latent_space_size] * 2, axis=1) 575 | return x_mu, x_log_sigma 576 | 577 | def from_z(z): 578 | # Joint transform 579 | X = self._build_layer(z, _zw[1], _zb[1], dropout_rate=1) 580 | return X 581 | 582 | def vae(X, output=False): 583 | x_mu, x_log_sigma = to_z(X) 584 | if output: 585 | reparam_z = mapped_dist.sample(sample_shape=tf.shape(input=x_mu)) 586 | # reparam_z = tf.random_normal(tf.shape(x_mu)) 587 | else: 588 | reparam_z = tf.random.normal(tf.shape(input=x_mu)) 589 | z = x_mu + reparam_z * tf.exp(x_log_sigma) 590 | kld = tf.maximum( 591 | tf.reduce_mean(input_tensor=1 + 2 * x_log_sigma * x_mu ** 2 - tf.exp(2 - x_log_sigma), 592 | axis=1) * self.prior_strength * - 0.5, 593 | self.kld_min) 594 | X = from_z(z) 595 | return X, kld 596 | 597 | if self.individual_outputs: 598 | def decode(X): 599 | for n in range(t_l): 600 | X = self._build_layer(X, _ow[n], _ob[n], dropout_rate=self.dropout_level) 601 | # Output tx 602 | base_splits = tf.split(X, output_layer_structure, axis=1) 603 | decombined = [] 604 | for n in range(len(outputs_struc)): 605 | decombined.append(self._build_layer(base_splits[n], _ow[n + t_l], _ob[n + t_l], 606 | dropout_rate=self.dropout_level, 607 | output_layer=True)) 608 | return decombined 609 | 610 | else: 611 | def decode(X): 612 | for n in range(t_l): 613 | if n == t_l - 1: 614 | X = self._build_layer(X, _ow[n], _ob[n], 615 | dropout_rate=self.dropout_level, 616 | output_layer=True) 617 | else: 618 | X = self._build_layer(X, _ow[n], _ob[n], 619 | dropout_rate=self.dropout_level) 620 | decombined = tf.split(X, output_split, axis=1) 621 | return decombined 622 | 623 | if self.vae_layer: 624 | def decode_z(z): 625 | X = from_z(z) 626 | X = decode(X) 627 | return X 628 | 629 | # Determine which imputation function is to be used. This is constructed to 630 | # take advantage of additional data provided. 631 | if additional_data is not None: 632 | encoded = denoise(tf.concat([self.X, self.X_add], axis=1)) 633 | else: 634 | encoded = denoise(self.X) 635 | 636 | if self.vae_layer: 637 | perturb, kld = vae(encoded) 638 | perturb_out, _ = vae(encoded, True) 639 | pred_split = decode(perturb) 640 | out_split = decode(perturb_out) 641 | else: 642 | pred_split = decode(encoded) 643 | 644 | # Output functions 645 | cost_list = [] 646 | self.output_types = [] 647 | 648 | # Build L2 loss and KL-Divergence 649 | if self.weight_decay == 'default': 650 | lmbda = 1 / self.imputation_target.shape[0] 651 | else: 652 | lmbda = self.weight_decay 653 | # if self.vae_layer: 654 | # l2_penalty = tf.multiply(tf.reduce_mean( 655 | # [tf.nn.l2_loss(w) for w in _w]+\ 656 | # [tf.nn.l2_loss(w) for w in _zw]+\ 657 | # [tf.nn.l2_loss(w) for w in _ow] 658 | # ), lmbda) 659 | # else: 660 | # l2_penalty = tf.multiply(tf.reduce_mean( 661 | # [tf.nn.l2_loss(w) for w in _w]+\ 662 | # [tf.nn.l2_loss(w) for w in _ow] 663 | # ), lmbda) 664 | 665 | # Assign cost and loss functions 666 | na_split = tf.split(self.na_idx, output_split, axis=1) 667 | true_split = tf.split(self.X, output_split, axis=1) 668 | for n in range(len(outputs_struc)): 669 | na_adj = tf.cast(tf.math.count_nonzero(na_split[n]), tf.float32) \ 670 | / tf.cast(tf.size(input=na_split[n]), tf.float32) 671 | if outputs_struc[n] == 'cont': 672 | if 'rmse' not in self.output_types: 673 | self.output_types.append('rmse') 674 | cost_list.append(tf.sqrt( 675 | tf.compat.v1.losses.mean_squared_error(tf.boolean_mask(tensor=true_split[n], mask=na_split[n]), 676 | tf.boolean_mask(tensor=pred_split[n], mask=na_split[n]) 677 | )) * self.cont_adj * na_adj) 678 | elif outputs_struc[n] == 'bin': 679 | if 'bacc' not in self.output_types: 680 | self.output_types.append('bacc') 681 | cost_list.append( 682 | tf.compat.v1.losses.sigmoid_cross_entropy( 683 | tf.boolean_mask(tensor=true_split[n], mask=na_split[n]), 684 | tf.boolean_mask(tensor=pred_split[n], mask=na_split[n])) 685 | * self.binary_adj * na_adj) 686 | elif type(outputs_struc[n]) == int: 687 | self.output_types.append('sacc') 688 | cost_list.append(tf.compat.v1.losses.softmax_cross_entropy( 689 | tf.reshape(tf.boolean_mask(tensor=true_split[n], mask=na_split[n]), [-1, outputs_struc[n]]), 690 | tf.reshape(tf.boolean_mask(tensor=pred_split[n], mask=na_split[n]), [-1, outputs_struc[n]])) 691 | * self.softmax_adj * na_adj) 692 | 693 | def output_function(out_split): 694 | output_list = [] 695 | # Break outputs into their parts 696 | for n in range(len(outputs_struc)): 697 | if outputs_struc[n] == 'cont': 698 | output_list.append(out_split[n]) 699 | elif outputs_struc[n] == 'bin': 700 | output_list.append(tf.nn.sigmoid(out_split[n])) 701 | elif type(outputs_struc[n]) == int: 702 | output_list.append(tf.nn.softmax(out_split[n])) 703 | return tf.concat(output_list, axis=1) 704 | 705 | self.outputs_struc = outputs_struc 706 | if self.vae_layer: 707 | self.output_op = output_function(out_split) 708 | self.joint_loss = tf.reduce_mean( 709 | input_tensor=tf.reduce_sum(input_tensor=cost_list) + kld) # + l2_penalty) 710 | self.encode_to_z = to_z(encoded) 711 | self.gen_from_z_sample = output_function(decode_z(mapped_dist.sample( 712 | sample_shape=tf.shape(input=self.latent_inputs)))) 713 | self.gen_from_z_inputs = output_function(decode_z(self.latent_inputs)) 714 | 715 | else: 716 | self.output_op = output_function(pred_split) 717 | self.joint_loss = tf.reduce_mean(input_tensor=tf.reduce_sum(input_tensor=cost_list)) # + l2_penalty) 718 | 719 | if tf.__version__[0] == '2': 720 | optim = tfa.optimizers.AdamW(lmbda, self.learn_rate) 721 | self.train_step = optim.get_updates(loss=self.joint_loss, params=tf.compat.v1.trainable_variables()) 722 | else: 723 | optim = tf.contrib.opt.AdamWOptimizer(lmbda, self.learn_rate) 724 | self.train_step = optim.minimize(loss=self.joint_loss, var_list=tf.compat.v1.trainable_variables()) 725 | 726 | self.init = tf.compat.v1.global_variables_initializer() 727 | self.saver = tf.compat.v1.train.Saver() 728 | 729 | self.model_built = True 730 | if verbose: 731 | print() 732 | print("Computation graph constructed") 733 | print() 734 | return self 735 | 736 | def train_model(self, 737 | training_epochs=100, 738 | verbose=True, 739 | verbosity_ival=1, 740 | excessive=False): 741 | """ 742 | This is the standard method for optimising the model's parameters. Must be 743 | called before imputation can be performed. 744 | 745 | Args: 746 | training_epochs: Integer. The number of complete cycles (forward passes) 747 | through the network during training (default = 100). 748 | 749 | verbose: Boolean. Specifies whether to print messages to the terminal 750 | during training, including loss values (default = True). 751 | 752 | verbosity_ival: Integer. The number of training epochs between messages 753 | (default = 1). 754 | 755 | excessive: Boolean. Specifies whether to print loss for each mini-batch 756 | to the terminal (default = \code{False}), which can help with 757 | troubleshooting. 758 | 759 | Returns: 760 | Self. Model is automatically saved upon reaching specified number of epochs 761 | 762 | """ 763 | if not self.model_built: 764 | raise AttributeError("The computation graph must be built before the model" 765 | " can be trained") 766 | 767 | if self.input_is_pipeline: 768 | raise AttributeError("Model was constructed to accept pipeline data, either" 769 | " use 'train_model_pipeline' method or rebuild model " 770 | "with in-memory dataset.") 771 | 772 | feed_data = self.imputation_target.values 773 | na_loc = self.na_matrix.values 774 | with tf.compat.v1.Session(graph=self.graph) as sess: 775 | if self.seed is not None: 776 | train_rng = np.random.default_rng(self.seed) 777 | # tf.compat.v1.set_random_seed(self.seed) 778 | else: 779 | train_rng = np.random.default_rng() 780 | 781 | sess.run(self.init) 782 | if verbose: 783 | print("Model initialised", flush=True) 784 | print(flush=True) 785 | for epoch in range(training_epochs): 786 | count = 0 787 | run_loss = 0 788 | for batch in self._batch_iter(feed_data, na_loc, self.train_batch, train_rng): 789 | if np.sum(batch[1]) == 0: 790 | continue 791 | feedin = {self.X: batch[0], self.na_idx: batch[1]} 792 | if self.additional_data is not None: 793 | feedin[self.X_add] = batch[2] 794 | loss, _ = sess.run([self.joint_loss, self.train_step], 795 | feed_dict=feedin) 796 | if excessive: 797 | print("Current cost:", loss) 798 | count += 1 799 | if not np.isnan(loss): 800 | run_loss += loss 801 | if verbose: 802 | if epoch % verbosity_ival == 0: 803 | print('Epoch:', epoch, ", loss:", str(run_loss / count), flush=True) 804 | if verbose: 805 | print("Training complete. Saving file...") 806 | save_path = self.saver.save(sess, self.savepath) 807 | if verbose: 808 | print("Model saved in file: %s" % save_path) 809 | return self 810 | 811 | def generate_samples(self, 812 | m=50, 813 | verbose=True): 814 | """ 815 | Method used to generate a set of m imputations to the .output_list attribute. 816 | Imputations are stored within a list in memory, and can be accessed in any 817 | order. 818 | 819 | If a model has been pre-trained, on subsequent runs this function can be 820 | directly called without having to train first. An 'if' statement checking 821 | the default save location is useful for this. 822 | 823 | Args: 824 | m: Integer. The number of completed datasets to produce (default = 50) 825 | 826 | verbose: Boolean. Specifies whether to print messages to the terminal 827 | (default = True). 828 | Returns: 829 | Self 830 | """ 831 | 832 | if not self.model_built: 833 | raise AttributeError("The computation graph must be built before the model" 834 | " can be trained") 835 | 836 | if self.input_is_pipeline: 837 | raise AttributeError("Model was constructed to accept pipeline data, either" 838 | " use 'pipeline_yield_samples' method or rebuild model " 839 | "with in-memory dataset.") 840 | self.output_list = [] 841 | with tf.compat.v1.Session(graph=self.graph) as sess: 842 | self.saver.restore(sess, self.savepath) 843 | if verbose: 844 | print("Model restored.") 845 | for n in range(m): 846 | feed_data = self.imputation_target.values 847 | feedin = {self.X: feed_data} 848 | if self.additional_data is not None: 849 | feedin[self.X_add] = self.additional_data 850 | y_out = pd.DataFrame(sess.run(self.output_op, 851 | feed_dict=feedin), 852 | columns=self.imputation_target.columns) 853 | output_df = self.imputation_target.copy() 854 | output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)] 855 | self.output_list.append(output_df) 856 | return self 857 | 858 | def yield_samples(self, 859 | m=50, 860 | verbose=True): 861 | """ 862 | Method used to generate a set of m imputations via the 'yield' command, allowing 863 | imputations to be used in a 'for' loop' 864 | 865 | If a model has been pre-trained, on subsequent runs this function can be 866 | directly called without having to train first. An 'if' statement checking 867 | the default save location is useful for this. 868 | 869 | Args: 870 | m: Integer. Number of imputations to generate. 871 | 872 | verbose: Boolean. Prints out messages. 873 | 874 | Returns: 875 | Self 876 | """ 877 | 878 | if not self.model_built: 879 | raise AttributeError("The computation graph must be built before the model" 880 | " can be trained") 881 | 882 | if self.input_is_pipeline: 883 | raise AttributeError("Model was constructed to accept pipeline data, either" 884 | " use 'pipeline_yield_samples' method or rebuild model " 885 | "with in-memory dataset.") 886 | with tf.compat.v1.Session(graph=self.graph) as sess: 887 | self.saver.restore(sess, self.savepath) 888 | if verbose: 889 | print("Model restored.") 890 | for n in range(m): 891 | feed_data = self.imputation_target.values 892 | feedin = {self.X: feed_data} 893 | if self.additional_data is not None: 894 | feedin[self.X_add] = self.additional_data 895 | y_out = pd.DataFrame(sess.run(self.output_op, 896 | feed_dict=feedin), 897 | columns=self.imputation_target.columns) 898 | output_df = self.imputation_target.copy() 899 | output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)] 900 | yield output_df 901 | return self 902 | 903 | def batch_generate_samples(self, 904 | m=50, 905 | b_size=256, 906 | verbose=True): 907 | """ 908 | Method used to generate a set of m imputations to the .output_list attribute. 909 | Imputations are stored within a list in memory, and can be accessed in any 910 | order. As batch generation implies very large datasets, this method is only 911 | provided for completeness' sake. 912 | 913 | This function is for a dataset large enough to be stored in memory, but 914 | too large to be passed into the model in its entirety. This may be due to 915 | GPU memory limitations, or just the size of the model 916 | 917 | If a model has been pre-trained, on subsequent runs this function can be 918 | directly called without having to train first. An 'if' statement checking 919 | the default save location is useful for this. 920 | 921 | Args: 922 | m: Integer. Number of imputations to generate. 923 | 924 | b_size: Integer. Number of data entries to process at once. For managing 925 | wider datasets, smaller numbers may be required. 926 | 927 | verbose: Boolean. Prints out messages. 928 | 929 | Returns: 930 | Self 931 | """ 932 | if not self.model_built: 933 | raise AttributeError("The computation graph must be built before the model" 934 | " can be trained") 935 | 936 | if self.input_is_pipeline: 937 | raise AttributeError("Model was constructed to accept pipeline data, either" 938 | " use 'pipeline_yield_samples' method or rebuild model " 939 | "with in-memory dataset.") 940 | self.output_list = [] 941 | with tf.compat.v1.Session(graph=self.graph) as sess: 942 | self.saver.restore(sess, self.savepath) 943 | if verbose: 944 | print("Model restored.") 945 | for n in range(m): 946 | feed_data = self.imputation_target.values 947 | minibatch_list = [] 948 | for batch in self._batch_iter_output(feed_data, b_size): 949 | if self.additional_data is not None: 950 | feedin = {self.X: batch[0], self.X_add: batch[1]} 951 | else: 952 | feedin = {self.X: batch} 953 | y_batch = pd.DataFrame(sess.run(self.output_op, 954 | feed_dict=feedin), 955 | columns=self.imputation_target.columns) 956 | minibatch_list.append(y_batch) 957 | y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True), 958 | columns=self.imputation_target.columns) 959 | output_df = self.imputation_target.copy() 960 | output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)] 961 | self.output_list.append(output_df) 962 | return self 963 | 964 | def batch_yield_samples(self, 965 | m=50, 966 | b_size=256, 967 | verbose=True): 968 | """ 969 | Method used to generate a set of m imputations via the 'yield' command, allowing 970 | imputations to be used in a 'for' loop' 971 | 972 | This function is for a dataset large enough to be stored in memory, but 973 | too large to be passed into the model in its entirety. This may be due to 974 | GPU memory limitations, or just the size of the model 975 | 976 | If a model has been pre-trained, on subsequent runs this function can be 977 | directly called without having to train first. An 'if' statement checking 978 | the default save location is useful for this. 979 | 980 | Args: 981 | m: Integer. Number of imputations to generate. 982 | 983 | b_size: Integer. Number of data entries to process at once. For managing 984 | wider datasets, smaller numbers may be required. 985 | 986 | verbose: Boolean. Prints out messages. 987 | 988 | Returns: 989 | Self """ 990 | if not self.model_built: 991 | raise AttributeError("The computation graph must be built before the model" 992 | " can be trained") 993 | 994 | if self.input_is_pipeline: 995 | raise AttributeError("Model was constructed to accept pipeline data, either" 996 | " use 'pipeline_yield_samples' method or rebuild model " 997 | "with in-memory dataset.") 998 | with tf.compat.v1.Session(graph=self.graph) as sess: 999 | self.saver.restore(sess, self.savepath) 1000 | if verbose: 1001 | print("Model restored.") 1002 | for n in range(m): 1003 | feed_data = self.imputation_target.values 1004 | minibatch_list = [] 1005 | for batch in self._batch_iter_output(feed_data, b_size): 1006 | if self.additional_data is not None: 1007 | feedin = {self.X: batch[0], self.X_add: batch[1]} 1008 | else: 1009 | feedin = {self.X: batch} 1010 | y_batch = pd.DataFrame(sess.run(self.output_op, 1011 | feed_dict=feedin), 1012 | columns=self.imputation_target.columns) 1013 | minibatch_list.append(y_batch) 1014 | y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True), 1015 | columns=self.imputation_target.columns) 1016 | output_df = self.imputation_target.copy() 1017 | output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)] 1018 | yield output_df 1019 | return self 1020 | 1021 | def overimpute(self, 1022 | spikein=0.1, 1023 | training_epochs=100, 1024 | report_ival=10, 1025 | report_samples=32, 1026 | plot_vars=True, 1027 | verbose=True, 1028 | verbosity_ival=1, 1029 | spike_seed=42, 1030 | cont_kdes=False, 1031 | excessive=False, 1032 | plot_main=True, 1033 | skip_plot=False, 1034 | save_figs=False, 1035 | save_path="", 1036 | ): 1037 | """ 1038 | This function spikes in additional missingness, so that known values can be 1039 | used to help adjust the complexity of the model. As conventional train/ 1040 | validation splits can still lead to autoencoders overtraining, the method for 1041 | limiting complexity is overimputation and early stopping. This gives an 1042 | estimate of how the model will react to unseen variables. 1043 | 1044 | Error is defined as RMSE for continuous variables, and classification error 1045 | for binary and categorical variables (ie. 1 - accuracy). Note that this means 1046 | that binary classification is inherently dependent on a selection threshold 1047 | of 0.5, and softmax accuracy will automatically decrease as a function of the 1048 | number of classes within the model. All three will be affected by the degree 1049 | of imbalance within the dataset. 1050 | 1051 | The accuracy measures provided here may not be ideal for all problems, but 1052 | they are generally appropriate for selecting optimum complexity. Should the 1053 | lines denoting error begin to trend upwards, this indicates overtraining and 1054 | is a sign that the training_epochs parameter to the .train_model() method should 1055 | be capped before this point. 1056 | 1057 | The actual optimal point may differ from that indicated by the .overimpute() 1058 | method for two reasons: 1059 | -The loss that is spiked in reduces the overall data available to the algorithm 1060 | to learn the patterns inherent, so there should be some improvement in performance 1061 | when .train_model() is called. If this is a concern, then it should be possible 1062 | to compare the behaviour of the loss figure between .train_model() and 1063 | .overimpute(). 1064 | -The missingness inherent to the data may depend on some unobserved factor. 1065 | In this case, the bias in the observed data may lead to inaccurate inference. 1066 | 1067 | It is worth visually inspecting the distribution of the overimputed values 1068 | against imputed values (using plot_vars) to ensure that they fall within a 1069 | sensible range. 1070 | 1071 | The plots block execution of the code until they are closed. To only plot a 1072 | single overimputation graph at the end of the run, you can supply plot_main = False 1073 | and plot_vars = False. To run the imputation without plotting any graphs, 1074 | set skip_plot = True in addition. The overimputation function will still print 1075 | predicted errors to the console. 1076 | 1077 | Args: 1078 | spikein: Float, between 0 and 1. The proportion of observed values in the 1079 | input dataset to be randomly removed (default = 0.1). 1080 | 1081 | training_epochs: Integer. The number of overimputation training epochs 1082 | (default = 100). Selecting a low value increases the risk that trends in the 1083 | loss metrics have not stabilized by the end of training, in which case 1084 | additional epochs may be necessary. 1085 | 1086 | report_ival: Integer. The number of overimputation training epochs between 1087 | calculations of loss (default = 10). Shorter intervals provide a more granular 1088 | view of model performance but slow down the overimputation process. 1089 | 1090 | report_samples: The number of Monte Carlo samples drawn from the estimated 1091 | missing-data posterior for loss calculations (default = 32). A larger number 1092 | increases overimputation runtime and may thus necessitate a lower value of 1093 | report_ival. 1094 | 1095 | plot_vars: Specifies whether to plot the distribution of original versus 1096 | overimputed values (default = True). This takes the form of a density 1097 | plot for continuous variables and a barplot for categorical variables (showing 1098 | proportions of each class). 1099 | 1100 | plot_main: Boolean. Specifies whether to display the main graphical output 1101 | (overimputation error during training) at every reporting interval (default = True). 1102 | If set to False, it will only appear at the end of the overimputation training 1103 | process. Error values are still shown at each report_ival. 1104 | 1105 | skip_plot: Boolean. Specifies whether to suppress the main graphical output 1106 | (default = False). This may be desirable when users are conducting multiple 1107 | overimputation exercises sequentially and are primarily interested in the console 1108 | output. 1109 | 1110 | save_figs: Boolean. Specifies whether to save generated figures instead of 1111 | displaying graphical output (default = False). 1112 | 1113 | save_path: String. Specifies path to save pyplots if save_figs = True 1114 | (default = working directory). 1115 | 1116 | verbose: Boolean. Prints out messages, including loss, to the terminal (default = True). 1117 | 1118 | verbosity_ival: Integer. The number of overimputation training epochs between 1119 | messages (default = True). 1120 | 1121 | spike_seed: Integer. The value to which Python's pseudo-random number generator is initialized 1122 | for the missingness spike-in. This is separate to the seed specified in the Midas() 1123 | call. 1124 | 1125 | cont_kdes: Boolean. Whether to plot kernel density estimates for continuous variables. 1126 | 1127 | excessive: Specifies whether to print aggregate mini-batch loss to the terminal 1128 | (default = False). This argument differs from the .train_model()'s excessive argument, 1129 | which prints individual mini-batch loss. This allows users to check for unusual imputations, 1130 | which may be helpful if loss is not declining during overimputation training. 1131 | 1132 | 1133 | """ 1134 | if not self.model_built: 1135 | raise AttributeError("The computation graph must be built before the model can be trained") 1136 | 1137 | if self.input_is_pipeline: 1138 | raise AttributeError("Overimputation not currently supported for models" 1139 | " which use a pipeline function for input.") 1140 | # These values simplify control flow used later for error calculation and 1141 | # visualisation of convergence. 1142 | if cont_kdes & (plot_vars is False): 1143 | raise ValueError("Cannot plot KDEs if plot_vars is False") 1144 | 1145 | if excessive: 1146 | import time 1147 | 1148 | overimp_rng = np.random.default_rng(spike_seed) 1149 | 1150 | rmse_in = False 1151 | sacc_in = False 1152 | bacc_in = False 1153 | if 'rmse' in self.output_types: 1154 | rmse_in = True 1155 | if 'sacc' in self.output_types: 1156 | def sacc(true, pred, spike): # Softmax accuracy 1157 | a = np.argmax(true, 1) 1158 | b = np.argmax(pred, 1) 1159 | return np.sum(a[spike.flatten()] == b[spike.flatten()]) / np.sum(spike) 1160 | 1161 | def findcatname(strlist): 1162 | return strlist[0][:([min([x[0] == elem for elem in x]) for x in zip(*strlist)] + [0]).index(0)] 1163 | 1164 | sacc_in = True 1165 | 1166 | if 'bacc' in self.output_types: 1167 | def bacc(true, pred, spike): 1168 | pred = (pred > 0.5).astype(np.int_) 1169 | return np.sum(true[spike] == pred[spike]) / np.sum(spike) 1170 | 1171 | bacc_in = True 1172 | 1173 | feed_data = self.imputation_target.copy() 1174 | na_loc = self.na_matrix 1175 | # np.random.seed(spike_seed) 1176 | n_softmax = 0 # Necessary to derive the average classification error 1177 | 1178 | # Pandas lacks an equivalent to tf.split, so this is used to divide columns 1179 | # for their respective error metrics 1180 | break_list = list(np.cumsum(self.size_index)) 1181 | break_list.insert(0, 0) 1182 | 1183 | # Generate spike-in 1184 | spike = [] 1185 | for n in range(len(self.size_index)): 1186 | if self.output_types[n] == 'sacc': 1187 | temp_spike = pd.Series(overimp_rng.choice([True, False], 1188 | size=self.imputation_target.shape[0], 1189 | p=[spikein, 1 - spikein])) 1190 | 1191 | spike.append(pd.concat([temp_spike] * self.size_index[n], axis=1)) 1192 | n_softmax += 1 1193 | 1194 | else: 1195 | spike.append(pd.DataFrame(overimp_rng.choice([True, False], 1196 | size=[self.imputation_target.shape[0], 1197 | self.size_index[n]], 1198 | p=[spikein, 1 - spikein]))) 1199 | spike = pd.concat(spike, axis=1) 1200 | spike.columns = self.imputation_target.columns 1201 | spike[np.invert(na_loc)] = False 1202 | feed_data[spike] = 0 1203 | feed_data = feed_data.values 1204 | na_loc[spike] = False 1205 | spike = spike.values 1206 | na_loc = na_loc.values 1207 | 1208 | # Initialise lists for plotting 1209 | s_rmse = [] 1210 | a_rmse = [] 1211 | s_bacc = [] 1212 | a_bacc = [] 1213 | s_sacc = [] 1214 | a_sacc = [] 1215 | with tf.compat.v1.Session(graph=self.graph) as sess: 1216 | if self.seed is not None: 1217 | train_rng = np.random.default_rng(self.seed) 1218 | 1219 | sess.run(self.init) 1220 | print("Model initialised", flush=True) 1221 | print(flush=True) 1222 | for epoch in range(training_epochs + 1): 1223 | count = 0 1224 | run_loss = 0 1225 | for batch in self._batch_iter(feed_data, na_loc, self.train_batch, train_rng): 1226 | if np.sum(batch[1]) == 0: 1227 | continue 1228 | feedin = {self.X: batch[0], self.na_idx: batch[1]} 1229 | if self.additional_data is not None: 1230 | feedin[self.X_add] = batch[2] 1231 | if excessive: 1232 | out, loss, _ = sess.run([self.output_op, self.joint_loss, self.train_step], 1233 | feed_dict=feedin) 1234 | print("Current cost:", loss) 1235 | print(out) 1236 | time.sleep(5) 1237 | else: 1238 | loss, _ = sess.run([self.joint_loss, self.train_step], 1239 | feed_dict=feedin) 1240 | count += 1 1241 | 1242 | if not np.isnan(loss): 1243 | run_loss += loss 1244 | if verbose: 1245 | if epoch % verbosity_ival == 0: 1246 | print('Epoch:', epoch, ", loss:", str(run_loss / count), flush=True) 1247 | 1248 | if epoch % report_ival == 0: 1249 | """ 1250 | For each report interval, generate report_samples worth of imputations 1251 | and measure both individual and aggregate error values 1252 | """ 1253 | # Initialise losses 1254 | single_rmse = 0 1255 | single_sacc = 0 1256 | single_bacc = 0 1257 | first = True 1258 | if cont_kdes: 1259 | plot_first = True 1260 | 1261 | for sample in range(report_samples): 1262 | 1263 | minibatch_list = [] 1264 | for batch in self._batch_iter_output(feed_data, self.train_batch): 1265 | feedin = {self.X: batch} 1266 | if self.additional_data is not None: 1267 | feedin = {self.X: batch[0]} 1268 | feedin[self.X_add] = batch[1] 1269 | else: 1270 | feedin = {self.X: batch} 1271 | y_batch = pd.DataFrame(sess.run(self.output_op, 1272 | feed_dict=feedin), 1273 | columns=self.imputation_target.columns) 1274 | minibatch_list.append(y_batch) 1275 | y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True), 1276 | columns=self.imputation_target.columns) 1277 | if cont_kdes: 1278 | if 'rmse' in self.output_types: 1279 | for n in range(self.size_index[0]): 1280 | plt.figure(n + 1) 1281 | t_t = self.imputation_target.iloc[:, n] 1282 | t_p = y_out.iloc[:, n] 1283 | t_s = spike[:, n] 1284 | if plot_first: 1285 | t_p[t_s].plot(kind='density', color='k', alpha=0.5, label='Single imputation') 1286 | else: 1287 | t_p[t_s].plot(kind='density', color='k', alpha=0.5, label='_nolegend_') 1288 | plot_first = False 1289 | 1290 | # Calculate individual imputation losses 1291 | for n in range(len(self.size_index)): 1292 | temp_pred = y_out.iloc[:, break_list[n]:break_list[n + 1]] 1293 | temp_true = self.imputation_target.iloc[:, break_list[n]:break_list[n + 1]] 1294 | temp_spike = spike[:, break_list[n]:break_list[n + 1]] 1295 | if self.output_types[n] == 'sacc': 1296 | temp_spike = temp_spike[:, 0] 1297 | single_sacc += (1 - sacc(temp_true.values, 1298 | temp_pred.values, temp_spike)) / n_softmax 1299 | 1300 | elif self.output_types[n] == 'rmse': 1301 | single_rmse += np.sqrt(mse(temp_true[temp_spike], 1302 | temp_pred[temp_spike])) 1303 | else: 1304 | single_bacc += 1 - bacc(temp_true.values, temp_pred.values, temp_spike) 1305 | 1306 | if first: 1307 | running_output = y_out 1308 | first = False 1309 | else: 1310 | running_output += y_out 1311 | single_rmse = single_rmse / report_samples 1312 | single_sacc = single_sacc / report_samples 1313 | single_bacc = single_bacc / report_samples 1314 | y_out = running_output / report_samples 1315 | 1316 | # Calculate aggregate imputation losses 1317 | agg_rmse = 0 1318 | agg_sacc = 0 1319 | agg_bacc = 0 1320 | for n in range(len(self.size_index)): 1321 | temp_pred = y_out.iloc[:, break_list[n]:break_list[n + 1]] 1322 | temp_true = self.imputation_target.iloc[:, break_list[n]:break_list[n + 1]] 1323 | temp_spike = spike[:, break_list[n]:break_list[n + 1]] 1324 | if self.output_types[n] == 'sacc': 1325 | temp_spike = temp_spike[:, 0] 1326 | if plot_vars: 1327 | temp_pred[temp_spike].mean().plot(kind='bar', 1328 | label='Imputed values (mean)', color='C0') 1329 | temp_true[temp_spike].mean().plot(kind='bar', alpha=0.5, 1330 | color='r', align='edge', 1331 | label='Removed observed values (mean)') 1332 | temp_true_name = findcatname(temp_true[temp_spike].columns)[:-1] 1333 | plt.title('Overimputation density plot: ' + temp_true_name + ' (categorical)') 1334 | plt.xlabel(temp_true_name) 1335 | plt.ylabel('Proportion') 1336 | plt.legend() 1337 | 1338 | if save_figs: 1339 | plt.tight_layout() 1340 | plt.savefig(save_path + temp_true_name + "_epoch_" + str(epoch) + ".png") 1341 | plt.clf() 1342 | else: 1343 | plt.show() 1344 | 1345 | agg_sacc += (1 - sacc(temp_true.values, temp_pred.values, 1346 | temp_spike)) / n_softmax 1347 | elif self.output_types[n] == 'rmse': 1348 | if plot_vars: 1349 | for n_rmse in range(len(temp_pred.columns)): 1350 | plt.figure(n_rmse + 1) 1351 | t_p = temp_pred.iloc[:, n_rmse] 1352 | t_t = temp_true.iloc[:, n_rmse] 1353 | t_s = temp_spike[:, n_rmse] 1354 | t_p[t_s].plot(kind='density', label='Imputed values (mean)') 1355 | t_t[t_s].plot(kind='density', color='r', label='Removed observed values') 1356 | t_t.plot(kind='kde', color='g', label='All observed values') 1357 | hyp_output = pd.concat([t_t[np.invert(t_s)], t_p[t_s]]) 1358 | hyp_output.plot(kind='kde', color='m', label='Completed data') 1359 | plt.title('Overimputation density plot: ' + \ 1360 | temp_pred.columns[n_rmse] + ' (continuous)') 1361 | plt.xlabel(temp_pred.columns[n_rmse]) 1362 | plt.ylabel('Density') 1363 | plt.legend() 1364 | 1365 | if save_figs: 1366 | plt.tight_layout() 1367 | plt.savefig( 1368 | save_path + temp_pred.columns[n_rmse] + "_epoch_" + str(epoch) + ".png") 1369 | plt.clf() 1370 | else: 1371 | plt.show() 1372 | 1373 | agg_rmse += np.sqrt(mse(temp_true[temp_spike], 1374 | temp_pred[temp_spike])) 1375 | else: 1376 | if plot_vars: 1377 | temp_pred[temp_spike].mean().plot(kind='bar', 1378 | label='Imputed values', 1379 | color='C0') 1380 | temp_true[temp_spike].mean().plot(kind='bar', alpha=0.5, 1381 | color='r', align='edge', label='Observed values') 1382 | plt.title('Overimputation binary proportions') 1383 | plt.xlabel('Variables') 1384 | plt.ylabel('Proportion') 1385 | plt.legend() 1386 | 1387 | if save_figs: 1388 | plt.tight_layout() 1389 | plt.savefig(save_path + "binary_vars_epoch_" + str(epoch) + ".png") 1390 | plt.clf() 1391 | else: 1392 | plt.show() 1393 | 1394 | agg_bacc += 1 - bacc(temp_true.values, temp_pred.values, temp_spike) 1395 | 1396 | # Plot losses depending on which loss values present in data 1397 | if rmse_in: 1398 | s_rmse.append(single_rmse) 1399 | a_rmse.append(agg_rmse) 1400 | print("Individual RMSE on spike-in:", single_rmse, flush=True) 1401 | print("Aggregated RMSE on spike-in:", agg_rmse, flush=True) 1402 | 1403 | if sacc_in: 1404 | s_sacc.append(single_sacc) 1405 | a_sacc.append(agg_sacc) 1406 | print("Individual error on softmax spike-in:", single_sacc, flush=True) 1407 | print("Aggregated error on softmax spike-in:", agg_sacc, flush=True) 1408 | 1409 | if bacc_in: 1410 | s_bacc.append(single_bacc) 1411 | a_bacc.append(agg_bacc) 1412 | print("Individual error on binary spike-in:", single_bacc, flush=True) 1413 | print("Aggregated error on binary spike-in:", agg_bacc, flush=True) 1414 | 1415 | if plot_main or ((training_epochs - epoch) < report_ival): 1416 | if rmse_in: 1417 | plt.plot(s_rmse, 'k-', label="Individual RMSE") 1418 | plt.plot(a_rmse, 'k--', label="Aggregated RMSE") 1419 | min_sr = min(s_rmse) 1420 | min_ar = min(a_rmse) 1421 | plt.plot([min_sr] * len(s_rmse), 'r:') 1422 | plt.plot([min_ar] * len(a_rmse), 'r:') 1423 | plt.plot(s_rmse.index(min(s_rmse)), 1424 | min_sr, 'rx') 1425 | plt.plot(a_rmse.index(min(a_rmse)), 1426 | min_ar, 'rx') 1427 | 1428 | if sacc_in: 1429 | plt.plot(s_sacc, 'g-', label="Individual classification error") 1430 | plt.plot(a_sacc, 'g--', label="Aggregated classification error") 1431 | min_ss = min(s_sacc) 1432 | min_as = min(a_sacc) 1433 | plt.plot([min_ss] * len(s_sacc), 'r:') 1434 | plt.plot([min_as] * len(a_sacc), 'r:') 1435 | plt.plot(s_sacc.index(min(s_sacc)), 1436 | min_ss, 'rx') 1437 | plt.plot(a_sacc.index(min(a_sacc)), 1438 | min_as, 'rx') 1439 | 1440 | if bacc_in: 1441 | plt.plot(s_bacc, 'b-', label="Individual binary error") 1442 | plt.plot(a_bacc, 'b--', label="Aggregated binary error") 1443 | min_sb = min(s_bacc) 1444 | min_ab = min(a_bacc) 1445 | plt.plot([min_sb] * len(s_bacc), 'r:') 1446 | plt.plot([min_ab] * len(a_bacc), 'r:') 1447 | plt.plot(s_bacc.index(min(s_bacc)), 1448 | min_sb, 'rx') 1449 | plt.plot(a_bacc.index(min(a_bacc)), 1450 | min_ab, 'rx') 1451 | 1452 | # Complete plots 1453 | if not skip_plot: 1454 | plt.title("Overimputation error during training") 1455 | plt.ylabel("Error") 1456 | plt.legend(loc=4) 1457 | plt.ylim(ymin=0) 1458 | plt.xlabel("Reporting interval") 1459 | 1460 | if save_figs: 1461 | plt.tight_layout() 1462 | plt.savefig(save_path + "overimputation_error.png") 1463 | plt.clf() 1464 | else: 1465 | plt.show() 1466 | 1467 | print("Overimputation complete. Adjust complexity as needed.", flush=True) 1468 | return self 1469 | 1470 | def build_model_pipeline(self, 1471 | data_sample, 1472 | binary_columns=None, 1473 | softmax_columns=None, 1474 | unsorted=True, 1475 | additional_data_sample=None, 1476 | verbose=True, 1477 | crossentropy_adj=1, 1478 | loss_scale=1): 1479 | """ 1480 | This function is for integration with databasing or any dataset that needs 1481 | to be batched into memory. The data sample is simply there to allow the 1482 | original constructor to be recycled. The head of the data should be sufficient 1483 | to build the imputation model. The input pipeline itself should pre-scale 1484 | the data, and code null values as type np.nan. The pipeline ought to output 1485 | a Pandas DataFrame. If additional data will be passed in, then the return must 1486 | be a list of two DataFrames. The columns of the dataframe will be re-arranged 1487 | so that error functions are efficiently generated. 1488 | 1489 | IT IS IMPERITIVE that this ordering is respected. Design the input batching 1490 | function accordingly. 1491 | 1492 | The categorical columns should be a list of column names. Softmax columns 1493 | should be a list of lists of column names. This will allow the model to 1494 | dynamically assign cost functions to the correct variables. If, however, 1495 | the data comes pre-sorted, arranged can be set to "true", in which case 1496 | the arguments can be passed in as integers of size, ie. shape[1] attributes 1497 | for each of the relevant categories. 1498 | 1499 | In other words, pre-sort your data and pass in the integers, so indexing 1500 | dynamically doesn't become too difficult. Alternatively, list(df.columns.values) 1501 | will output a list of column names, which can be easily implemented in the 1502 | 'for' loop which constructs your dummy variables. 1503 | """ 1504 | self.input_is_pipeline = True 1505 | b_c = binary_columns 1506 | s_c = softmax_columns 1507 | us = unsorted 1508 | a_d = additional_data_sample 1509 | vb = verbose 1510 | cea = crossentropy_adj 1511 | l_s = loss_scale 1512 | 1513 | self.build_model(data_sample, b_c, s_c, us, a_d, vb, cea, l_s) 1514 | 1515 | return self 1516 | 1517 | def train_model_pipeline(self, 1518 | input_pipeline, 1519 | training_epochs=100, 1520 | verbose=True, 1521 | verbosity_ival=1, 1522 | excessive=False): 1523 | """ 1524 | This is the alternative method for optimising the model's parameters when input 1525 | data must be batched into memory. Must be called before imputation can be 1526 | performed. The model will then be saved to the specified directory 1527 | 1528 | Args: 1529 | input_pipeline: Function which yields a pre-processed and scaled DataFrame 1530 | from the designated source, be it a server or large flat file. 1531 | 1532 | training_epochs: Integer. The number of epochs the model will run for 1533 | 1534 | verbose: Boolean. Prints out messages, including loss 1535 | 1536 | verbosity_ival: Integer. This number determines the interval between 1537 | messages. 1538 | 1539 | excessive: Boolean. Used for troubleshooting, this argument will cause the 1540 | cost of each batch to be printed to the terminal. 1541 | 1542 | Returns: 1543 | Self. Model is automatically saved upon reaching specified number of epochs 1544 | 1545 | """ 1546 | self.input_pipeline = input_pipeline 1547 | if not self.model_built: 1548 | raise AttributeError("The computation graph must be built before the model" 1549 | " can be trained") 1550 | if not self.input_is_pipeline: 1551 | raise AttributeError("Model was constructed to accept locally-stored data," 1552 | "either use 'train_model' method or rebuild model " 1553 | "with the 'build_model_pipeline' method.") 1554 | 1555 | # if self.seed is not None: 1556 | # np.random.seed(self.seed) 1557 | with tf.compat.v1.Session(graph=self.graph) as sess: 1558 | sess.run(self.init) 1559 | if verbose: 1560 | print("Model initialised") 1561 | print() 1562 | for epoch in range(training_epochs): 1563 | count = 0 1564 | run_loss = 0 1565 | 1566 | for feed_data in input_pipeline: 1567 | if self.additional_data is None: 1568 | if not isinstance(feed_data, pd.DataFrame): 1569 | raise TypeError("Input data must be in a DataFrame") 1570 | na_loc = feed_data.notnull().astype(bool).values 1571 | feedin = {self.X: feed_data.values, 1572 | self.na_idx: na_loc} 1573 | else: 1574 | if not isinstance(feed_data, list): 1575 | raise TypeError("Input should be a list of two DataFrames, with " 1576 | "index 0 containing the target imputation data, and" 1577 | " the data at index 1 containing additional data") 1578 | if len(feed_data) != 2: 1579 | raise TypeError("Input should be a list of two DataFrames, with " 1580 | "index 0 containing the target imputation data, and" 1581 | " the data at index 1 containing additional data") 1582 | if not isinstance(feed_data[0], pd.DataFrame): 1583 | raise TypeError("Input data must be in a DataFrame") 1584 | if not isinstance(feed_data[1], pd.DataFrame): 1585 | raise TypeError("Additional data must be in a DataFrame") 1586 | na_loc = feed_data[0].notnull().astype(bool).values 1587 | feedin = {self.X: feed_data[0].fillna(0).values, 1588 | self.X_add: feed_data[1].fillna(0).values, 1589 | self.na_idx: na_loc} 1590 | 1591 | if np.sum(na_loc) == 0: 1592 | continue 1593 | loss, _ = sess.run([self.joint_loss, self.train_step], 1594 | feed_dict=feedin) 1595 | if excessive: 1596 | print("Current cost:", loss) 1597 | count += 1 1598 | if not np.isnan(loss): 1599 | run_loss += loss 1600 | if verbose: 1601 | if epoch % verbosity_ival == 0: 1602 | print('Epoch:', epoch, ", loss:", str(run_loss / count)) 1603 | if verbose: 1604 | print("Training complete. Saving file...") 1605 | save_path = self.saver.save(sess, self.savepath) 1606 | if verbose: 1607 | print("Model saved in file: %s" % save_path) 1608 | return self 1609 | 1610 | def yield_samples_pipeline(self, 1611 | verbose=False): 1612 | """ 1613 | As its impossible to know the specifics of the pipeline, this method simply 1614 | cycles through all data provided by the input function. The number of imputations 1615 | can be specified by the user, depending on their needs. 1616 | 1617 | Args: 1618 | verbose: Prints out messages 1619 | 1620 | Yields: 1621 | A 'DataFrame' of the size specified by the input function passed to the 1622 | 'train_model_pipeline' method. 1623 | 1624 | Returns: 1625 | Self 1626 | 1627 | """ 1628 | if not self.model_built: 1629 | raise AttributeError("The computation graph must be built before the model" 1630 | " can be trained") 1631 | if not self.input_is_pipeline: 1632 | raise AttributeError("Model was constructed to accept locally-stored data," 1633 | "either use 'train_model' method or rebuild model " 1634 | "with the 'build_model_pipeline' method.") 1635 | 1636 | # if self.seed is not None: 1637 | # np.random.seed(self.seed) 1638 | # tf.compat.v1.set_random_seed(self.seed) 1639 | with tf.compat.v1.Session(graph=self.graph) as sess: 1640 | self.saver.restore(sess, self.savepath) 1641 | if verbose: 1642 | print("Model restored.") 1643 | 1644 | for feed_data in self.inpinput_pipeline: 1645 | if self.additional_data is None: 1646 | if not isinstance(feed_data, pd.DataFrame): 1647 | raise TypeError("Input data must be in a DataFrame") 1648 | na_loc = feed_data.notnull().astype(bool).values 1649 | feedin = {self.X: feed_data.fillna(0).values} 1650 | else: 1651 | if not isinstance(feed_data, list): 1652 | raise TypeError("Input should be a list of two DataFrames, with " 1653 | "index 0 containing the target imputation data, and" 1654 | " the data at index 1 containing additional data") 1655 | if len(feed_data) != 2: 1656 | raise TypeError("Input should be a list of two DataFrames, with " 1657 | "index 0 containing the target imputation data, and" 1658 | " the data at index 1 containing additional data") 1659 | if not isinstance(feed_data[0], pd.DataFrame): 1660 | raise TypeError("Input data must be in a DataFrame") 1661 | if not isinstance(feed_data[1], pd.DataFrame): 1662 | raise TypeError("Additional data must be in a DataFrame") 1663 | na_loc = feed_data[0].notnull().astype(bool).values 1664 | feedin = {self.X: feed_data[0].fillna(0).values, 1665 | self.X_add: feed_data[1].fillna(0).values} 1666 | feed_data = feed_data[0] 1667 | na_loc = feed_data.notnull().astype(bool).values 1668 | 1669 | y_out = pd.DataFrame(sess.run(self.output_op, feed_dict=feedin), 1670 | columns=self.imputation_target.columns) 1671 | output_df = self.imputation_target.copy() 1672 | output_df[np.invert(na_loc)] = y_out[np.invert(na_loc)] 1673 | yield output_df 1674 | 1675 | return self 1676 | 1677 | def sample_from_z(self, 1678 | sample_size=256, 1679 | verbose=True): 1680 | """ 1681 | Method used to generate new samples by drawing on the default Student-T(3) 1682 | sampling distribution. In effect, generates new data samples. 1683 | Arguments: 1684 | 1685 | sample_size: Integer. Number of sample observations to draw at once. 1686 | 1687 | verbose: Boolean. Prints out messages. 1688 | 1689 | Returns: 1690 | Sampled_output 1691 | """ 1692 | if not self.model_built: 1693 | raise AttributeError("The computation graph must be built before the model" 1694 | " can be trained") 1695 | if not self.vae_layer: 1696 | raise AttributeError("The model must include a VAE layer to be used to generate" 1697 | " new observations from a latent distribution") 1698 | if self.input_is_pipeline: 1699 | raise AttributeError("Model was constructed to accept pipeline data, either" 1700 | " use 'pipeline_yield_samples' method or rebuild model " 1701 | "with in-memory dataset.") 1702 | with tf.compat.v1.Session(graph=self.graph) as sess: 1703 | self.saver.restore(sess, self.savepath) 1704 | if verbose: 1705 | print("Model restored.") 1706 | feedin = {self.latent_inputs: np.zeros([sample_size, self.latent_space_size])} 1707 | out = sess.run(self.gen_from_z_sample, feed_dict=feedin) 1708 | sampled_output = pd.DataFrame(out, 1709 | columns=self.imputation_target.columns) 1710 | return sampled_output 1711 | 1712 | def transform_from_z(self, 1713 | data, 1714 | b_size=256, 1715 | verbose=True): 1716 | """ 1717 | Method used to generate new samples by drawing on the default Student-T(3) 1718 | sampling distribution. In effect, generates new data samples. 1719 | Arguments: 1720 | 1721 | data: Pandas dataframe or numpy array, as wide as latent_space_size. These 1722 | numbers can be sampled from some distribution, or can be structured vectors 1723 | to enable sweeping through the data space. 1724 | 1725 | b_size: Integer. Number of data entries to process at once. For managing 1726 | larger input datasets, smaller numbers may be required. 1727 | 1728 | verbose: Boolean. Prints out messages. 1729 | 1730 | Returns: 1731 | Generated_output 1732 | """ 1733 | if not self.model_built: 1734 | raise AttributeError("The computation graph must be built before the model" 1735 | " can be trained") 1736 | if not self.vae_layer: 1737 | raise AttributeError("The model must include a VAE layer to be used to generate" 1738 | " new observations from a latent distribution") 1739 | if self.input_is_pipeline: 1740 | raise AttributeError("Model was constructed to accept pipeline data, either" 1741 | " use 'pipeline_yield_samples' method or rebuild model " 1742 | "with in-memory dataset.") 1743 | assert data.shape[1] == self.latent_space_size 1744 | with tf.compat.v1.Session(graph=self.graph) as sess: 1745 | self.saver.restore(sess, self.savepath) 1746 | if verbose: 1747 | print("Model restored.") 1748 | feed_data = data 1749 | minibatch_list = [] 1750 | for batch in self._batch_iter_zsample(feed_data, b_size): 1751 | feedin = {self.latent_inputs: batch} 1752 | y_batch = pd.DataFrame(sess.run(self.gen_from_z_inputs, 1753 | feed_dict=feedin), 1754 | columns=self.imputation_target.columns) 1755 | minibatch_list.append(y_batch) 1756 | generated_output = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True), 1757 | columns=self.imputation_target.columns) 1758 | return generated_output 1759 | 1760 | def inputs_to_z(self, 1761 | b_size=256, 1762 | verbose=True): 1763 | """ 1764 | Method used for transforming imputation_target into a latent representation 1765 | for analysis. Can be used for observing how data behaves in a lower dimensional 1766 | space, etc. 1767 | 1768 | Args: 1769 | m: Integer. Number of imputations to generate. 1770 | 1771 | b_size: Integer. Number of data entries to process at once. For managing 1772 | wider datasets, smaller numbers may be required. 1773 | 1774 | verbose: Boolean. Prints out messages. 1775 | 1776 | Returns: 1777 | Self, z_mu, z_log_sigma 1778 | """ 1779 | if not self.model_built: 1780 | raise AttributeError("The computation graph must be built before the model" 1781 | " can be trained") 1782 | if not self.vae_layer: 1783 | raise AttributeError("The model must include a VAE layer to be used to encode" 1784 | " the dataset into the latent space") 1785 | 1786 | if self.input_is_pipeline: 1787 | raise AttributeError("Model was constructed to accept pipeline data, either" 1788 | " use 'pipeline_yield_samples' method or rebuild model " 1789 | "with in-memory dataset.") 1790 | with tf.compat.v1.Session(graph=self.graph) as sess: 1791 | self.saver.restore(sess, self.savepath) 1792 | if verbose: 1793 | print("Model restored.") 1794 | feed_data = self.imputation_target.values 1795 | mu_list = [] 1796 | sigma_list = [] 1797 | for batch in self._batch_iter_output(feed_data, b_size): 1798 | if self.additional_data is not None: 1799 | feedin = {self.X: batch[0], self.X_add: batch[1]} 1800 | else: 1801 | feedin = {self.X: batch} 1802 | batch_mu, batch_sigma = sess.run(self.encode_to_z, 1803 | feed_dict=feedin) 1804 | batch_mu = pd.DataFrame(batch_mu) 1805 | batch_sigma = pd.DataFrame(batch_sigma) 1806 | mu_list.append(batch_mu) 1807 | sigma_list.append(batch_sigma) 1808 | x_mu = pd.concat(mu_list, ignore_index=True) 1809 | x_log_sigma = pd.concat(sigma_list, ignore_index=True) 1810 | return x_mu, x_log_sigma 1811 | 1812 | def change_imputation_target(self, new_target, additional_data=None): 1813 | """ 1814 | Helper method to allow for imputed dataset to be hotswapped. MIDAS is not 1815 | designed with such a function in mind, but this should allow for more flexible 1816 | workflows. 1817 | """ 1818 | if type(self.imputation_target) != type(new_target): 1819 | raise ValueError("New target must be of same type as original target dataset") 1820 | if type(self.imputation_target) == pd.core.series.Series: 1821 | if self.imputation_target.name != new_target.name: 1822 | raise ValueError("Ensure input series are from same source") 1823 | elif type(self.imputation_target) == pd.core.frame.DataFrame: 1824 | test_1 = new_target.shape[1] == self.imputation_target.shape[1] 1825 | test_2 = new_target.columns.isin(self.imputation_target.columns).sum() \ 1826 | == new_target.shape[1] 1827 | if not test_1 & test_2: 1828 | raise ValueError("New target must have same columns as original target dataframe") 1829 | if self.additional_data is not None: 1830 | test_1 = new_target.shape[1] == self.additional_data.shape[1] 1831 | test_2 = additional_data.columns.isin(self.additional_data.columns).sum() \ 1832 | == additional_data.shape[1] 1833 | if not test_1 & test_2: 1834 | raise ValueError("New target must have same columns as original target dataframe") 1835 | else: 1836 | raise ValueError("Target must be Pandas dataframe or series") 1837 | self.imputation_target = new_target.copy() 1838 | if self.additional_data is not None: 1839 | self.additional_data = additional_data.copy() 1840 | self.additional_data.fillna(0, inplace=True) 1841 | self.na_matrix = self.imputation_target.notnull().astype(bool) 1842 | self.imputation_target.fillna(0, inplace=True) 1843 | return self 1844 | 1845 | 1846 | def combine(y_var, 1847 | X_vars, 1848 | df_list=None, 1849 | dof_adjust=True, 1850 | incl_constant=True, 1851 | **glm_args, 1852 | ): 1853 | """ 1854 | Function used to run a GLM model across multiple datasets, aggregating the 1855 | results using Rubin's combination rules -- i.e. multiple imputation analysis. 1856 | 1857 | This function regresses the outcome variable on a linear combination of 1858 | independent variables, given a user-specified model family and link function. 1859 | For example if y_var = 'y' and X_vars = ['x1','x2','x3'], then by default this 1860 | function estimates the model y = a + x1 + x2 + x3, where a is the constant term. 1861 | Note, the constant term is added by default, but can be excluded by setting 1862 | incl_constant = False. 1863 | 1864 | This function wraps statsmodels.GLM() and allows users to specify linear 1865 | models using GLM families including Gaussian, Binomial, and Poisson. 1866 | 1867 | The function can be called on the completed dataframes generated from a MIDAS 1868 | model or users can supply their own list of completed datasets to analyse. 1869 | 1870 | Args: 1871 | df_list: A list of pd.DataFrames. The M completed datasets to be analyzed. 1872 | 1873 | y_var: String. The name of the outcome variable. 1874 | 1875 | X_vars: List of strings. The names of the predictor variables. 1876 | 1877 | dof_adjust: Boolean. Indicates whether to apply the Barnard and Rubin (1999) 1878 | degrees of freedom adjustment for small-samples. 1879 | 1880 | incl_constant: Boolean. Indicates whether to include an intercept in the null model (the default in 1881 | most generalized linear model software packages). 1882 | 1883 | **glm_args: Further arguments to be passed to statsmodels.GLM(), e.g., to 1884 | specify model family, offsets, and variance and frequency weights (see the 1885 | statsmodels documentation for full details). If None, a Gaussian (ordinary 1886 | least squares) model will be estimated. 1887 | 1888 | Returns: 1889 | DataFrame of combined model results """ 1890 | 1891 | ind_models = [] 1892 | mods_est = [] 1893 | mods_var = [] 1894 | m = len(df_list) 1895 | 1896 | for i in range(m): 1897 | df_mod = df_list[i] 1898 | df_endog = df_mod[y_var] 1899 | df_exog = df_mod[X_vars] 1900 | 1901 | if incl_constant: 1902 | df_exog = sm.add_constant(df_exog) 1903 | 1904 | ind_model = sm.GLM(df_endog, df_exog, **glm_args) 1905 | ind_results = ind_model.fit() 1906 | mods_est.append(ind_results.params) 1907 | mods_var.append(np.diag(ind_results.cov_params())) 1908 | 1909 | if i == 0: 1910 | mods_df_resid = ind_results.df_resid 1911 | mods_coef_names = ind_results.model.exog_names 1912 | 1913 | Q_bar = np.multiply((1 / m), np.sum(np.array(mods_est), 0)) 1914 | U_bar = np.multiply((1 / m), np.sum(np.array(mods_var), 0)) 1915 | 1916 | models_demean = list(map(lambda x: np.square(x - Q_bar), mods_est)) 1917 | 1918 | B = np.multiply(1 / (m - 1), np.sum(np.array(models_demean), 0)) 1919 | 1920 | Q_bar_var = U_bar + ((1 + (1 / m)) * B) 1921 | Q_bar_se = np.sqrt(Q_bar_var) 1922 | 1923 | v_m = (m - 1) * np.square(1 + (U_bar / ((1 + m ** (-1)) * B))) 1924 | 1925 | if dof_adjust: 1926 | 1927 | v_complete = mods_df_resid 1928 | 1929 | gamma = ((1 + m ** (-1)) * B) / Q_bar_var 1930 | 1931 | v_obs = ((v_complete + 1) / (v_complete + 3)) * v_complete * (1 - gamma) 1932 | 1933 | v_corrected = ((1 / v_m) + (1 / v_obs)) ** (-1) 1934 | 1935 | dof = v_corrected 1936 | 1937 | else: 1938 | 1939 | dof = v_m 1940 | 1941 | est = Q_bar 1942 | std_err = Q_bar_se 1943 | stat = est / std_err 1944 | 1945 | combined_mat = {'term': mods_coef_names, 1946 | 'estimate': est, 1947 | 'std.error': std_err, 1948 | 'statistic': stat, 1949 | 'df': dof, 1950 | 'p.value': (2 * (1 - stats.t.cdf(abs(stat), df=dof)))} 1951 | 1952 | return pd.DataFrame(combined_mat) 1953 | 1954 | 1955 | def binary_conv(x): 1956 | """ 1957 | Convenience function used to convert a binary column vector of data to 1958 | 1/0 encoding. 1959 | 1960 | Args: 1961 | x: pd.Series. An indexable array containing only two unique values. 1962 | 1963 | Returns: 1964 | A pd.Series the same length as x, with 0s and 1s corresponding to the first 1965 | and unique values in x respectively. """ 1966 | 1967 | labs = x.unique()[~pd.isnull(x.unique())] 1968 | x = np.where(x == labs[0], 0, x) 1969 | x = np.where(x == labs[1], 1, x) 1970 | x = np.where(pd.isnull(x), np.NaN, x) 1971 | 1972 | return x 1973 | 1974 | 1975 | def cat_conv(cat_data): 1976 | """ 1977 | Convenience function used to one-hot encode a categorical column in a panda 1978 | dataframe. 1979 | 1980 | Args: 1981 | cat_data: A pd.DataFrame. A dataframe containing only categorical columns to be 1982 | one-hot encoded. 1983 | 1984 | Returns: 1985 | cat_construct: pd.DataFrame. A one-hot encoded version of the input data. 1986 | cat_col_names: List of lists. Nested list of the one-hot encoded variable names, 1987 | that can be passed into the MIDASpy .build() function.""" 1988 | 1989 | cat_col_names = [] 1990 | 1991 | cat_construct = [] 1992 | 1993 | for column in cat_data.columns: 1994 | na_temp = cat_data[column].isnull() 1995 | temp = pd.get_dummies(cat_data[column], prefix=column, dtype=np.uint8) 1996 | temp[na_temp] = np.nan 1997 | cat_construct.append(temp) 1998 | cat_col_names.append(list(temp.columns.values)) 1999 | 2000 | cat_construct = pd.concat(cat_construct, axis=1) 2001 | return cat_construct, cat_col_names 2002 | -------------------------------------------------------------------------------- /midas_functions.md: -------------------------------------------------------------------------------- 1 | # Guide to the methods and arguments of MIDAS 2 | 3 | Model construction first requires an instantiation of MIDAS. The model then needs to be constructed and trained before imputations can be generated. Calibration is optional, but strongly recommeded. 4 | 5 | This class doesn't explicitly return values. Values are either stored internally, files are saved remotely or methods yield rather than returning. The key attribute is .output_list when samples are generated. 6 | 7 | #### Instantiation: 8 | 9 | - Midas() 10 | 11 | #### Model construction: 12 | 13 | - .build_model() 14 | - .build_model_pipeline() 15 | 16 | #### Model calibration: 17 | 18 | - .overimpute() 19 | 20 | #### Model training: 21 | 22 | - .train_model() 23 | - .train_model_pipeline() 24 | 25 | #### Imputation generation: 26 | 27 | - .batch_generate_samples() 28 | - .batch_yield_samples() 29 | - .generate_samples() 30 | - .yield_samples() 31 | - .yield_samples_pipeline() 32 | 33 | --- 34 | 35 | ### Midas() 36 | 37 | - layer_structure= \[256, 256, 256\] 38 | - learn_rate= 1e-4 39 | - input_drop= 0.8 40 | - train_batch = 16 41 | - savepath= 'tmp/MIDAS' 42 | - seed= None 43 | - loss_scale= 1 44 | - init_scale= 1 45 | - softmax_adj= 1 46 | 47 | Initialiser. Called separately to 'build_model' to allow for out-of-memory datasets. All key hyperparameters are entered at this stage, as the model construction methods only deal with the dataset. 48 | 49 | #### Args: 50 | - **layer_structure:** List of integers. The number of nodes in each layer of the network (default = [256, 256, 256], denoting a three-layer network with 256 nodes per layer). Larger networks can learn more complex data structures but require longer training and are more prone to overfitting. 51 | 52 | - **learn_rate:** Float. The learning rate $\gamma$ (default = 0.0001), which controls the size of the weight adjustment in each training epoch. In general, higher values reduce training time at the expense of less accurate results. 53 | 54 | - **input_drop:** Float between 0 and 1. The probability of corruption for input columns in training mini-batches (default = 0.8). Higher values increase training time but reduce the risk of overfitting. In our experience, values between 0.7 and 0.95 deliver the best performance. 55 | 56 | - **train_batch:** Integer. The number of observations in training mini-batches (default = 16). Common choices are 8, 16, 32, 64, and 128; powers of 2 tend to enhance memory efficiency. In general, smaller sizes lead to faster convergence at the cost of greater noise and thus less accurate estimates of the error gradient. Where memory management is a concern, they should be favored. 57 | 58 | - **savepath:** String. The location to which the trained model will be saved. 59 | 60 | - **seed:** Integer. The value to which Python's pseudo-random number generator is initialized. This enables users to ensure that data shuffling, weight and bias initialization, and missingness indicator vectors are reproducible. 61 | 62 | - **loss_scale:** Float. A constant by which the RMSE loss functions are multiplied (default = 1). This hyperparameter performs a similar function to the learning rate. If loss during training is very large, increasing its value can help to prevent overtraining. 63 | 64 | - **init_scale:** Float. The numerator of the variance component of Xavier Initialisation equation (default = 1). In very deep networks, higher values may help to prevent extreme gradients (though this problem is less common with ELU activation functions). 65 | 66 | - **softmax_adj:** Float. A constant by which the cross-entropy loss functions are multiplied (default = 1). This hyperparameter is the equivalent of loss_scale for categorical variables. If cross-entropy loss falls at a consistently faster rate than RMSE during training, a lower value may help to redress this imbalance. 67 | 68 | - **vae_layer:** Boolean. Specifies whether to include a variational autoencoder layer in the network (default = False), one of the key diagnostic tools included in midas. If set to True, variational autoencoder hyperparameters must be specified via a number of additional arguments. 69 | 70 | - **latent_space_size:** Integer. The number of normal dimensions used to parameterize the latent space. 71 | 72 | - **vae_sample_var:** Float. The sampling variance of the normal distributions used to parameterize the latent space. 73 | 74 | - **vae_alpha:** Float. The strength of the prior imposed on the Kullback-Leibler divergence term in the variational autoencoder loss functions. 75 | 76 | - **kld_min:** Float. The minimum value of the Kullback-Leibler divergence term in the variational autoencoder loss functions. 77 | 78 | --- 79 | 80 | ### .build_model() 81 | 82 | - imputation_target 83 | - categorical_columns= None 84 | - softmax_columns= None 85 | - unsorted= True 86 | - additional_data = None 87 | - verbose= True 88 | 89 | This method is called to construct the neural network that is the heart of MIDAS. This includes the assignment of loss functions to the appropriate data types. 90 | 91 | THIS FUNCTION MUST BE CALLED BEFORE ANY TRAINING OR IMPUTATION OCCURS. Failing to do so will simply raise an error. 92 | 93 | The categorical columns should be a list of column names. Softmax columns should be a list of lists of column names. This will allow the model to dynamically assign cost functions to the correct variables. If, however, the data comes pre-sorted, 'arranged' can be set to "True", in which case the arguments can be passed in as integers of size, ie. shape[1] attributes for each of the relevant categories. 94 | 95 | In other words, if you're experienced at using MIDAS and understand how its indexing works, pre-sort your data and pass in the integers so specifying reindexing values doesn't become too onerous. 96 | 97 | Alternatively, list(df.columns.values) will output a list of column names, which can be easily implemented in the 'for' loop which constructs your dummy variables. 98 | 99 | #### Args: 100 | - **imputation_target:** DataFrame. The name of the incomplete input dataset. Upon being read in, the dataset will be appropriately formatted and stored for training. 101 | 102 | - **binary_columns:** List of names. A list of all binary variables in the input dataset. 103 | 104 | - **softmax_columns:** List of lists. The outer list should include all non-binary categorical variables in the input dataset. Each inner list should contain the mutually exclusive set of possible classes for each of these variables. 105 | 106 | - **unsorted:** Boolean. Specifies whether the input dataset has been pre-ordered in terms of variable type (default = True, denoting no sorting). If set to False, binary_columns and softmax_columns should be a list of integers denoting shape attributes for each category. 107 | 108 | - **additional_data:** DataFrame. Data that should be included in the imputation model but are not required for later analyses. Such data will not be formatted, rearranged, or included in the loss functions, reducing training time. 109 | 110 | - **verbose:** Boolean. Specifies whether to print messages to the terminal (default = True). 111 | 112 | --- 113 | 114 | ### .build_model_pipeline() 115 | 116 | - data_sample 117 | - categorical_columns= None 118 | - softmax_columns= None 119 | - unsorted= True 120 | - additional_data_sample= None 121 | - verbose= True 122 | - crossentropy_adj= 1 123 | - loss_scale = 1 124 | 125 | This function is for integration with databasing or any dataset that needs to be batched into memory. The data sample is simply there to allow the original constructor to be recycled. The head of the data should be sufficient to build the imputation model. The input pipeline itself should pre-scale the data, and code null values as type np.nan. The pipeline ought to output a Pandas DataFrame. If additional data will be passed in, then the return must be a list of two DataFrames. The columns of the dataframe will be re-arranged so that error functions are efficiently generated. 126 | 127 | IT IS IMPERATIVE that this ordering is respected. Design the input batching function accordingly. 128 | 129 | The categorical columns should be a list of column names. Softmax columns should be a list of lists of column names. This will allow the model to dynamically assign cost functions to the correct variables. If, however, the data comes pre-sorted, arranged can be set to "true", in which case the arguments can be passed in as integers of size, ie. shape[1] attributes for each of the relevant categories. 130 | 131 | In other words, pre-sort your data and pass in the integers, so indexing dynamically doesn't become too difficult. Alternatively, list(df.columns.values) will output a list of column names, which can be easily implemented in the 'for' loop which constructs your dummy variables. 132 | 133 | #### Args: 134 | - **data_sample:** DataFrame. The head of the data that will be fed in via a batching pipeline. This sample is just used to enforce indexing and to allow code recyling. 135 | 136 | - **categorical_columns:** List of names. Specifies the binary (ie. non-exclusive categories) to be imputed. If unsorted = False, this value can be an integer 137 | 138 | - **softmax_columns:** List of lists. Every inner list should contain column names. Each inner list should represent a set of mutually exclusive categories, such as current day of the week. if unsorted = False, this should be a list of integers. 139 | 140 | - **unsorted:** Boolean. Specifies to MIDAS that data has been pre-sorted, and indices can simply be appended to the size index. 141 | 142 | - **additional_data:** DataFrame. Any data that shoud be included in the imputation model, but is not required from the output. By passing data here, the data will neither be rearranged nor will it generate a cost function. This reduces the regularising effects of multiple loss functions, but reduces both networksize requirements and training time. 143 | 144 | - **verbose:** Boolean. Set to False to suppress messages printing to terminal. 145 | 146 | --- 147 | 148 | ### .overimpute() 149 | 150 | - spikein = 0.1 151 | - training_epochs= 100 152 | - report_ival = 10 153 | - report_samples = 32 154 | - plot_all= True 155 | - verbose= True 156 | - verbosity_ival= 1 157 | - spike_seed= 42 158 | - excessive= False 159 | 160 | This function spikes in additional missingness, so that known values can be used to help adjust the complexity of the model. As conventional train/validation splits can still lead to autoencoders overtraining, the method for limiting complexity is overimputation and early stopping. This gives an estimate of how the model will react to unseen variables. 161 | 162 | Error is defined as RMSE for continuous variables, and classification error for binary and categorical variables (ie. 1 - accuracy). Note that this means that binary classification is inherently dependent on a selection threshold of 0.5, and softmax accuracy will naturally decrease as a function of the number of classes within the model. All three will be affected by the degree of imbalance within the dataset. 163 | 164 | The accuracy measures provided here may not be ideal for all problems, but they are generally appropriate for selecting optimum complexity. Should the lines denoting error begin to trend upwards, this indicates overtraining and is a sign that the training_epochs parameter to the .train_model() method should be capped before this point. 165 | 166 | The actual optimal point may differ from that indicated by the .overimpute() method for two reasons: 167 | - The loss that is spiked in reduces the overall data available to the algorithm to learn the patterns inherent, so there should be some improvement in performance when .train_model() is called. If this is a concern, then it should be possible to compare the behaviour of the loss figure between .train_model() and .overimpute(). 168 | - The missingness inherent to the data may depend on some unobserved factor. 169 | In this case, the bias in the observed data may lead to inaccurate inference. 170 | 171 | It is worth visually inspecting the distribution of the overimputed values against imputed values (using plot_all) to ensure that they fall within a sensible range. 172 | 173 | #### Args: 174 | 175 | - **spikein:** Float, between 0 and 1. The proportion of observed values in the input dataset to be randomly removed (default = 0.1). 176 | 177 | - **training_epochs:** Integer. The number of overimputation training epochs (default = 100). Selecting a low value increases the risk that trends in the loss metrics have not stabilized by the end of training, in which case additional epochs may be necessary. 178 | 179 | - **report_ival:** Integer. The number of overimputation training epochs between calculations of loss (default = 10). Shorter intervals provide a more granular view of model performance but slow down the overimputation process. 180 | 181 | - **report_samples:** The number of Monte Carlo samples drawn from the estimated missing-data posterior for loss calculations (default = 32). A larger number increases overimputation runtime and may thus necessitate a lower value of report_ival. 182 | 183 | - **plot_vars:** Boolean. Specifies whether to plot the distribution of original versus overimputed values (default = True). This takes the form of a density plot for continuous variables and a barplot for categorical variables (showing proportions of each class). 184 | 185 | - **plot_main:** Boolean. Specifies whether to display the main graphical output (overimputation error during training) at every reporting interval (default = True). If set to False, it will only appear at the end of the overimputation training process. Error values are still shown at each report_ival. 186 | 187 | - **skip_plot:** Boolean. Specifies whether to suppress the main graphical output (default = False). This may be desirable when users are conducting multiple overimputation exercises sequentially and are primarily interested in the console output. 188 | 189 | - **verbose:** Boolean. Prints out messages, including loss, to the terminal (default = True). 190 | 191 | - **verbosity_ival:** Integer. The number of overimputation training epochs between messages (default = True). 192 | 193 | - **spike_seed:** Integer. The value to which Python's pseudo-random number generator is initialized for the missingness spike-in. This is separate to the seed specified in the Midas() call. 194 | 195 | - **excessive:** Boolean. Specifies whether to print aggregate mini-batch loss to the terminal (default = False). This argument differs from the .train\_model()'s excessive argument, which prints individual mini-batch loss. This allows users to check for unusual imputations, which may be helpful if loss is not declining during overimputation training. 196 | 197 | --- 198 | 199 | ### .train_model() 200 | 201 | - training_epochs= 100 202 | - verbose= True 203 | - verbosity_ival= 1 204 | - excessive= False 205 | 206 | This is the standard method for optimising the model's parameters. Must be called before imputation can be performed. The model is automatically saved upon conclusion of training 207 | 208 | #### Args: 209 | 210 | - **training_epochs:** Integer. The number of complete cycles (forward passes) through the network during training (default = 100). 211 | 212 | - **verbose:** Boolean. Specifies whether to print messages to the terminal during training, including loss values (default = True). 213 | 214 | - **verbosity_ival:** Integer. The number of training epochs between messages (default = 1). 215 | 216 | - **excessive:** Boolean. Specifies whether to print loss for each mini-batch to the terminal (default = False), which can help with troubleshooting. 217 | 218 | --- 219 | 220 | ### .train_model_pipeline() 221 | 222 | - input_pipeline 223 | - training_epochs= 100 224 | - verbose= True 225 | - verbosity_ival= 1 226 | - excessive= False 227 | 228 | This is the alternative method for optimising the model's parameters when input data must be batched into memory. Must be called before imputation can be performed. The model will then be saved to the specified directory. 229 | 230 | #### Args: 231 | 232 | - **input_pipeline:** Function which yields a pre-processed and scaled DataFrame from the designated source, be it a server or large flat file. 233 | 234 | - **training_epochs:** Integer. The number of epochs the model will run for 235 | 236 | - **verbose:** Boolean. Prints out messages, including loss 237 | 238 | - **verbosity_ival:** Integer. This number determines the interval between messages. 239 | 240 | - **excessive:** Boolean. Used for troubleshooting, this argument will cause the cost of each minibatch to be printed to the terminal. 241 | 242 | ---- 243 | 244 | ### .batch_generate_samples() 245 | 246 | - m= 50 247 | - b_size= 256 248 | - verbose= True 249 | 250 | Method used to generate a set of m imputations to the .output_list attribute. Imputations are stored within a list in memory, and can be accessed in any order. As batch generation implies very large datasets, this method is only provided for internal troubleshooting. 251 | 252 | This function is for a dataset large enough to be stored in memory, but too large to be passed into the model in its entirety. This may be due to GPU memory limitations, or just the size of the model 253 | 254 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this. 255 | 256 | #### Args: 257 | - **m:** Integer. Number of imputations to generate. 258 | 259 | - **b_size:** Integer. Number of data entries to process at once. For managing wider datasets, smaller numbers may be required. 260 | 261 | - **verbose:** Boolean. Prints out messages. 262 | 263 | --- 264 | 265 | ### .batch_yield_samples() 266 | 267 | - m= 50 268 | - b_size= 256 269 | - verbose= True 270 | 271 | Method used to generate a set of m imputations via the 'yield' command, allowing imputations to be used in a 'for' loop' 272 | 273 | This function is for a dataset large enough to be stored in memory, but too large to be passed into the model in its entirety. This may be due to GPU memory limitations, or just the size of the model or dataset. 274 | 275 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this. 276 | 277 | #### Args: 278 | - **m:** Integer. Number of imputations to generate. 279 | 280 | - **b_size:** Integer. Number of data entries to process at once. For managing wider datasets, smaller numbers may be required. 281 | 282 | - **verbose:** Boolean. Prints out messages. 283 | 284 | --- 285 | 286 | ### .generate_samples() 287 | 288 | - m= 50 289 | - verbose= True 290 | 291 | Method used to generate a set of m imputations to the .output_list attribute. Imputations are stored within a list in memory, and can be accessed in any order. 292 | 293 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this. 294 | 295 | #### Args: 296 | - **m:** Integer. The number of completed datasets to produce (default = 50) 297 | 298 | - **verbose:** Boolean. Specifies whether to print messages to the terminal (default = True). 299 | 300 | --- 301 | 302 | ### .yield_samples() 303 | 304 | - m= 50 305 | - verbose= True 306 | 307 | Method used to generate a set of m imputations via the 'yield' command, allowing imputations to be used in a 'for' loop. 308 | 309 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this. 310 | 311 | #### Args: 312 | 313 | - **m:** Integer. Number of imputations to generate. 314 | 315 | - **verbose:** Boolean. Prints out messages. 316 | 317 | --- 318 | 319 | ### .yield_samples_pipeline() 320 | 321 | - verbose= False 322 | 323 | As it's impossible to know the specifics of the pipeline, this method simply cycles through all data provided by the input function. The number of imputations can be specified by the user, depending on their needs. The size of the output DataFrame depends on the size specified by the input function that was passed to 'train_model_pipeline'. 324 | 325 | #### Args: 326 | 327 | - **verbose: Prints out messages 328 | 329 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | 3 | [pytest] 4 | python_files = *.py 5 | addopts = --ignore=setup.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | from setuptools import setup, find_packages 4 | from pathlib import Path 5 | 6 | this_directory = Path(__file__).parent 7 | long_description = (this_directory / "README.md").read_text(encoding="utf-8") 8 | 9 | if sys.version_info < (3, 6): 10 | sys.exit("Sorry, Python 3.5 is no longer supported. Please use Python versions from 3.6 to 3.10") 11 | 12 | install_requires = ['numpy>=1.5,<=1.26.4', 'scikit-learn', 'matplotlib', 'pandas>=0.19', 'tensorflow_addons<0.20', 'statsmodels', 'scipy'] 13 | if sys.version_info >= (3, 8) and sys.version_info < (3, 11): 14 | install_requires.append('tensorflow<2.12.0; sys_platform != "darwin" or platform_machine != "arm64"') 15 | install_requires.append('tensorflow-macos<2.12.0; sys_platform == "darwin" and platform_machine == "arm64"') 16 | else: 17 | install_requires.append('tensorflow>=1.10; sys_platform != "darwin" or platform_machine != "arm64"') 18 | install_requires.append('tensorflow-macos>=1.10; sys_platform == "darwin" and platform_machine == "arm64"') 19 | 20 | setup( 21 | name='MIDASpy', 22 | packages=['MIDASpy'], 23 | version='1.4.0', 24 | license='Apache', 25 | description='Multiple Imputation with Denoising Autoencoders', 26 | long_description_content_type='text/markdown', 27 | long_description=long_description, 28 | url='http://github.com/MIDASverse/MIDASpy', 29 | project_urls={ 30 | 'Method article': 'https://doi.org/10.1017/pan.2020.49', 31 | 'Software article': 'https://doi.org/10.18637/jss.v107.i09', 32 | 'Source': 'https://github.com/MIDASverse/MIDASpy', 33 | 'Issues': 'https://github.com/MIDASverse/MIDASpy/issues', 34 | }, 35 | author='Ranjit Lall, Alex Stenlake, and Thomas Robinson', 36 | author_email='R.Lall@lse.ac.uk', 37 | python_requires='>=3.6, <3.11', 38 | install_requires=install_requires, 39 | keywords=['multiple imputation', 'neural networks', 'tensorflow'], 40 | extras_require={'test': ['pytest','matplotlib']}, 41 | 42 | classifiers=[ 43 | 'Development Status :: 5 - Production/Stable', 44 | 'Intended Audience :: Science/Research', 45 | 'Topic :: Scientific/Engineering', 46 | 'License :: OSI Approved :: Apache Software License', 47 | 'Programming Language :: Python :: 3', 48 | 'Programming Language :: Python :: 3.6', 49 | 'Programming Language :: Python :: 3.7', 50 | 'Programming Language :: Python :: 3.8', 51 | 'Programming Language :: Python :: 3.9', 52 | 'Programming Language :: Python :: 3.10', 53 | ], 54 | ) 55 | -------------------------------------------------------------------------------- /tests/test_midas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from sklearn.preprocessing import MinMaxScaler 5 | import sys 6 | import os 7 | import csv 8 | import MIDASpy as md 9 | 10 | def test_some_functionality(): 11 | # Load the data 12 | np.random.seed(441) 13 | data_path = os.path.join(os.path.dirname(__file__), "test_data", "adult_data.csv") 14 | data_0 = pd.read_csv(data_path) 15 | data_0.columns.str.strip() 16 | 17 | def spike_in_generation(data): 18 | spike_in = pd.DataFrame(np.zeros_like(data), columns= data.columns) 19 | for column in data.columns: 20 | subset = np.random.choice(data[column].index[data[column].notnull()], 5000, replace= False) 21 | spike_in.loc[subset, column] = 1 22 | return spike_in 23 | 24 | spike_in = spike_in_generation(data_0) 25 | original_value = data_0.loc[4, 'hours_per_week'] 26 | data_0[spike_in == 1] = np.nan 27 | 28 | categorical = ['workclass','marital_status','relationship','race','class_labels','sex','education','occupation','native_country'] 29 | data_cat, cat_cols_list = md.cat_conv(data_0[categorical]) 30 | 31 | data_0.drop(categorical, axis = 1, inplace = True) 32 | constructor_list = [data_0] 33 | constructor_list.append(data_cat) 34 | data_in = pd.concat(constructor_list, axis=1) 35 | 36 | na_loc = data_in.isnull() 37 | data_in[na_loc] = np.nan 38 | 39 | imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 89, input_drop = 0.75) 40 | imputer.build_model(data_in, softmax_columns = cat_cols_list) 41 | imputer.train_model(training_epochs = 2) 42 | 43 | imputations = imputer.generate_samples(m=2).output_list 44 | model = md.combine(y_var = "capital_gain", X_vars = ["education_num","age"], df_list = imputations) 45 | -------------------------------------------------------------------------------- /tmp/MIDAS.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.data-00000-of-00001 -------------------------------------------------------------------------------- /tmp/MIDAS.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.index -------------------------------------------------------------------------------- /tmp/MIDAS.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.meta -------------------------------------------------------------------------------- /tmp/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "MIDAS" 2 | all_model_checkpoint_paths: "MIDAS" 3 | --------------------------------------------------------------------------------