├── .github
    └── workflows
    │   ├── testlinux.yml
    │   ├── testmacos.yml
    │   └── testwindows.yml
├── .gitignore
├── Examples
    ├── .Rhistory
    ├── adult_data.csv
    ├── cces_jss_format.csv
    ├── midas_demo.ipynb
    ├── midaspy_demo_cces.ipynb
    └── tmp
    │   ├── MIDAS.data-00000-of-00001
    │   ├── MIDAS.index
    │   ├── MIDAS.meta
    │   └── checkpoint
├── LICENSE.txt
├── MIDASpy
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-312.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── midas_base.cpython-312.pyc
    │   └── midas_base.cpython-38.pyc
    └── midas_base.py
├── MIDASpy_logo.png
├── README.md
├── build
    └── lib
    │   └── MIDASpy
    │       ├── __init__.py
    │       └── midas_base.py
├── midas_functions.md
├── pytest.ini
├── setup.cfg
├── setup.py
├── tests
    ├── test_data
    │   └── adult_data.csv
    └── test_midas.py
└── tmp
    ├── MIDAS.data-00000-of-00001
    ├── MIDAS.index
    ├── MIDAS.meta
    └── checkpoint


/.github/workflows/testlinux.yml:
--------------------------------------------------------------------------------
 1 | name: CI-Linux
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | jobs:
14 |   test:
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [ubuntu-latest]
19 |         python-version: ["3.7", "3.8", "3.9", "3.10"]
20 |     steps:
21 |       - name: Check out code
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install -e .[test]
33 | 
34 |       - name: Run tests
35 |         run: |
36 |           pytest
37 | 


--------------------------------------------------------------------------------
/.github/workflows/testmacos.yml:
--------------------------------------------------------------------------------
 1 | name: CI-macOS
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |       - develop
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |       - master
13 |       - develop
14 | 
15 | jobs:
16 |   test:
17 |     runs-on: ${{ matrix.os }}
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         include:
22 |           - os: macos-11
23 |             python-version: "3.7"
24 |           - os: macos-latest
25 |             python-version: "3.8"
26 |           - os: macos-latest
27 |             python-version: "3.9"
28 |           - os: macos-latest
29 |             python-version: "3.10"
30 |     steps:
31 |       - name: Check out code
32 |         uses: actions/checkout@v4
33 | 
34 |       - name: Set up Python ${{ matrix.python-version }}
35 |         uses: actions/setup-python@v5
36 |         with:
37 |           python-version: ${{ matrix.python-version }}
38 | 
39 |       - name: Install dependencies
40 |         run: |
41 |           python -m pip install --upgrade pip
42 |           pip install -e .[test]
43 |       - name: Run tests
44 |         run: |
45 |           pytest
46 | 


--------------------------------------------------------------------------------
/.github/workflows/testwindows.yml:
--------------------------------------------------------------------------------
 1 | name: CI-Windows
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | jobs:
14 |   test:
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [windows-latest]
19 |         python-version: ["3.7", "3.8", "3.9", "3.10"]
20 |     steps:
21 |       - name: Check out code
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install -e .[test]
33 |       - name: Run tests
34 |         run: |
35 |           pytest
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | MIDASpy.egg-info


--------------------------------------------------------------------------------
/Examples/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/.Rhistory


--------------------------------------------------------------------------------
/Examples/midas_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "# MIDASpy demonstration"
  7 |       ],
  8 |       "metadata": {}
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "This notebook provides a brief demonstration of **MIDASpy**'s core functionalities. We show how to use the package to impute missing values in the [Adult census dataset](https://github.com/MIDASverse/MIDASpy/blob/master/Examples/adult_data.csv) (which is commonly used for benchmarking machine learning tasks).\n",
 14 |         "\n",
 15 |         "Users of **MIDASpy** must have **TensorFlow** installed as a **pip** package in their Python environment. **MIDASpy** is compatible with both **TensorFlow** 1.X and **TensorFlow** >= 2.2 versions.\n",
 16 |         "\n\nOnce these packages are installed, users can import the dependencies and load the data:"
 17 |       ],
 18 |       "metadata": {}
 19 |     },
 20 |     {
 21 |       "cell_type": "code",
 22 |       "source": [
 23 |         "from sklearn.preprocessing import MinMaxScaler\n",
 24 |         "import numpy as np\n",
 25 |         "import pandas as pd\n",
 26 |         "import tensorflow as tf\n",
 27 |         "import MIDASpy as md\n",
 28 |         "\n",
 29 |         "data_0 = pd.read_csv('adult_data.csv')\n",
 30 |         "data_0.columns.str.strip()"
 31 |       ],
 32 |       "outputs": [
 33 |         {
 34 |           "output_type": "execute_result",
 35 |           "execution_count": 1,
 36 |           "data": {
 37 |             "text/plain": [
 38 |               "Index(['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education',\n",
 39 |               "       'education_num', 'marital_status', 'occupation', 'relationship', 'race',\n",
 40 |               "       'sex', 'capital_gain', 'capital_loss', 'hours_per_week',\n",
 41 |               "       'native_country', 'class_labels'],\n",
 42 |               "      dtype='object')"
 43 |             ]
 44 |           },
 45 |           "metadata": {}
 46 |         }
 47 |       ],
 48 |       "execution_count": 1,
 49 |       "metadata": {}
 50 |     },
 51 |     {
 52 |       "cell_type": "markdown",
 53 |       "source": [
 54 |         "As the Adult dataset has very little missingness, we randomly set 5,000 observed values as missing in each column:"
 55 |       ],
 56 |       "metadata": {}
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "source": [
 61 |         "np.random.seed(441)\n",
 62 |         "\n",
 63 |         "def spike_in_generation(data):\n",
 64 |         "    spike_in = pd.DataFrame(np.zeros_like(data), columns= data.columns)\n",
 65 |         "    for column in data.columns:\n",
 66 |         "        subset = np.random.choice(data[column].index[data[column].notnull()], 5000, replace= False)\n",
 67 |         "        spike_in.loc[subset, column] = 1\n",
 68 |         "    return spike_in\n",
 69 |         "\n",
 70 |         "spike_in = spike_in_generation(data_0)\n",
 71 |         "original_value = data_0.loc[4, 'hours_per_week']\n",
 72 |         "data_0[spike_in == 1] = np.nan"
 73 |       ],
 74 |       "outputs": [],
 75 |       "execution_count": 2,
 76 |       "metadata": {}
 77 |     },
 78 |     {
 79 |       "cell_type": "markdown",
 80 |       "source": [
 81 |         "Next, we list categorical variables in a vector and one-hot encode them using **MIDASpy**'s inbuilt preprocessing function `cat_conv`, which returns both the encoded data and a nested list of categorical column names we can pass to the imputation algorithm. To construct the final, pre-processed data we append the one-hot encoded categorical data to the non-cateogrical data, and replace null values with `np.nan` values:"
 82 |       ],
 83 |       "metadata": {}
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "source": [
 88 |         "categorical = ['workclass','marital_status','relationship','race','class_labels','sex','education','occupation','native_country']\n",
 89 |         "data_cat, cat_cols_list = md.cat_conv(data_0[categorical])\n",
 90 |         "\n",
 91 |         "data_0.drop(categorical, axis = 1, inplace = True)\n",
 92 |         "constructor_list = [data_0]\n",
 93 |         "constructor_list.append(data_cat)\n",
 94 |         "data_in = pd.concat(constructor_list, axis=1)\n",
 95 |         "\n",
 96 |         "na_loc = data_in.isnull()\n",
 97 |         "data_in[na_loc] = np.nan"
 98 |       ],
 99 |       "outputs": [],
100 |       "execution_count": 3,
101 |       "metadata": {}
102 |     },
103 |     {
104 |       "cell_type": "markdown",
105 |       "source": [
106 |         "To visualize the results:\n"
107 |       ],
108 |       "metadata": {}
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "source": [
113 |         "print(data_in.head())"
114 |       ],
115 |       "outputs": [
116 |         {
117 |           "output_type": "stream",
118 |           "name": "stdout",
119 |           "text": [
120 |             "   Unnamed: 0   age    fnlwgt  education_num  capital_gain  capital_loss  \\\n",
121 |             "0         0.0  39.0   77516.0           13.0        2174.0           0.0   \n",
122 |             "1         1.0  50.0   83311.0           13.0           0.0           0.0   \n",
123 |             "2         2.0  38.0  215646.0            9.0           0.0           0.0   \n",
124 |             "3         3.0  53.0  234721.0            NaN           0.0           0.0   \n",
125 |             "4         4.0  28.0       NaN           13.0           0.0           NaN   \n",
126 |             "\n",
127 |             "   hours_per_week  workclass_Federal-gov  workclass_Local-gov  \\\n",
128 |             "0            40.0                    0.0                  0.0   \n",
129 |             "1            13.0                    0.0                  0.0   \n",
130 |             "2            40.0                    0.0                  0.0   \n",
131 |             "3            40.0                    0.0                  0.0   \n",
132 |             "4             NaN                    0.0                  0.0   \n",
133 |             "\n",
134 |             "   workclass_Never-worked  ...  native_country_Portugal  \\\n",
135 |             "0                     0.0  ...                      0.0   \n",
136 |             "1                     0.0  ...                      0.0   \n",
137 |             "2                     0.0  ...                      0.0   \n",
138 |             "3                     0.0  ...                      0.0   \n",
139 |             "4                     0.0  ...                      0.0   \n",
140 |             "\n",
141 |             "   native_country_Puerto-Rico  native_country_Scotland  native_country_South  \\\n",
142 |             "0                         0.0                      0.0                   0.0   \n",
143 |             "1                         0.0                      0.0                   0.0   \n",
144 |             "2                         0.0                      0.0                   0.0   \n",
145 |             "3                         0.0                      0.0                   0.0   \n",
146 |             "4                         0.0                      0.0                   0.0   \n",
147 |             "\n",
148 |             "   native_country_Taiwan  native_country_Thailand  \\\n",
149 |             "0                    0.0                      0.0   \n",
150 |             "1                    0.0                      0.0   \n",
151 |             "2                    0.0                      0.0   \n",
152 |             "3                    0.0                      0.0   \n",
153 |             "4                    0.0                      0.0   \n",
154 |             "\n",
155 |             "   native_country_Trinadad&Tobago  native_country_United-States  \\\n",
156 |             "0                             0.0                           1.0   \n",
157 |             "1                             0.0                           1.0   \n",
158 |             "2                             0.0                           1.0   \n",
159 |             "3                             0.0                           1.0   \n",
160 |             "4                             0.0                           0.0   \n",
161 |             "\n",
162 |             "   native_country_Vietnam  native_country_Yugoslavia  \n",
163 |             "0                     0.0                        0.0  \n",
164 |             "1                     0.0                        0.0  \n",
165 |             "2                     0.0                        0.0  \n",
166 |             "3                     0.0                        0.0  \n",
167 |             "4                     0.0                        0.0  \n",
168 |             "\n",
169 |             "[5 rows x 108 columns]\n"
170 |           ]
171 |         }
172 |       ],
173 |       "execution_count": 4,
174 |       "metadata": {}
175 |     },
176 |     {
177 |       "cell_type": "markdown",
178 |       "source": [
179 |         "The data are now ready to be fed into the imputation algorithm, which involves three steps. First, we specify the dimensions, input corruption proportion, and other hyperparameters of the MIDAS neural network. Second, we build a MIDAS model based on the data. The vector of one-hot-encoded column names should be passed to the softmax_columns argument, as MIDAS employs a softmax final-layer activation function for categorical variables. Third, we train the model on the data, setting the number of training epochs as 20 in this example:"
180 |       ],
181 |       "metadata": {}
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "source": [
186 |         "imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 89, input_drop = 0.75)\n",
187 |         "imputer.build_model(data_in, softmax_columns = cat_cols_list)\n",
188 |         "imputer.train_model(training_epochs = 20)"
189 |       ],
190 |       "outputs": [
191 |         {
192 |           "output_type": "stream",
193 |           "name": "stdout",
194 |           "text": [
195 |             "Size index: [7, 8, 7, 6, 5, 2, 2, 16, 14, 41]\n",
196 |             "\n",
197 |             "Computation graph constructed\n",
198 |             "\n",
199 |             "Model initialised\n",
200 |             "\n",
201 |             "Epoch: 0 , loss: 131055.20626587074\n",
202 |             "Epoch: 1 , loss: 94882.5758455009\n",
203 |             "Epoch: 2 , loss: 90956.90158796997\n",
204 |             "Epoch: 3 , loss: 88764.57763543885\n",
205 |             "Epoch: 4 , loss: 85847.00143988573\n",
206 |             "Epoch: 5 , loss: 80933.15996490518\n",
207 |             "Epoch: 6 , loss: 76754.09316700627\n",
208 |             "Epoch: 7 , loss: 75646.90740190858\n",
209 |             "Epoch: 8 , loss: 74589.6067678469\n",
210 |             "Epoch: 9 , loss: 74155.46380383252\n",
211 |             "Epoch: 10 , loss: 74159.95000204784\n",
212 |             "Epoch: 11 , loss: 74705.84092718402\n",
213 |             "Epoch: 12 , loss: 73753.75950004607\n",
214 |             "Epoch: 13 , loss: 73959.30564486403\n",
215 |             "Epoch: 14 , loss: 73135.93429385444\n",
216 |             "Epoch: 15 , loss: 74014.20066695508\n",
217 |             "Epoch: 16 , loss: 73246.82324794705\n",
218 |             "Epoch: 17 , loss: 74179.63132589798\n",
219 |             "Epoch: 18 , loss: 73412.0879309418\n",
220 |             "Epoch: 19 , loss: 73584.05688892529\n",
221 |             "Training complete. Saving file...\n",
222 |             "Model saved in file: tmp/MIDAS\n"
223 |           ]
224 |         },
225 |         {
226 |           "output_type": "execute_result",
227 |           "execution_count": 5,
228 |           "data": {
229 |             "text/plain": [
230 |               "<MIDASpy.midas_base.Midas at 0x7fa85ed70c10>"
231 |             ]
232 |           },
233 |           "metadata": {}
234 |         }
235 |       ],
236 |       "execution_count": 5,
237 |       "metadata": {}
238 |     },
239 |     {
240 |       "cell_type": "markdown",
241 |       "source": [
242 |         "Once training is complete, we can generate any number of imputed datasets (M) using the `generate_samples` function (here we set M as 10). Users can then either write these imputations to separate .CSV files or work with them directly in Python:"
243 |       ],
244 |       "metadata": {}
245 |     },
246 |     {
247 |       "cell_type": "code",
248 |       "source": [
249 |         "imputations = imputer.generate_samples(m=10).output_list \n",
250 |         "\n",
251 |         "# for i in imputations:\n",
252 |         "#    file_out = \"midas_imp_\" + str(n) + \".csv\"\n",
253 |         "#    i.to_csv(file_out, index=False)\n",
254 |         "#    n += 1"
255 |       ],
256 |       "outputs": [
257 |         {
258 |           "output_type": "stream",
259 |           "name": "stdout",
260 |           "text": [
261 |             "INFO:tensorflow:Restoring parameters from tmp/MIDAS\n",
262 |             "Model restored.\n"
263 |           ]
264 |         }
265 |       ],
266 |       "execution_count": 6,
267 |       "metadata": {}
268 |     },
269 |     {
270 |       "cell_type": "markdown",
271 |       "source": [
272 |         "Finally, using the list of generated imputations, we can estimate M separate regression models and combine the parameter and variance estimates (see Rubin 1987) using **MIDASpy's** `combine` function:"
273 |       ],
274 |       "metadata": {}
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "source": [
279 |         "model = md.combine(y_var = \"capital_gain\", \n",
280 |         "                   X_vars = [\"education_num\",\"age\"],\n",
281 |         "                   df_list = imputations)\n",
282 |         "\nmodel"
283 |       ],
284 |       "outputs": [
285 |         {
286 |           "output_type": "execute_result",
287 |           "execution_count": 7,
288 |           "data": {
289 |             "text/plain": [
290 |               "            term    estimate   std.error  statistic          df       p.value\n",
291 |               "0          const -936.114554  136.800095  -6.842938   75.658615  1.764065e-09\n",
292 |               "1  education_num   67.955119    9.202229   7.384637   26.664184  6.556180e-08\n",
293 |               "2            age   31.339538    2.383158  13.150427  522.516002  0.000000e+00"
294 |             ],
295 |             "text/html": [
296 |               "<div>\n",
297 |               "<style scoped>\n",
298 |               "    .dataframe tbody tr th:only-of-type {\n",
299 |               "        vertical-align: middle;\n",
300 |               "    }\n",
301 |               "\n",
302 |               "    .dataframe tbody tr th {\n",
303 |               "        vertical-align: top;\n",
304 |               "    }\n",
305 |               "\n",
306 |               "    .dataframe thead th {\n",
307 |               "        text-align: right;\n",
308 |               "    }\n",
309 |               "</style>\n",
310 |               "<table border=\"1\" class=\"dataframe\">\n",
311 |               "  <thead>\n",
312 |               "    <tr style=\"text-align: right;\">\n",
313 |               "      <th></th>\n",
314 |               "      <th>term</th>\n",
315 |               "      <th>estimate</th>\n",
316 |               "      <th>std.error</th>\n",
317 |               "      <th>statistic</th>\n",
318 |               "      <th>df</th>\n",
319 |               "      <th>p.value</th>\n",
320 |               "    </tr>\n",
321 |               "  </thead>\n",
322 |               "  <tbody>\n",
323 |               "    <tr>\n",
324 |               "      <th>0</th>\n",
325 |               "      <td>const</td>\n",
326 |               "      <td>-936.114554</td>\n",
327 |               "      <td>136.800095</td>\n",
328 |               "      <td>-6.842938</td>\n",
329 |               "      <td>75.658615</td>\n",
330 |               "      <td>1.764065e-09</td>\n",
331 |               "    </tr>\n",
332 |               "    <tr>\n",
333 |               "      <th>1</th>\n",
334 |               "      <td>education_num</td>\n",
335 |               "      <td>67.955119</td>\n",
336 |               "      <td>9.202229</td>\n",
337 |               "      <td>7.384637</td>\n",
338 |               "      <td>26.664184</td>\n",
339 |               "      <td>6.556180e-08</td>\n",
340 |               "    </tr>\n",
341 |               "    <tr>\n",
342 |               "      <th>2</th>\n",
343 |               "      <td>age</td>\n",
344 |               "      <td>31.339538</td>\n",
345 |               "      <td>2.383158</td>\n",
346 |               "      <td>13.150427</td>\n",
347 |               "      <td>522.516002</td>\n",
348 |               "      <td>0.000000e+00</td>\n",
349 |               "    </tr>\n",
350 |               "  </tbody>\n",
351 |               "</table>\n",
352 |               "</div>"
353 |             ]
354 |           },
355 |           "metadata": {}
356 |         }
357 |       ],
358 |       "execution_count": 7,
359 |       "metadata": {
360 |         "collapsed": false,
361 |         "outputHidden": false,
362 |         "inputHidden": false
363 |       }
364 |     },
365 |     {
366 |       "cell_type": "markdown",
367 |       "source": [
368 |         "### Handling categorical data post-imputation\n",
369 |         "\nTo impute missing data in a categorical variable, we one-hot encode this variable and then impute the probability of each class for each observation. For example, the imputed version of the one-hot-encoded `workclass` variable is represented by 8 columns, one for each category in the data:"
370 |       ],
371 |       "metadata": {}
372 |     },
373 |     {
374 |       "cell_type": "code",
375 |       "source": [
376 |         "workclasses = [x for x in imputations[0].columns if \"workclass\" in x]\n",
377 |         "imputations[0][workclasses].head()"
378 |       ],
379 |       "outputs": [
380 |         {
381 |           "output_type": "execute_result",
382 |           "execution_count": 8,
383 |           "data": {
384 |             "text/plain": [
385 |               "   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \\\n",
386 |               "0                    0.0                  0.0                     0.0   \n",
387 |               "1                    0.0                  0.0                     0.0   \n",
388 |               "2                    0.0                  0.0                     0.0   \n",
389 |               "3                    0.0                  0.0                     0.0   \n",
390 |               "4                    0.0                  0.0                     0.0   \n",
391 |               "\n",
392 |               "   workclass_Private  workclass_Self-emp-inc  workclass_Self-emp-not-inc  \\\n",
393 |               "0                0.0                     0.0                         0.0   \n",
394 |               "1                0.0                     0.0                         1.0   \n",
395 |               "2                1.0                     0.0                         0.0   \n",
396 |               "3                1.0                     0.0                         0.0   \n",
397 |               "4                1.0                     0.0                         0.0   \n",
398 |               "\n",
399 |               "   workclass_State-gov  workclass_Without-pay  \n",
400 |               "0                  1.0                    0.0  \n",
401 |               "1                  0.0                    0.0  \n",
402 |               "2                  0.0                    0.0  \n",
403 |               "3                  0.0                    0.0  \n",
404 |               "4                  0.0                    0.0  "
405 |             ],
406 |             "text/html": [
407 |               "<div>\n",
408 |               "<style scoped>\n",
409 |               "    .dataframe tbody tr th:only-of-type {\n",
410 |               "        vertical-align: middle;\n",
411 |               "    }\n",
412 |               "\n",
413 |               "    .dataframe tbody tr th {\n",
414 |               "        vertical-align: top;\n",
415 |               "    }\n",
416 |               "\n",
417 |               "    .dataframe thead th {\n",
418 |               "        text-align: right;\n",
419 |               "    }\n",
420 |               "</style>\n",
421 |               "<table border=\"1\" class=\"dataframe\">\n",
422 |               "  <thead>\n",
423 |               "    <tr style=\"text-align: right;\">\n",
424 |               "      <th></th>\n",
425 |               "      <th>workclass_Federal-gov</th>\n",
426 |               "      <th>workclass_Local-gov</th>\n",
427 |               "      <th>workclass_Never-worked</th>\n",
428 |               "      <th>workclass_Private</th>\n",
429 |               "      <th>workclass_Self-emp-inc</th>\n",
430 |               "      <th>workclass_Self-emp-not-inc</th>\n",
431 |               "      <th>workclass_State-gov</th>\n",
432 |               "      <th>workclass_Without-pay</th>\n",
433 |               "    </tr>\n",
434 |               "  </thead>\n",
435 |               "  <tbody>\n",
436 |               "    <tr>\n",
437 |               "      <th>0</th>\n",
438 |               "      <td>0.0</td>\n",
439 |               "      <td>0.0</td>\n",
440 |               "      <td>0.0</td>\n",
441 |               "      <td>0.0</td>\n",
442 |               "      <td>0.0</td>\n",
443 |               "      <td>0.0</td>\n",
444 |               "      <td>1.0</td>\n",
445 |               "      <td>0.0</td>\n",
446 |               "    </tr>\n",
447 |               "    <tr>\n",
448 |               "      <th>1</th>\n",
449 |               "      <td>0.0</td>\n",
450 |               "      <td>0.0</td>\n",
451 |               "      <td>0.0</td>\n",
452 |               "      <td>0.0</td>\n",
453 |               "      <td>0.0</td>\n",
454 |               "      <td>1.0</td>\n",
455 |               "      <td>0.0</td>\n",
456 |               "      <td>0.0</td>\n",
457 |               "    </tr>\n",
458 |               "    <tr>\n",
459 |               "      <th>2</th>\n",
460 |               "      <td>0.0</td>\n",
461 |               "      <td>0.0</td>\n",
462 |               "      <td>0.0</td>\n",
463 |               "      <td>1.0</td>\n",
464 |               "      <td>0.0</td>\n",
465 |               "      <td>0.0</td>\n",
466 |               "      <td>0.0</td>\n",
467 |               "      <td>0.0</td>\n",
468 |               "    </tr>\n",
469 |               "    <tr>\n",
470 |               "      <th>3</th>\n",
471 |               "      <td>0.0</td>\n",
472 |               "      <td>0.0</td>\n",
473 |               "      <td>0.0</td>\n",
474 |               "      <td>1.0</td>\n",
475 |               "      <td>0.0</td>\n",
476 |               "      <td>0.0</td>\n",
477 |               "      <td>0.0</td>\n",
478 |               "      <td>0.0</td>\n",
479 |               "    </tr>\n",
480 |               "    <tr>\n",
481 |               "      <th>4</th>\n",
482 |               "      <td>0.0</td>\n",
483 |               "      <td>0.0</td>\n",
484 |               "      <td>0.0</td>\n",
485 |               "      <td>1.0</td>\n",
486 |               "      <td>0.0</td>\n",
487 |               "      <td>0.0</td>\n",
488 |               "      <td>0.0</td>\n",
489 |               "      <td>0.0</td>\n",
490 |               "    </tr>\n",
491 |               "  </tbody>\n",
492 |               "</table>\n",
493 |               "</div>"
494 |             ]
495 |           },
496 |           "metadata": {}
497 |         }
498 |       ],
499 |       "execution_count": 8,
500 |       "metadata": {}
501 |     },
502 |     {
503 |       "cell_type": "markdown",
504 |       "source": [
505 |         "If we want to analyze the original categorical version of the variable post-imputation, we can transform these probabilities back into a vector of labels. The simplest approach is to select the category with the highest probability for each observation. Having used `md.conv()` to one-hot encode categorical variables earlier, we can use the associated `cat_cols_list` object to do just that. The following code collapses all one-hot-encoded columns back into single categorical columns:"
506 |       ],
507 |       "metadata": {}
508 |     },
509 |     {
510 |       "cell_type": "code",
511 |       "source": [
512 |         "flat_cats = [cat for variable in cat_cols_list for cat in variable]\n",
513 |         "\n",
514 |         "for i in range(len(imputations)):\n",
515 |         "    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]\n",
516 |         "    cat_df = pd.DataFrame({categorical[i]:tmp_cat[i] for i in range(len(categorical))})\n",
517 |         "    imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)\n"
518 |       ],
519 |       "outputs": [],
520 |       "execution_count": 9,
521 |       "metadata": {}
522 |     },
523 |     {
524 |       "cell_type": "markdown",
525 |       "source": [
526 |         "If we now inspect the imputations, we can see that the data are back to their original shape. For example, the `workclass` variable's categories correspond to the one-hot-encoded variables shown earlier:"
527 |       ],
528 |       "metadata": {}
529 |     },
530 |     {
531 |       "cell_type": "code",
532 |       "source": [
533 |         "print(imputations[0].columns)\n",
534 |         "\nimputations[0]['workclass'].head()"
535 |       ],
536 |       "outputs": [
537 |         {
538 |           "output_type": "stream",
539 |           "name": "stdout",
540 |           "text": [
541 |             "Index(['Unnamed: 0', 'age', 'fnlwgt', 'education_num', 'capital_gain',\n",
542 |             "       'capital_loss', 'hours_per_week', 'workclass', 'marital_status',\n",
543 |             "       'relationship', 'race', 'class_labels', 'sex', 'education',\n",
544 |             "       'occupation', 'native_country'],\n",
545 |             "      dtype='object')\n"
546 |           ]
547 |         },
548 |         {
549 |           "output_type": "execute_result",
550 |           "execution_count": 10,
551 |           "data": {
552 |             "text/plain": [
553 |               "0           workclass_State-gov\n",
554 |               "1    workclass_Self-emp-not-inc\n",
555 |               "2             workclass_Private\n",
556 |               "3             workclass_Private\n",
557 |               "4             workclass_Private\n",
558 |               "Name: workclass, dtype: object"
559 |             ]
560 |           },
561 |           "metadata": {}
562 |         }
563 |       ],
564 |       "execution_count": 10,
565 |       "metadata": {}
566 |     }
567 |   ],
568 |   "metadata": {
569 |     "kernelspec": {
570 |       "name": "python3",
571 |       "language": "python",
572 |       "display_name": "Python 3"
573 |     },
574 |     "language_info": {
575 |       "name": "python",
576 |       "version": "3.8.3",
577 |       "mimetype": "text/x-python",
578 |       "codemirror_mode": {
579 |         "name": "ipython",
580 |         "version": 3
581 |       },
582 |       "pygments_lexer": "ipython3",
583 |       "nbconvert_exporter": "python",
584 |       "file_extension": ".py"
585 |     },
586 |     "kernel_info": {
587 |       "name": "python3"
588 |     },
589 |     "nteract": {
590 |       "version": "0.12.3"
591 |     },
592 |     "interpreter": {
593 |       "hash": "88f65ce5382ce20a2dfcb3047ae19453970fdb3147747ad8e6ead051daaa71e6"
594 |     }
595 |   },
596 |   "nbformat": 4,
597 |   "nbformat_minor": 2
598 | }
599 | 


--------------------------------------------------------------------------------
/Examples/midaspy_demo_cces.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "609c91af",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### __MIDASpy demonstration__"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "a1239507",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "MIDASpy's core functionalities are demonstrated here by using it to impute missing responses to the 2018 Cooperative Congressional Election Study (CCES), an electoral survey conducted in the United States whose size and complexity poses computational difficulties for many existing multiple imputation algorithms."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "attachments": {},
 21 |    "cell_type": "markdown",
 22 |    "id": "e8752d9c",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "The full CCES has 525 columns and 60,000 rows, the latter corresponding to individual survey respondents. After removing variables that either require extensive preprocessing or are unhelpful for imputation purposes — open-ended string variables, time indices, and ZIP code variables — the dataset contains 349 columns. The vast majority of these variables are categorical and must therefore be one-hot encoded for most multiple imputation software packages — that is, each 1 × 60,000 categorical variable with K unique classes must be expanded into a K × 60,000 matrix of 1s and 0s — increasing their number to 1,914."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "attachments": {},
 30 |    "cell_type": "markdown",
 31 |    "id": "48c09dfe",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "_**Loading and preprocessing the data**_"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "abb5b5f7",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "We begin by loading MIDASpy, its dependencies, and additional packages called in the workflow. We then read in the formatted CCES data and sort variables into continuous, binary, and categorical types."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 1,
 48 |    "id": "e5e9ff71",
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import numpy as np\n",
 53 |     "import pandas as pd\n",
 54 |     "import tensorflow as tf\n",
 55 |     "from sklearn.preprocessing import MinMaxScaler\n",
 56 |     "import sys\n",
 57 |     "import MIDASpy as md"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "id": "7608d31b",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "data_in = pd.read_csv(\"cces_jss_format.csv\")\n",
 68 |     "cont_vars = [\"citylength_1\",\"numchildren\",\"birthyr\"]\n",
 69 |     "vals = data_in.nunique()\n",
 70 |     "cat_vars = list(data_in.columns[(vals.values > 2) & ~(data_in.columns.isin(cont_vars))])\n",
 71 |     "bin_vars = list(data_in.columns[vals.values == 2])"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "d325b60a",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Next, we apply the `.binary_conv()` function to the list of binary variables (which are not in dummy form), before appending them and the continuous variables to a `constructor_list` object, the basis for our final preprocessed dataset."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "id": "5e23551f",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "data_bin = data_in[bin_vars].apply(md.binary_conv)\n",
 90 |     "constructor_list = [data_in[cont_vars], data_bin]"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "7b6e0da5",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "To one-hot encode categorical variables, we apply the `.cat_conv()` function to a dataframe containing them. We concatenate the resulting matrix to the existing `constructor_list` object."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "id": "1bd9e10c",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "data_cat = data_in[cat_vars]\n",
109 |     "data_oh, cat_col_list = md.cat_conv(data_cat)\n",
110 |     "constructor_list.append(data_oh)\n",
111 |     "data_0 = pd.concat(constructor_list, axis=1)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "6f598191",
117 |    "metadata": {},
118 |    "source": [
119 |     "The final preprocessing step, which is nonessential, is to scale all variables between 0 and 1 to aid model convergence. We use scikit-learn’s `MinMaxScaler()` function for this step."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 5,
125 |    "id": "afcc8148",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "scaler = MinMaxScaler()\n",
130 |     "data_scaled = scaler.fit_transform(data_0)\n",
131 |     "data_scaled = pd.DataFrame(data_scaled, columns = data_0.columns)\n",
132 |     "na_loc = data_scaled.isnull()\n",
133 |     "data_scaled[na_loc] = np.nan"
134 |    ]
135 |   },
136 |   {
137 |    "attachments": {},
138 |    "cell_type": "markdown",
139 |    "id": "c75e2495",
140 |    "metadata": {},
141 |    "source": [
142 |     "_**Imputation**_"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "id": "f9f88f8b",
148 |    "metadata": {},
149 |    "source": [
150 |     "Once the data are preprocessed, training a MIDAS network with MIDASpy is straightforward. We declare an instance of the `Midas` class, pass our data to this object (including the sorted variable names) with the `.build_model()` function, and train the network for 10 epochs with the `.train_model()` function. For the purposes of this illustration, we maintain most of MIDASpy’s default hyperparameter settings."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 6,
156 |    "id": "381c6ffc",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "imputer = md.Midas(layer_structure= [256,256], vae_layer = False, seed= 89, input_drop = 0.75)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 7,
166 |    "id": "a6d34c74",
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "Size index: [3, 178, 6, 8, 6, 3, 3, 6, 6, 4, 3, 59, 3, 3, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 3, 5, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 6, 6, 6, 6, 6, 6, 6, 10, 10, 7, 4, 7, 8, 5, 8, 3, 5, 9, 5, 52, 17, 3, 3, 3, 3, 3, 6, 3, 23, 4, 7, 8, 12, 14, 11, 6, 6, 4, 7, 10, 5, 4, 4, 7, 3, 4, 6, 3, 7, 5, 4, 4, 4, 6, 5, 17, 51, 53, 53, 3, 98, 6, 6, 5, 17, 17, 4, 6, 3, 3, 3, 6, 6, 6, 10, 5, 5, 5, 5, 6, 5, 7, 5, 5, 5, 5, 224, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 51, 53, 53, 5, 51, 14, 5, 6, 5]\n",
174 |       "\n",
175 |       "Computation graph constructed\n",
176 |       "\n"
177 |      ]
178 |     },
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "<MIDASpy.midas_base.Midas at 0x16a85c6d0>"
183 |       ]
184 |      },
185 |      "execution_count": 7,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "imputer.build_model(data_scaled, binary_columns = bin_vars, softmax_columns = cat_col_list)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 8,
197 |    "id": "7ab7e7ce",
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "Model initialised\n",
205 |       "\n",
206 |       "Epoch: 0 , loss: 186.26737846679688\n",
207 |       "Epoch: 1 , loss: 169.38942487792968\n",
208 |       "Epoch: 2 , loss: 163.48311638997396\n",
209 |       "Epoch: 3 , loss: 159.68743997802736\n",
210 |       "Epoch: 4 , loss: 157.04094825032553\n",
211 |       "Epoch: 5 , loss: 154.82602157389323\n",
212 |       "Epoch: 6 , loss: 153.35590602010092\n",
213 |       "Epoch: 7 , loss: 152.05749235839843\n",
214 |       "Epoch: 8 , loss: 151.08395079345703\n",
215 |       "Epoch: 9 , loss: 150.22736969604492\n",
216 |       "Training complete. Saving file...\n",
217 |       "Model saved in file: tmp/MIDAS\n"
218 |      ]
219 |     },
220 |     {
221 |      "data": {
222 |       "text/plain": [
223 |        "<MIDASpy.midas_base.Midas at 0x16a85c6d0>"
224 |       ]
225 |      },
226 |      "execution_count": 8,
227 |      "metadata": {},
228 |      "output_type": "execute_result"
229 |     }
230 |    ],
231 |    "source": [
232 |     "imputer.train_model(training_epochs = 10)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "ea0ffaca",
238 |    "metadata": {},
239 |    "source": [
240 |     "Once the model is trained, we draw a list of 10 completed datasets. When datasets are very large, as in this case, we recommend accessing each one separately rather than simultaneously holding all of them in memory. We thus construct a dataset generator using the `.yield_samples()` function."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 9,
246 |    "id": "fb0bd2da",
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "imputations = imputer.yield_samples(m=10)"
251 |    ]
252 |   },
253 |   {
254 |    "attachments": {},
255 |    "cell_type": "markdown",
256 |    "id": "94f4131a",
257 |    "metadata": {},
258 |    "source": [
259 |     "_**Analysis of completed datasets**_"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "c5f2c772",
265 |    "metadata": {},
266 |    "source": [
267 |     "We analyze the 10 completed datasets using MIDASpy’s inbuilt `combine()` function. We estimate a simple linear probability model in which `\"CC18_415a\"`, a respondent’s degree of support for giving the United States Environmental Protection Agency power to regulate carbon dioxide emissions,is regressed on `\"age\" (2018 − \"birthyr\")`, a respondent’s age.\n",
268 |     "\n",
269 |     "Users can ensure exact reproducibility of analytical results by saving completed datasets to disk. The trained MIDAS model itself is also saved by default to the location specified in the `savepath` argument of `Midas()`.\n",
270 |     "\n",
271 |     "As we scaled the input dataset prior to imputation with the `MinMaxScaler()` function, for each completed dataset we first invert this transformation via scikit-learn’s `.inverse_transform()` function and also convert predicted probabilities for `CC18_415a` into binary categories using a threshold of 0.5. To save memory, we append the relevant subset of the data, for analysis, to a list."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 10,
277 |    "id": "43f664b7",
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "analysis_dfs = []"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 11,
287 |    "id": "c23fdeeb",
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "INFO:tensorflow:Restoring parameters from tmp/MIDAS\n",
295 |       "Model restored.\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "for df in imputations:\n",
301 |     "    df_unscaled = scaler.inverse_transform(df)\n",
302 |     "    df_unscaled = pd.DataFrame(df_unscaled, columns = data_scaled.columns) \n",
303 |     "    df['age'] = 2018 - df_unscaled['birthyr']\n",
304 |     "    df['CC18_415a'] = np.where(df_unscaled['CC18_415a'] >= 0.5,1,0)\n",
305 |     "    analysis_dfs.append(df.loc[:,[\"age\",\"CC18_415a\"]])"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 12,
311 |    "id": "393ba27d",
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "model = md.combine(y_var = \"CC18_415a\", X_vars = [\"age\"], df_list = analysis_dfs)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 13,
321 |    "id": "605ef806",
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "text/html": [
327 |        "<div>\n",
328 |        "<style scoped>\n",
329 |        "    .dataframe tbody tr th:only-of-type {\n",
330 |        "        vertical-align: middle;\n",
331 |        "    }\n",
332 |        "\n",
333 |        "    .dataframe tbody tr th {\n",
334 |        "        vertical-align: top;\n",
335 |        "    }\n",
336 |        "\n",
337 |        "    .dataframe thead th {\n",
338 |        "        text-align: right;\n",
339 |        "    }\n",
340 |        "</style>\n",
341 |        "<table border=\"1\" class=\"dataframe\">\n",
342 |        "  <thead>\n",
343 |        "    <tr style=\"text-align: right;\">\n",
344 |        "      <th></th>\n",
345 |        "      <th>term</th>\n",
346 |        "      <th>estimate</th>\n",
347 |        "      <th>std.error</th>\n",
348 |        "      <th>statistic</th>\n",
349 |        "      <th>df</th>\n",
350 |        "      <th>p.value</th>\n",
351 |        "    </tr>\n",
352 |        "  </thead>\n",
353 |        "  <tbody>\n",
354 |        "    <tr>\n",
355 |        "      <th>0</th>\n",
356 |        "      <td>const</td>\n",
357 |        "      <td>0.934493</td>\n",
358 |        "      <td>0.005515</td>\n",
359 |        "      <td>169.459700</td>\n",
360 |        "      <td>3056.421238</td>\n",
361 |        "      <td>0.0</td>\n",
362 |        "    </tr>\n",
363 |        "    <tr>\n",
364 |        "      <th>1</th>\n",
365 |        "      <td>age</td>\n",
366 |        "      <td>-0.005259</td>\n",
367 |        "      <td>0.000107</td>\n",
368 |        "      <td>-49.160665</td>\n",
369 |        "      <td>4565.125518</td>\n",
370 |        "      <td>0.0</td>\n",
371 |        "    </tr>\n",
372 |        "  </tbody>\n",
373 |        "</table>\n",
374 |        "</div>"
375 |       ],
376 |       "text/plain": [
377 |        "    term  estimate  std.error   statistic           df  p.value\n",
378 |        "0  const  0.934493   0.005515  169.459700  3056.421238      0.0\n",
379 |        "1    age -0.005259   0.000107  -49.160665  4565.125518      0.0"
380 |       ]
381 |      },
382 |      "execution_count": 13,
383 |      "metadata": {},
384 |      "output_type": "execute_result"
385 |     }
386 |    ],
387 |    "source": [
388 |     "model"
389 |    ]
390 |   }
391 |  ],
392 |  "metadata": {
393 |   "kernelspec": {
394 |    "display_name": "Python 3 (ipykernel)",
395 |    "language": "python",
396 |    "name": "python3"
397 |   },
398 |   "language_info": {
399 |    "codemirror_mode": {
400 |     "name": "ipython",
401 |     "version": 3
402 |    },
403 |    "file_extension": ".py",
404 |    "mimetype": "text/x-python",
405 |    "name": "python",
406 |    "nbconvert_exporter": "python",
407 |    "pygments_lexer": "ipython3",
408 |    "version": "3.10.10"
409 |   }
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 5
413 | }
414 | 


--------------------------------------------------------------------------------
/Examples/tmp/MIDAS.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.data-00000-of-00001


--------------------------------------------------------------------------------
/Examples/tmp/MIDAS.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.index


--------------------------------------------------------------------------------
/Examples/tmp/MIDAS.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/Examples/tmp/MIDAS.meta


--------------------------------------------------------------------------------
/Examples/tmp/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "MIDAS"
2 | all_model_checkpoint_paths: "MIDAS"
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MIDASpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .midas_base import *
2 | 


--------------------------------------------------------------------------------
/MIDASpy/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/MIDASpy/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/MIDASpy/__pycache__/midas_base.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/midas_base.cpython-312.pyc


--------------------------------------------------------------------------------
/MIDASpy/__pycache__/midas_base.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy/__pycache__/midas_base.cpython-38.pyc


--------------------------------------------------------------------------------
/MIDASpy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/MIDASpy_logo.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # MIDASpy<img src='https://raw.githubusercontent.com/MIDASverse/MIDASpy/master/MIDASpy_logo.png' align="right" height="139" /></a>
  3 | 
  4 | [![PyPI Latest Release](https://img.shields.io/pypi/v/midaspy.svg)](https://pypi.org/project/midaspy/)
  5 | [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)](https://pypi.org/project/midaspy/)
  6 | [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html)
  7 | [![CI Linux](https://github.com/edvinskis/MIDASpy/actions/workflows/testlinux.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testlinux.yml)
  8 | [![CI macOS](https://github.com/edvinskis/MIDASpy/actions/workflows/testmacos.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testmacos.yml)
  9 | [![CI Windows](https://github.com/edvinskis/MIDASpy/actions/workflows/testwindows.yml/badge.svg)](https://github.com/edvinskis/MIDASpy/actions/workflows/testwindows.yml)
 10 | 
 11 | ## Overview
 12 | 
 13 | **MIDASpy** is a Python package for multiply imputing missing data using
 14 | deep learning methods. The **MIDASpy** algorithm offers significant
 15 | accuracy and efficiency advantages over other multiple imputation
 16 | strategies, particularly when applied to large datasets with complex
 17 | features. In addition to implementing the algorithm, the package contains
 18 | functions for processing data before and after model training, running
 19 | imputation model diagnostics, generating multiple completed datasets,
 20 | and estimating regression models on these datasets.
 21 | 
 22 | For an implementation in R, see our **rMIDAS** repository
 23 | [here](https://github.com/MIDASverse/rMIDAS).
 24 | 
 25 | ## Background and suggested citations
 26 | 
 27 | For more information on MIDAS, the method underlying the software, see:
 28 | 
 29 | Lall, Ranjit, and Thomas Robinson. 2022. "The MIDAS Touch: Accurate and Scalable Missing-Data Imputation with Deep Learning." _Political Analysis_ 30, no. 2: 179-196. doi:10.1017/pan.2020.49. [Published version](https://ranjitlall.github.io/assets/pdf/Lall%20and%20Robinson%202022%20PA.pdf). [Accepted version](http://eprints.lse.ac.uk/108170/1/Lall_Robinson_PA_Forthcoming.pdf).
 30 | 
 31 | Lall, Ranjit, and Thomas Robinson. 2023. "Efficient Multiple Imputation for Diverse Data in Python and R: MIDASpy and rMIDAS." _Journal of Statistical Software_ 107, no. 9: 1-38. doi:10.18637/jss.v107.i09. [Published version](https://ranjitlall.github.io/assets/pdf/Lall%20and%20Robinson%202023%20JSS.pdf).
 32 | 
 33 | ## Installation
 34 | 
 35 | To install via pip, enter the following command into the terminal:  
 36 | `pip install MIDASpy`
 37 | 
 38 | The latest development version (potentially unstable) can be installed
 39 | via the terminal with:  
 40 | `pip install git+https://github.com/MIDASverse/MIDASpy.git`
 41 | 
 42 | MIDAS requires:
 43 | 
 44 |   - Python (>=3.6; <3.11)
 45 |   - Numpy (>=1.5)
 46 |   - Pandas (>=0.19)
 47 |   - TensorFlow (<2.12)
 48 |   - Matplotlib
 49 |   - Statmodels
 50 |   - Scipy
 51 |   - TensorFlow Addons (<0.20)
 52 | 
 53 | Tensorflow also has a number of requirements, particularly if GPU acceleration is desired. See <https://www.tensorflow.org/install/> for details.
 54 | 
 55 | ## Examples
 56 | 
 57 | For a simple demonstration of **MIDASpy**, see our Jupyter Notebook
 58 | [examples](https://github.com/MIDASverse/MIDASpy/blob/master/Examples/).
 59 | 
 60 | ## Contributing to MIDASpy
 61 | 
 62 | Interested in contributing to **MIDASpy**? We are looking to hire a research assistant to work part-time (flexibly) to help us build out new features and integrate our software with existing machine learning pipelines. You would be paid the standard research assistant rate at the University of Oxford. To apply, please send your CV (or a summary of relevant skills/experience) to ranjit.lall@sjc.ox.ac.uk.
 63 | 
 64 | 
 65 | ## Version 1.4.0 (August 2024)
 66 | 
 67 | - Adds support for non-negative output columns, with a `positive_columns` argument
 68 | 
 69 | 
 70 | ## Version 1.3.1 (October 2023)
 71 | 
 72 | - Minor update to reflect publication of accompanying article in Journal of Statistical Software
 73 | - Further updates to make documentation and URLs consistent, including removing unused metadata
 74 | 
 75 | ## Version 1.2.4 (August 2023)
 76 | 
 77 | - Adds support for Python 3.9 and 3.10
 78 | - Addresses deprecation warnings and other minor bug fixes
 79 | - Resolves dependency issues and includes an updated `setup.py` file
 80 | - Adds GitHub Actions workflows that trigger automatic tests on the latest Ubuntu, macOS, and Windows for Python versions 3.7 to 3.10 each time a push or pull request is made to the main branch
 81 | - An additional Jupyter Notebook example that demonstrates the core functionalities of **MIDASpy**
 82 | 
 83 | ## Version 1.2.3 (December 2022)
 84 | 
 85 | *v1.2.3 adds support for installation on Apple Silicon hardware (i.e. M1 and M2 Macs).*
 86 | 
 87 | ## Version 1.2.2 (July 2022)
 88 | 
 89 | *v1.2.2 makes minor efficiency changes to the codebase. Full details are available in the Release logs.*
 90 | 
 91 | ## Version 1.2.1 (January 2021)
 92 | 
 93 | *v1.2.1 adds new pre-processing functionality and a multiple imputation regression function.*
 94 | 
 95 | Users can now automatically preprocess binary and categorical columns prior to running the MIDAS algorithm using `binary_conv()` and `cat_conv()`.
 96 | 
 97 | The new `combine()` function allows users to run regression analysis across the complete data, following Rubin’s combination rules.
 98 | 
 99 | ## Previous versions
100 | 
101 | *Version 1.1.1 (October 2020)*
102 | 
103 | Key changes:
104 | 
105 |   - Update adds **full Tensorflow 2.X support**:
106 | 
107 |     - Users can now run the MIDAS algorithm in TensorFlow 2.X (TF1 support
108 |     retained)
109 | 
110 |     - Tidier handling of random seed setting across both TensorFlow and
111 |     NumPy
112 |     
113 |   - Fixes a minor dependency bug
114 |   
115 |   - Other minor bug fixes
116 | 
117 | *Version 1.0.2 (September 2020)*
118 | 
119 | Key changes:
120 | 
121 |   - Minor, mainly cosmetic, changes to the underlying source code.
122 |   - Renamed ‘categorical\_columns’ argument in build\_model() to
123 |     ‘binary\_columns’ to avoid confusion
124 |   - Added plotting arguments to overimputation() method to suppress
125 |     intermediary overimputation plots (plot\_main) and all plots
126 |     (skip\_plot).
127 |   - Changed overimputation() plot titles, labels and legends
128 |   - Added tensorflow 2.0 version check on import
129 |   - Fixed seed-setting bug in earlier versions
130 | 
131 | *Alpha 0.2:*
132 | 
133 | Variational autoencoder enabled. More flexibility in model
134 | specification, although defaulting to a simple mirrored system. Deeper
135 | analysis tools within .overimpute() for checking fit on continuous
136 | values. Constructor code deconflicted. Individual output specification
137 | enabled for very large datasets.
138 | 
139 | Key added features:
140 | 
141 |   - Variational autoencoder capacity added, including encoding to and
142 |     sampling from latent space
143 | 
144 | Planned features:
145 | 
146 |   - Time dependence handling through recurrent cells
147 |   - Improving the pipeline methods for very large datasets
148 |   - Tensorboard integration
149 |   - Dropout scaling
150 |   - A modified constructor that can generate embeddings for better
151 |     interpolation of features
152 |   - R support
153 | 
154 | Wish list:
155 | 
156 |   - Smoothing for time series (LOESS?)
157 |   - Informative priors?
158 | 
159 | *Alpha 0.1:*
160 | 
161 |   - Basic functionality feature-complete.
162 |   - Support for mixed categorical and continuous data types
163 |   - An “additional data” pipeline, allowing data that may be relevant to
164 |     the imputation to be included (without being included in error
165 |     generating statistics)
166 |   - Simplified calibration for model complexity through the
167 |     “overimputation” function, including visualization of
168 |     reconstructed features
169 |   - Basic large dataset functionality
170 | 


--------------------------------------------------------------------------------
/build/lib/MIDASpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .midas_base import *
2 | 


--------------------------------------------------------------------------------
/build/lib/MIDASpy/midas_base.py:
--------------------------------------------------------------------------------
   1 | # ==============================================================================
   2 | #
   3 | #   888b     d888 8888888 8888888b.        d8888  .d8888b.  
   4 | #   8888b   d8888   888   888  "Y88b      d88888 d88P  Y88b 
   5 | #   88888b.d88888   888   888    888     d88P888 Y88b.      
   6 | #   888Y88888P888   888   888    888    d88P 888  "Y888b.   
   7 | #   888 Y888P 888   888   888    888   d88P  888     "Y88b. 
   8 | #   888  Y8P  888   888   888    888  d88P   888       "888 
   9 | #   888   "   888   888   888  .d88P d8888888888 Y88b  d88P 
  10 | #   888       888 8888888 8888888P" d88P     888  "Y8888P"  
  11 | #
  12 | # --- Multiple Imputation with Denoising Autoencoders
  13 | # Copyright 2020 Ranjit Lall, Alex Stenlake, and Thomas Robinson. All Rights Reserved.
  14 | #
  15 | # Licensed under the Apache License, Version 2.0 (the "License");
  16 | # you may not use this file except in compliance with the License.
  17 | # You may obtain a copy of the License at
  18 | #
  19 | #     http://www.apache.org/licenses/LICENSE-2.0
  20 | #
  21 | # Unless required by applicable law or agreed to in writing, software
  22 | # distributed under the License is distributed on an "AS IS" BASIS,
  23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  24 | # See the License for the specific language governing permissions and
  25 | # limitations under the License.
  26 | # ==============================================================================
  27 | 
  28 | import matplotlib.pyplot as plt
  29 | import numpy as np
  30 | import pandas as pd
  31 | import tensorflow as tf
  32 | import os
  33 | import statsmodels.api as sm
  34 | from scipy import stats
  35 | 
  36 | if tf.__version__[0] == '2':
  37 |     import tensorflow_addons as tfa
  38 | 
  39 | from sklearn.metrics import mean_squared_error as mse
  40 | from typing import List, Optional
  41 | import random
  42 | 
  43 | 
  44 | class Midas(object):
  45 |     """
  46 |     MIDASpy is a Python package for multiply imputing missing data using deep learning methods. The MIDASpy algorithm
  47 |     offers significant accuracy and efficiency advantages over other multiple imputation strategies, particularly when
  48 |     applied to large datasets with complex features. In addition to implementing the algorithm, the package contains
  49 |     functions for processing data before and after model training, running imputation model diagnostics, generating
  50 |     multiple completed datasets, and estimating regression models on these datasets.
  51 |     """
  52 |     def __init__(self,
  53 |                  layer_structure: Optional[List[int]] = None,
  54 |                  learn_rate: float = 1e-4,
  55 |                  input_drop: float = 0.8,
  56 |                  train_batch: int = 16,
  57 |                  savepath: str = 'tmp/MIDAS',
  58 |                  seed: (int, type(None)) = None,
  59 |                  output_layers: str = 'reversed',
  60 |                  loss_scale: int = 1,
  61 |                  init_scale: int = 1,
  62 |                  vae_layer: bool = False,
  63 |                  individual_outputs: bool = False,
  64 |                  manual_outputs: bool = False,
  65 |                  output_structure: Optional[List[int]] = None,
  66 |                  latent_space_size: int = 4,
  67 |                  cont_adj: float = 1.0,
  68 |                  binary_adj: float = 1.0,
  69 |                  softmax_adj: float = 1.0,
  70 |                  dropout_level: float = 0.5,
  71 |                  weight_decay: str = 'default',
  72 |                  vae_alpha: float = 1.0,
  73 |                  act=tf.nn.elu,
  74 |                  vae_sample_var: float = 1.0,
  75 |                  noise_type: str = 'bernoulli',
  76 |                  kld_min: float = 0.01,
  77 |                  ):
  78 |         """
  79 |         Initialiser. Called separately to 'build_model' to allow for out-of-memory
  80 |         datasets. All key hyperparameters are entered at this stage, as the model
  81 |         construction methods only deal with the dataset.
  82 | 
  83 |         Args:
  84 |           layer_structure: List of integers. The number of nodes in each layer of the
  85 |           network (default = [256, 256, 256], denoting a three-layer network with 256
  86 |           nodes per layer). Larger networks can learn more complex data structures but
  87 |           require longer training and are more prone to overfitting.
  88 | 
  89 |           learn_rate: Float. The learning rate (gamma; default = 0.0001), which
  90 |           controls the size of the weight adjustment in each training epoch. In general,
  91 |           higher values reduce training time at the expense of less accurate results.
  92 | 
  93 |           input_drop: Float between 0 and 1. The probability of corruption for input
  94 |           columns in training mini-batches (default = 0.8). Higher values increase
  95 |           training time but reduce the risk of overfitting. In our experience, values
  96 |           between 0.7 and 0.95 deliver the best performance.
  97 | 
  98 |           train_batch: Integer. The number of observations in training mini-batches
  99 |           (default = 16). Common choices are 8, 16, 32, 64, and 128; powers of 2 tend to
 100 |           enhance memory efficiency. In general, smaller sizes lead to faster convergence
 101 |           at the cost of greater noise and thus less accurate estimates of the error
 102 |           gradient. Where memory management is a concern, they should be favored.
 103 | 
 104 |           savepath: String. The location to which the trained model will be saved.
 105 | 
 106 |           seed: Integer. The value to which Python's pseudo-random number
 107 |           generator is initialized. This enables users to ensure that data shuffling,
 108 |           weight and bias initialization, and missingness indicator vectors are
 109 |           reproducible.
 110 | 
 111 |           loss_scale: Float. A constant by which the RMSE loss functions are multiplied
 112 |           (default = 1). This hyperparameter performs a similar function to the learning
 113 |           rate. If loss during training is very large, increasing its value can help to
 114 |           prevent overtraining.
 115 | 
 116 |           init_scale: Float. The numerator of the variance component of Xavier Initialisation
 117 |           equation (default = 1). In very deep networks, higher values may help to prevent
 118 |           extreme gradients (though this problem is less common with ELU activation functions).
 119 | 
 120 |           softmax_adj: Float. A constant by which the cross-entropy loss functions are
 121 |           multiplied (default = 1). This hyperparameter is the equivalent of loss_scale
 122 |           for categorical variables. If cross-entropy loss falls at a consistently faster
 123 |           rate than RMSE during training, a lower value may help to redress this imbalance.
 124 | 
 125 |           vae_layer: Boolean. Specifies whether to include a variational autoencoder layer in
 126 |           the network (default = False), one of the key diagnostic tools included in midas.
 127 |           If set to true, variational autoencoder hyperparameters must be specified via a number
 128 |           of additional arguments.
 129 | 
 130 |           latent_space_size: Integer. The number of normal dimensions used to parameterize the
 131 |           latent space when vae_layer = True.
 132 | 
 133 |           vae_sample_var: Float. The sampling variance of the normal distributions used to
 134 |           parameterize the latent space when vae_layer = True.
 135 | 
 136 |           vae_alpha: Float. The strength of the prior imposed on the Kullback-Leibler divergence term
 137 |           in the variational autoencoder loss functions.
 138 | 
 139 |           kld_min: Float. The minimum value of the Kullback-Leibler divergence term in the variational
 140 |           autoencoder loss functions.
 141 | 
 142 |         Returns:
 143 |           Self
 144 | 
 145 |         """
 146 |         # Sanity Check layer_structure:
 147 |         if not layer_structure:
 148 |             layer_structure = [256, 256, 256]
 149 |         if not isinstance(layer_structure, list):
 150 |             raise TypeError("The layer structure must be specified within a list type.")
 151 |         if not all(isinstance(v, int) for v in layer_structure):
 152 |             raise ValueError("The elements of the layer_structure must all be specified as integer types.")
 153 | 
 154 |         # Sanity Check output_layers:
 155 |         if not isinstance(output_layers, (str, list)):
 156 |             raise TypeError("The 'output_layers' argument must be a string or a list type.")
 157 |         if isinstance(output_layers, str):
 158 |             if not output_layers == "reversed":
 159 |                 raise ValueError("The only string argument accepted for output_layers is 'reversed'.")
 160 |             self.output_layers = layer_structure.copy()
 161 |             self.output_layers.reverse()
 162 |         if isinstance(output_layers, list):
 163 |             self.output_layers = output_layers
 164 | 
 165 |         # Sanity Check weight_decay:
 166 |         if not isinstance(weight_decay, (str, float)):
 167 |             raise TypeError("The 'weight_decay' argument must be a string or float type.")
 168 |         if isinstance(weight_decay, str):
 169 |             if not weight_decay == 'default':
 170 |                 raise ValueError("The 'weight_decay' argument must be 'default' if a string.")
 171 |             self.weight_decay = 'default'
 172 |         if isinstance(weight_decay, float):
 173 |             self.weight_decay = weight_decay
 174 | 
 175 |         # Sanity Check output_structure:
 176 |         if output_structure is None:
 177 |             output_structure = [16, 16, 32]
 178 |         if isinstance(output_structure, int):
 179 |             self.output_structure = [output_structure] * 3
 180 |         elif (individual_outputs is True) | (len(output_structure) == 3):
 181 |             self.output_structure = output_structure
 182 |         else:
 183 |             raise TypeError("The output transform assignment must take the form of an integer, a list of three "
 184 |                             "elements (cont, bin, cat), or individual values must be specified.")
 185 | 
 186 |         if seed is not None:
 187 |             os.environ['PYTHONHASHSEED'] = str(seed)
 188 |             os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
 189 |             os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
 190 |             tf.compat.v1.set_random_seed(seed)
 191 | 
 192 |         # Sanity Check savepath:
 193 |         if not isinstance(savepath, str):
 194 |             raise TypeError("The 'savepath' argument must be a string type.")
 195 |         if os.path.exists(savepath) and not os.path.isdir(savepath):
 196 |             raise FileExistsError("The passed argument was a file, not a directory.")
 197 |         if not os.path.exists(savepath):
 198 |             os.makedirs(savepath)
 199 | 
 200 |         self.layer_structure = layer_structure
 201 |         self.learn_rate = learn_rate
 202 |         self.input_drop = input_drop
 203 |         self.model_built = False
 204 |         self.savepath = savepath
 205 |         self.model = None
 206 |         self.additional_data = None
 207 |         self.train_batch = train_batch
 208 |         self.seed = seed
 209 |         self.input_is_pipeline = False
 210 |         self.input_pipeline = None
 211 |         self.vae_layer = vae_layer
 212 |         self.loss_scale = loss_scale
 213 |         self.init_scale = init_scale
 214 |         self.individual_outputs = individual_outputs
 215 |         self.manual_outputs = manual_outputs
 216 |         self.vae_sample_var = vae_sample_var
 217 |         self.latent_space_size = latent_space_size
 218 |         self.dropout_level = dropout_level
 219 |         self.prior_strength = vae_alpha
 220 |         self.kld_min = kld_min
 221 |         self.seed = seed
 222 |         self.cont_adj = cont_adj
 223 |         self.binary_adj = binary_adj
 224 |         self.softmax_adj = softmax_adj
 225 |         self.act = act
 226 |         self.noise_type = noise_type
 227 | 
 228 |     def _batch_iter(self,
 229 |                     train_data,
 230 |                     na_mask,
 231 |                     b_size=16,
 232 |                     rng=np.random):
 233 |         """
 234 |         Function for handling the batch feeds for training loops
 235 |         """
 236 |         indices = np.arange(train_data.shape[0])
 237 |         rng.shuffle(indices)
 238 | 
 239 |         for start_idx in range(0, train_data.shape[0] - b_size + 1, b_size):
 240 |             excerpt = indices[start_idx:start_idx + b_size]
 241 |             if self.additional_data is None:
 242 |                 yield train_data[excerpt], na_mask[excerpt]
 243 |             else:
 244 |                 yield train_data[excerpt], na_mask[excerpt], self.additional_data.values[excerpt]
 245 | 
 246 |     def _batch_iter_output(self,
 247 |                            train_data,
 248 |                            b_size=256):
 249 |         """
 250 |         Identical to _batch_iter(), although designed for a single datasource
 251 |         """
 252 | 
 253 |         indices = np.arange(train_data.shape[0])
 254 |         for start_idx in range(0, train_data.shape[0], b_size):
 255 |             excerpt = indices[start_idx:start_idx + b_size]
 256 |             if self.additional_data is None:
 257 |                 yield train_data[excerpt]
 258 |             else:
 259 |                 yield train_data[excerpt], self.additional_data.values[excerpt]
 260 | 
 261 |     @staticmethod
 262 |     def _batch_iter_zsample(data,
 263 |                             b_size: int = 256):
 264 |         """
 265 |         Identical to _batch_iter(), although designed for sampling from latent
 266 |         """
 267 |         indices = np.arange(data.shape[0])
 268 |         for start_idx in range(0, data.shape[0], b_size):
 269 |             excerpt = indices[start_idx:start_idx + b_size]
 270 |             yield data[excerpt]
 271 | 
 272 |     def _build_layer(self,
 273 |                      X,
 274 |                      weight_matrix,
 275 |                      bias_vec,
 276 |                      dropout_rate=0.5,
 277 |                      output_layer=False):
 278 |         """
 279 |         Constructs layers for the build function
 280 |         """
 281 |         X_tx = tf.matmul(tf.compat.v1.nn.dropout(X,
 282 |                                                  rate=(1 - dropout_rate)),
 283 |                          weight_matrix) + bias_vec
 284 |         if output_layer:
 285 |             return X_tx
 286 |         else:
 287 |             return self.act(X_tx)
 288 | 
 289 |     @staticmethod
 290 |     def _build_variables(weights,
 291 |                          biases,
 292 |                          num_in,
 293 |                          num_out,
 294 |                          scale=1):
 295 |         """
 296 |         Custom initialiser for a weights, using a variation on Xavier initialisation
 297 |         with smaller starting weights. Allows for faster convergence on low learn
 298 |         rates, useful in the presence of multiple loss functions
 299 |         """
 300 |         weights.append(tf.Variable(tf.random.truncated_normal([num_in, num_out],
 301 |                                                               mean=0,
 302 |                                                               stddev=scale / np.sqrt(num_in + num_out))))
 303 |         biases.append(tf.Variable(tf.zeros([num_out])))  # Bias can be zero
 304 |         return weights, biases
 305 | 
 306 |     @staticmethod
 307 |     def _sort_cols(data,
 308 |                    subset):
 309 |         """
 310 |         This function is used to sequence the columns of the dataset, so as to be in
 311 |         the order [Continuous data], [Binary data], [Categorical data]. It simply
 312 |         rearranges a column, done functionally to minimise memory overhead
 313 |         """
 314 |         if not isinstance(subset, list):
 315 |             subset = list(subset)
 316 |         data_1 = data[subset]
 317 |         data_0 = data.drop(subset, axis=1)
 318 |         chunk = data_1.shape[1]
 319 |         return pd.concat([data_0, data_1], axis=1), chunk
 320 | 
 321 |     def build_model(self,
 322 |                     imputation_target,
 323 |                     binary_columns=None,
 324 |                     softmax_columns=None,
 325 |                     unsorted=True,
 326 |                     additional_data=None,
 327 |                     verbose=True,
 328 |                     ):
 329 |         """
 330 |         This method is called to construct the neural network that is the heart of
 331 |         MIDAS. This includes the assignment of loss functions to the appropriate
 332 |         data types.
 333 | 
 334 |         THIS FUNCTION MUST BE CALLED BEFORE ANY TRAINING OR IMPUTATION OCCURS. Failing
 335 |         to do so will simply raise an error.
 336 | 
 337 |         The categorical columns should be a list of column names. Softmax columns
 338 |         should be a list of lists of column names. This will allow the model to
 339 |         dynamically assign cost functions to the correct variables. If, however,
 340 |         the data comes pre-sorted, arranged can be set to "true", in which case
 341 |         the arguments can be passed in as integers of size, ie. shape[1] attributes
 342 |         for each of the relevant categories.
 343 | 
 344 |         In other words, if you're experienced at using MIDAS and understand how its
 345 |         indexing works, pre-sort your data and pass in the integers so specifying
 346 |         reindexing values doesn't become too onerous.
 347 | 
 348 |         Alternatively, list(df.columns.values) will output a list of column names,
 349 |         which can be easily implemented in the 'for' loop which constructs your dummy
 350 |         variables.
 351 | 
 352 |         Args:
 353 |           imputation_target: DataFrame. The name of the incomplete input dataset.
 354 |           Upon being read in, the dataset will be appropriately formatted and stored
 355 |           for training.
 356 | 
 357 |           binary_columns: List of names. A list of  all binary variables in the input
 358 |           dataset.
 359 | 
 360 |           softmax_columns: List of lists. The outer list should include all non-binary
 361 |           categorical variables in the input dataset. Each inner list should contain
 362 |           the mutually exclusive set of possible classes for each of these variables.
 363 | 
 364 |           unsorted: Boolean. Specifies whether the input dataset has been pre-ordered
 365 |           in terms of variable type (default = True, denoting no sorting). If
 366 |           set to False, binary_columns and softmax_columns should be a list of integers
 367 |           denoting shape attributes for each category.
 368 | 
 369 |           additional_data: DataFrame. Data that should be included in the imputation
 370 |           model but are not required for later analyses. Such data will not be
 371 |           formatted, rearranged, or included in the loss functions, reducing training
 372 |           time.
 373 | 
 374 |           verbose: Boolean. Specifies whether to print messages to the terminal
 375 |           (default = True).
 376 | 
 377 |           Returns:
 378 |             Self
 379 | 
 380 |         """
 381 |         if not isinstance(imputation_target, pd.DataFrame):
 382 |             raise TypeError("Input data must be in a DataFrame")
 383 |         if imputation_target.isnull().sum().sum() == 0:
 384 |             raise ValueError("Imputation target contains no missing values. Please ensure "
 385 |                              "missing values are encoded as type np.nan")
 386 |         self.original_columns = imputation_target.columns
 387 |         cont_exists = False
 388 |         cat_exists = False
 389 |         in_size = imputation_target.shape[1]
 390 |         if additional_data is not None:
 391 |             add_size = additional_data.shape[1]
 392 |         else:
 393 |             add_size = 0
 394 | 
 395 |         # Establishing indices for cost function
 396 |         size_index = []
 397 |         if binary_columns is not None:
 398 |             if unsorted:
 399 |                 imputation_target, chunk = self._sort_cols(imputation_target,
 400 |                                                            binary_columns)
 401 |                 size_index.append(chunk)
 402 |             else:
 403 |                 size_index.append(binary_columns)
 404 |             cat_exists = True
 405 |         if softmax_columns is not None:
 406 |             if unsorted:
 407 |                 for subset in softmax_columns:
 408 |                     imputation_target, chunk = self._sort_cols(imputation_target,
 409 |                                                                subset)
 410 |                     size_index.append(chunk)
 411 |             else:
 412 |                 for digit in softmax_columns:
 413 |                     size_index.append(digit)
 414 |         if sum(size_index) < in_size:
 415 |             chunk = in_size - sum(size_index)
 416 |             size_index.insert(0, chunk)
 417 |             cont_exists = True
 418 |             if not sum(size_index) == in_size:
 419 |                 raise ValueError("Sorting columns has failed")
 420 |         if verbose:
 421 |             print("Size index:", size_index)
 422 | 
 423 |         # Commit some variables to the instance of the class
 424 |         self.size_index = size_index
 425 |         if not self.input_is_pipeline:
 426 |             self.na_matrix = imputation_target.notnull().astype(bool)
 427 |         self.imputation_target = imputation_target.fillna(0)
 428 |         if additional_data is not None:
 429 |             self.additional_data = additional_data.fillna(0)
 430 | 
 431 |         # Build graph
 432 |         tf.compat.v1.reset_default_graph()
 433 |         self.graph = tf.Graph()
 434 |         with self.graph.as_default():
 435 |             if self.seed is not None:
 436 |                 # np.random.seed(self.seed)
 437 |                 tf.compat.v1.set_random_seed(self.seed)
 438 | 
 439 |             # Placeholders
 440 |             self.X = tf.compat.v1.placeholder(tf.float32, [None, in_size])
 441 |             self.na_idx = tf.compat.v1.placeholder(tf.bool, [None, in_size])
 442 |             if additional_data is not None:
 443 |                 self.X_add = tf.compat.v1.placeholder(tf.float32, [None, add_size])
 444 |             if self.vae_layer:
 445 |                 self.latent_inputs = tf.compat.v1.placeholder(tf.float32, [None, self.latent_space_size])
 446 | 
 447 |             # Build list for determining input and output structures
 448 |             struc_list = self.layer_structure.copy()
 449 |             struc_list.insert(0, in_size + add_size)
 450 |             outputs_struc = []
 451 |             for n in range(len(size_index)):
 452 |                 if n == 0:
 453 |                     if cont_exists:
 454 |                         outputs_struc += ["cont"] * size_index[n]
 455 |                     elif cat_exists:
 456 |                         outputs_struc += ["bin"] * size_index[n]
 457 | 
 458 |                     else:
 459 |                         outputs_struc += [size_index[n]]
 460 | 
 461 |                 elif n == 1:
 462 |                     if cont_exists and cat_exists:
 463 |                         outputs_struc += ["bin"] * size_index[n]
 464 | 
 465 |                     else:
 466 |                         outputs_struc += [size_index[n]]
 467 |                 else:
 468 |                     outputs_struc += [size_index[n]]
 469 | 
 470 |             if self.manual_outputs is True:
 471 |                 output_layer_size = np.sum(self.output_structure)
 472 |                 output_layer_structure = self.output_structure
 473 |             else:
 474 |                 output_layer_structure = []
 475 |                 for item in outputs_struc:
 476 |                     if item == "cont":
 477 |                         output_layer_structure.append(self.output_structure[0])
 478 |                     if item == "bin":
 479 |                         output_layer_structure.append(self.output_structure[1])
 480 |                     if type(item) == int:
 481 |                         output_layer_structure.append(self.output_structure[2])
 482 |                     output_layer_size = np.sum(output_layer_structure)
 483 | 
 484 |             # Instantiate and initialise variables
 485 |             _w = []
 486 |             _b = []
 487 |             _zw = []
 488 |             _zb = []
 489 |             _ow = []
 490 |             _ob = []
 491 | 
 492 |             # Input, denoising
 493 |             for n in range(len(struc_list) - 1):
 494 |                 _w, _b = self._build_variables(weights=_w, biases=_b,
 495 |                                                num_in=struc_list[n],
 496 |                                                num_out=struc_list[n + 1],
 497 |                                                scale=self.init_scale)
 498 |             if self.vae_layer:
 499 |                 mapped_dist = tf.compat.v1.distributions.Normal(tf.constant(0.),
 500 |                                                                 tf.constant(self.vae_sample_var))
 501 |                 #        mapped_dist = tf.distributions.StudentT(tf.constant(3.0),
 502 |                 #                                                tf.constant(0.0),
 503 |                 #                                                tf.constant(1.0))
 504 |                 # Latent state, variance
 505 |                 _zw, _wb = self._build_variables(weights=_zw, biases=_zb,
 506 |                                                  num_in=struc_list[-1],
 507 |                                                  num_out=self.latent_space_size * 2,
 508 |                                                  scale=self.init_scale)
 509 |                 _zw, _wb = self._build_variables(weights=_zw, biases=_zb,
 510 |                                                  num_in=self.latent_space_size,
 511 |                                                  num_out=self.output_layers[0],
 512 |                                                  scale=self.init_scale)
 513 | 
 514 |             t_l = len(self.output_layers)
 515 |             # Output, specialisation
 516 |             assert len(output_layer_structure) == len(outputs_struc)
 517 |             output_split = []
 518 |             if self.individual_outputs:
 519 |                 self.output_layers.append(output_layer_size)
 520 |                 for n in range(t_l):
 521 |                     _ow, _ob = self._build_variables(weights=_ow, biases=_ob,
 522 |                                                      num_in=self.output_layers[n],
 523 |                                                      num_out=self.output_layers[n + 1],
 524 |                                                      scale=self.init_scale)
 525 |                 for n in range(len(outputs_struc)):
 526 |                     if type(outputs_struc[n]) == str:
 527 |                         _ow, _ob = self._build_variables(weights=_ow, biases=_ob,
 528 |                                                          num_in=output_layer_structure[n],
 529 |                                                          num_out=1,
 530 |                                                          scale=self.init_scale)
 531 |                         output_split.append(1)
 532 |                     elif type(outputs_struc[n]) == int:
 533 |                         _ow, _ob = self._build_variables(weights=_ow, biases=_ob,
 534 |                                                          num_in=output_layer_structure[n],
 535 |                                                          num_out=outputs_struc[n],
 536 |                                                          scale=self.init_scale)
 537 |                         output_split.append(outputs_struc[n])
 538 |             else:
 539 |                 self.output_layers.append(in_size)
 540 |                 for n in range(t_l):
 541 |                     _ow, _ob = self._build_variables(weights=_ow, biases=_ob,
 542 |                                                      num_in=self.output_layers[n],
 543 |                                                      num_out=self.output_layers[n + 1])
 544 |                 for n in range(len(outputs_struc)):
 545 |                     if type(outputs_struc[n]) == str:
 546 |                         output_split.append(1)
 547 |                     elif type(outputs_struc[n]) == int:
 548 |                         output_split.append(outputs_struc[n])
 549 | 
 550 |             # Build the neural network. Each layer is determined by the struc list
 551 |             def denoise(X):
 552 |                 # Input tx
 553 |                 for n in range(len(struc_list) - 1):
 554 |                     if n == 0:
 555 |                         if self.noise_type == 'bernoulli':
 556 |                             X = self._build_layer(X, _w[n], _b[n],
 557 |                                                   dropout_rate=self.input_drop)
 558 |                         elif self.noise_type == 'gaussian':
 559 |                             X = X + tf.compat.v1.distributions.Normal(loc=tf.constant(0.),
 560 |                                                                       scale=tf.constant(self.input_drop)).sample(
 561 |                                 sample_shape=tf.shape(input=X))
 562 |                             X = self._build_layer(X, _w[n], _b[n],
 563 |                                                   dropout_rate=self.input_drop)
 564 |                     else:
 565 |                         X = self._build_layer(X, _w[n], _b[n],
 566 |                                               dropout_rate=self.dropout_level)
 567 |                 return X
 568 | 
 569 |             if self.vae_layer:
 570 |                 def to_z(X):
 571 |                     # Latent tx
 572 |                     X = self._build_layer(X, _zw[0], _zb[0], dropout_rate=self.dropout_level,
 573 |                                           output_layer=True)
 574 |                     x_mu, x_log_sigma = tf.split(X, [self.latent_space_size] * 2, axis=1)
 575 |                     return x_mu, x_log_sigma
 576 | 
 577 |                 def from_z(z):
 578 |                     # Joint transform
 579 |                     X = self._build_layer(z, _zw[1], _zb[1], dropout_rate=1)
 580 |                     return X
 581 | 
 582 |                 def vae(X, output=False):
 583 |                     x_mu, x_log_sigma = to_z(X)
 584 |                     if output:
 585 |                         reparam_z = mapped_dist.sample(sample_shape=tf.shape(input=x_mu))
 586 |                     #            reparam_z = tf.random_normal(tf.shape(x_mu))
 587 |                     else:
 588 |                         reparam_z = tf.random.normal(tf.shape(input=x_mu))
 589 |                     z = x_mu + reparam_z * tf.exp(x_log_sigma)
 590 |                     kld = tf.maximum(
 591 |                         tf.reduce_mean(input_tensor=1 + 2 * x_log_sigma * x_mu ** 2 - tf.exp(2 - x_log_sigma),
 592 |                                        axis=1) * self.prior_strength * - 0.5,
 593 |                         self.kld_min)
 594 |                     X = from_z(z)
 595 |                     return X, kld
 596 | 
 597 |             if self.individual_outputs:
 598 |                 def decode(X):
 599 |                     for n in range(t_l):
 600 |                         X = self._build_layer(X, _ow[n], _ob[n], dropout_rate=self.dropout_level)
 601 |                     # Output tx
 602 |                     base_splits = tf.split(X, output_layer_structure, axis=1)
 603 |                     decombined = []
 604 |                     for n in range(len(outputs_struc)):
 605 |                         decombined.append(self._build_layer(base_splits[n], _ow[n + t_l], _ob[n + t_l],
 606 |                                                             dropout_rate=self.dropout_level,
 607 |                                                             output_layer=True))
 608 |                     return decombined
 609 | 
 610 |             else:
 611 |                 def decode(X):
 612 |                     for n in range(t_l):
 613 |                         if n == t_l - 1:
 614 |                             X = self._build_layer(X, _ow[n], _ob[n],
 615 |                                                   dropout_rate=self.dropout_level,
 616 |                                                   output_layer=True)
 617 |                         else:
 618 |                             X = self._build_layer(X, _ow[n], _ob[n],
 619 |                                                   dropout_rate=self.dropout_level)
 620 |                     decombined = tf.split(X, output_split, axis=1)
 621 |                     return decombined
 622 | 
 623 |             if self.vae_layer:
 624 |                 def decode_z(z):
 625 |                     X = from_z(z)
 626 |                     X = decode(X)
 627 |                     return X
 628 | 
 629 |             # Determine which imputation function is to be used. This is constructed to
 630 |             # take advantage of additional data provided.
 631 |             if additional_data is not None:
 632 |                 encoded = denoise(tf.concat([self.X, self.X_add], axis=1))
 633 |             else:
 634 |                 encoded = denoise(self.X)
 635 | 
 636 |             if self.vae_layer:
 637 |                 perturb, kld = vae(encoded)
 638 |                 perturb_out, _ = vae(encoded, True)
 639 |                 pred_split = decode(perturb)
 640 |                 out_split = decode(perturb_out)
 641 |             else:
 642 |                 pred_split = decode(encoded)
 643 | 
 644 |             # Output functions
 645 |             cost_list = []
 646 |             self.output_types = []
 647 | 
 648 |             # Build L2 loss and KL-Divergence
 649 |             if self.weight_decay == 'default':
 650 |                 lmbda = 1 / self.imputation_target.shape[0]
 651 |             else:
 652 |                 lmbda = self.weight_decay
 653 |             #      if self.vae_layer:
 654 |             #        l2_penalty = tf.multiply(tf.reduce_mean(
 655 |             #            [tf.nn.l2_loss(w) for w in _w]+\
 656 |             #            [tf.nn.l2_loss(w) for w in _zw]+\
 657 |             #            [tf.nn.l2_loss(w) for w in _ow]
 658 |             #            ), lmbda)
 659 |             #      else:
 660 |             #        l2_penalty = tf.multiply(tf.reduce_mean(
 661 |             #            [tf.nn.l2_loss(w) for w in _w]+\
 662 |             #            [tf.nn.l2_loss(w) for w in _ow]
 663 |             #            ), lmbda)
 664 | 
 665 |             # Assign cost and loss functions
 666 |             na_split = tf.split(self.na_idx, output_split, axis=1)
 667 |             true_split = tf.split(self.X, output_split, axis=1)
 668 |             for n in range(len(outputs_struc)):
 669 |                 na_adj = tf.cast(tf.math.count_nonzero(na_split[n]), tf.float32) \
 670 |                          / tf.cast(tf.size(input=na_split[n]), tf.float32)
 671 |                 if outputs_struc[n] == 'cont':
 672 |                     if 'rmse' not in self.output_types:
 673 |                         self.output_types.append('rmse')
 674 |                     cost_list.append(tf.sqrt(
 675 |                         tf.compat.v1.losses.mean_squared_error(tf.boolean_mask(tensor=true_split[n], mask=na_split[n]),
 676 |                                                                tf.boolean_mask(tensor=pred_split[n], mask=na_split[n])
 677 |                                                                )) * self.cont_adj * na_adj)
 678 |                 elif outputs_struc[n] == 'bin':
 679 |                     if 'bacc' not in self.output_types:
 680 |                         self.output_types.append('bacc')
 681 |                     cost_list.append(
 682 |                         tf.compat.v1.losses.sigmoid_cross_entropy(
 683 |                             tf.boolean_mask(tensor=true_split[n], mask=na_split[n]),
 684 |                             tf.boolean_mask(tensor=pred_split[n], mask=na_split[n]))
 685 |                         * self.binary_adj * na_adj)
 686 |                 elif type(outputs_struc[n]) == int:
 687 |                     self.output_types.append('sacc')
 688 |                     cost_list.append(tf.compat.v1.losses.softmax_cross_entropy(
 689 |                         tf.reshape(tf.boolean_mask(tensor=true_split[n], mask=na_split[n]), [-1, outputs_struc[n]]),
 690 |                         tf.reshape(tf.boolean_mask(tensor=pred_split[n], mask=na_split[n]), [-1, outputs_struc[n]]))
 691 |                                      * self.softmax_adj * na_adj)
 692 | 
 693 |             def output_function(out_split):
 694 |                 output_list = []
 695 |                 # Break outputs into their parts
 696 |                 for n in range(len(outputs_struc)):
 697 |                     if outputs_struc[n] == 'cont':
 698 |                         output_list.append(out_split[n])
 699 |                     elif outputs_struc[n] == 'bin':
 700 |                         output_list.append(tf.nn.sigmoid(out_split[n]))
 701 |                     elif type(outputs_struc[n]) == int:
 702 |                         output_list.append(tf.nn.softmax(out_split[n]))
 703 |                 return tf.concat(output_list, axis=1)
 704 | 
 705 |             self.outputs_struc = outputs_struc
 706 |             if self.vae_layer:
 707 |                 self.output_op = output_function(out_split)
 708 |                 self.joint_loss = tf.reduce_mean(
 709 |                     input_tensor=tf.reduce_sum(input_tensor=cost_list) + kld)  # + l2_penalty)
 710 |                 self.encode_to_z = to_z(encoded)
 711 |                 self.gen_from_z_sample = output_function(decode_z(mapped_dist.sample(
 712 |                     sample_shape=tf.shape(input=self.latent_inputs))))
 713 |                 self.gen_from_z_inputs = output_function(decode_z(self.latent_inputs))
 714 | 
 715 |             else:
 716 |                 self.output_op = output_function(pred_split)
 717 |                 self.joint_loss = tf.reduce_mean(input_tensor=tf.reduce_sum(input_tensor=cost_list))  # + l2_penalty)
 718 | 
 719 |             if tf.__version__[0] == '2':
 720 |                 optim = tfa.optimizers.AdamW(lmbda, self.learn_rate)
 721 |                 self.train_step = optim.get_updates(loss=self.joint_loss, params=tf.compat.v1.trainable_variables())
 722 |             else:
 723 |                 optim = tf.contrib.opt.AdamWOptimizer(lmbda, self.learn_rate)
 724 |                 self.train_step = optim.minimize(loss=self.joint_loss, var_list=tf.compat.v1.trainable_variables())
 725 | 
 726 |             self.init = tf.compat.v1.global_variables_initializer()
 727 |             self.saver = tf.compat.v1.train.Saver()
 728 | 
 729 |         self.model_built = True
 730 |         if verbose:
 731 |             print()
 732 |             print("Computation graph constructed")
 733 |             print()
 734 |         return self
 735 | 
 736 |     def train_model(self,
 737 |                     training_epochs=100,
 738 |                     verbose=True,
 739 |                     verbosity_ival=1,
 740 |                     excessive=False):
 741 |         """
 742 |         This is the standard method for optimising the model's parameters. Must be
 743 |         called before imputation can be performed.
 744 | 
 745 |         Args:
 746 |           training_epochs: Integer. The number of complete cycles (forward passes)
 747 |           through the network during training (default = 100).
 748 | 
 749 |           verbose: Boolean. Specifies whether to print messages to the terminal
 750 |           during training, including loss values (default = True).
 751 | 
 752 |           verbosity_ival: Integer. The number of training epochs between messages
 753 |           (default = 1).
 754 | 
 755 |           excessive: Boolean. Specifies whether to print loss for each mini-batch
 756 |           to the terminal (default = \code{False}), which can help with
 757 |           troubleshooting.
 758 | 
 759 |         Returns:
 760 |           Self. Model is automatically saved upon reaching specified number of epochs
 761 | 
 762 |         """
 763 |         if not self.model_built:
 764 |             raise AttributeError("The computation graph must be built before the model"
 765 |                                  " can be trained")
 766 | 
 767 |         if self.input_is_pipeline:
 768 |             raise AttributeError("Model was constructed to accept pipeline data, either"
 769 |                                  " use 'train_model_pipeline' method or rebuild model "
 770 |                                  "with in-memory dataset.")
 771 | 
 772 |         feed_data = self.imputation_target.values
 773 |         na_loc = self.na_matrix.values
 774 |         with tf.compat.v1.Session(graph=self.graph) as sess:
 775 |             if self.seed is not None:
 776 |                 train_rng = np.random.default_rng(self.seed)
 777 |                 # tf.compat.v1.set_random_seed(self.seed)
 778 |             else:
 779 |                 train_rng = np.random.default_rng()
 780 | 
 781 |             sess.run(self.init)
 782 |             if verbose:
 783 |                 print("Model initialised", flush=True)
 784 |                 print(flush=True)
 785 |             for epoch in range(training_epochs):
 786 |                 count = 0
 787 |                 run_loss = 0
 788 |                 for batch in self._batch_iter(feed_data, na_loc, self.train_batch, train_rng):
 789 |                     if np.sum(batch[1]) == 0:
 790 |                         continue
 791 |                     feedin = {self.X: batch[0], self.na_idx: batch[1]}
 792 |                     if self.additional_data is not None:
 793 |                         feedin[self.X_add] = batch[2]
 794 |                     loss, _ = sess.run([self.joint_loss, self.train_step],
 795 |                                        feed_dict=feedin)
 796 |                     if excessive:
 797 |                         print("Current cost:", loss)
 798 |                     count += 1
 799 |                     if not np.isnan(loss):
 800 |                         run_loss += loss
 801 |                 if verbose:
 802 |                     if epoch % verbosity_ival == 0:
 803 |                         print('Epoch:', epoch, ", loss:", str(run_loss / count), flush=True)
 804 |             if verbose:
 805 |                 print("Training complete. Saving file...")
 806 |             save_path = self.saver.save(sess, self.savepath)
 807 |             if verbose:
 808 |                 print("Model saved in file: %s" % save_path)
 809 |             return self
 810 | 
 811 |     def generate_samples(self,
 812 |                          m=50,
 813 |                          verbose=True):
 814 |         """
 815 |         Method used to generate a set of m imputations to the .output_list attribute.
 816 |         Imputations are stored within a list in memory, and can be accessed in any
 817 |         order.
 818 | 
 819 |         If a model has been pre-trained, on subsequent runs this function can be
 820 |         directly called without having to train first. An 'if' statement checking
 821 |         the default save location is useful for this.
 822 | 
 823 |         Args:
 824 |           m: Integer. The number of completed datasets to produce (default = 50)
 825 | 
 826 |           verbose: Boolean. Specifies whether to print messages to the terminal
 827 |           (default = True).
 828 |         Returns:
 829 |           Self
 830 |         """
 831 | 
 832 |         if not self.model_built:
 833 |             raise AttributeError("The computation graph must be built before the model"
 834 |                                  " can be trained")
 835 | 
 836 |         if self.input_is_pipeline:
 837 |             raise AttributeError("Model was constructed to accept pipeline data, either"
 838 |                                  " use 'pipeline_yield_samples' method or rebuild model "
 839 |                                  "with in-memory dataset.")
 840 |         self.output_list = []
 841 |         with tf.compat.v1.Session(graph=self.graph) as sess:
 842 |             self.saver.restore(sess, self.savepath)
 843 |             if verbose:
 844 |                 print("Model restored.")
 845 |             for n in range(m):
 846 |                 feed_data = self.imputation_target.values
 847 |                 feedin = {self.X: feed_data}
 848 |                 if self.additional_data is not None:
 849 |                     feedin[self.X_add] = self.additional_data
 850 |                 y_out = pd.DataFrame(sess.run(self.output_op,
 851 |                                               feed_dict=feedin),
 852 |                                      columns=self.imputation_target.columns)
 853 |                 output_df = self.imputation_target.copy()
 854 |                 output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)]
 855 |                 self.output_list.append(output_df)
 856 |         return self
 857 | 
 858 |     def yield_samples(self,
 859 |                       m=50,
 860 |                       verbose=True):
 861 |         """
 862 |         Method used to generate a set of m imputations via the 'yield' command, allowing
 863 |         imputations to be used in a 'for' loop'
 864 | 
 865 |         If a model has been pre-trained, on subsequent runs this function can be
 866 |         directly called without having to train first. An 'if' statement checking
 867 |         the default save location is useful for this.
 868 | 
 869 |         Args:
 870 |           m: Integer. Number of imputations to generate.
 871 | 
 872 |           verbose: Boolean. Prints out messages.
 873 | 
 874 |         Returns:
 875 |           Self
 876 |         """
 877 | 
 878 |         if not self.model_built:
 879 |             raise AttributeError("The computation graph must be built before the model"
 880 |                                  " can be trained")
 881 | 
 882 |         if self.input_is_pipeline:
 883 |             raise AttributeError("Model was constructed to accept pipeline data, either"
 884 |                                  " use 'pipeline_yield_samples' method or rebuild model "
 885 |                                  "with in-memory dataset.")
 886 |         with tf.compat.v1.Session(graph=self.graph) as sess:
 887 |             self.saver.restore(sess, self.savepath)
 888 |             if verbose:
 889 |                 print("Model restored.")
 890 |             for n in range(m):
 891 |                 feed_data = self.imputation_target.values
 892 |                 feedin = {self.X: feed_data}
 893 |                 if self.additional_data is not None:
 894 |                     feedin[self.X_add] = self.additional_data
 895 |                 y_out = pd.DataFrame(sess.run(self.output_op,
 896 |                                               feed_dict=feedin),
 897 |                                      columns=self.imputation_target.columns)
 898 |                 output_df = self.imputation_target.copy()
 899 |                 output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)]
 900 |                 yield output_df
 901 |         return self
 902 | 
 903 |     def batch_generate_samples(self,
 904 |                                m=50,
 905 |                                b_size=256,
 906 |                                verbose=True):
 907 |         """
 908 |         Method used to generate a set of m imputations to the .output_list attribute.
 909 |         Imputations are stored within a list in memory, and can be accessed in any
 910 |         order. As batch generation implies very large datasets, this method is only
 911 |         provided for completeness' sake.
 912 | 
 913 |         This function is for a dataset large enough to be stored in memory, but
 914 |         too large to be passed into the model in its entirety. This may be due to
 915 |         GPU memory limitations, or just the size of the model
 916 | 
 917 |         If a model has been pre-trained, on subsequent runs this function can be
 918 |         directly called without having to train first. An 'if' statement checking
 919 |         the default save location is useful for this.
 920 | 
 921 |         Args:
 922 |           m: Integer. Number of imputations to generate.
 923 | 
 924 |           b_size: Integer. Number of data entries to process at once. For managing
 925 |           wider datasets, smaller numbers may be required.
 926 | 
 927 |           verbose: Boolean. Prints out messages.
 928 | 
 929 |         Returns:
 930 |           Self
 931 |         """
 932 |         if not self.model_built:
 933 |             raise AttributeError("The computation graph must be built before the model"
 934 |                                  " can be trained")
 935 | 
 936 |         if self.input_is_pipeline:
 937 |             raise AttributeError("Model was constructed to accept pipeline data, either"
 938 |                                  " use 'pipeline_yield_samples' method or rebuild model "
 939 |                                  "with in-memory dataset.")
 940 |         self.output_list = []
 941 |         with tf.compat.v1.Session(graph=self.graph) as sess:
 942 |             self.saver.restore(sess, self.savepath)
 943 |             if verbose:
 944 |                 print("Model restored.")
 945 |             for n in range(m):
 946 |                 feed_data = self.imputation_target.values
 947 |                 minibatch_list = []
 948 |                 for batch in self._batch_iter_output(feed_data, b_size):
 949 |                     if self.additional_data is not None:
 950 |                         feedin = {self.X: batch[0], self.X_add: batch[1]}
 951 |                     else:
 952 |                         feedin = {self.X: batch}
 953 |                     y_batch = pd.DataFrame(sess.run(self.output_op,
 954 |                                                     feed_dict=feedin),
 955 |                                            columns=self.imputation_target.columns)
 956 |                     minibatch_list.append(y_batch)
 957 |                 y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True),
 958 |                                      columns=self.imputation_target.columns)
 959 |                 output_df = self.imputation_target.copy()
 960 |                 output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)]
 961 |                 self.output_list.append(output_df)
 962 |         return self
 963 | 
 964 |     def batch_yield_samples(self,
 965 |                             m=50,
 966 |                             b_size=256,
 967 |                             verbose=True):
 968 |         """
 969 |         Method used to generate a set of m imputations via the 'yield' command, allowing
 970 |         imputations to be used in a 'for' loop'
 971 | 
 972 |         This function is for a dataset large enough to be stored in memory, but
 973 |         too large to be passed into the model in its entirety. This may be due to
 974 |         GPU memory limitations, or just the size of the model
 975 | 
 976 |         If a model has been pre-trained, on subsequent runs this function can be
 977 |         directly called without having to train first. An 'if' statement checking
 978 |         the default save location is useful for this.
 979 | 
 980 |         Args:
 981 |           m: Integer. Number of imputations to generate.
 982 | 
 983 |           b_size: Integer. Number of data entries to process at once. For managing
 984 |           wider datasets, smaller numbers may be required.
 985 | 
 986 |           verbose: Boolean. Prints out messages.
 987 | 
 988 |         Returns:
 989 |           Self    """
 990 |         if not self.model_built:
 991 |             raise AttributeError("The computation graph must be built before the model"
 992 |                                  " can be trained")
 993 | 
 994 |         if self.input_is_pipeline:
 995 |             raise AttributeError("Model was constructed to accept pipeline data, either"
 996 |                                  " use 'pipeline_yield_samples' method or rebuild model "
 997 |                                  "with in-memory dataset.")
 998 |         with tf.compat.v1.Session(graph=self.graph) as sess:
 999 |             self.saver.restore(sess, self.savepath)
1000 |             if verbose:
1001 |                 print("Model restored.")
1002 |             for n in range(m):
1003 |                 feed_data = self.imputation_target.values
1004 |                 minibatch_list = []
1005 |                 for batch in self._batch_iter_output(feed_data, b_size):
1006 |                     if self.additional_data is not None:
1007 |                         feedin = {self.X: batch[0], self.X_add: batch[1]}
1008 |                     else:
1009 |                         feedin = {self.X: batch}
1010 |                     y_batch = pd.DataFrame(sess.run(self.output_op,
1011 |                                                     feed_dict=feedin),
1012 |                                            columns=self.imputation_target.columns)
1013 |                     minibatch_list.append(y_batch)
1014 |                 y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True),
1015 |                                      columns=self.imputation_target.columns)
1016 |                 output_df = self.imputation_target.copy()
1017 |                 output_df[np.invert(self.na_matrix)] = y_out[np.invert(self.na_matrix)]
1018 |                 yield output_df
1019 |         return self
1020 | 
1021 |     def overimpute(self,
1022 |                    spikein=0.1,
1023 |                    training_epochs=100,
1024 |                    report_ival=10,
1025 |                    report_samples=32,
1026 |                    plot_vars=True,
1027 |                    verbose=True,
1028 |                    verbosity_ival=1,
1029 |                    spike_seed=42,
1030 |                    cont_kdes=False,
1031 |                    excessive=False,
1032 |                    plot_main=True,
1033 |                    skip_plot=False,
1034 |                    save_figs=False,
1035 |                    save_path="",
1036 |                    ):
1037 |         """
1038 |         This function spikes in additional missingness, so that known values can be
1039 |         used to help adjust the complexity of the model. As conventional train/
1040 |         validation splits can still lead to autoencoders overtraining, the method for
1041 |         limiting complexity is overimputation and early stopping. This gives an
1042 |         estimate of how the model will react to unseen variables.
1043 | 
1044 |         Error is defined as RMSE for continuous variables, and classification error
1045 |         for binary and categorical variables (ie. 1 - accuracy). Note that this means
1046 |         that binary classification is inherently dependent on a selection threshold
1047 |         of 0.5, and softmax accuracy will automatically decrease as a function of the
1048 |         number of classes within the model. All three will be affected by the degree
1049 |         of imbalance within the dataset.
1050 | 
1051 |         The accuracy measures provided here may not be ideal for all problems, but
1052 |         they are generally appropriate for selecting optimum complexity. Should the
1053 |         lines denoting error begin to trend upwards, this indicates overtraining and
1054 |         is a sign that the training_epochs parameter to the .train_model() method should
1055 |         be capped before this point.
1056 | 
1057 |         The actual optimal point may differ from that indicated by the .overimpute()
1058 |         method for two reasons:
1059 |           -The loss that is spiked in reduces the overall data available to the algorithm
1060 |           to learn the patterns inherent, so there should be some improvement in performance
1061 |           when .train_model() is called. If this is a concern, then it should be possible
1062 |           to compare the behaviour of the loss figure between .train_model() and
1063 |           .overimpute().
1064 |           -The missingness inherent to the data may depend on some unobserved factor.
1065 |           In this case, the bias in the observed data may lead to inaccurate inference.
1066 | 
1067 |         It is worth visually inspecting the distribution of the overimputed values
1068 |         against imputed values (using plot_vars) to ensure that they fall within a
1069 |         sensible range.
1070 | 
1071 |         The plots block execution of the code until they are closed. To only plot a
1072 |         single overimputation graph at the end of the run, you can supply plot_main = False
1073 |         and plot_vars = False. To run the imputation without plotting any graphs,
1074 |         set skip_plot = True in addition. The overimputation function will still print
1075 |         predicted errors to the console.
1076 | 
1077 |         Args:
1078 |           spikein: Float, between 0 and 1. The proportion of observed values in the
1079 |           input dataset to be randomly removed (default = 0.1).
1080 | 
1081 |           training_epochs: Integer. The number of overimputation training epochs
1082 |           (default = 100). Selecting a low value increases the risk that trends in the
1083 |           loss metrics have not stabilized by the end of training, in which case
1084 |           additional epochs may be necessary.
1085 | 
1086 |           report_ival: Integer. The number of overimputation training epochs between
1087 |           calculations of loss (default = 10). Shorter intervals provide a more granular
1088 |           view of model performance but slow down the overimputation process.
1089 | 
1090 |           report_samples: The number of Monte Carlo samples drawn from the estimated
1091 |           missing-data posterior for loss calculations (default = 32). A larger number
1092 |           increases overimputation runtime and may thus necessitate a lower value of
1093 |           report_ival.
1094 | 
1095 |           plot_vars: Specifies whether to plot the distribution of original versus
1096 |           overimputed values (default = True). This takes the form of a density
1097 |           plot for continuous variables and a barplot for categorical variables (showing
1098 |           proportions of each class).
1099 | 
1100 |           plot_main: Boolean. Specifies whether to display the main graphical output
1101 |           (overimputation error during training) at every reporting interval (default = True).
1102 |           If set to False, it will only appear at the end of the overimputation training
1103 |           process. Error values are still shown at each report_ival.
1104 | 
1105 |           skip_plot: Boolean. Specifies whether to suppress the main graphical output
1106 |           (default = False). This may be desirable when users are conducting multiple
1107 |           overimputation exercises sequentially and are primarily interested in the console
1108 |           output.
1109 | 
1110 |           save_figs: Boolean. Specifies whether to save generated figures instead of
1111 |           displaying graphical output (default = False).
1112 | 
1113 |           save_path: String. Specifies path to save pyplots if save_figs = True
1114 |           (default = working directory).
1115 | 
1116 |           verbose: Boolean. Prints out messages, including loss, to the terminal (default = True).
1117 | 
1118 |           verbosity_ival: Integer. The number of overimputation training epochs between
1119 |           messages (default = True).
1120 | 
1121 |           spike_seed: Integer. The value to which Python's pseudo-random number generator is initialized
1122 |           for the missingness spike-in. This is separate to the seed specified in the Midas()
1123 |           call.
1124 | 
1125 |           cont_kdes: Boolean. Whether to plot kernel density estimates for continuous variables.
1126 | 
1127 |           excessive: Specifies whether to print aggregate mini-batch loss to the terminal
1128 |           (default = False). This argument differs from the .train_model()'s excessive argument,
1129 |           which prints individual mini-batch loss. This allows users to check for unusual imputations,
1130 |           which may be helpful if loss is not declining during overimputation training.
1131 | 
1132 | 
1133 |         """
1134 |         if not self.model_built:
1135 |             raise AttributeError("The computation graph must be built before the model can be trained")
1136 | 
1137 |         if self.input_is_pipeline:
1138 |             raise AttributeError("Overimputation not currently supported for models"
1139 |                                  " which use a pipeline function for input.")
1140 |         # These values simplify control flow used later for error calculation and
1141 |         # visualisation of convergence.
1142 |         if cont_kdes & (plot_vars is False):
1143 |             raise ValueError("Cannot plot KDEs if plot_vars is False")
1144 | 
1145 |         if excessive:
1146 |             import time
1147 | 
1148 |         overimp_rng = np.random.default_rng(spike_seed)
1149 | 
1150 |         rmse_in = False
1151 |         sacc_in = False
1152 |         bacc_in = False
1153 |         if 'rmse' in self.output_types:
1154 |             rmse_in = True
1155 |         if 'sacc' in self.output_types:
1156 |             def sacc(true, pred, spike):  # Softmax accuracy
1157 |                 a = np.argmax(true, 1)
1158 |                 b = np.argmax(pred, 1)
1159 |                 return np.sum(a[spike.flatten()] == b[spike.flatten()]) / np.sum(spike)
1160 | 
1161 |             def findcatname(strlist):
1162 |                 return strlist[0][:([min([x[0] == elem for elem in x]) for x in zip(*strlist)] + [0]).index(0)]
1163 | 
1164 |             sacc_in = True
1165 | 
1166 |         if 'bacc' in self.output_types:
1167 |             def bacc(true, pred, spike):
1168 |                 pred = (pred > 0.5).astype(np.int_)
1169 |                 return np.sum(true[spike] == pred[spike]) / np.sum(spike)
1170 | 
1171 |             bacc_in = True
1172 | 
1173 |         feed_data = self.imputation_target.copy()
1174 |         na_loc = self.na_matrix
1175 |         # np.random.seed(spike_seed)
1176 |         n_softmax = 0  # Necessary to derive the average classification error
1177 | 
1178 |         # Pandas lacks an equivalent to tf.split, so this is used to divide columns
1179 |         # for their respective error metrics
1180 |         break_list = list(np.cumsum(self.size_index))
1181 |         break_list.insert(0, 0)
1182 | 
1183 |         # Generate spike-in
1184 |         spike = []
1185 |         for n in range(len(self.size_index)):
1186 |             if self.output_types[n] == 'sacc':
1187 |                 temp_spike = pd.Series(overimp_rng.choice([True, False],
1188 |                                                           size=self.imputation_target.shape[0],
1189 |                                                           p=[spikein, 1 - spikein]))
1190 | 
1191 |                 spike.append(pd.concat([temp_spike] * self.size_index[n], axis=1))
1192 |                 n_softmax += 1
1193 | 
1194 |             else:
1195 |                 spike.append(pd.DataFrame(overimp_rng.choice([True, False],
1196 |                                                              size=[self.imputation_target.shape[0],
1197 |                                                                    self.size_index[n]],
1198 |                                                              p=[spikein, 1 - spikein])))
1199 |         spike = pd.concat(spike, axis=1)
1200 |         spike.columns = self.imputation_target.columns
1201 |         spike[np.invert(na_loc)] = False
1202 |         feed_data[spike] = 0
1203 |         feed_data = feed_data.values
1204 |         na_loc[spike] = False
1205 |         spike = spike.values
1206 |         na_loc = na_loc.values
1207 | 
1208 |         # Initialise lists for plotting
1209 |         s_rmse = []
1210 |         a_rmse = []
1211 |         s_bacc = []
1212 |         a_bacc = []
1213 |         s_sacc = []
1214 |         a_sacc = []
1215 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1216 |             if self.seed is not None:
1217 |                 train_rng = np.random.default_rng(self.seed)
1218 | 
1219 |             sess.run(self.init)
1220 |             print("Model initialised", flush=True)
1221 |             print(flush=True)
1222 |             for epoch in range(training_epochs + 1):
1223 |                 count = 0
1224 |                 run_loss = 0
1225 |                 for batch in self._batch_iter(feed_data, na_loc, self.train_batch, train_rng):
1226 |                     if np.sum(batch[1]) == 0:
1227 |                         continue
1228 |                     feedin = {self.X: batch[0], self.na_idx: batch[1]}
1229 |                     if self.additional_data is not None:
1230 |                         feedin[self.X_add] = batch[2]
1231 |                     if excessive:
1232 |                         out, loss, _ = sess.run([self.output_op, self.joint_loss, self.train_step],
1233 |                                                 feed_dict=feedin)
1234 |                         print("Current cost:", loss)
1235 |                         print(out)
1236 |                         time.sleep(5)
1237 |                     else:
1238 |                         loss, _ = sess.run([self.joint_loss, self.train_step],
1239 |                                            feed_dict=feedin)
1240 |                     count += 1
1241 | 
1242 |                     if not np.isnan(loss):
1243 |                         run_loss += loss
1244 |                 if verbose:
1245 |                     if epoch % verbosity_ival == 0:
1246 |                         print('Epoch:', epoch, ", loss:", str(run_loss / count), flush=True)
1247 | 
1248 |                 if epoch % report_ival == 0:
1249 |                     """
1250 |                     For each report interval, generate report_samples worth of imputations
1251 |                     and measure both individual and aggregate error values
1252 |                     """
1253 |                     # Initialise losses
1254 |                     single_rmse = 0
1255 |                     single_sacc = 0
1256 |                     single_bacc = 0
1257 |                     first = True
1258 |                     if cont_kdes:
1259 |                         plot_first = True
1260 | 
1261 |                     for sample in range(report_samples):
1262 | 
1263 |                         minibatch_list = []
1264 |                         for batch in self._batch_iter_output(feed_data, self.train_batch):
1265 |                             feedin = {self.X: batch}
1266 |                             if self.additional_data is not None:
1267 |                                 feedin = {self.X: batch[0]}
1268 |                                 feedin[self.X_add] = batch[1]
1269 |                             else:
1270 |                                 feedin = {self.X: batch}
1271 |                             y_batch = pd.DataFrame(sess.run(self.output_op,
1272 |                                                             feed_dict=feedin),
1273 |                                                    columns=self.imputation_target.columns)
1274 |                             minibatch_list.append(y_batch)
1275 |                         y_out = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True),
1276 |                                              columns=self.imputation_target.columns)
1277 |                         if cont_kdes:
1278 |                             if 'rmse' in self.output_types:
1279 |                                 for n in range(self.size_index[0]):
1280 |                                     plt.figure(n + 1)
1281 |                                     t_t = self.imputation_target.iloc[:, n]
1282 |                                     t_p = y_out.iloc[:, n]
1283 |                                     t_s = spike[:, n]
1284 |                                     if plot_first:
1285 |                                         t_p[t_s].plot(kind='density', color='k', alpha=0.5, label='Single imputation')
1286 |                                     else:
1287 |                                         t_p[t_s].plot(kind='density', color='k', alpha=0.5, label='_nolegend_')
1288 |                                 plot_first = False
1289 | 
1290 |                         # Calculate individual imputation losses
1291 |                         for n in range(len(self.size_index)):
1292 |                             temp_pred = y_out.iloc[:, break_list[n]:break_list[n + 1]]
1293 |                             temp_true = self.imputation_target.iloc[:, break_list[n]:break_list[n + 1]]
1294 |                             temp_spike = spike[:, break_list[n]:break_list[n + 1]]
1295 |                             if self.output_types[n] == 'sacc':
1296 |                                 temp_spike = temp_spike[:, 0]
1297 |                                 single_sacc += (1 - sacc(temp_true.values,
1298 |                                                          temp_pred.values, temp_spike)) / n_softmax
1299 | 
1300 |                             elif self.output_types[n] == 'rmse':
1301 |                                 single_rmse += np.sqrt(mse(temp_true[temp_spike],
1302 |                                                            temp_pred[temp_spike]))
1303 |                             else:
1304 |                                 single_bacc += 1 - bacc(temp_true.values, temp_pred.values, temp_spike)
1305 | 
1306 |                         if first:
1307 |                             running_output = y_out
1308 |                             first = False
1309 |                         else:
1310 |                             running_output += y_out
1311 |                     single_rmse = single_rmse / report_samples
1312 |                     single_sacc = single_sacc / report_samples
1313 |                     single_bacc = single_bacc / report_samples
1314 |                     y_out = running_output / report_samples
1315 | 
1316 |                     # Calculate aggregate imputation losses
1317 |                     agg_rmse = 0
1318 |                     agg_sacc = 0
1319 |                     agg_bacc = 0
1320 |                     for n in range(len(self.size_index)):
1321 |                         temp_pred = y_out.iloc[:, break_list[n]:break_list[n + 1]]
1322 |                         temp_true = self.imputation_target.iloc[:, break_list[n]:break_list[n + 1]]
1323 |                         temp_spike = spike[:, break_list[n]:break_list[n + 1]]
1324 |                         if self.output_types[n] == 'sacc':
1325 |                             temp_spike = temp_spike[:, 0]
1326 |                             if plot_vars:
1327 |                                 temp_pred[temp_spike].mean().plot(kind='bar',
1328 |                                                                   label='Imputed values (mean)', color='C0')
1329 |                                 temp_true[temp_spike].mean().plot(kind='bar', alpha=0.5,
1330 |                                                                   color='r', align='edge',
1331 |                                                                   label='Removed observed values (mean)')
1332 |                                 temp_true_name = findcatname(temp_true[temp_spike].columns)[:-1]
1333 |                                 plt.title('Overimputation density plot: ' + temp_true_name + ' (categorical)')
1334 |                                 plt.xlabel(temp_true_name)
1335 |                                 plt.ylabel('Proportion')
1336 |                                 plt.legend()
1337 | 
1338 |                                 if save_figs:
1339 |                                     plt.tight_layout()
1340 |                                     plt.savefig(save_path + temp_true_name + "_epoch_" + str(epoch) + ".png")
1341 |                                     plt.clf()
1342 |                                 else:
1343 |                                     plt.show()
1344 | 
1345 |                             agg_sacc += (1 - sacc(temp_true.values, temp_pred.values,
1346 |                                                   temp_spike)) / n_softmax
1347 |                         elif self.output_types[n] == 'rmse':
1348 |                             if plot_vars:
1349 |                                 for n_rmse in range(len(temp_pred.columns)):
1350 |                                     plt.figure(n_rmse + 1)
1351 |                                     t_p = temp_pred.iloc[:, n_rmse]
1352 |                                     t_t = temp_true.iloc[:, n_rmse]
1353 |                                     t_s = temp_spike[:, n_rmse]
1354 |                                     t_p[t_s].plot(kind='density', label='Imputed values (mean)')
1355 |                                     t_t[t_s].plot(kind='density', color='r', label='Removed observed values')
1356 |                                     t_t.plot(kind='kde', color='g', label='All observed values')
1357 |                                     hyp_output = pd.concat([t_t[np.invert(t_s)], t_p[t_s]])
1358 |                                     hyp_output.plot(kind='kde', color='m', label='Completed data')
1359 |                                     plt.title('Overimputation density plot: ' + \
1360 |                                               temp_pred.columns[n_rmse] + ' (continuous)')
1361 |                                     plt.xlabel(temp_pred.columns[n_rmse])
1362 |                                     plt.ylabel('Density')
1363 |                                     plt.legend()
1364 | 
1365 |                                     if save_figs:
1366 |                                         plt.tight_layout()
1367 |                                         plt.savefig(
1368 |                                             save_path + temp_pred.columns[n_rmse] + "_epoch_" + str(epoch) + ".png")
1369 |                                         plt.clf()
1370 |                                     else:
1371 |                                         plt.show()
1372 | 
1373 |                             agg_rmse += np.sqrt(mse(temp_true[temp_spike],
1374 |                                                     temp_pred[temp_spike]))
1375 |                         else:
1376 |                             if plot_vars:
1377 |                                 temp_pred[temp_spike].mean().plot(kind='bar',
1378 |                                                                   label='Imputed values',
1379 |                                                                   color='C0')
1380 |                                 temp_true[temp_spike].mean().plot(kind='bar', alpha=0.5,
1381 |                                                                   color='r', align='edge', label='Observed values')
1382 |                                 plt.title('Overimputation binary proportions')
1383 |                                 plt.xlabel('Variables')
1384 |                                 plt.ylabel('Proportion')
1385 |                                 plt.legend()
1386 | 
1387 |                                 if save_figs:
1388 |                                     plt.tight_layout()
1389 |                                     plt.savefig(save_path + "binary_vars_epoch_" + str(epoch) + ".png")
1390 |                                     plt.clf()
1391 |                                 else:
1392 |                                     plt.show()
1393 | 
1394 |                             agg_bacc += 1 - bacc(temp_true.values, temp_pred.values, temp_spike)
1395 | 
1396 |                     # Plot losses depending on which loss values present in data
1397 |                     if rmse_in:
1398 |                         s_rmse.append(single_rmse)
1399 |                         a_rmse.append(agg_rmse)
1400 |                         print("Individual RMSE on spike-in:", single_rmse, flush=True)
1401 |                         print("Aggregated RMSE on spike-in:", agg_rmse, flush=True)
1402 | 
1403 |                     if sacc_in:
1404 |                         s_sacc.append(single_sacc)
1405 |                         a_sacc.append(agg_sacc)
1406 |                         print("Individual error on softmax spike-in:", single_sacc, flush=True)
1407 |                         print("Aggregated error on softmax spike-in:", agg_sacc, flush=True)
1408 | 
1409 |                     if bacc_in:
1410 |                         s_bacc.append(single_bacc)
1411 |                         a_bacc.append(agg_bacc)
1412 |                         print("Individual error on binary spike-in:", single_bacc, flush=True)
1413 |                         print("Aggregated error on binary spike-in:", agg_bacc, flush=True)
1414 | 
1415 |                     if plot_main or ((training_epochs - epoch) < report_ival):
1416 |                         if rmse_in:
1417 |                             plt.plot(s_rmse, 'k-', label="Individual RMSE")
1418 |                             plt.plot(a_rmse, 'k--', label="Aggregated RMSE")
1419 |                             min_sr = min(s_rmse)
1420 |                             min_ar = min(a_rmse)
1421 |                             plt.plot([min_sr] * len(s_rmse), 'r:')
1422 |                             plt.plot([min_ar] * len(a_rmse), 'r:')
1423 |                             plt.plot(s_rmse.index(min(s_rmse)),
1424 |                                      min_sr, 'rx')
1425 |                             plt.plot(a_rmse.index(min(a_rmse)),
1426 |                                      min_ar, 'rx')
1427 | 
1428 |                         if sacc_in:
1429 |                             plt.plot(s_sacc, 'g-', label="Individual classification error")
1430 |                             plt.plot(a_sacc, 'g--', label="Aggregated classification error")
1431 |                             min_ss = min(s_sacc)
1432 |                             min_as = min(a_sacc)
1433 |                             plt.plot([min_ss] * len(s_sacc), 'r:')
1434 |                             plt.plot([min_as] * len(a_sacc), 'r:')
1435 |                             plt.plot(s_sacc.index(min(s_sacc)),
1436 |                                      min_ss, 'rx')
1437 |                             plt.plot(a_sacc.index(min(a_sacc)),
1438 |                                      min_as, 'rx')
1439 | 
1440 |                         if bacc_in:
1441 |                             plt.plot(s_bacc, 'b-', label="Individual binary error")
1442 |                             plt.plot(a_bacc, 'b--', label="Aggregated binary error")
1443 |                             min_sb = min(s_bacc)
1444 |                             min_ab = min(a_bacc)
1445 |                             plt.plot([min_sb] * len(s_bacc), 'r:')
1446 |                             plt.plot([min_ab] * len(a_bacc), 'r:')
1447 |                             plt.plot(s_bacc.index(min(s_bacc)),
1448 |                                      min_sb, 'rx')
1449 |                             plt.plot(a_bacc.index(min(a_bacc)),
1450 |                                      min_ab, 'rx')
1451 | 
1452 |                         # Complete plots
1453 |                         if not skip_plot:
1454 |                             plt.title("Overimputation error during training")
1455 |                             plt.ylabel("Error")
1456 |                             plt.legend(loc=4)
1457 |                             plt.ylim(ymin=0)
1458 |                             plt.xlabel("Reporting interval")
1459 | 
1460 |                             if save_figs:
1461 |                                 plt.tight_layout()
1462 |                                 plt.savefig(save_path + "overimputation_error.png")
1463 |                                 plt.clf()
1464 |                             else:
1465 |                                 plt.show()
1466 | 
1467 |             print("Overimputation complete. Adjust complexity as needed.", flush=True)
1468 |             return self
1469 | 
1470 |     def build_model_pipeline(self,
1471 |                              data_sample,
1472 |                              binary_columns=None,
1473 |                              softmax_columns=None,
1474 |                              unsorted=True,
1475 |                              additional_data_sample=None,
1476 |                              verbose=True,
1477 |                              crossentropy_adj=1,
1478 |                              loss_scale=1):
1479 |         """
1480 |         This function is for integration with databasing or any dataset that needs
1481 |         to be batched into memory. The data sample is simply there to allow the
1482 |         original constructor to be recycled. The head of the data should be sufficient
1483 |         to build the imputation model. The input pipeline itself should pre-scale
1484 |         the data, and code null values as type np.nan. The pipeline ought to output
1485 |         a Pandas DataFrame. If additional data will be passed in, then the return must
1486 |         be a list of two DataFrames. The columns of the dataframe will be re-arranged
1487 |         so that error functions are efficiently generated.
1488 | 
1489 |         IT IS IMPERITIVE that this ordering is respected. Design the input batching
1490 |         function accordingly.
1491 | 
1492 |         The categorical columns should be a list of column names. Softmax columns
1493 |         should be a list of lists of column names. This will allow the model to
1494 |         dynamically assign cost functions to the correct variables. If, however,
1495 |         the data comes pre-sorted, arranged can be set to "true", in which case
1496 |         the arguments can be passed in as integers of size, ie. shape[1] attributes
1497 |         for each of the relevant categories.
1498 | 
1499 |         In other words, pre-sort your data and pass in the integers, so indexing
1500 |         dynamically doesn't become too difficult. Alternatively, list(df.columns.values)
1501 |         will output a list of column names, which can be easily implemented in the
1502 |         'for' loop which constructs your dummy variables.
1503 |         """
1504 |         self.input_is_pipeline = True
1505 |         b_c = binary_columns
1506 |         s_c = softmax_columns
1507 |         us = unsorted
1508 |         a_d = additional_data_sample
1509 |         vb = verbose
1510 |         cea = crossentropy_adj
1511 |         l_s = loss_scale
1512 | 
1513 |         self.build_model(data_sample, b_c, s_c, us, a_d, vb, cea, l_s)
1514 | 
1515 |         return self
1516 | 
1517 |     def train_model_pipeline(self,
1518 |                              input_pipeline,
1519 |                              training_epochs=100,
1520 |                              verbose=True,
1521 |                              verbosity_ival=1,
1522 |                              excessive=False):
1523 |         """
1524 |         This is the alternative method for optimising the model's parameters when input
1525 |         data must be batched into memory. Must be called before imputation can be
1526 |         performed. The model will then be saved to the specified directory
1527 | 
1528 |         Args:
1529 |           input_pipeline: Function which yields a pre-processed and scaled DataFrame
1530 |           from the designated source, be it a server or large flat file.
1531 | 
1532 |           training_epochs: Integer. The number of epochs the model will run for
1533 | 
1534 |           verbose: Boolean. Prints out messages, including loss
1535 | 
1536 |           verbosity_ival: Integer. This number determines the interval between
1537 |           messages.
1538 | 
1539 |           excessive: Boolean. Used for troubleshooting, this argument will cause the
1540 |           cost of each batch to be printed to the terminal.
1541 | 
1542 |         Returns:
1543 |           Self. Model is automatically saved upon reaching specified number of epochs
1544 | 
1545 |         """
1546 |         self.input_pipeline = input_pipeline
1547 |         if not self.model_built:
1548 |             raise AttributeError("The computation graph must be built before the model"
1549 |                                  " can be trained")
1550 |         if not self.input_is_pipeline:
1551 |             raise AttributeError("Model was constructed to accept locally-stored data,"
1552 |                                  "either use 'train_model' method or rebuild model "
1553 |                                  "with the 'build_model_pipeline' method.")
1554 | 
1555 |         # if self.seed is not None:
1556 |         #   np.random.seed(self.seed)
1557 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1558 |             sess.run(self.init)
1559 |             if verbose:
1560 |                 print("Model initialised")
1561 |                 print()
1562 |             for epoch in range(training_epochs):
1563 |                 count = 0
1564 |                 run_loss = 0
1565 | 
1566 |                 for feed_data in input_pipeline:
1567 |                     if self.additional_data is None:
1568 |                         if not isinstance(feed_data, pd.DataFrame):
1569 |                             raise TypeError("Input data must be in a DataFrame")
1570 |                         na_loc = feed_data.notnull().astype(bool).values
1571 |                         feedin = {self.X: feed_data.values,
1572 |                                   self.na_idx: na_loc}
1573 |                     else:
1574 |                         if not isinstance(feed_data, list):
1575 |                             raise TypeError("Input should be a list of two DataFrames, with "
1576 |                                             "index 0 containing the target imputation data, and"
1577 |                                             " the data at index 1 containing additional data")
1578 |                         if len(feed_data) != 2:
1579 |                             raise TypeError("Input should be a list of two DataFrames, with "
1580 |                                             "index 0 containing the target imputation data, and"
1581 |                                             " the data at index 1 containing additional data")
1582 |                         if not isinstance(feed_data[0], pd.DataFrame):
1583 |                             raise TypeError("Input data must be in a DataFrame")
1584 |                         if not isinstance(feed_data[1], pd.DataFrame):
1585 |                             raise TypeError("Additional data must be in a DataFrame")
1586 |                         na_loc = feed_data[0].notnull().astype(bool).values
1587 |                         feedin = {self.X: feed_data[0].fillna(0).values,
1588 |                                   self.X_add: feed_data[1].fillna(0).values,
1589 |                                   self.na_idx: na_loc}
1590 | 
1591 |                     if np.sum(na_loc) == 0:
1592 |                         continue
1593 |                     loss, _ = sess.run([self.joint_loss, self.train_step],
1594 |                                        feed_dict=feedin)
1595 |                     if excessive:
1596 |                         print("Current cost:", loss)
1597 |                     count += 1
1598 |                     if not np.isnan(loss):
1599 |                         run_loss += loss
1600 |                 if verbose:
1601 |                     if epoch % verbosity_ival == 0:
1602 |                         print('Epoch:', epoch, ", loss:", str(run_loss / count))
1603 |             if verbose:
1604 |                 print("Training complete. Saving file...")
1605 |             save_path = self.saver.save(sess, self.savepath)
1606 |             if verbose:
1607 |                 print("Model saved in file: %s" % save_path)
1608 |         return self
1609 | 
1610 |     def yield_samples_pipeline(self,
1611 |                                verbose=False):
1612 |         """
1613 |         As its impossible to know the specifics of the pipeline, this method simply
1614 |         cycles through all data provided by the input function. The number of imputations
1615 |         can be specified by the user, depending on their needs.
1616 | 
1617 |         Args:
1618 |           verbose: Prints out messages
1619 | 
1620 |         Yields:
1621 |           A 'DataFrame' of the size specified by the input function passed to the
1622 |           'train_model_pipeline' method.
1623 | 
1624 |         Returns:
1625 |           Self
1626 | 
1627 |         """
1628 |         if not self.model_built:
1629 |             raise AttributeError("The computation graph must be built before the model"
1630 |                                  " can be trained")
1631 |         if not self.input_is_pipeline:
1632 |             raise AttributeError("Model was constructed to accept locally-stored data,"
1633 |                                  "either use 'train_model' method or rebuild model "
1634 |                                  "with the 'build_model_pipeline' method.")
1635 | 
1636 |         # if self.seed is not None:
1637 |         #   np.random.seed(self.seed)
1638 |         #   tf.compat.v1.set_random_seed(self.seed)
1639 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1640 |             self.saver.restore(sess, self.savepath)
1641 |             if verbose:
1642 |                 print("Model restored.")
1643 | 
1644 |             for feed_data in self.inpinput_pipeline:
1645 |                 if self.additional_data is None:
1646 |                     if not isinstance(feed_data, pd.DataFrame):
1647 |                         raise TypeError("Input data must be in a DataFrame")
1648 |                     na_loc = feed_data.notnull().astype(bool).values
1649 |                     feedin = {self.X: feed_data.fillna(0).values}
1650 |                 else:
1651 |                     if not isinstance(feed_data, list):
1652 |                         raise TypeError("Input should be a list of two DataFrames, with "
1653 |                                         "index 0 containing the target imputation data, and"
1654 |                                         " the data at index 1 containing additional data")
1655 |                     if len(feed_data) != 2:
1656 |                         raise TypeError("Input should be a list of two DataFrames, with "
1657 |                                         "index 0 containing the target imputation data, and"
1658 |                                         " the data at index 1 containing additional data")
1659 |                     if not isinstance(feed_data[0], pd.DataFrame):
1660 |                         raise TypeError("Input data must be in a DataFrame")
1661 |                     if not isinstance(feed_data[1], pd.DataFrame):
1662 |                         raise TypeError("Additional data must be in a DataFrame")
1663 |                     na_loc = feed_data[0].notnull().astype(bool).values
1664 |                     feedin = {self.X: feed_data[0].fillna(0).values,
1665 |                               self.X_add: feed_data[1].fillna(0).values}
1666 |                     feed_data = feed_data[0]
1667 |                 na_loc = feed_data.notnull().astype(bool).values
1668 | 
1669 |                 y_out = pd.DataFrame(sess.run(self.output_op, feed_dict=feedin),
1670 |                                      columns=self.imputation_target.columns)
1671 |                 output_df = self.imputation_target.copy()
1672 |                 output_df[np.invert(na_loc)] = y_out[np.invert(na_loc)]
1673 |                 yield output_df
1674 | 
1675 |         return self
1676 | 
1677 |     def sample_from_z(self,
1678 |                       sample_size=256,
1679 |                       verbose=True):
1680 |         """
1681 |         Method used to generate new samples by drawing on the default Student-T(3)
1682 |         sampling distribution. In effect, generates new data samples.
1683 |         Arguments:
1684 | 
1685 |           sample_size: Integer. Number of sample observations to draw at once.
1686 | 
1687 |           verbose: Boolean. Prints out messages.
1688 | 
1689 |         Returns:
1690 |           Sampled_output
1691 |         """
1692 |         if not self.model_built:
1693 |             raise AttributeError("The computation graph must be built before the model"
1694 |                                  " can be trained")
1695 |         if not self.vae_layer:
1696 |             raise AttributeError("The model must include a VAE layer to be used to generate"
1697 |                                  " new observations from a latent distribution")
1698 |         if self.input_is_pipeline:
1699 |             raise AttributeError("Model was constructed to accept pipeline data, either"
1700 |                                  " use 'pipeline_yield_samples' method or rebuild model "
1701 |                                  "with in-memory dataset.")
1702 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1703 |             self.saver.restore(sess, self.savepath)
1704 |             if verbose:
1705 |                 print("Model restored.")
1706 |             feedin = {self.latent_inputs: np.zeros([sample_size, self.latent_space_size])}
1707 |             out = sess.run(self.gen_from_z_sample, feed_dict=feedin)
1708 |             sampled_output = pd.DataFrame(out,
1709 |                                           columns=self.imputation_target.columns)
1710 |         return sampled_output
1711 | 
1712 |     def transform_from_z(self,
1713 |                          data,
1714 |                          b_size=256,
1715 |                          verbose=True):
1716 |         """
1717 |         Method used to generate new samples by drawing on the default Student-T(3)
1718 |         sampling distribution. In effect, generates new data samples.
1719 |         Arguments:
1720 | 
1721 |           data: Pandas dataframe or numpy array, as wide as latent_space_size. These
1722 |           numbers can be sampled from some distribution, or can be structured vectors
1723 |           to enable sweeping through the data space.
1724 | 
1725 |           b_size: Integer. Number of data entries to process at once. For managing
1726 |           larger input datasets, smaller numbers may be required.
1727 | 
1728 |           verbose: Boolean. Prints out messages.
1729 | 
1730 |         Returns:
1731 |           Generated_output
1732 |         """
1733 |         if not self.model_built:
1734 |             raise AttributeError("The computation graph must be built before the model"
1735 |                                  " can be trained")
1736 |         if not self.vae_layer:
1737 |             raise AttributeError("The model must include a VAE layer to be used to generate"
1738 |                                  " new observations from a latent distribution")
1739 |         if self.input_is_pipeline:
1740 |             raise AttributeError("Model was constructed to accept pipeline data, either"
1741 |                                  " use 'pipeline_yield_samples' method or rebuild model "
1742 |                                  "with in-memory dataset.")
1743 |         assert data.shape[1] == self.latent_space_size
1744 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1745 |             self.saver.restore(sess, self.savepath)
1746 |             if verbose:
1747 |                 print("Model restored.")
1748 |             feed_data = data
1749 |             minibatch_list = []
1750 |             for batch in self._batch_iter_zsample(feed_data, b_size):
1751 |                 feedin = {self.latent_inputs: batch}
1752 |                 y_batch = pd.DataFrame(sess.run(self.gen_from_z_inputs,
1753 |                                                 feed_dict=feedin),
1754 |                                        columns=self.imputation_target.columns)
1755 |                 minibatch_list.append(y_batch)
1756 |             generated_output = pd.DataFrame(pd.concat(minibatch_list, ignore_index=True),
1757 |                                             columns=self.imputation_target.columns)
1758 |         return generated_output
1759 | 
1760 |     def inputs_to_z(self,
1761 |                     b_size=256,
1762 |                     verbose=True):
1763 |         """
1764 |         Method used for transforming imputation_target into a latent representation
1765 |         for analysis. Can be used for observing how data behaves in a lower dimensional
1766 |         space, etc.
1767 | 
1768 |         Args:
1769 |           m: Integer. Number of imputations to generate.
1770 | 
1771 |           b_size: Integer. Number of data entries to process at once. For managing
1772 |           wider datasets, smaller numbers may be required.
1773 | 
1774 |           verbose: Boolean. Prints out messages.
1775 | 
1776 |         Returns:
1777 |           Self, z_mu, z_log_sigma
1778 |         """
1779 |         if not self.model_built:
1780 |             raise AttributeError("The computation graph must be built before the model"
1781 |                                  " can be trained")
1782 |         if not self.vae_layer:
1783 |             raise AttributeError("The model must include a VAE layer to be used to encode"
1784 |                                  " the dataset into the latent space")
1785 | 
1786 |         if self.input_is_pipeline:
1787 |             raise AttributeError("Model was constructed to accept pipeline data, either"
1788 |                                  " use 'pipeline_yield_samples' method or rebuild model "
1789 |                                  "with in-memory dataset.")
1790 |         with tf.compat.v1.Session(graph=self.graph) as sess:
1791 |             self.saver.restore(sess, self.savepath)
1792 |             if verbose:
1793 |                 print("Model restored.")
1794 |             feed_data = self.imputation_target.values
1795 |             mu_list = []
1796 |             sigma_list = []
1797 |             for batch in self._batch_iter_output(feed_data, b_size):
1798 |                 if self.additional_data is not None:
1799 |                     feedin = {self.X: batch[0], self.X_add: batch[1]}
1800 |                 else:
1801 |                     feedin = {self.X: batch}
1802 |                 batch_mu, batch_sigma = sess.run(self.encode_to_z,
1803 |                                                  feed_dict=feedin)
1804 |                 batch_mu = pd.DataFrame(batch_mu)
1805 |                 batch_sigma = pd.DataFrame(batch_sigma)
1806 |                 mu_list.append(batch_mu)
1807 |                 sigma_list.append(batch_sigma)
1808 |         x_mu = pd.concat(mu_list, ignore_index=True)
1809 |         x_log_sigma = pd.concat(sigma_list, ignore_index=True)
1810 |         return x_mu, x_log_sigma
1811 | 
1812 |     def change_imputation_target(self, new_target, additional_data=None):
1813 |         """
1814 |         Helper method to allow for imputed dataset to be hotswapped. MIDAS is not
1815 |         designed with such a function in mind, but this should allow for more flexible
1816 |         workflows.
1817 |         """
1818 |         if type(self.imputation_target) != type(new_target):
1819 |             raise ValueError("New target must be of same type as original target dataset")
1820 |         if type(self.imputation_target) == pd.core.series.Series:
1821 |             if self.imputation_target.name != new_target.name:
1822 |                 raise ValueError("Ensure input series are from same source")
1823 |         elif type(self.imputation_target) == pd.core.frame.DataFrame:
1824 |             test_1 = new_target.shape[1] == self.imputation_target.shape[1]
1825 |             test_2 = new_target.columns.isin(self.imputation_target.columns).sum() \
1826 |                      == new_target.shape[1]
1827 |             if not test_1 & test_2:
1828 |                 raise ValueError("New target must have same columns as original target dataframe")
1829 |             if self.additional_data is not None:
1830 |                 test_1 = new_target.shape[1] == self.additional_data.shape[1]
1831 |                 test_2 = additional_data.columns.isin(self.additional_data.columns).sum() \
1832 |                          == additional_data.shape[1]
1833 |                 if not test_1 & test_2:
1834 |                     raise ValueError("New target must have same columns as original target dataframe")
1835 |         else:
1836 |             raise ValueError("Target must be Pandas dataframe or series")
1837 |         self.imputation_target = new_target.copy()
1838 |         if self.additional_data is not None:
1839 |             self.additional_data = additional_data.copy()
1840 |             self.additional_data.fillna(0, inplace=True)
1841 |         self.na_matrix = self.imputation_target.notnull().astype(bool)
1842 |         self.imputation_target.fillna(0, inplace=True)
1843 |         return self
1844 | 
1845 | 
1846 | def combine(y_var,
1847 |             X_vars,
1848 |             df_list=None,
1849 |             dof_adjust=True,
1850 |             incl_constant=True,
1851 |             **glm_args,
1852 |             ):
1853 |     """
1854 |       Function used to run a GLM model across multiple datasets, aggregating the
1855 |       results using Rubin's combination rules -- i.e. multiple imputation analysis.
1856 | 
1857 |       This function regresses the outcome variable on a linear combination of
1858 |       independent variables, given a user-specified model family and link function.
1859 |       For example if y_var = 'y' and X_vars = ['x1','x2','x3'], then by default this
1860 |       function estimates the model y = a + x1 + x2 + x3, where a is the constant term.
1861 |       Note, the constant term is added by default, but can be excluded by setting
1862 |       incl_constant = False.
1863 | 
1864 |       This function wraps statsmodels.GLM() and allows users to specify linear
1865 |       models using GLM families including Gaussian, Binomial, and Poisson.
1866 | 
1867 |       The function can be called on the completed dataframes generated from a MIDAS
1868 |       model or users can supply their own list of completed datasets to analyse.
1869 | 
1870 |       Args:
1871 |         df_list: A list of pd.DataFrames. The M completed datasets to be analyzed.
1872 | 
1873 |         y_var: String. The name of the outcome variable.
1874 | 
1875 |         X_vars: List of strings. The names of the predictor variables.
1876 | 
1877 |         dof_adjust: Boolean. Indicates whether to apply the Barnard and Rubin (1999)
1878 |         degrees of freedom adjustment for small-samples.
1879 | 
1880 |         incl_constant: Boolean. Indicates whether to include an intercept in the null model (the default in
1881 |         most generalized linear model software packages).
1882 | 
1883 |         **glm_args: Further arguments to be passed to statsmodels.GLM(), e.g., to
1884 |         specify model family, offsets, and variance and frequency weights (see the
1885 |         statsmodels documentation for full details). If None, a Gaussian (ordinary
1886 |         least squares) model will be estimated.
1887 | 
1888 |       Returns:
1889 |         DataFrame of combined model results  """
1890 | 
1891 |     ind_models = []
1892 |     mods_est = []
1893 |     mods_var = []
1894 |     m = len(df_list)
1895 | 
1896 |     for i in range(m):
1897 |         df_mod = df_list[i]
1898 |         df_endog = df_mod[y_var]
1899 |         df_exog = df_mod[X_vars]
1900 | 
1901 |         if incl_constant:
1902 |             df_exog = sm.add_constant(df_exog)
1903 | 
1904 |         ind_model = sm.GLM(df_endog, df_exog, **glm_args)
1905 |         ind_results = ind_model.fit()
1906 |         mods_est.append(ind_results.params)
1907 |         mods_var.append(np.diag(ind_results.cov_params()))
1908 | 
1909 |         if i == 0:
1910 |             mods_df_resid = ind_results.df_resid
1911 |             mods_coef_names = ind_results.model.exog_names
1912 | 
1913 |     Q_bar = np.multiply((1 / m), np.sum(np.array(mods_est), 0))
1914 |     U_bar = np.multiply((1 / m), np.sum(np.array(mods_var), 0))
1915 | 
1916 |     models_demean = list(map(lambda x: np.square(x - Q_bar), mods_est))
1917 | 
1918 |     B = np.multiply(1 / (m - 1), np.sum(np.array(models_demean), 0))
1919 | 
1920 |     Q_bar_var = U_bar + ((1 + (1 / m)) * B)
1921 |     Q_bar_se = np.sqrt(Q_bar_var)
1922 | 
1923 |     v_m = (m - 1) * np.square(1 + (U_bar / ((1 + m ** (-1)) * B)))
1924 | 
1925 |     if dof_adjust:
1926 | 
1927 |         v_complete = mods_df_resid
1928 | 
1929 |         gamma = ((1 + m ** (-1)) * B) / Q_bar_var
1930 | 
1931 |         v_obs = ((v_complete + 1) / (v_complete + 3)) * v_complete * (1 - gamma)
1932 | 
1933 |         v_corrected = ((1 / v_m) + (1 / v_obs)) ** (-1)
1934 | 
1935 |         dof = v_corrected
1936 | 
1937 |     else:
1938 | 
1939 |         dof = v_m
1940 | 
1941 |     est = Q_bar
1942 |     std_err = Q_bar_se
1943 |     stat = est / std_err
1944 | 
1945 |     combined_mat = {'term': mods_coef_names,
1946 |                     'estimate': est,
1947 |                     'std.error': std_err,
1948 |                     'statistic': stat,
1949 |                     'df': dof,
1950 |                     'p.value': (2 * (1 - stats.t.cdf(abs(stat), df=dof)))}
1951 | 
1952 |     return pd.DataFrame(combined_mat)
1953 | 
1954 | 
1955 | def binary_conv(x):
1956 |     """
1957 |       Convenience function used to convert a binary column vector of data  to
1958 |       1/0 encoding.
1959 | 
1960 |       Args:
1961 |         x: pd.Series. An indexable array containing only two unique values.
1962 | 
1963 |       Returns:
1964 |         A pd.Series the same length as x, with 0s and 1s corresponding to the first
1965 |         and unique values in x respectively. """
1966 | 
1967 |     labs = x.unique()[~pd.isnull(x.unique())]
1968 |     x = np.where(x == labs[0], 0, x)
1969 |     x = np.where(x == labs[1], 1, x)
1970 |     x = np.where(pd.isnull(x), np.NaN, x)
1971 | 
1972 |     return x
1973 | 
1974 | 
1975 | def cat_conv(cat_data):
1976 |     """
1977 |       Convenience function used to one-hot encode a categorical column in a panda
1978 |       dataframe.
1979 | 
1980 |       Args:
1981 |         cat_data: A pd.DataFrame. A dataframe containing only categorical columns to be
1982 |         one-hot encoded.
1983 | 
1984 |       Returns:
1985 |         cat_construct: pd.DataFrame. A one-hot encoded version of the input data.
1986 |         cat_col_names: List of lists. Nested list of the one-hot encoded variable names,
1987 |         that can be passed into the MIDASpy .build() function."""
1988 | 
1989 |     cat_col_names = []
1990 | 
1991 |     cat_construct = []
1992 | 
1993 |     for column in cat_data.columns:
1994 |         na_temp = cat_data[column].isnull()
1995 |         temp = pd.get_dummies(cat_data[column], prefix=column, dtype=np.uint8)
1996 |         temp[na_temp] = np.nan
1997 |         cat_construct.append(temp)
1998 |         cat_col_names.append(list(temp.columns.values))
1999 | 
2000 |     cat_construct = pd.concat(cat_construct, axis=1)
2001 |     return cat_construct, cat_col_names
2002 | 


--------------------------------------------------------------------------------
/midas_functions.md:
--------------------------------------------------------------------------------
  1 | # Guide to the methods and arguments of MIDAS
  2 | 
  3 | Model construction first requires an instantiation of MIDAS. The model then needs to be constructed and trained before imputations can be generated. Calibration is optional, but strongly recommeded.
  4 | 
  5 | This class doesn't explicitly return values. Values are either stored internally, files are saved remotely or methods yield rather than returning. The key attribute is .output_list when samples are generated.
  6 | 
  7 | #### Instantiation:
  8 | 
  9 | - Midas()
 10 | 
 11 | #### Model construction:
 12 | 
 13 | - .build_model()
 14 | - .build_model_pipeline()
 15 | 
 16 | #### Model calibration:
 17 | 
 18 | - .overimpute()
 19 | 
 20 | #### Model training:
 21 | 
 22 | - .train_model()
 23 | - .train_model_pipeline()
 24 | 
 25 | #### Imputation generation:
 26 | 
 27 | - .batch_generate_samples()
 28 | - .batch_yield_samples()
 29 | - .generate_samples()
 30 | - .yield_samples()
 31 | - .yield_samples_pipeline()
 32 | 
 33 | ---
 34 | 
 35 | ### Midas()
 36 | 
 37 | - layer_structure= \[256, 256, 256\]
 38 | - learn_rate= 1e-4
 39 | - input_drop= 0.8
 40 | - train_batch = 16
 41 | - savepath= 'tmp/MIDAS'
 42 | - seed= None
 43 | - loss_scale= 1
 44 | - init_scale= 1
 45 | - softmax_adj= 1
 46 | 
 47 | Initialiser. Called separately to 'build_model' to allow for out-of-memory datasets. All key hyperparameters are entered at this stage, as the model construction methods only deal with the dataset.
 48 | 
 49 | #### Args:
 50 | - **layer_structure:** List of integers. The number of nodes in each layer of the network (default = [256, 256, 256], denoting a three-layer network with 256 nodes per layer). Larger networks can learn more complex data structures but require longer training and are more prone to overfitting.
 51 | 
 52 | - **learn_rate:** Float. The learning rate $\gamma$ (default = 0.0001), which controls the size of the weight adjustment in each training epoch. In general, higher values reduce training time at the expense of less accurate results.
 53 | 
 54 | - **input_drop:** Float between 0 and 1. The probability of corruption for input columns in training mini-batches (default = 0.8). Higher values increase training time but reduce the risk of overfitting. In our experience, values between 0.7 and 0.95 deliver the best performance.
 55 | 
 56 | - **train_batch:** Integer. The number of observations in training mini-batches (default = 16). Common choices are 8, 16, 32, 64, and 128; powers of 2 tend to enhance memory efficiency. In general, smaller sizes lead to faster convergence at the cost of greater noise and thus less accurate estimates of the error gradient. Where memory management is a concern, they should be favored.
 57 | 
 58 | - **savepath:** String. The location to which the trained model will be saved.
 59 | 
 60 | - **seed:** Integer. The value to which Python's pseudo-random number generator is initialized. This enables users to ensure that data shuffling, weight and bias initialization, and missingness indicator vectors are reproducible.
 61 | 
 62 | - **loss_scale:** Float. A constant by which the RMSE loss functions are multiplied (default = 1). This hyperparameter performs a similar function to the learning rate. If loss during training is very large, increasing its value can help to prevent overtraining.
 63 | 
 64 | - **init_scale:** Float. The numerator of the variance component of Xavier Initialisation equation (default = 1). In very deep networks, higher values may help to prevent extreme gradients (though this problem is less common with ELU activation functions).
 65 | 
 66 | - **softmax_adj:** Float. A constant by which the cross-entropy loss functions are multiplied (default = 1). This hyperparameter is the equivalent of loss_scale for categorical variables. If cross-entropy loss falls at a consistently faster rate than RMSE during training, a lower value may help to redress this imbalance.
 67 | 
 68 | - **vae_layer:** Boolean. Specifies whether to include a variational autoencoder layer in the network (default = False), one of the key diagnostic tools included in midas. If set to True, variational autoencoder hyperparameters must be specified via a number of additional arguments.
 69 | 
 70 | - **latent_space_size:** Integer. The number of normal dimensions used to parameterize the latent space.
 71 | 
 72 | - **vae_sample_var:** Float. The sampling variance of the normal distributions used to parameterize the latent space.
 73 | 
 74 | - **vae_alpha:** Float. The strength of the prior imposed on the Kullback-Leibler divergence term in the variational autoencoder loss functions.
 75 | 
 76 | - **kld_min:**  Float. The minimum value of the Kullback-Leibler divergence term in the variational autoencoder loss functions.
 77 | 
 78 | ---
 79 | 
 80 | ### .build_model()
 81 | 
 82 | - imputation_target
 83 | - categorical_columns= None
 84 | - softmax_columns= None
 85 | - unsorted= True
 86 | - additional_data = None
 87 | - verbose= True
 88 | 
 89 | This method is called to construct the neural network that is the heart of MIDAS. This includes the assignment of loss functions to the appropriate data types.
 90 | 
 91 | THIS FUNCTION MUST BE CALLED BEFORE ANY TRAINING OR IMPUTATION OCCURS. Failing to do so will simply raise an error.
 92 | 
 93 | The categorical columns should be a list of column names. Softmax columns should be a list of lists of column names. This will allow the model to dynamically assign cost functions to the correct variables. If, however, the data comes pre-sorted, 'arranged' can be set to "True", in which case the arguments can be passed in as integers of size, ie. shape[1] attributes for each of the relevant categories.
 94 | 
 95 | In other words, if you're experienced at using MIDAS and understand how its indexing works, pre-sort your data and pass in the integers so specifying reindexing values doesn't become too onerous.
 96 | 
 97 | Alternatively, list(df.columns.values) will output a list of column names, which can be easily implemented in the 'for' loop which constructs your dummy variables.
 98 | 
 99 | #### Args:
100 | - **imputation_target:** DataFrame. The name of the incomplete input dataset. Upon being read in, the dataset will be appropriately formatted and stored for training.
101 | 
102 | - **binary_columns:** List of names. A list of  all binary variables in the input dataset.
103 | 
104 | - **softmax_columns:** List of lists. The outer list should include all non-binary categorical variables in the input dataset. Each inner list should contain the mutually exclusive set of possible classes for each of these variables.
105 | 
106 | - **unsorted:** Boolean. Specifies whether the input dataset has been pre-ordered in terms of variable type (default = True, denoting no sorting). If set to False, binary_columns and softmax_columns should be a list of integers denoting shape attributes for each category.
107 | 
108 | - **additional_data:** DataFrame. Data that should be included in the imputation model but are not required for later analyses. Such data will not be formatted, rearranged, or included in the loss functions, reducing training time.
109 | 
110 | - **verbose:** Boolean. Specifies whether to print messages to the terminal (default = True).
111 | 
112 | ---
113 | 
114 | ### .build_model_pipeline()
115 | 
116 | - data_sample
117 | - categorical_columns= None
118 | - softmax_columns= None
119 | - unsorted= True
120 | - additional_data_sample= None
121 | - verbose= True
122 | - crossentropy_adj= 1
123 | - loss_scale = 1
124 | 
125 | This function is for integration with databasing or any dataset that needs to be batched into memory. The data sample is simply there to allow the original constructor to be recycled. The head of the data should be sufficient to build the imputation model. The input pipeline itself should pre-scale the data, and code null values as type np.nan. The pipeline ought to output a Pandas DataFrame. If additional data will be passed in, then the return must be a list of two DataFrames. The columns of the dataframe will be re-arranged so that error functions are efficiently generated.
126 | 
127 | IT IS IMPERATIVE that this ordering is respected. Design the input batching function accordingly.
128 | 
129 | The categorical columns should be a list of column names. Softmax columns should be a list of lists of column names. This will allow the model to dynamically assign cost functions to the correct variables. If, however, the data comes pre-sorted, arranged can be set to "true", in which case the arguments can be passed in as integers of size, ie. shape[1] attributes for each of the relevant categories.
130 | 
131 | In other words, pre-sort your data and pass in the integers, so indexing dynamically doesn't become too difficult. Alternatively, list(df.columns.values) will output a list of column names, which can be easily implemented in the 'for' loop which constructs your dummy variables.
132 | 
133 | #### Args:
134 | - **data_sample:** DataFrame. The head of the data that will be fed in via a batching pipeline. This sample is just used to enforce indexing and to allow code recyling.
135 | 
136 | - **categorical_columns:** List of names. Specifies the binary (ie. non-exclusive categories) to be imputed. If unsorted = False, this value can be an integer
137 | 
138 | - **softmax_columns:** List of lists. Every inner list should contain column names. Each inner list should represent a set of mutually exclusive categories, such as current day of the week. if unsorted = False, this should be a list of integers.
139 | 
140 | - **unsorted:** Boolean. Specifies to MIDAS that data has been pre-sorted, and indices can simply be appended to the size index.
141 | 
142 | - **additional_data:** DataFrame. Any data that shoud be included in the imputation model, but is not required from the output. By passing data here, the data will neither be rearranged nor will it generate a cost function. This reduces the regularising effects of multiple loss functions, but reduces both networksize requirements and training time.
143 | 
144 | - **verbose:** Boolean. Set to False to suppress messages printing to terminal.
145 | 
146 | ---
147 | 
148 | ### .overimpute()
149 | 
150 | - spikein = 0.1
151 | - training_epochs= 100
152 | - report_ival = 10
153 | - report_samples = 32
154 | - plot_all= True
155 | - verbose= True
156 | - verbosity_ival= 1
157 | - spike_seed= 42
158 | - excessive= False
159 | 
160 | This function spikes in additional missingness, so that known values can be used to help adjust the complexity of the model. As conventional train/validation splits can still lead to autoencoders overtraining, the method for limiting complexity is overimputation and early stopping. This gives an estimate of how the model will react to unseen variables.
161 | 
162 | Error is defined as RMSE for continuous variables, and classification error for binary and categorical variables (ie. 1 - accuracy). Note that this means that binary classification is inherently dependent on a selection threshold of 0.5, and softmax accuracy will naturally decrease as a function of the number of classes within the model. All three will be affected by the degree of imbalance within the dataset.
163 | 
164 | The accuracy measures provided here may not be ideal for all problems, but they are generally appropriate for selecting optimum complexity. Should the lines denoting error begin to trend upwards, this indicates overtraining and is a sign that the training_epochs parameter to the .train_model() method should be capped before this point.
165 | 
166 | The actual optimal point may differ from that indicated by the .overimpute() method for two reasons:
167 | - The loss that is spiked in reduces the overall data available to the algorithm to learn the patterns inherent, so there should be some improvement in performance when .train_model() is called. If this is a concern, then it should be possible to compare the behaviour of the loss figure between .train_model() and .overimpute().
168 | - The missingness inherent to the data may depend on some unobserved factor.
169 | In this case, the bias in the observed data may lead to inaccurate inference.
170 | 
171 | It is worth visually inspecting the distribution of the overimputed values against imputed values (using plot_all) to ensure that they fall within a sensible range.
172 | 
173 | #### Args:
174 | 
175 | - **spikein:** Float, between 0 and 1. The proportion of observed values in the input dataset to be randomly removed (default = 0.1).
176 | 
177 | - **training_epochs:** Integer. The number of overimputation training epochs (default = 100). Selecting a low value increases the risk that trends in the loss metrics have not stabilized by the end of training, in which case additional epochs may be necessary.
178 | 
179 | - **report_ival:** Integer. The number of overimputation training epochs between calculations of loss (default = 10). Shorter intervals provide a more granular view of model performance but slow down the overimputation process.
180 | 
181 | - **report_samples:** The number of Monte Carlo samples drawn from the estimated missing-data posterior for loss calculations (default = 32). A larger number increases overimputation runtime and may thus necessitate a lower value of report_ival.
182 | 
183 | - **plot_vars:** Boolean. Specifies whether to plot the distribution of original versus overimputed values (default = True). This takes the form of a density plot for continuous variables and a barplot for categorical variables (showing proportions of each class).
184 | 
185 | - **plot_main:** Boolean. Specifies whether to display the main graphical output (overimputation error during training) at every reporting interval (default = True). If set to False, it will only appear at the end of the overimputation training process. Error values are still shown at each report_ival.
186 | 
187 | - **skip_plot:** Boolean. Specifies whether to suppress the main graphical output (default = False). This may be desirable when users are conducting multiple overimputation exercises sequentially and are primarily interested in the console output.
188 | 
189 | - **verbose:** Boolean. Prints out messages, including loss, to the terminal (default = True).
190 | 
191 | - **verbosity_ival:** Integer. The number of overimputation training epochs between messages (default = True).
192 | 
193 | - **spike_seed:** Integer. The value to which Python's pseudo-random number generator is initialized for the missingness spike-in. This is separate to the seed specified in the Midas() call.
194 | 
195 | - **excessive:** Boolean. Specifies whether to print aggregate mini-batch loss to the terminal (default = False). This argument differs from the .train\_model()'s excessive argument, which prints individual mini-batch loss. This allows users to check for unusual imputations, which may be helpful if loss is not declining during overimputation training.
196 | 
197 | ---
198 | 
199 | ### .train_model()
200 | 
201 | - training_epochs= 100
202 | - verbose= True
203 | - verbosity_ival= 1
204 | - excessive= False
205 |                   
206 | This is the standard method for optimising the model's parameters. Must be called before imputation can be performed. The model is automatically saved upon conclusion of training
207 | 
208 | #### Args:
209 | 
210 | - **training_epochs:** Integer. The number of complete cycles (forward passes) through the network during training (default = 100).
211 | 
212 | - **verbose:** Boolean. Specifies whether to print messages to the terminal during training, including loss values (default = True).
213 | 
214 | - **verbosity_ival:** Integer. The number of training epochs between messages (default = 1).
215 | 
216 | - **excessive:** Boolean. Specifies whether to print loss for each mini-batch to the terminal (default = False), which can help with troubleshooting.
217 | 
218 | ---
219 | 
220 | ### .train_model_pipeline()
221 | 
222 | - input_pipeline
223 | - training_epochs= 100
224 | - verbose= True
225 | - verbosity_ival= 1
226 | - excessive= False
227 | 
228 | This is the alternative method for optimising the model's parameters when input data must be batched into memory. Must be called before imputation can be performed. The model will then be saved to the specified directory.
229 | 
230 | #### Args:
231 |       
232 | - **input_pipeline:** Function which yields a pre-processed and scaled DataFrame from the designated source, be it a server or large flat file.
233 | 
234 | - **training_epochs:** Integer. The number of epochs the model will run for
235 | 
236 | - **verbose:** Boolean. Prints out messages, including loss
237 | 
238 | - **verbosity_ival:** Integer. This number determines the interval between messages.
239 | 
240 | - **excessive:** Boolean. Used for troubleshooting, this argument will cause the cost of each minibatch to be printed to the terminal.
241 | 
242 | ----
243 | 
244 | ### .batch_generate_samples()
245 | 
246 | - m= 50
247 | - b_size= 256
248 | - verbose= True
249 |  
250 | Method used to generate a set of m imputations to the .output_list attribute. Imputations are stored within a list in memory, and can be accessed in any order. As batch generation implies very large datasets, this method is only provided for internal troubleshooting.
251 | 
252 | This function is for a dataset large enough to be stored in memory, but too large to be passed into the model in its entirety. This may be due to GPU memory limitations, or just the size of the model
253 | 
254 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this.
255 | 
256 | #### Args:
257 | - **m:** Integer. Number of imputations to generate.
258 | 
259 | - **b_size:** Integer. Number of data entries to process at once. For managing wider datasets, smaller numbers may be required.
260 | 
261 | - **verbose:** Boolean. Prints out messages.
262 | 
263 | ---
264 | 
265 | ### .batch_yield_samples()
266 | 
267 | - m= 50
268 | - b_size= 256
269 | - verbose= True
270 | 
271 | Method used to generate a set of m imputations via the 'yield' command, allowing imputations to be used in a 'for' loop'
272 | 
273 | This function is for a dataset large enough to be stored in memory, but too large to be passed into the model in its entirety. This may be due to GPU memory limitations, or just the size of the model or dataset.
274 | 
275 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this.
276 | 
277 | #### Args:
278 | - **m:** Integer. Number of imputations to generate.
279 | 
280 | - **b_size:** Integer. Number of data entries to process at once. For managing wider datasets, smaller numbers may be required.
281 | 
282 | - **verbose:** Boolean. Prints out messages.
283 | 
284 | ---
285 | 
286 | ### .generate_samples()
287 | 
288 | - m= 50
289 | - verbose= True
290 | 
291 | Method used to generate a set of m imputations to the .output_list attribute. Imputations are stored within a list in memory, and can be accessed in any order.
292 | 
293 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this.
294 | 
295 | #### Args:
296 | - **m:** Integer. The number of completed datasets to produce (default = 50)
297 | 
298 | - **verbose:** Boolean. Specifies whether to print messages to the terminal (default = True).
299 | 
300 | ---
301 | 
302 | ### .yield_samples()
303 | 
304 | - m= 50
305 | - verbose= True
306 | 
307 | Method used to generate a set of m imputations via the 'yield' command, allowing imputations to be used in a 'for' loop.
308 | 
309 | If a model has been pre-trained, on subsequent runs this function can be directly called without having to train first. An 'if' statement checking the default save location is useful for this.
310 | 
311 | #### Args:
312 | 
313 | - **m:** Integer. Number of imputations to generate.
314 | 
315 | - **verbose:** Boolean. Prints out messages.
316 | 
317 | ---
318 | 
319 | ### .yield_samples_pipeline()
320 | 
321 | - verbose= False
322 | 
323 | As it's impossible to know the specifics of the pipeline, this method simply cycles through all data provided by the input function. The number of imputations can be specified by the user, depending on their needs. The size of the output DataFrame depends on the size specified by the input function that was passed to 'train_model_pipeline'.
324 | 
325 | #### Args:
326 | 
327 | - **verbose: Prints out messages
328 | 
329 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | 
3 | [pytest]
4 | python_files = *.py
5 | addopts = --ignore=setup.py


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | from pathlib import Path
 5 | 
 6 | this_directory = Path(__file__).parent
 7 | long_description = (this_directory / "README.md").read_text(encoding="utf-8")
 8 | 
 9 | if sys.version_info < (3, 6):
10 |     sys.exit("Sorry, Python 3.5 is no longer supported. Please use Python versions from 3.6 to 3.10")
11 | 
12 | install_requires = ['numpy>=1.5,<=1.26.4', 'scikit-learn', 'matplotlib', 'pandas>=0.19', 'tensorflow_addons<0.20', 'statsmodels', 'scipy']
13 | if sys.version_info >= (3, 8) and sys.version_info < (3, 11):
14 |     install_requires.append('tensorflow<2.12.0; sys_platform != "darwin" or platform_machine != "arm64"')
15 |     install_requires.append('tensorflow-macos<2.12.0; sys_platform == "darwin" and platform_machine == "arm64"')
16 | else:
17 |     install_requires.append('tensorflow>=1.10; sys_platform != "darwin" or platform_machine != "arm64"')
18 |     install_requires.append('tensorflow-macos>=1.10; sys_platform == "darwin" and platform_machine == "arm64"')
19 | 
20 | setup(
21 |     name='MIDASpy',
22 |     packages=['MIDASpy'],
23 |     version='1.4.0',
24 |     license='Apache',
25 |     description='Multiple Imputation with Denoising Autoencoders',
26 |     long_description_content_type='text/markdown',
27 |     long_description=long_description,
28 |     url='http://github.com/MIDASverse/MIDASpy',
29 |     project_urls={
30 |         'Method article': 'https://doi.org/10.1017/pan.2020.49',
31 |         'Software article': 'https://doi.org/10.18637/jss.v107.i09',
32 |         'Source': 'https://github.com/MIDASverse/MIDASpy',
33 |         'Issues': 'https://github.com/MIDASverse/MIDASpy/issues',
34 |     },
35 |     author='Ranjit Lall, Alex Stenlake, and Thomas Robinson',
36 |     author_email='R.Lall@lse.ac.uk',
37 |     python_requires='>=3.6, <3.11',
38 |     install_requires=install_requires,
39 |     keywords=['multiple imputation', 'neural networks', 'tensorflow'],
40 |     extras_require={'test': ['pytest','matplotlib']},
41 | 
42 |     classifiers=[
43 |         'Development Status :: 5 - Production/Stable',
44 |         'Intended Audience :: Science/Research',
45 |         'Topic :: Scientific/Engineering',
46 |         'License :: OSI Approved :: Apache Software License',
47 |         'Programming Language :: Python :: 3',
48 |         'Programming Language :: Python :: 3.6',
49 |         'Programming Language :: Python :: 3.7',
50 |         'Programming Language :: Python :: 3.8',
51 |         'Programming Language :: Python :: 3.9',
52 |         'Programming Language :: Python :: 3.10',
53 |     ],
54 | )
55 | 


--------------------------------------------------------------------------------
/tests/test_midas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import tensorflow as tf
 4 | from sklearn.preprocessing import MinMaxScaler
 5 | import sys
 6 | import os
 7 | import csv
 8 | import MIDASpy as md
 9 | 
10 | def test_some_functionality():
11 |     # Load the data
12 |     np.random.seed(441)
13 |     data_path = os.path.join(os.path.dirname(__file__), "test_data", "adult_data.csv")
14 |     data_0 = pd.read_csv(data_path)
15 |     data_0.columns.str.strip()
16 | 
17 |     def spike_in_generation(data):
18 |         spike_in = pd.DataFrame(np.zeros_like(data), columns= data.columns)
19 |         for column in data.columns:
20 |             subset = np.random.choice(data[column].index[data[column].notnull()], 5000, replace= False)
21 |             spike_in.loc[subset, column] = 1
22 |         return spike_in
23 | 
24 |     spike_in = spike_in_generation(data_0)
25 |     original_value = data_0.loc[4, 'hours_per_week']
26 |     data_0[spike_in == 1] = np.nan
27 | 
28 |     categorical = ['workclass','marital_status','relationship','race','class_labels','sex','education','occupation','native_country']
29 |     data_cat, cat_cols_list = md.cat_conv(data_0[categorical])
30 | 
31 |     data_0.drop(categorical, axis = 1, inplace = True)
32 |     constructor_list = [data_0]
33 |     constructor_list.append(data_cat)
34 |     data_in = pd.concat(constructor_list, axis=1)
35 | 
36 |     na_loc = data_in.isnull()
37 |     data_in[na_loc] = np.nan
38 | 
39 |     imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 89, input_drop = 0.75)
40 |     imputer.build_model(data_in, softmax_columns = cat_cols_list)
41 |     imputer.train_model(training_epochs = 2)
42 | 
43 |     imputations = imputer.generate_samples(m=2).output_list
44 |     model = md.combine(y_var = "capital_gain", X_vars = ["education_num","age"], df_list = imputations)
45 | 


--------------------------------------------------------------------------------
/tmp/MIDAS.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.data-00000-of-00001


--------------------------------------------------------------------------------
/tmp/MIDAS.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.index


--------------------------------------------------------------------------------
/tmp/MIDAS.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MIDASverse/MIDASpy/b43f3ccedc1a150cc74c6a82d049239fe27d10ee/tmp/MIDAS.meta


--------------------------------------------------------------------------------
/tmp/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "MIDAS"
2 | all_model_checkpoint_paths: "MIDAS"
3 | 


--------------------------------------------------------------------------------