├── .gitignore
├── 1_Temperature.ipynb
├── 1_data_preparation.ipynb
├── 2_imputations_concatenating.ipynb
├── 3_distance_cor_continents.ipynb
├── 3_distance_cor_groups.ipynb
├── 5_Additions.ipynb
├── LICENSE
├── README.md
└── TLPH_undirected_networks_supp_material.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | un_data.csv
2 | csv/
3 | info.csv
4 |
--------------------------------------------------------------------------------
/1_Temperature.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Temperature data per country"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pickle\n",
18 | "import pandas as pd\n",
19 | "import os\n",
20 | "from sklearn.preprocessing import scale"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "# Using CRU dataset"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "['Afghanistan',\n",
39 | " 'Albania',\n",
40 | " 'Algeria',\n",
41 | " 'Angola',\n",
42 | " 'Antigua and Barbuda',\n",
43 | " 'Argentina',\n",
44 | " 'Armenia',\n",
45 | " 'Australia',\n",
46 | " 'Austria',\n",
47 | " 'Azerbaijan',\n",
48 | " 'Bahamas, The',\n",
49 | " 'Bahrain',\n",
50 | " 'Bangladesh',\n",
51 | " 'Barbados',\n",
52 | " 'Belarus',\n",
53 | " 'Belgium',\n",
54 | " 'Belize',\n",
55 | " 'Benin',\n",
56 | " 'Bhutan',\n",
57 | " 'Bolivia',\n",
58 | " 'Bosnia and Herzegovina',\n",
59 | " 'Botswana',\n",
60 | " 'Brazil',\n",
61 | " 'Brunei Darussalam',\n",
62 | " 'Bulgaria',\n",
63 | " 'Burkina Faso',\n",
64 | " 'Burundi',\n",
65 | " 'Cambodia',\n",
66 | " 'Cameroon',\n",
67 | " 'Canada',\n",
68 | " 'Central African Republic',\n",
69 | " 'Chad',\n",
70 | " 'Chile',\n",
71 | " 'China',\n",
72 | " 'Colombia',\n",
73 | " 'Comoros',\n",
74 | " 'Congo, Dem. Rep.',\n",
75 | " 'Congo, Rep.',\n",
76 | " 'Costa Rica',\n",
77 | " \"Cote d'Ivoire\",\n",
78 | " 'Croatia',\n",
79 | " 'Cuba',\n",
80 | " 'Cyprus',\n",
81 | " 'Czech Republic',\n",
82 | " 'Denmark',\n",
83 | " 'Djibouti',\n",
84 | " 'Dominica',\n",
85 | " 'Dominican Republic',\n",
86 | " 'Ecuador',\n",
87 | " 'Egypt, Arab Rep.',\n",
88 | " 'El Salvador',\n",
89 | " 'Equatorial Guinea',\n",
90 | " 'Eritrea',\n",
91 | " 'Estonia',\n",
92 | " 'Ethiopia',\n",
93 | " 'Fiji',\n",
94 | " 'Finland',\n",
95 | " 'France',\n",
96 | " 'Gabon',\n",
97 | " 'Gambia, The',\n",
98 | " 'Georgia',\n",
99 | " 'Germany',\n",
100 | " 'Ghana',\n",
101 | " 'Greece',\n",
102 | " 'Greenland',\n",
103 | " 'Grenada',\n",
104 | " 'Guatemala',\n",
105 | " 'Guinea',\n",
106 | " 'Guinea-Bissau',\n",
107 | " 'Guyana',\n",
108 | " 'Haiti',\n",
109 | " 'Honduras',\n",
110 | " 'Hungary',\n",
111 | " 'Iceland',\n",
112 | " 'India',\n",
113 | " 'Indonesia',\n",
114 | " 'Iran, Islamic Rep.',\n",
115 | " 'Iraq',\n",
116 | " 'Ireland',\n",
117 | " 'Israel',\n",
118 | " 'Italy',\n",
119 | " 'Jamaica',\n",
120 | " 'Japan',\n",
121 | " 'Jordan',\n",
122 | " 'Kazakhstan',\n",
123 | " 'Kenya',\n",
124 | " 'Kiribati',\n",
125 | " \"Korea, Dem. People's Rep.\",\n",
126 | " 'Korea, Rep.',\n",
127 | " 'Kuwait',\n",
128 | " 'Kyrgyz Republic',\n",
129 | " 'Lao PDR',\n",
130 | " 'Latvia',\n",
131 | " 'Lebanon',\n",
132 | " 'Lesotho',\n",
133 | " 'Liberia',\n",
134 | " 'Libya',\n",
135 | " 'Liechtenstein',\n",
136 | " 'Lithuania',\n",
137 | " 'Luxembourg',\n",
138 | " 'Madagascar',\n",
139 | " 'Malawi',\n",
140 | " 'Malaysia',\n",
141 | " 'Maldives',\n",
142 | " 'Mali',\n",
143 | " 'Malta',\n",
144 | " 'Mauritania',\n",
145 | " 'Mauritius',\n",
146 | " 'Mexico',\n",
147 | " 'Micronesia, Fed. Sts.',\n",
148 | " 'Moldova',\n",
149 | " 'Mongolia',\n",
150 | " 'Montenegro',\n",
151 | " 'Morocco',\n",
152 | " 'Mozambique',\n",
153 | " 'Myanmar',\n",
154 | " 'Namibia',\n",
155 | " 'Nepal',\n",
156 | " 'Netherlands',\n",
157 | " 'New Zealand',\n",
158 | " 'Nicaragua',\n",
159 | " 'Niger',\n",
160 | " 'Nigeria',\n",
161 | " 'Norway',\n",
162 | " 'Oman',\n",
163 | " 'Pakistan',\n",
164 | " 'Palau',\n",
165 | " 'Panama',\n",
166 | " 'Papua New Guinea',\n",
167 | " 'Paraguay',\n",
168 | " 'Peru',\n",
169 | " 'Philippines',\n",
170 | " 'Poland',\n",
171 | " 'Portugal',\n",
172 | " 'Puerto Rico',\n",
173 | " 'Qatar',\n",
174 | " 'Romania',\n",
175 | " 'Russian Federation',\n",
176 | " 'Rwanda',\n",
177 | " 'Samoa',\n",
178 | " 'Sao Tome and Principe',\n",
179 | " 'Saudi Arabia',\n",
180 | " 'Senegal',\n",
181 | " 'Serbia',\n",
182 | " 'Seychelles',\n",
183 | " 'Sierra Leone',\n",
184 | " 'Singapore',\n",
185 | " 'Slovak Republic',\n",
186 | " 'Slovenia',\n",
187 | " 'Solomon Islands',\n",
188 | " 'Somalia',\n",
189 | " 'South Africa',\n",
190 | " 'South Sudan',\n",
191 | " 'Spain',\n",
192 | " 'Sri Lanka',\n",
193 | " 'Sudan',\n",
194 | " 'Suriname',\n",
195 | " 'Sweden',\n",
196 | " 'Switzerland',\n",
197 | " 'Syrian Arab Republic',\n",
198 | " 'Tajikistan',\n",
199 | " 'Tanzania',\n",
200 | " 'Thailand',\n",
201 | " 'Timor-Leste',\n",
202 | " 'Togo',\n",
203 | " 'Tonga',\n",
204 | " 'Trinidad and Tobago',\n",
205 | " 'Tunisia',\n",
206 | " 'Turkey',\n",
207 | " 'Turkmenistan',\n",
208 | " 'Tuvalu',\n",
209 | " 'Uganda',\n",
210 | " 'Ukraine',\n",
211 | " 'United Arab Emirates',\n",
212 | " 'United Kingdom',\n",
213 | " 'United States',\n",
214 | " 'Uruguay',\n",
215 | " 'Uzbekistan',\n",
216 | " 'Vanuatu',\n",
217 | " 'Venezuela, RB',\n",
218 | " 'Vietnam',\n",
219 | " 'Yemen, Rep.',\n",
220 | " 'Zambia',\n",
221 | " 'Zimbabwe']"
222 | ]
223 | },
224 | "execution_count": 2,
225 | "metadata": {},
226 | "output_type": "execute_result"
227 | }
228 | ],
229 | "source": [
230 | "c = pd.read_csv('utils/countries_wb.csv', dtype=str, delimiter=';', header=None)\n",
231 | "countries = list(c[0])\n",
232 | "countries"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 3,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "countries_monthly = {}\n",
242 | "\n",
243 | "for country in countries:\n",
244 | " for file in os.listdir('utils/data/temp_country/'):\n",
245 | " if country in file:\n",
246 | " countries_monthly[country] = pd.read_csv('utils/data/temp_country/{}'.format(file), header=3, delimiter=' ', engine='python')"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 4,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "# we do not have temperature data for these countries\n",
256 | "\n",
257 | "not_included = []\n",
258 | "\n",
259 | "for country in countries:\n",
260 | " if country not in list(countries_monthly.keys()):\n",
261 | " not_included.append(country)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 5,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stdout",
271 | "output_type": "stream",
272 | "text": [
273 | "43\n"
274 | ]
275 | },
276 | {
277 | "data": {
278 | "text/plain": [
279 | "['Antigua and Barbuda',\n",
280 | " 'Bahamas, The',\n",
281 | " 'Bosnia and Herzegovina',\n",
282 | " 'Brunei Darussalam',\n",
283 | " 'Burkina Faso',\n",
284 | " 'Central African Republic',\n",
285 | " 'Congo, Dem. Rep.',\n",
286 | " 'Congo, Rep.',\n",
287 | " 'Costa Rica',\n",
288 | " \"Cote d'Ivoire\",\n",
289 | " 'Czech Republic',\n",
290 | " 'Dominican Republic',\n",
291 | " 'Egypt, Arab Rep.',\n",
292 | " 'El Salvador',\n",
293 | " 'Equatorial Guinea',\n",
294 | " 'Gambia, The',\n",
295 | " 'Iran, Islamic Rep.',\n",
296 | " \"Korea, Dem. People's Rep.\",\n",
297 | " 'Korea, Rep.',\n",
298 | " 'Kyrgyz Republic',\n",
299 | " 'Lao PDR',\n",
300 | " 'Micronesia, Fed. Sts.',\n",
301 | " 'New Zealand',\n",
302 | " 'Papua New Guinea',\n",
303 | " 'Puerto Rico',\n",
304 | " 'Russian Federation',\n",
305 | " 'Sao Tome and Principe',\n",
306 | " 'Saudi Arabia',\n",
307 | " 'Sierra Leone',\n",
308 | " 'Slovak Republic',\n",
309 | " 'Solomon Islands',\n",
310 | " 'South Africa',\n",
311 | " 'South Sudan',\n",
312 | " 'Sri Lanka',\n",
313 | " 'Syrian Arab Republic',\n",
314 | " 'Timor-Leste',\n",
315 | " 'Trinidad and Tobago',\n",
316 | " 'United Arab Emirates',\n",
317 | " 'United Kingdom',\n",
318 | " 'United States',\n",
319 | " 'Vanuatu',\n",
320 | " 'Venezuela, RB',\n",
321 | " 'Yemen, Rep.']"
322 | ]
323 | },
324 | "execution_count": 5,
325 | "metadata": {},
326 | "output_type": "execute_result"
327 | }
328 | ],
329 | "source": [
330 | "print(len(not_included))\n",
331 | "not_included"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 6,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "# manually matching these World Bank country names with the CRU country names\n",
341 | "countries_monthly['Antigua and Barbuda'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Antigua_and_Barbuda.tmp.per', header=3, delimiter=' ', engine='python')\n",
342 | "countries_monthly['Bahamas, The'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Bahamas.tmp.per', header=3, delimiter=' ', engine='python')\n",
343 | "countries_monthly['Bosnia and Herzegovina'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Bosnia-Herzegovinia.tmp.per', header=3, delimiter=' ', engine='python')\n",
344 | "countries_monthly['Brunei Darussalam'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Brunei.tmp.per', header=3, delimiter=' ', engine='python')\n",
345 | "countries_monthly['Burkina Faso'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Burkina_Faso.tmp.per', header=3, delimiter=' ', engine='python')\n",
346 | "countries_monthly['Central African Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Central_African_Rep.tmp.per', header=3, delimiter=' ', engine='python')\n",
347 | "countries_monthly['Congo, Dem. Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.DR_Congo.tmp.per', header=3, delimiter=' ', engine='python')\n",
348 | "countries_monthly['Congo, Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Congo.tmp.per', header=3, delimiter=' ', engine='python')\n",
349 | "countries_monthly['Costa Rica'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Costa_Rica.tmp.per', header=3, delimiter=' ', engine='python')\n",
350 | "countries_monthly[\"Cote d'Ivoire\"] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Ivory_Coast.tmp.per', header=3, delimiter=' ', engine='python')\n",
351 | "countries_monthly['Czech Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Czech_Republic.tmp.per', header=3, delimiter=' ', engine='python')\n",
352 | "countries_monthly['Dominican Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Dominican_Republic.tmp.per', header=3, delimiter=' ', engine='python')\n",
353 | "countries_monthly['Egypt, Arab Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Egypt.tmp.per', header=3, delimiter=' ', engine='python')\n",
354 | "countries_monthly['El Salvador'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.El_Salvador.tmp.per', header=3, delimiter=' ', engine='python')\n",
355 | "countries_monthly['Equatorial Guinea'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Equatorial_Guinea.tmp.per', header=3, delimiter=' ', engine='python')\n",
356 | "countries_monthly['Gambia, The'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Gambia.tmp.per', header=3, delimiter=' ', engine='python')\n",
357 | "countries_monthly['Iran, Islamic Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Iran.tmp.per', header=3, delimiter=' ', engine='python')\n",
358 | "countries_monthly[\"Korea, Dem. People's Rep.\"] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.North_Korea.tmp.per', header=3, delimiter=' ', engine='python')\n",
359 | "countries_monthly['Korea, Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.South_Korea.tmp.per', header=3, delimiter=' ', engine='python')\n",
360 | "countries_monthly['Kyrgyz Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Kyrgyzstan.tmp.per', header=3, delimiter=' ', engine='python')\n",
361 | "countries_monthly['Lao PDR'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Laos.tmp.per', header=3, delimiter=' ', engine='python')\n",
362 | "countries_monthly['New Zealand'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.New_Zealand.tmp.per', header=3, delimiter=' ', engine='python')\n",
363 | "countries_monthly['Papua New Guinea'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Papua_New_Guinea.tmp.per', header=3, delimiter=' ', engine='python')\n",
364 | "countries_monthly['Puerto Rico'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Puerto_Rica.tmp.per', header=3, delimiter=' ', engine='python')\n",
365 | "countries_monthly['Russian Federation'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Russia.tmp.per', header=3, delimiter=' ', engine='python')\n",
366 | "countries_monthly['Sao Tome and Principe'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Sao_Tome_+_Principe.tmp.per', header=3, delimiter=' ', engine='python')\n",
367 | "countries_monthly['Saudi Arabia'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Saudi_Arabia.tmp.per', header=3, delimiter=' ', engine='python')\n",
368 | "countries_monthly['Sierra Leone'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Sierra_Leone.tmp.per', header=3, delimiter=' ', engine='python')\n",
369 | "countries_monthly['Slovak Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Slovakia.tmp.per', header=3, delimiter=' ', engine='python')\n",
370 | "countries_monthly['Solomon Islands'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Solomon_Isl.tmp.per', header=3, delimiter=' ', engine='python')\n",
371 | "countries_monthly['South Africa'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.South_Africa.tmp.per', header=3, delimiter=' ', engine='python')\n",
372 | "countries_monthly['South Sudan'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.South_Sudan.tmp.per', header=3, delimiter=' ', engine='python')\n",
373 | "countries_monthly['Sri Lanka'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Sri_Lanka.tmp.per', header=3, delimiter=' ', engine='python')\n",
374 | "countries_monthly['Syrian Arab Republic'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Syria.tmp.per', header=3, delimiter=' ', engine='python')\n",
375 | "countries_monthly['Timor-Leste'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.East_Timor.tmp.per', header=3, delimiter=' ', engine='python')\n",
376 | "countries_monthly['Trinidad and Tobago'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Trinidad_and_Tobago.tmp.per', header=3, delimiter=' ', engine='python')\n",
377 | "countries_monthly['United Arab Emirates'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.United_Arab_Emirates.tmp.per', header=3, delimiter=' ', engine='python')\n",
378 | "countries_monthly['United Kingdom'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.United_Kingdom.tmp.per', header=3, delimiter=' ', engine='python')\n",
379 | "countries_monthly['United States'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.USA.tmp.per', header=3, delimiter=' ', engine='python')\n",
380 | "countries_monthly['Vanuatu'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Vanatu.tmp.per', header=3, delimiter=' ', engine='python')\n",
381 | "countries_monthly['Venezuela, RB'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Venezuela.tmp.per', header=3, delimiter=' ', engine='python')\n",
382 | "countries_monthly['Yemen, Rep.'] = pd.read_csv('utils/data/temp_country/crucy.v4.04.1901.2019.Yemen.tmp.per', header=3, delimiter=' ', engine='python')"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 7,
388 | "metadata": {
389 | "scrolled": true
390 | },
391 | "outputs": [],
392 | "source": [
393 | "for country, df in countries_monthly.items():\n",
394 | " df.columns = df.columns.str.replace(' ', '')\n",
395 | " #countries_monthly[country].drop(labels=['MAM', 'JJA', 'SON', 'DJF', 'ANN'], axis=1, inplace=True)\n",
396 | " months = df.loc[:, 'JAN':'DEC']\n",
397 | " countries_monthly[country]['AVG'] = months.mean(axis=1)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 8,
403 | "metadata": {},
404 | "outputs": [
405 | {
406 | "data": {
407 | "text/html": [
408 | "
\n",
409 | "\n",
422 | "
\n",
423 | " \n",
424 | " \n",
425 | " | \n",
426 | " YEAR | \n",
427 | " JAN | \n",
428 | " FEB | \n",
429 | " MAR | \n",
430 | " APR | \n",
431 | " MAY | \n",
432 | " JUN | \n",
433 | " JUL | \n",
434 | " AUG | \n",
435 | " SEP | \n",
436 | " OCT | \n",
437 | " NOV | \n",
438 | " DEC | \n",
439 | " MAM | \n",
440 | " JJA | \n",
441 | " SON | \n",
442 | " DJF | \n",
443 | " ANN | \n",
444 | " AVG | \n",
445 | "
\n",
446 | " \n",
447 | " \n",
448 | " \n",
449 | " 0 | \n",
450 | " 1901 | \n",
451 | " 3.8 | \n",
452 | " 2.7 | \n",
453 | " 8.0 | \n",
454 | " 11.8 | \n",
455 | " 14.6 | \n",
456 | " 19.9 | \n",
457 | " 22.0 | \n",
458 | " 21.6 | \n",
459 | " 19.1 | \n",
460 | " 14.5 | \n",
461 | " 8.8 | \n",
462 | " 7.2 | \n",
463 | " 11.5 | \n",
464 | " 21.2 | \n",
465 | " 14.1 | \n",
466 | " 6.8 | \n",
467 | " 12.8 | \n",
468 | " 12.833333 | \n",
469 | "
\n",
470 | " \n",
471 | " 1 | \n",
472 | " 1902 | \n",
473 | " 6.1 | \n",
474 | " 7.2 | \n",
475 | " 8.4 | \n",
476 | " 12.7 | \n",
477 | " 12.4 | \n",
478 | " 17.9 | \n",
479 | " 22.2 | \n",
480 | " 21.9 | \n",
481 | " 19.1 | \n",
482 | " 14.2 | \n",
483 | " 8.9 | \n",
484 | " 5.7 | \n",
485 | " 11.2 | \n",
486 | " 20.7 | \n",
487 | " 14.1 | \n",
488 | " 5.9 | \n",
489 | " 13.1 | \n",
490 | " 13.058333 | \n",
491 | "
\n",
492 | " \n",
493 | " 2 | \n",
494 | " 1903 | \n",
495 | " 5.2 | \n",
496 | " 6.8 | \n",
497 | " 9.1 | \n",
498 | " 9.1 | \n",
499 | " 15.2 | \n",
500 | " 17.7 | \n",
501 | " 21.0 | \n",
502 | " 21.7 | \n",
503 | " 19.0 | \n",
504 | " 15.0 | \n",
505 | " 9.8 | \n",
506 | " 7.0 | \n",
507 | " 11.1 | \n",
508 | " 20.1 | \n",
509 | " 14.6 | \n",
510 | " 6.6 | \n",
511 | " 13.0 | \n",
512 | " 13.050000 | \n",
513 | "
\n",
514 | " \n",
515 | " 3 | \n",
516 | " 1904 | \n",
517 | " 5.4 | \n",
518 | " 7.3 | \n",
519 | " 8.7 | \n",
520 | " 12.3 | \n",
521 | " 16.1 | \n",
522 | " 20.2 | \n",
523 | " 23.4 | \n",
524 | " 22.2 | \n",
525 | " 17.4 | \n",
526 | " 14.1 | \n",
527 | " 8.4 | \n",
528 | " 6.4 | \n",
529 | " 12.4 | \n",
530 | " 21.9 | \n",
531 | " 13.3 | \n",
532 | " 4.6 | \n",
533 | " 13.5 | \n",
534 | " 13.491667 | \n",
535 | "
\n",
536 | " \n",
537 | " 4 | \n",
538 | " 1905 | \n",
539 | " 2.9 | \n",
540 | " 4.6 | \n",
541 | " 8.7 | \n",
542 | " 11.3 | \n",
543 | " 14.4 | \n",
544 | " 19.0 | \n",
545 | " 23.3 | \n",
546 | " 22.2 | \n",
547 | " 19.8 | \n",
548 | " 11.6 | \n",
549 | " 10.4 | \n",
550 | " 6.8 | \n",
551 | " 11.5 | \n",
552 | " 21.5 | \n",
553 | " 13.9 | \n",
554 | " 5.6 | \n",
555 | " 12.9 | \n",
556 | " 12.916667 | \n",
557 | "
\n",
558 | " \n",
559 | " ... | \n",
560 | " ... | \n",
561 | " ... | \n",
562 | " ... | \n",
563 | " ... | \n",
564 | " ... | \n",
565 | " ... | \n",
566 | " ... | \n",
567 | " ... | \n",
568 | " ... | \n",
569 | " ... | \n",
570 | " ... | \n",
571 | " ... | \n",
572 | " ... | \n",
573 | " ... | \n",
574 | " ... | \n",
575 | " ... | \n",
576 | " ... | \n",
577 | " ... | \n",
578 | " ... | \n",
579 | "
\n",
580 | " \n",
581 | " 114 | \n",
582 | " 2015 | \n",
583 | " 7.0 | \n",
584 | " 6.6 | \n",
585 | " 9.6 | \n",
586 | " 12.4 | \n",
587 | " 17.2 | \n",
588 | " 21.1 | \n",
589 | " 25.4 | \n",
590 | " 24.3 | \n",
591 | " 20.0 | \n",
592 | " 15.6 | \n",
593 | " 12.0 | \n",
594 | " 8.5 | \n",
595 | " 13.1 | \n",
596 | " 23.6 | \n",
597 | " 15.9 | \n",
598 | " 8.4 | \n",
599 | " 15.0 | \n",
600 | " 14.975000 | \n",
601 | "
\n",
602 | " \n",
603 | " 115 | \n",
604 | " 2016 | \n",
605 | " 7.3 | \n",
606 | " 9.2 | \n",
607 | " 9.4 | \n",
608 | " 13.7 | \n",
609 | " 16.0 | \n",
610 | " 20.5 | \n",
611 | " 23.6 | \n",
612 | " 23.1 | \n",
613 | " 20.5 | \n",
614 | " 15.5 | \n",
615 | " 11.6 | \n",
616 | " 7.8 | \n",
617 | " 13.0 | \n",
618 | " 22.4 | \n",
619 | " 15.9 | \n",
620 | " 6.9 | \n",
621 | " 14.9 | \n",
622 | " 14.850000 | \n",
623 | "
\n",
624 | " \n",
625 | " 116 | \n",
626 | " 2017 | \n",
627 | " 4.3 | \n",
628 | " 8.6 | \n",
629 | " 11.1 | \n",
630 | " 12.5 | \n",
631 | " 17.2 | \n",
632 | " 22.4 | \n",
633 | " 24.1 | \n",
634 | " 24.8 | \n",
635 | " 18.7 | \n",
636 | " 15.5 | \n",
637 | " 10.4 | \n",
638 | " 6.5 | \n",
639 | " 13.6 | \n",
640 | " 23.8 | \n",
641 | " 14.9 | \n",
642 | " 7.0 | \n",
643 | " 14.7 | \n",
644 | " 14.675000 | \n",
645 | "
\n",
646 | " \n",
647 | " 117 | \n",
648 | " 2018 | \n",
649 | " 8.4 | \n",
650 | " 6.0 | \n",
651 | " 9.1 | \n",
652 | " 14.5 | \n",
653 | " 17.6 | \n",
654 | " 21.1 | \n",
655 | " 24.1 | \n",
656 | " 24.3 | \n",
657 | " 21.1 | \n",
658 | " 16.7 | \n",
659 | " 12.1 | \n",
660 | " 7.9 | \n",
661 | " 13.8 | \n",
662 | " 23.1 | \n",
663 | " 16.7 | \n",
664 | " 7.0 | \n",
665 | " 15.3 | \n",
666 | " 15.241667 | \n",
667 | "
\n",
668 | " \n",
669 | " 118 | \n",
670 | " 2019 | \n",
671 | " 5.1 | \n",
672 | " 7.9 | \n",
673 | " 10.4 | \n",
674 | " 12.5 | \n",
675 | " 14.2 | \n",
676 | " 22.2 | \n",
677 | " 24.4 | \n",
678 | " 24.5 | \n",
679 | " 20.7 | \n",
680 | " 17.0 | \n",
681 | " 12.0 | \n",
682 | " 9.3 | \n",
683 | " 12.4 | \n",
684 | " 23.7 | \n",
685 | " 16.6 -999.0 | \n",
686 | " 15.0 | \n",
687 | " NaN | \n",
688 | " 15.016667 | \n",
689 | "
\n",
690 | " \n",
691 | "
\n",
692 | "
119 rows × 19 columns
\n",
693 | "
"
694 | ],
695 | "text/plain": [
696 | " YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV \\\n",
697 | "0 1901 3.8 2.7 8.0 11.8 14.6 19.9 22.0 21.6 19.1 14.5 8.8 \n",
698 | "1 1902 6.1 7.2 8.4 12.7 12.4 17.9 22.2 21.9 19.1 14.2 8.9 \n",
699 | "2 1903 5.2 6.8 9.1 9.1 15.2 17.7 21.0 21.7 19.0 15.0 9.8 \n",
700 | "3 1904 5.4 7.3 8.7 12.3 16.1 20.2 23.4 22.2 17.4 14.1 8.4 \n",
701 | "4 1905 2.9 4.6 8.7 11.3 14.4 19.0 23.3 22.2 19.8 11.6 10.4 \n",
702 | ".. ... ... ... ... ... ... ... ... ... ... ... ... \n",
703 | "114 2015 7.0 6.6 9.6 12.4 17.2 21.1 25.4 24.3 20.0 15.6 12.0 \n",
704 | "115 2016 7.3 9.2 9.4 13.7 16.0 20.5 23.6 23.1 20.5 15.5 11.6 \n",
705 | "116 2017 4.3 8.6 11.1 12.5 17.2 22.4 24.1 24.8 18.7 15.5 10.4 \n",
706 | "117 2018 8.4 6.0 9.1 14.5 17.6 21.1 24.1 24.3 21.1 16.7 12.1 \n",
707 | "118 2019 5.1 7.9 10.4 12.5 14.2 22.2 24.4 24.5 20.7 17.0 12.0 \n",
708 | "\n",
709 | " DEC MAM JJA SON DJF ANN AVG \n",
710 | "0 7.2 11.5 21.2 14.1 6.8 12.8 12.833333 \n",
711 | "1 5.7 11.2 20.7 14.1 5.9 13.1 13.058333 \n",
712 | "2 7.0 11.1 20.1 14.6 6.6 13.0 13.050000 \n",
713 | "3 6.4 12.4 21.9 13.3 4.6 13.5 13.491667 \n",
714 | "4 6.8 11.5 21.5 13.9 5.6 12.9 12.916667 \n",
715 | ".. ... ... ... ... ... ... ... \n",
716 | "114 8.5 13.1 23.6 15.9 8.4 15.0 14.975000 \n",
717 | "115 7.8 13.0 22.4 15.9 6.9 14.9 14.850000 \n",
718 | "116 6.5 13.6 23.8 14.9 7.0 14.7 14.675000 \n",
719 | "117 7.9 13.8 23.1 16.7 7.0 15.3 15.241667 \n",
720 | "118 9.3 12.4 23.7 16.6 -999.0 15.0 NaN 15.016667 \n",
721 | "\n",
722 | "[119 rows x 19 columns]"
723 | ]
724 | },
725 | "execution_count": 8,
726 | "metadata": {},
727 | "output_type": "execute_result"
728 | }
729 | ],
730 | "source": [
731 | "# check\n",
732 | "countries_monthly['Italy']"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 9,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": [
741 | "# better save these precious data\n",
742 | "t = open('utils/data/temp.pkl', 'wb')\n",
743 | "pickle.dump(countries_monthly, t)\n",
744 | "t.close()"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": null,
750 | "metadata": {},
751 | "outputs": [],
752 | "source": []
753 | },
754 | {
755 | "cell_type": "markdown",
756 | "metadata": {},
757 | "source": [
758 | "## *old:* Data until 2016\n",
759 | "\n",
760 | "Monthly temperature data for countries comes from the [World Bank](https://climateknowledgeportal.worldbank.org/download-data)"
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": null,
766 | "metadata": {},
767 | "outputs": [],
768 | "source": [
769 | "temp = pd.read_csv('utils/data/temp.csv', sep=', ', usecols=range(4), dtype=object)\n",
770 | "temp.tail()"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": null,
776 | "metadata": {},
777 | "outputs": [],
778 | "source": [
779 | "countries = []\n",
780 | "c = temp.Country.unique()\n",
781 | "\n",
782 | "for country in c:\n",
783 | " countries.append(country.strip())"
784 | ]
785 | },
786 | {
787 | "cell_type": "code",
788 | "execution_count": null,
789 | "metadata": {},
790 | "outputs": [],
791 | "source": [
792 | "temp_dict = {}\n",
793 | "t_dict = {}\n",
794 | "t_dict_std = {}\n",
795 | "\n",
796 | "for country in countries:\n",
797 | " temp_dict[country] = temp[temp.Country == country]\n",
798 | " temp_dict[country].drop(columns=['Statistics', 'Country'], inplace=True)\n",
799 | " t_dict[country] = temp_dict[country].astype(float).groupby('Year')['Temperature - (Celsius)'].mean()\n",
800 | " t_dict[country].rename('Temperature', inplace=True)\n",
801 | " t_dict_std[country] = pd.Series(index=t_dict[country].index, data=scale(list(t_dict[country])))\n",
802 | " t_dict_std[country].rename('Temperature', inplace=True)"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": null,
808 | "metadata": {
809 | "scrolled": true
810 | },
811 | "outputs": [],
812 | "source": [
813 | "# check\n",
814 | "t_dict_std['Albania']"
815 | ]
816 | }
817 | ],
818 | "metadata": {
819 | "kernelspec": {
820 | "display_name": "Python 3",
821 | "language": "python",
822 | "name": "python3"
823 | },
824 | "language_info": {
825 | "codemirror_mode": {
826 | "name": "ipython",
827 | "version": 3
828 | },
829 | "file_extension": ".py",
830 | "mimetype": "text/x-python",
831 | "name": "python",
832 | "nbconvert_exporter": "python",
833 | "pygments_lexer": "ipython3",
834 | "version": "3.7.6"
835 | }
836 | },
837 | "nbformat": 4,
838 | "nbformat_minor": 2
839 | }
840 |
--------------------------------------------------------------------------------
/3_distance_cor_continents.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Non-linear dependencies amongst the SDGs and climate change by distance correlation\n",
8 | "\n",
9 | "We start with investigating dependencies amongst the SDGs on different levels. The method how we investigate these dependencies should take as few assumptions as possible. So, a Pearson linear correlation coefficient or a rank correlation coefficient are not our choice since they assume linearity and/or monotony, respectively.\n",
10 | "\n",
11 | "We choose to compute the [distance correlation](https://projecteuclid.org/euclid.aos/1201012979), precisely the [partial distance correlation](https://projecteuclid.org/download/pdfview_1/euclid.aos/1413810731), because of the following properties:\n",
12 | "1. we have an absolute measure of dependence ranging from $0$ to $1$, $0 \\leq \\mathcal{R}(X,Y) \\leq 1$\n",
13 | "2. $\\mathcal{R}(X,Y) = 0$ if and only if $X$ and $Y$ are independent,\n",
14 | "3. $\\mathcal{R}(X,Y) = \\mathcal{R}(Y,X)$\n",
15 | "4. we are able to investigate non-linear and non-monotone relationships,\n",
16 | "5. we can find dependencies between indicators with differently many measurements,\n",
17 | "6. the only assumptions we need to take is that probability distributions have finite first moments.\n",
18 | "\n",
19 | "The conditional distance correlation has the advantage that we ignore the influence of any other targets or goals when we compute the correlation between any two targets or goals. This procedure is also called controlling for confounders.\n",
20 | "\n",
21 | "The **distance correlation** is defined as:\n",
22 | "\n",
23 | "$$\n",
24 | "\\mathcal{R}^2(X,Y) = \\begin{cases}\n",
25 | "\\frac{\\mathcal{V}^2 (X,Y)}{\\sqrt{\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y)}} &\\text{, if $\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y) > 0$} \\\\\n",
26 | "0 &\\text{, if $\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y) = 0$}\n",
27 | "\\end{cases}\n",
28 | "$$\n",
29 | "\n",
30 | "\n",
31 | "where\n",
32 | "\n",
33 | "\n",
34 | "$$\n",
35 | "\\mathcal{V}^2 (X,Y) = \\| f_{X,Y}(t) - f_X(t)f_Y(t) \\|^2\n",
36 | "$$\n",
37 | "\n",
38 | "\n",
39 | "is the distance covariance with **characteristic functions** $f(t)$. Bear in mind that characteristic functions include the imaginary unit $i$, $i^2 = -1$:\n",
40 | "\n",
41 | "$$\n",
42 | "f_X(t) = \\mathbb{E}[e^{itX}]\n",
43 | "$$\n",
44 | "\n",
45 | "Thus, we are in the space of complex numbers $\\mathbb{C}$. Unfortunately, this means we can most likely not find exact results, but we'll get back to this later under Estimators.\n",
46 | "\n",
47 | "The **conditional distance correlation** is defined as:\n",
48 | "\n",
49 | "$$\n",
50 | "\\mathcal{R}^2(X,Y \\ | \\ Z) = \\begin{cases}\n",
51 | "\\frac{\\mathcal{R}^2 (X,Y) - \\mathcal{R}^2 (X,Z) \\mathcal{R}^2 (Y,Z)}{\\sqrt{1 - \\mathcal{R}^4 (X,Z)} \\sqrt{1 - \\mathcal{R}^4 (Y,Z)}} &\\text{, if $\\mathcal{R}^4 (X,Z) \\neq 1$ and $\\mathcal{R}^4 (Y,Z) \\neq 1$} \\\\\n",
52 | "0 &\\text{, if $\\mathcal{R}^4 (X,Z) = 1$ and $\\mathcal{R}^4 (Y,Z) = 1$}\n",
53 | "\\end{cases}\n",
54 | "$$\n",
55 | "\n",
56 | "# Distance covariance\n",
57 | "Let's dismantle the distance covariance equation to know what we actually compute in the distance correlation:\n",
58 | "\n",
59 | "$$\n",
60 | "\\mathcal{V}^2 (X,Y) = \\| f_{X,Y}(t) - f_X(t) \\ f_Y(t) \\|^2 = \\frac{1}{c_p c_q} \\int_{\\mathbb{R}^{p+q}} \\frac{| f_{X,Y}(t) - f_X(t)f_Y(t) |^2}{| t |_p^{1+p} \\ | t |_q^{1+q}} dt\n",
61 | "$$\n",
62 | "\n",
63 | "where\n",
64 | "\n",
65 | "$$\n",
66 | "c_d = \\frac{\\pi^{(1+d)/2}}{\\Gamma \\Big( (1+d)/2 \\Big)}\n",
67 | "$$\n",
68 | "\n",
69 | "where the (complete) Gamma function $\\Gamma$ is\n",
70 | "\n",
71 | "$$\n",
72 | "\\Gamma (z) = \\int_0^{\\infty} x^{z-1} \\ e^{-x} \\ dx\n",
73 | "$$\n",
74 | "\n",
75 | "with $z \\in \\mathbb{R}^{+}$. \n",
76 | "\n",
77 | "$p$ and $q$ are the samples of time-series. We can see this as a random vector with multiple samples available for each time point. However, the number of samples for time points must not vary over the same time-series. We can write this as: \n",
78 | "\n",
79 | "$$X \\ \\text{in} \\ \\mathbb{R}^p$$\n",
80 | "\n",
81 | "$$Y \\ \\text{in} \\ \\mathbb{R}^q$$\n",
82 | "\n",
83 | "\n",
84 | "A preliminary conclusion of this formulation: **we can compute dependencies between time-series with different numbers of samples**. \n",
85 | "\n",
86 | "But we still have some terms in the distance covariance $\\mathcal{V}^2 (X,Y)$ which we need to define:\n",
87 | "\n",
88 | "$ | t |_p^{1+p} $ is the Euclidean distance of $t$ in $\\mathbb{R}^p$, $ | t |_q^{1+q} $ is the Euclidean distance of $t$ in $\\mathbb{R}^q$.\n",
89 | "\n",
90 | "The numerator in the integral of $\\mathcal{V}^2 (X,Y)$ is:\n",
91 | "$$\n",
92 | "| f_{X,Y}(t) - f_X(t) \\ f_Y(t) |^2 = \\Big( 1- |f_X(t) | ^2 \\Big) \\ \\Big( 1- |f_Y(t) |^2 \\Big)\n",
93 | "$$\n",
94 | "\n",
95 | "where $|f_X(t) |$ and $|f_Y(t) |$ are absolute random vectors of the characteristic functions $f(t)$ with $p$ and $q$ samples, respectively.\n",
96 | "\n",
97 | "\n",
98 | "## Estimators\n",
99 | "\n",
100 | "Since the characteristic functions include the imaginary unit $i$, we cannot recover the exact solution for the distance covariance. However, we can estimate it by a quite simple form. We compute these estimators according to [Huo & Szekely, 2016](https://arxiv.org/abs/1410.1503).\n",
101 | "\n",
102 | "We denote the pairwise distances of the $X$ observations by $a_{ij} := \\|X_i - X_j \\|$ and of the $Y$ observations by $b_{ij} = \\|Y_i - Y_j \\|$ for $i,j = 1, ..., n$, where $n$ is the number of measurements in $X$ and $Y$. The corresponding distance matrices are denoted by $(A_{ij})^n_{i,j=1}$ and $(B_{ij})^n_{i,j=1}$, where\n",
103 | "\n",
104 | "$$\n",
105 | "A_{ij} = \\begin{cases}\n",
106 | "a_{ij} - \\frac{1}{n} \\sum_{l=1}^n a_{il} - \\frac{1}{n} \\sum_{k=1}^n a_{kj} + \\frac{1}{n^2} \\sum_{k,l=1}^n a_{kl} & i \\neq j; \\\\\n",
107 | "0 & i = j.\n",
108 | "\\end{cases}\n",
109 | "$$\n",
110 | "\n",
111 | "and\n",
112 | "\n",
113 | "$$\n",
114 | "B_{ij} = \\begin{cases}\n",
115 | "b_{ij} - \\frac{1}{n} \\sum_{l=1}^n b_{il} - \\frac{1}{n} \\sum_{k=1}^n b_{kj} + \\frac{1}{n^2} \\sum_{k,l=1}^n b_{kl} & i \\neq j; \\\\\n",
116 | "0 & i = j.\n",
117 | "\\end{cases}\n",
118 | "$$\n",
119 | "\n",
120 | "\n",
121 | "Having computed these, we can estimate the sample distance covariance $\\hat{\\mathcal{V}}^2(X,Y)$ by\n",
122 | "\n",
123 | "$$\n",
124 | "\\hat{\\mathcal{V}}^2(X,Y) = \\frac{1}{n^2} \\sum_{i,j=1}^n A_{ij} \\ B_{ij}\n",
125 | "$$\n",
126 | "\n",
127 | "The corresponding sample variance $\\hat{\\mathcal{V}}^2(X)$ is consequently:\n",
128 | "\n",
129 | "$$\n",
130 | "\\hat{\\mathcal{V}}^2(X) = \\frac{1}{n^2} \\sum_{i,j=1}^n A^2_{ij}\n",
131 | "$$\n",
132 | "\n",
133 | "\n",
134 | "Then, we can scale these covariances to finally arrive at the sample distance correlation $\\hat{\\mathcal{R}}^2(X,Y)$:\n",
135 | "\n",
136 | "$$\n",
137 | "\\hat{\\mathcal{R}}^2(X,Y) = \\begin{cases}\n",
138 | "\\frac{\\hat{\\mathcal{V}}^2 (X,Y)}{\\sqrt{\\hat{\\mathcal{V}}^2 (X)\\hat{\\mathcal{V}}^2 (Y)}} &\\text{, if $\\hat{\\mathcal{V}}^2 (X)\\mathcal{V}^2 (Y) > 0$} \\\\\n",
139 | "0 &\\text{, if $\\hat{\\mathcal{V}}^2 (X)\\hat{\\mathcal{V}}^2 (Y) = 0$}\n",
140 | "\\end{cases}\n",
141 | "$$\n",
142 | "\n",
143 | "### Unbiased estimators\n",
144 | "These estimators are biased, but we can define unbiased estimators of the distance covariance $\\hat{\\mathcal{V}}^2(X,Y)$ and call them $\\Omega_n(x,y)$. We must first redefine our distance matrices $(A_{ij})^n_{i,j=1}$ and $(B_{ij})^n_{i,j=1}$, which we will call $(\\tilde{A}_{ij})^n_{i,j=1}$ and $(\\tilde{B}_{ij})^n_{i,j=1}$:\n",
145 | "\n",
146 | "$$\n",
147 | "\\tilde{A}_{ij} = \\begin{cases}\n",
148 | "a_{ij} - \\frac{1}{n-2} \\sum_{l=1}^n a_{il} - \\frac{1}{n-2} \\sum_{k=1}^n a_{kj} + \\frac{1}{(n-1)(n-2)} \\sum_{k,l=1}^n a_{kl} & i \\neq j; \\\\\n",
149 | "0 & i = j.\n",
150 | "\\end{cases}\n",
151 | "$$\n",
152 | "\n",
153 | "and\n",
154 | "\n",
155 | "$$\n",
156 | "\\tilde{B}_{ij} = \\begin{cases}\n",
157 | "b_{ij} - \\frac{1}{n-2} \\sum_{l=1}^n b_{il} - \\frac{1}{n-2} \\sum_{k=1}^n b_{kj} + \\frac{1}{(n-1)(n-2)} \\sum_{k,l=1}^n b_{kl} & i \\neq j; \\\\\n",
158 | "0 & i = j.\n",
159 | "\\end{cases}\n",
160 | "$$\n",
161 | "\n",
162 | "Finally, we can compute the unbiased estimator $\\Omega_n(X,Y)$ for $\\mathcal{V}^2(X,Y)$ as the dot product $\\langle \\tilde{A}, \\tilde{B} \\rangle$:\n",
163 | "\n",
164 | "$$\n",
165 | "\\Omega_n(X,Y) = \\langle \\tilde{A}, \\tilde{B} \\rangle = \\frac{1}{n(n-3)} \\sum_{i,j=1}^n \\tilde{A}_{ij} \\ \\tilde{B}_{ij}\n",
166 | "$$\n",
167 | "\n",
168 | "\n",
169 | "Interestingly, [Lyons (2013)](https://arxiv.org/abs/1106.5758) found another solution how not only the sample distance correlation can be computed, but also the population distance correlation without characteristic functions. This is good to acknowledge, but it is not necessary to focus on it. \n",
170 | "\n",
171 | "# Conditional distance covariance\n",
172 | "\n",
173 | "We start with computing the unbiased distance matrices $(\\tilde{A}_{ij})^n_{i,j=1}$, $(\\tilde{B}_{ij})^n_{i,j=1}$, and $(\\tilde{C}_{ij})^n_{i,j=1}$ for $X$, $Y$, and $Z$, respectively, as we have done previously for the distance covariance. We define the dot product\n",
174 | "\n",
175 | "$$\n",
176 | "\\Omega_n(X,Y) = \\langle \\tilde{A}, \\tilde{B} \\rangle = \\frac{1}{n(n-3)} \\sum_{i,j=1}^n \\tilde{A}_{ij} \\tilde{B}_{ij}\n",
177 | "$$\n",
178 | "\n",
179 | "and project the sample $x$ onto $z$ as \n",
180 | "\n",
181 | "$$\n",
182 | "P_z (x) = \\frac{\\langle \\tilde{A}, \\tilde{C} \\rangle}{\\langle \\tilde{C}, \\tilde{C} \\rangle} \\tilde{C} .\n",
183 | "$$\n",
184 | "\n",
185 | "The complementary projection is consequently\n",
186 | "\n",
187 | "$$\n",
188 | "P_{z^{\\bot}} (x) = \\tilde{A} - P_z (x) = \\tilde{A} - \\frac{\\langle \\tilde{A}, \\tilde{C} \\rangle}{\\langle \\tilde{C}, \\tilde{C} \\rangle} \\tilde{C} .\n",
189 | "$$\n",
190 | "\n",
191 | "Hence, the sample conditional distance covariance is\n",
192 | "\n",
193 | "$$\n",
194 | "\\hat{\\mathcal{V}}^2(X,Y \\ | \\ Z) = \\langle P_{z^{\\bot}} (x), P_{z^{\\bot}} (y) \\rangle .\n",
195 | "$$\n",
196 | "\n",
197 | "Then, we can scale these covariances to finally arrive at the sample conditional distance correlation $\\hat{\\mathcal{R}}^2(X,Y \\ | \\ Z)$:\n",
198 | "\n",
199 | "$$\n",
200 | "\\hat{\\mathcal{R}}^2(X,Y \\ | \\ Z) = \\begin{cases}\n",
201 | "\\frac{\\langle P_{z^{\\bot}} (x), P_{z^{\\bot}} (y) \\rangle}{\\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\|} &\\text{, if} \\ \\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\| \\neq 0 \\\\\n",
202 | "0 &\\text{, if} \\ \\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\| = 0\n",
203 | "\\end{cases}\n",
204 | "$$\n",
205 | "\n",
206 | "## Implementation\n",
207 | "For our computations, we'll use the packages [`dcor`](https://dcor.readthedocs.io/en/latest/?badge=latest) for the partial distance correlation and [`community`](https://github.com/taynaud/python-louvain) for the clustering."
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "import dcor\n",
217 | "import numpy as np\n",
218 | "import pickle\n",
219 | "import itertools\n",
220 | "import pandas as pd\n",
221 | "import os\n",
222 | "import math\n",
223 | "from tqdm.notebook import tqdm\n",
224 | "\n",
225 | "import matplotlib.pyplot as plt\n",
226 | "import seaborn as sns\n",
227 | "import networkx as nx\n",
228 | "import matplotlib.image as mpimg\n",
229 | "import matplotlib.pyplot as plt\n",
230 | "from matplotlib.offsetbox import OffsetImage, AnnotationBbox\n",
231 | "\n",
232 | "from community import community_louvain as community\n",
233 | "\n",
234 | "from dcor._dcor_internals import _u_distance_matrix, u_complementary_projection\n",
235 | "from sklearn.manifold import MDS\n",
236 | "import gc\n",
237 | "import warnings \n",
238 | "warnings.filterwarnings('ignore')"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### Loading standardised imputed data set\n",
246 | "We load first of all the standardised imputed data set which we have generated with the previous notebook."
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "targets_values_i = pickle.load(open('utils/data/targets_values_i_up_arr_wb.pkl', 'rb'))\n",
256 | "goals_values_i = pickle.load(open('utils/data/goals_values_i_up_arr_wb.pkl', 'rb'))"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# check whether T appended\n",
266 | "len(targets_values_i['Belgium'])"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "# read amended csv file\n",
276 | "c = pd.read_csv('utils/countries_wb.csv', dtype=str, delimiter=';', header=None)\n",
277 | "countries = list(c[0])\n",
278 | "continents = pd.read_csv(r'utils/continents.csv')\n",
279 | "continents.replace({\"Democratic People's Republic of Korea\": \"Korea, Dem. People's Rep.\", 'Gambia': 'Gambia, The', 'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom', 'Congo': 'Congo, Rep.', 'Democratic Republic of the Congo': 'Congo, Dem. Rep.', 'Czechia': 'Czech Republic', 'Iran (Islamic Republic of)': 'Iran, Islamic Rep.', \"Côte d'Ivoire\": \"Cote d'Ivoire\", 'Kyrgyzstan': 'Kyrgyz Republic', \"Lao People's Democratic Republic\": 'Lao PDR', 'Republic of Moldova': 'Moldova', 'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.', 'Slovakia': 'Slovak Republic', 'Viet Nam': 'Vietnam', 'Egypt': 'Egypt, Arab Rep.', 'United Republic of Tanzania': 'Tanzania','United States of America': 'United States', 'Venezuela (Bolivarian Republic of)': 'Venezuela, RB', 'Yemen': 'Yemen, Rep.', 'Bahamas': 'Bahamas, The', 'Bolivia (Plurinational State of)': 'Bolivia'}, inplace=True)\n",
280 | "info = pd.read_csv(r'utils/wb_info.csv', header=None)"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "# removes key in-place\n",
290 | "countries.remove('Micronesia, Fed. Sts.')\n",
291 | "continents['Oceania (excl. AUS + NZ)'] = continents['Oceania (excl. AUS + NZ)'].drop(index=4) # removing Micronesia\n",
292 | "continents['Oceania (incl. AUS + NZ)'] = continents['Oceania (incl. AUS + NZ)'].drop(index=6) # removing Micronesia\n",
293 | "continents['World'] = continents['World'].drop(index=170) # removing Micronesia\n",
294 | "#continents.drop(['Northern Africa', 'Southern Africa', 'North America', 'Australia and New Zealand'], axis=1, inplace=True)"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {},
300 | "source": [
301 | "We later compute the correlations on an indicator level, but this is too detailed for any network visualisation and for an overarching understanding. Hence, we group here all sub-indicators first on an indicator-level. Then, we compute the distance correlations for the indicators, targets and goals.\n",
302 | "\n",
303 | "We work with the `info` file again, so we don't need to assign all of this by hand."
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "# check\n",
313 | "info"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {
320 | "scrolled": true
321 | },
322 | "outputs": [],
323 | "source": [
324 | "# check\n",
325 | "#targets_values_i['France'].tail()"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "We would like to have values for targets, so we must, first of all, generate a list of all unique **targets**."
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "targets = list(info[4].unique())\n",
342 | "\n",
343 | "dict_targets = {}\n",
344 | "\n",
345 | "for target in targets:\n",
346 | " t = info[0].where(info[4] == target)\n",
347 | "\n",
348 | " dict_targets[target] = [i for i in t if str(i) != 'nan']"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "#check \n",
358 | "dict_targets['1.2']"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "Finally we also generate a list of all unique **goals**."
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "goals = list(info[3].unique())\n",
375 | "\n",
376 | "dict_goals = {}\n",
377 | "\n",
378 | "for goal in goals:\n",
379 | " g = info[4].where(info[3] == goal)\n",
380 | "\n",
381 | " dict_goals[goal] = [t for t in g if str(t) != 'nan']\n",
382 | " dict_goals[goal] = list(set(dict_goals[goal]))"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {
389 | "scrolled": true
390 | },
391 | "outputs": [],
392 | "source": [
393 | "#check \n",
394 | "print(dict_goals['1'])"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "## Distance correlations between goals\n",
402 | "\n",
403 | "The next step is to compute the distance correlations on a goal-level.\n",
404 | "\n",
405 | "We work with the **concatenated time-series** to compute the conditioned distance correlation directly on goal-level data. Visually speaking, this means that we fit one non-linear function to the data for all targets of these two goals. Since goals often have diverse targets, this may end up in fitting a non-linear curve to very noisy data.\n",
406 | "\n",
407 | "## Working with concatenated time-series\n",
408 | "\n",
409 | "### Conditioning iteratively on subsets of joint distributions of all goals\n",
410 | "We condition pairs of two goals iteratively on subsets of all remaining goals. We start with conditioning on the empty set, i.e. we compute the pairwise distance correlation first. Afterwards, we increase the set to condition on until we have reached the set of all remaining 15 goals to condition on. These sets are represented by the joint distributions of the goals entailed in them."
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {},
416 | "source": [
417 | "We need to condition on all **subsets** of these lists of SDGs we condition on to find the dependence which solely stems from either of the two SDGs we condition the others on:"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "def combinations(iterable, r):\n",
427 | " # combinations('ABCD', 2) --> AB AC AD BC BD CD\n",
428 | " # combinations(range(4), 3) --> 012 013 023 123\n",
429 | " pool = tuple(iterable)\n",
430 | " n = len(pool)\n",
431 | " if r > n:\n",
432 | " return\n",
433 | " indices = list(range(r))\n",
434 | " yield list(pool[i] for i in indices)\n",
435 | " while True:\n",
436 | " for i in reversed(range(r)):\n",
437 | " if indices[i] != i + n - r:\n",
438 | " break\n",
439 | " else:\n",
440 | " return\n",
441 | " indices[i] += 1\n",
442 | " for j in range(i+1, r):\n",
443 | " indices[j] = indices[j-1] + 1\n",
444 | " yield list(pool[i] for i in indices)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": null,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "def combinations_tuple(iterable, r):\n",
454 | " # combinations('ABCD', 2) --> AB AC AD BC BD CD\n",
455 | " # combinations(range(4), 3) --> 012 013 023 123\n",
456 | " pool = tuple(iterable)\n",
457 | " n = len(pool)\n",
458 | " if r > n:\n",
459 | " return\n",
460 | " indices = list(range(r))\n",
461 | " yield tuple(pool[i] for i in indices)\n",
462 | " while True:\n",
463 | " for i in reversed(range(r)):\n",
464 | " if indices[i] != i + n - r:\n",
465 | " break\n",
466 | " else:\n",
467 | " return\n",
468 | " indices[i] += 1\n",
469 | " for j in range(i+1, r):\n",
470 | " indices[j] = indices[j-1] + 1\n",
471 | " yield tuple(pool[i] for i in indices)"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "def product(pool_0, pool_1):\n",
481 | " result = [[x, y]+[z] for x, y in pool_0 for z in pool_1] # ~ 40 Mio rows\n",
482 | " for prod in result:\n",
483 | " yield tuple(prod)"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": null,
489 | "metadata": {},
490 | "outputs": [],
491 | "source": [
492 | "# create list out of all unique combinations of goals\n",
493 | "g_combinations = list(combinations(goals, 2))\n",
494 | "conditions_g = []\n",
495 | "conditions_g_tuple = []\n",
496 | "for i in range(1, 18):\n",
497 | " conditions_g.extend(list(combinations(goals, i)))\n",
498 | " conditions_g_tuple.extend(tuple(combinations_tuple(goals, i)))\n",
499 | "\n",
500 | "# divide conditions_g_tuple into four sub-lists to save memory\n",
501 | "conditions_g_tuple_1 = conditions_g_tuple[:int(len(conditions_g_tuple)/4)]\n",
502 | "conditions_g_tuple_2 = conditions_g_tuple[int(len(conditions_g_tuple)/4)+1:2*int(len(conditions_g_tuple)/4)]\n",
503 | "conditions_g_tuple_3 = conditions_g_tuple[2*int(len(conditions_g_tuple)/4)+1:3*int(len(conditions_g_tuple)/4)]\n",
504 | "conditions_g_tuple_4 = conditions_g_tuple[3*int(len(conditions_g_tuple)/4)+1:]\n",
505 | " \n",
506 | "pairs = list(product(g_combinations, conditions_g_tuple))\n",
507 | "pairs_g0 = pd.DataFrame.from_records(pairs, columns=['pair_0', 'pair_1', 'condition'])\n",
508 | "\n",
509 | "pairs_1 = list(product(g_combinations, conditions_g_tuple_1))\n",
510 | "pairs_g0_1 = pd.DataFrame.from_records(pairs_1, columns=['pair_0', 'pair_1', 'condition'])\n",
511 | "pairs_2 = list(product(g_combinations, conditions_g_tuple_2))\n",
512 | "pairs_g0_2 = pd.DataFrame.from_records(pairs_2, columns=['pair_0', 'pair_1', 'condition'])\n",
513 | "pairs_3 = list(product(g_combinations, conditions_g_tuple_3))\n",
514 | "pairs_g0_3 = pd.DataFrame.from_records(pairs_3, columns=['pair_0', 'pair_1', 'condition'])\n",
515 | "pairs_4 = list(product(g_combinations, conditions_g_tuple_4))\n",
516 | "pairs_g0_4 = pd.DataFrame.from_records(pairs_4, columns=['pair_0', 'pair_1', 'condition'])"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": null,
522 | "metadata": {
523 | "scrolled": true
524 | },
525 | "outputs": [],
526 | "source": [
527 | "# how many rows?\n",
528 | "print(len(pairs_g0))\n",
529 | "print(len(pairs_g0_1), len(pairs_g0_2), len(pairs_g0_3), len(pairs_g0_4))"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": null,
535 | "metadata": {},
536 | "outputs": [],
537 | "source": [
538 | "# adding empty condition set for pairwise dcor\n",
539 | "pairs_g1 = pd.DataFrame.from_records(data=g_combinations, columns=['pair_0', 'pair_1'])\n",
540 | "pairs_g1['condition'] = '0'"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "# Continents"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {
554 | "scrolled": true
555 | },
556 | "outputs": [],
557 | "source": [
558 | "# data preparation\n",
559 | "continents_prep_g = {}\n",
560 | "\n",
561 | "for continent in continents:\n",
562 | " print(continent)\n",
563 | " \n",
564 | " continents_prep_g[continent] = np.empty(18, dtype=object)\n",
565 | " \n",
566 | " for g, goal in enumerate(goals):\n",
567 | " g_list = []\n",
568 | " for country in continents[continent].dropna():\n",
569 | " g_list.append(np.asarray(goals_values_i[country][g]))\n",
570 | "\n",
571 | " continents_prep_g[continent][g] = np.asarray(g_list)"
572 | ]
573 | },
574 | {
575 | "cell_type": "markdown",
576 | "metadata": {},
577 | "source": [
578 | "Now we call these data in our `dcor` computations. We first compute the pairwise distance covariance and correlation, then the partial ones with conditioning on all the previously defined sets in `pairs_g`."
579 | ]
580 | },
581 | {
582 | "cell_type": "markdown",
583 | "metadata": {},
584 | "source": [
585 | "### Preparations\n",
586 | "Filtering out the conditions that contain goals $X$ (`pair_0`) or $Y$ (`pair_1`):"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": null,
592 | "metadata": {},
593 | "outputs": [],
594 | "source": [
595 | "import multiprocessing as mp\n",
596 | "print(\"Number of processors: \", mp.cpu_count())"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": [
605 | "# CHECKPOINT\n",
606 | "pairs_g0_left_0 = pd.read_csv('utils/pairs_g0_left_0.zip', dtype=str, compression='zip')\n",
607 | "\n",
608 | "pairs_g0_left_0_1 = pd.read_csv('utils/pairs_g0_left_0_1.zip', dtype=str, compression='zip')\n",
609 | "pairs_g0_left_0_2 = pd.read_csv('utils/pairs_g0_left_0_2.zip', dtype=str, compression='zip')\n",
610 | "pairs_g0_left_0_3 = pd.read_csv('utils/pairs_g0_left_0_3.zip', dtype=str, compression='zip')\n",
611 | "pairs_g0_left_0_4 = pd.read_csv('utils/pairs_g0_left_0_4.zip', dtype=str, compression='zip')"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": null,
617 | "metadata": {
618 | "scrolled": true
619 | },
620 | "outputs": [],
621 | "source": [
622 | "# check\n",
623 | "pairs_g0_left_0_3.tail()"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "metadata": {
630 | "scrolled": true
631 | },
632 | "outputs": [],
633 | "source": [
634 | "# only keep rows which do not have an SDG in their condition\n",
635 | "pairs_g0_left = []\n",
636 | "\n",
637 | "pairs_g0_left_1 = []\n",
638 | "pairs_g0_left_2 = []\n",
639 | "pairs_g0_left_3 = []\n",
640 | "pairs_g0_left_4 = []\n",
641 | "\n",
642 | "for i in tqdm(pairs_g0.index):\n",
643 | " if (pairs_g0.loc[i, 'pair_0'] not in pairs_g0.loc[i, 'condition'] and pairs_g0.loc[i, 'pair_1'] not in pairs_g0.loc[i, 'condition']):\n",
644 | " pairs_g0_left.append(i)\n",
645 | "\n",
646 | "for i in tqdm(pairs_g0_1.index):\n",
647 | " if (pairs_g0_1.loc[i, 'pair_0'] not in pairs_g0_1.loc[i, 'condition'] and pairs_g0_1.loc[i, 'pair_1'] not in pairs_g0_1.loc[i, 'condition']):\n",
648 | " pairs_g0_left_1.append(i)\n",
649 | "\n",
650 | "for i in tqdm(pairs_g0_2.index):\n",
651 | " if (pairs_g0_2.loc[i, 'pair_0'] not in pairs_g0_2.loc[i, 'condition'] and pairs_g0_2.loc[i, 'pair_1'] not in pairs_g0_2.loc[i, 'condition']):\n",
652 | " pairs_g0_left_2.append(i)\n",
653 | " \n",
654 | "for i in tqdm(pairs_g0_3.index):\n",
655 | " if (pairs_g0_3.loc[i, 'pair_0'] not in pairs_g0_3.loc[i, 'condition'] and pairs_g0_3.loc[i, 'pair_1'] not in pairs_g0_3.loc[i, 'condition']):\n",
656 | " pairs_g0_left_3.append(i)\n",
657 | " \n",
658 | "for i in tqdm(pairs_g0_4.index):\n",
659 | " if (pairs_g0_4.loc[i, 'pair_0'] not in pairs_g0_4.loc[i, 'condition'] and pairs_g0_4.loc[i, 'pair_1'] not in pairs_g0_4.loc[i, 'condition']):\n",
660 | " pairs_g0_left_4.append(i)"
661 | ]
662 | },
663 | {
664 | "cell_type": "code",
665 | "execution_count": null,
666 | "metadata": {},
667 | "outputs": [],
668 | "source": [
669 | "# how many rows are left?\n",
670 | "print(len(pairs_g0_left_1), len(pairs_g0_left_2), len(pairs_g0_left_3), len(pairs_g0_left_4))\n",
671 | "\n",
672 | "pairs_g0_left_0_1 = pairs_g0_1.iloc[pairs_g0_left_1]\n",
673 | "pairs_g0_left_0_2 = pairs_g0_2.iloc[pairs_g0_left_2]\n",
674 | "pairs_g0_left_0_3 = pairs_g0_3.iloc[pairs_g0_left_3]\n",
675 | "pairs_g0_left_0_4 = pairs_g0_4.iloc[pairs_g0_left_4]"
676 | ]
677 | },
678 | {
679 | "cell_type": "code",
680 | "execution_count": null,
681 | "metadata": {},
682 | "outputs": [],
683 | "source": [
684 | "# check\n",
685 | "pairs_g0_left_0_3.head()"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "# save pairs_g0_left_0's\n",
695 | "compression_opts = dict(method='zip', archive_name='pairs_g0_left_0.csv') \n",
696 | "pairs_g0_left_0.to_csv('utils/pairs_g0_left_0.zip', index=False, compression=compression_opts)\n",
697 | "\n",
698 | "compression_opts = dict(method='zip', archive_name='pairs_g0_left_0_1.csv') \n",
699 | "pairs_g0_left_0_1.to_csv('utils/pairs_g0_left_0_1.zip', index=False, compression=compression_opts)\n",
700 | "\n",
701 | "compression_opts = dict(method='zip', archive_name='pairs_g0_left_0_2.csv') \n",
702 | "pairs_g0_left_0_2.to_csv('utils/pairs_g0_left_0_2.zip', index=False, compression=compression_opts)\n",
703 | "\n",
704 | "compression_opts = dict(method='zip', archive_name='pairs_g0_left_0_3.csv') \n",
705 | "pairs_g0_left_0_3.to_csv('utils/pairs_g0_left_0_3.zip', index=False, compression=compression_opts)\n",
706 | "\n",
707 | "compression_opts = dict(method='zip', archive_name='pairs_g0_left_0_4.csv') \n",
708 | "pairs_g0_left_0_4.to_csv('utils/pairs_g0_left_0_4.zip', index=False, compression=compression_opts)"
709 | ]
710 | },
711 | {
712 | "cell_type": "markdown",
713 | "metadata": {},
714 | "source": [
715 | "# With `multiprocessing` parallelisation\n",
716 | "\n",
717 | "\n",
718 | " \n",
719 | "### Partial distance correlation"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": null,
725 | "metadata": {},
726 | "outputs": [],
727 | "source": [
728 | "if not os.path.exists('distance_cor'):\n",
729 | " os.mkdir('distance_cor')\n",
730 | " \n",
731 | "if not os.path.exists('distance_cor/goals'):\n",
732 | " os.mkdir('distance_cor/goals')"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": [
741 | "def partial_distance_cor(row):\n",
742 | " pair_0, pair_1, cond = row\n",
743 | " if pair_0=='T':\n",
744 | " pair_0 = 18\n",
745 | " if pair_1=='T':\n",
746 | " pair_1 = 18\n",
747 | " pair_0_array = continents_prep_g[continent][int(pair_0)-1]\n",
748 | " pair_1_array = continents_prep_g[continent][int(pair_1)-1]\n",
749 | " condition_array = conditions_dict[str(cond)].T\n",
750 | " \n",
751 | " return dcor.partial_distance_correlation(pair_0_array, pair_1_array, condition_array)**2"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": null,
757 | "metadata": {},
758 | "outputs": [],
759 | "source": [
760 | "# meta-list of regions containing continents\n",
761 | "regions = {'Africa': ['Eastern and Southern Africa', 'Northern and Western Africa', 'Sub-Saharan Africa', 'Africa'], \n",
762 | " 'Americas': ['Caribbean', 'North and Central America', 'South America', 'Latin America and the Caribbean', 'Americas'], \n",
763 | " 'Asia': ['Central and Eastern Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia', 'Asia'], \n",
764 | " 'Europe': ['Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 'Europe'], \n",
765 | " 'Oceania': ['Oceania (excl. AUS + NZ)', 'Oceania (incl. AUS + NZ)']}"
766 | ]
767 | },
768 | {
769 | "cell_type": "code",
770 | "execution_count": null,
771 | "metadata": {},
772 | "outputs": [],
773 | "source": [
774 | "#regions = {'World': ['World']}"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": null,
780 | "metadata": {
781 | "scrolled": true
782 | },
783 | "outputs": [],
784 | "source": [
785 | "# continents\n",
786 | "\n",
787 | "for region in regions: # to save memory\n",
788 | " print(region)\n",
789 | " \n",
790 | " dict_cor_goals_continents_2_cond = {}\n",
791 | "\n",
792 | " for continent in regions[region]:\n",
793 | " print(continent)\n",
794 | "\n",
795 | " dict_cor_goa_c = pairs_g0_left_0.copy(deep=True)\n",
796 | " #dict_cor_goa_c = pairs_g0_left_0_4.copy(deep=True) # pairs_g0_left_0 has all non-empty conditional sets\n",
797 | "\n",
798 | " # preparing conditional set\n",
799 | " conditions_dict = {}\n",
800 | "\n",
801 | " for cond in tqdm(conditions_g_tuple):\n",
802 | " #for cond in conditions_g_tuple_4:\n",
803 | " condition = []\n",
804 | "\n",
805 | " for c in cond: \n",
806 | " if c=='T':\n",
807 | " condition.extend(continents_prep_g[continent][17].T)\n",
808 | " else:\n",
809 | " condition.extend(continents_prep_g[continent][int(c)-1].T)\n",
810 | "\n",
811 | " conditions_dict[str(cond)] = np.asarray(condition)\n",
812 | "\n",
813 | " # partial distance correlation\n",
814 | " pool = mp.Pool(int(mp.cpu_count()/2))\n",
815 | "\n",
816 | " dict_cor_goa_c_list = dict_cor_goa_c.values.tolist()\n",
817 | "\n",
818 | " print('start dcor...')\n",
819 | "\n",
820 | " cor_results = pool.map(partial_distance_cor, dict_cor_goa_c_list, chunksize=1000)\n",
821 | "\n",
822 | " pool.close()\n",
823 | " pool.join()\n",
824 | "\n",
825 | " dict_cor_goa_c['dcor'] = cor_results\n",
826 | "\n",
827 | " print('...dcor done')\n",
828 | "\n",
829 | " # find minimum distance correlation between any two goals\n",
830 | " dict_cor_goa_con = dict_cor_goa_c.groupby(['pair_0', 'pair_1'])['dcor'].apply(list).reset_index(name='list_dcor')\n",
831 | "\n",
832 | " for i, row_c in dict_cor_goa_con.iterrows():\n",
833 | " dict_cor_goa_con.loc[i, 'min_dcor_cond'] = min(dict_cor_goa_con.loc[i, 'list_dcor'])\n",
834 | "\n",
835 | " dict_cor_goa_con.drop(columns=['list_dcor'], inplace=True)\n",
836 | " \n",
837 | " # finding conditional set of minimum partial distance correlation\n",
838 | " dict_cor_goa_cond = dict_cor_goa_con.merge(dict_cor_goa_c, left_on='min_dcor_cond', right_on='dcor').drop(['pair_0_y', 'pair_1_y', 'dcor'], axis=1).rename(columns={'pair_0_x': 'pair_0', 'pair_1_x': 'pair_1'})\n",
839 | " \n",
840 | " dict_cor_goals_continents_2_cond[continent] = dict_cor_goa_cond\n",
841 | " \n",
842 | " # save every entry region separately to save memory\n",
843 | " g_cor = open('distance_cor/goals/dict_cor_goals_continents_2_cond_{}.pkl'.format(region), 'wb')\n",
844 | " #g_cor = open('distance_cor/goals/dict_cor_goals_continents_2_cond_{}_4.pkl'.format(region), 'wb')\n",
845 | " pickle.dump(dict_cor_goals_continents_2_cond, g_cor)\n",
846 | " g_cor.close()\n",
847 | " \n",
848 | " gc.collect()"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": null,
854 | "metadata": {},
855 | "outputs": [],
856 | "source": [
857 | "# for World (disaggregated because of memory restrictions)\n",
858 | "dict_World_1 = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_World_1.pkl', 'rb'))\n",
859 | "dict_World_2 = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_World_2.pkl', 'rb'))\n",
860 | "dict_World_3 = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_World_3.pkl', 'rb'))\n",
861 | "dict_World_4 = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_World_4.pkl', 'rb'))\n",
862 | "\n",
863 | "cor_goals_continents_2_World = pd.concat([dict_World_1['World'], dict_World_2['World'], dict_World_3['World'], dict_World_4['World']])\n",
864 | "\n",
865 | "# find minimum distance correlation between any two goals\n",
866 | "dict_cor_goa_con = cor_goals_continents_2_World.groupby(['pair_0', 'pair_1'])['min_dcor_cond'].apply(list).reset_index(name='list_dcor')\n",
867 | "\n",
868 | "for i, row_c in dict_cor_goa_con.iterrows():\n",
869 | " dict_cor_goa_con.loc[i, 'min_dcor_cond'] = min(dict_cor_goa_con.loc[i, 'list_dcor'])\n",
870 | "\n",
871 | "dict_cor_goa_con.drop(columns=['list_dcor'], inplace=True)\n",
872 | "\n",
873 | "# finding conditional set of minimum partial distance correlation\n",
874 | "dict_cor_goa_cond = dict_cor_goa_con.merge(cor_goals_continents_2_World, left_on='min_dcor_cond', right_on='min_dcor_cond').drop(['pair_0_y', 'pair_1_y'], axis=1).rename(columns={'pair_0_x': 'pair_0', 'pair_1_x': 'pair_1'})\n",
875 | "\n",
876 | "# save every entry region separately to save memory\n",
877 | "g_cor = open('distance_cor/goals/dict_cor_goals_continents_2_cond_World.pkl', 'wb')\n",
878 | "pickle.dump(dict_cor_goa_cond, g_cor)\n",
879 | "g_cor.close()"
880 | ]
881 | },
882 | {
883 | "cell_type": "code",
884 | "execution_count": null,
885 | "metadata": {},
886 | "outputs": [],
887 | "source": [
888 | "dict_Africa = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_Africa.pkl', 'rb'))\n",
889 | "dict_Americas = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_Americas.pkl', 'rb'))\n",
890 | "dict_Asia = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_Asia.pkl', 'rb'))\n",
891 | "dict_Europe = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_Europe.pkl', 'rb'))\n",
892 | "dict_Oceania = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_Oceania.pkl', 'rb'))\n",
893 | "dict_World = {}\n",
894 | "dict_World['World'] = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2_cond_World.pkl', 'rb'))"
895 | ]
896 | },
897 | {
898 | "cell_type": "code",
899 | "execution_count": null,
900 | "metadata": {},
901 | "outputs": [],
902 | "source": [
903 | "dict_cor_goals_continents_2_condition = {**dict_Africa, **dict_Americas, **dict_Asia, **dict_Europe, **dict_Oceania, **dict_World}"
904 | ]
905 | },
906 | {
907 | "cell_type": "code",
908 | "execution_count": null,
909 | "metadata": {},
910 | "outputs": [],
911 | "source": [
912 | "# check\n",
913 | "print(dict_cor_goals_continents_2_condition.keys())\n",
914 | "dict_cor_goals_continents_2_condition['Southern Europe']"
915 | ]
916 | },
917 | {
918 | "cell_type": "markdown",
919 | "metadata": {},
920 | "source": [
921 | "The following continents which all have a sample size less or equal to 4 returned either NaNs or 1's. This is due to the assumption of a sample size greater or equal to 4 of the partial distance correlation.\n",
922 | "\n",
923 | "Northern Africa, Southern Africa, North America, Central Asia, Eastern Asia, Australia and New Zealand"
924 | ]
925 | },
926 | {
927 | "cell_type": "markdown",
928 | "metadata": {},
929 | "source": [
930 | "### Pairwise distance correlation"
931 | ]
932 | },
933 | {
934 | "cell_type": "code",
935 | "execution_count": null,
936 | "metadata": {},
937 | "outputs": [],
938 | "source": [
939 | "def distance_cor(row):\n",
940 | " pair_0, pair_1 = row\n",
941 | " if pair_0=='T':\n",
942 | " pair_0 = 18\n",
943 | " if pair_1=='T':\n",
944 | " pair_1 = 18\n",
945 | " pair_0_array = continents_prep_g[continent][int(pair_0)-1]\n",
946 | " pair_1_array = continents_prep_g[continent][int(pair_1)-1]\n",
947 | " \n",
948 | " return dcor.distance_correlation(pair_0_array, pair_1_array)**2"
949 | ]
950 | },
951 | {
952 | "cell_type": "code",
953 | "execution_count": null,
954 | "metadata": {
955 | "scrolled": true
956 | },
957 | "outputs": [],
958 | "source": [
959 | "# continents\n",
960 | "dict_cor_goals_continents_2_pair = {}\n",
961 | "\n",
962 | "for continent in continents:\n",
963 | " print(continent)\n",
964 | " \n",
965 | " dict_cor_goa_c_pair = pairs_g1.drop(columns=['condition']).copy(deep=True) # pairs_g1 has empty conditional sets for pairwise dcor\n",
966 | " \n",
967 | " pool = mp.Pool(int(mp.cpu_count()/2))\n",
968 | " \n",
969 | " print('start dcor...')\n",
970 | " \n",
971 | " dict_cor_goa_c_pair_list = dict_cor_goa_c_pair.values.tolist()\n",
972 | " \n",
973 | " cor_results = pool.map(distance_cor, dict_cor_goa_c_pair_list, chunksize=1000)\n",
974 | " \n",
975 | " pool.close()\n",
976 | " pool.join()\n",
977 | " \n",
978 | " dict_cor_goa_c_pair['min_dcor_pair'] = cor_results\n",
979 | " \n",
980 | " print('...dcor done')\n",
981 | " \n",
982 | " dict_cor_goals_continents_2_pair[continent] = dict_cor_goa_c_pair"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": null,
988 | "metadata": {},
989 | "outputs": [],
990 | "source": [
991 | "# check\n",
992 | "dict_cor_goals_continents_2_pair['Europe']"
993 | ]
994 | },
995 | {
996 | "cell_type": "code",
997 | "execution_count": null,
998 | "metadata": {
999 | "scrolled": true
1000 | },
1001 | "outputs": [],
1002 | "source": [
1003 | "# merge dictionaries\n",
1004 | "dict_cor_goals_continents_2 = {}\n",
1005 | "\n",
1006 | "for continent in continents:\n",
1007 | " print(continent)\n",
1008 | " \n",
1009 | " dict_cor_goals_continents_2[continent] = pd.DataFrame(index=range(153), columns=['pair_0', 'pair_1', 'min_dcor'])\n",
1010 | " \n",
1011 | " for i in dict_cor_goals_continents_2_pair[continent].index:\n",
1012 | " for j in dict_cor_goals_continents_2_condition[continent].index:\n",
1013 | " if dict_cor_goals_continents_2_pair[continent].loc[i, 'pair_0']==dict_cor_goals_continents_2_condition[continent].loc[j, 'pair_0'] and dict_cor_goals_continents_2_pair[continent].loc[i, 'pair_1']==dict_cor_goals_continents_2_condition[continent].loc[j, 'pair_1']:\n",
1014 | " dict_cor_goals_continents_2[continent].loc[i, 'pair_0'] = dict_cor_goals_continents_2_pair[continent].loc[i, 'pair_0']\n",
1015 | " dict_cor_goals_continents_2[continent].loc[i, 'pair_1'] = dict_cor_goals_continents_2_pair[continent].loc[i, 'pair_1']\n",
1016 | " dict_cor_goals_continents_2[continent].loc[i, 'min_dcor'] = min(dict_cor_goals_continents_2_pair[continent].loc[i, 'min_dcor_pair'], dict_cor_goals_continents_2_condition[continent].loc[j, 'min_dcor_cond'])\n",
1017 | " if dict_cor_goals_continents_2_pair[continent].loc[i, 'min_dcor_pair'] < dict_cor_goals_continents_2_condition[continent].loc[j, 'min_dcor_cond']:\n",
1018 | " dict_cor_goals_continents_2[continent].loc[i, 'condition'] = 0\n",
1019 | " else:\n",
1020 | " dict_cor_goals_continents_2[continent].loc[i, 'condition'] = dict_cor_goals_continents_2_condition[continent].loc[j, 'condition']"
1021 | ]
1022 | },
1023 | {
1024 | "cell_type": "code",
1025 | "execution_count": null,
1026 | "metadata": {},
1027 | "outputs": [],
1028 | "source": [
1029 | "# check\n",
1030 | "dict_cor_goals_continents_2['World']"
1031 | ]
1032 | },
1033 | {
1034 | "cell_type": "code",
1035 | "execution_count": null,
1036 | "metadata": {},
1037 | "outputs": [],
1038 | "source": [
1039 | "# CHECKPOINT\n",
1040 | "dict_cor_goals_continents_2 = pickle.load(open('distance_cor/goals/dict_cor_goals_continents_2.pkl', 'rb'))"
1041 | ]
1042 | },
1043 | {
1044 | "cell_type": "markdown",
1045 | "metadata": {},
1046 | "source": [
1047 | "### Testing for statistical significance\n",
1048 | "We calculate the p-values of our partial distance correlations, i.e., the probability that the null hypothesis of (partial) independence can be accepted."
1049 | ]
1050 | },
1051 | {
1052 | "cell_type": "code",
1053 | "execution_count": null,
1054 | "metadata": {
1055 | "scrolled": true
1056 | },
1057 | "outputs": [],
1058 | "source": [
1059 | "for continent in continents:\n",
1060 | " print(continent)\n",
1061 | " dict_cor_goals_continents_2[continent]['p-value'] = -1\n",
1062 | " for r, row in dict_cor_goals_continents_2[continent].iterrows():\n",
1063 | " \n",
1064 | " # preparing pair_0 and pair_1\n",
1065 | " if row.pair_1=='T':\n",
1066 | " row.pair_1 = 18\n",
1067 | " pair_0_array = continents_prep_g[continent][int(row.pair_0)-1]\n",
1068 | " pair_1_array = continents_prep_g[continent][int(row.pair_1)-1]\n",
1069 | " \n",
1070 | " # extracting conditional variables from column 'condition'\n",
1071 | " cond_list = []\n",
1072 | " for i in row.condition.split():\n",
1073 | " newstr = ''.join((ch if ch in '0123456789.-eT' else ' ') for ch in i)\n",
1074 | " cond_list.extend([i for i in newstr.split()])\n",
1075 | "\n",
1076 | " condition = []\n",
1077 | " for c in cond_list:\n",
1078 | " if c=='T':\n",
1079 | " condition.extend(continents_prep_g[continent][17].T)\n",
1080 | " else:\n",
1081 | " condition.extend(continents_prep_g[continent][int(c)-1].T)\n",
1082 | "\n",
1083 | " cond_array = np.asarray(condition).T\n",
1084 | " \n",
1085 | " dict_cor_goals_continents_2[continent].iloc[r, 4] = dcor.independence.partial_distance_covariance_test(pair_0_array, pair_1_array, cond_array, num_resamples=10000).p_value"
1086 | ]
1087 | },
1088 | {
1089 | "cell_type": "code",
1090 | "execution_count": null,
1091 | "metadata": {},
1092 | "outputs": [],
1093 | "source": [
1094 | "# save\n",
1095 | "g_cor = open('distance_cor/goals/dict_cor_goals_continents_2.pkl', 'wb')\n",
1096 | "pickle.dump(dict_cor_goals_continents_2, g_cor)\n",
1097 | "g_cor.close()\n",
1098 | "\n",
1099 | "# saving as csv's\n",
1100 | "for continent in continents:\n",
1101 | " dict_cor_goals_continents_2[continent] = dict_cor_goals_continents_2[continent][['pair_0', 'pair_1', 'min_dcor', 'p-value', 'condition']]\n",
1102 | " dict_cor_goals_continents_2[continent]['p-value'] = dict_cor_goals_continents_2[continent]['p-value'].astype(float).round(5)\n",
1103 | " dict_cor_goals_continents_2[continent].min_dcor = dict_cor_goals_continents_2[continent].min_dcor.astype(float).round(5)\n",
1104 | " dict_cor_goals_continents_2[continent].to_csv('distance_cor/goals/conditions_{}.csv'.format(continent))"
1105 | ]
1106 | },
1107 | {
1108 | "cell_type": "markdown",
1109 | "metadata": {},
1110 | "source": [
1111 | "We want to keep the minimum significant distance correlation of each pair of two goals, pairwise or conditioned on any potential subset.\n",
1112 | "\n",
1113 | "The last step is to insert these values into the right cell in a matrix."
1114 | ]
1115 | },
1116 | {
1117 | "cell_type": "code",
1118 | "execution_count": null,
1119 | "metadata": {},
1120 | "outputs": [],
1121 | "source": [
1122 | "cor_goals_continents_2 = {}\n",
1123 | "\n",
1124 | "for continent in continents:\n",
1125 | " print(continent)\n",
1126 | " cor_goals_continents_2[continent] = pd.DataFrame(index=goals, columns=goals)\n",
1127 | "\n",
1128 | " for i in list(dict_cor_goals_continents_2[continent].index):\n",
1129 | " goal_0 = dict_cor_goals_continents_2[continent].loc[i, 'pair_0']\n",
1130 | " goal_1 = dict_cor_goals_continents_2[continent].loc[i, 'pair_1']\n",
1131 | " \n",
1132 | " # take square root because we have previously squared the distance correlation\n",
1133 | " cor_goals_continents_2[continent].loc[goal_1, goal_0] = np.sqrt(dict_cor_goals_continents_2[continent].loc[i, 'min_dcor'])"
1134 | ]
1135 | },
1136 | {
1137 | "cell_type": "code",
1138 | "execution_count": null,
1139 | "metadata": {
1140 | "scrolled": false
1141 | },
1142 | "outputs": [],
1143 | "source": [
1144 | "# check\n",
1145 | "cor_goals_continents_2['Africa']"
1146 | ]
1147 | },
1148 | {
1149 | "cell_type": "markdown",
1150 | "metadata": {},
1151 | "source": [
1152 | "In `cor_goals_continents_2` are the conditional distance correlations for all continents in a setting of 18 random vectors $X$, $Y$, and $Z_1, Z_2, ..., Z_{16}$, where $\\boldsymbol{Z}$ is the array containing all random vectors we want to condition on."
1153 | ]
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "execution_count": null,
1158 | "metadata": {},
1159 | "outputs": [],
1160 | "source": [
1161 | "# save\n",
1162 | "g_cor = open('distance_cor/goals/dcor_goals_continents_2.pkl', 'wb')\n",
1163 | "pickle.dump(cor_goals_continents_2, g_cor)\n",
1164 | "g_cor.close()"
1165 | ]
1166 | },
1167 | {
1168 | "cell_type": "code",
1169 | "execution_count": null,
1170 | "metadata": {},
1171 | "outputs": [],
1172 | "source": [
1173 | "# CHECKPOINT\n",
1174 | "g_cor = pickle.load(open('distance_cor/goals/dcor_goals_continents_2.pkl', 'rb'))"
1175 | ]
1176 | },
1177 | {
1178 | "cell_type": "markdown",
1179 | "metadata": {},
1180 | "source": [
1181 | "## Visualisation on goal-level\n",
1182 | "Additionally to the matrices with numbers, we would also like to visualise these matrices and plot these correlations as networks."
1183 | ]
1184 | },
1185 | {
1186 | "cell_type": "code",
1187 | "execution_count": null,
1188 | "metadata": {
1189 | "scrolled": true
1190 | },
1191 | "outputs": [],
1192 | "source": [
1193 | "# continents\n",
1194 | "for continent in continents:\n",
1195 | " # generate a mask for the upper triangle\n",
1196 | " mask = np.zeros_like(cor_goals_continents_2[continent].fillna(0), dtype=np.bool)\n",
1197 | " mask[np.triu_indices_from(mask)] = True\n",
1198 | "\n",
1199 | " # set up the matplotlib figure\n",
1200 | " f, ax = plt.subplots(figsize=(25, 22))\n",
1201 | "\n",
1202 | " # generate a custom diverging colormap\n",
1203 | " cmap = sns.color_palette(\"Reds\", 100)\n",
1204 | "\n",
1205 | " # draw the heatmap with the mask and correct aspect ratio\n",
1206 | " sns.heatmap(cor_goals_continents_2[continent].fillna(0), mask=mask, cmap=cmap, vmax=1, center=0.5, vmin=0,\n",
1207 | " square=True, linewidths=.5, cbar_kws={\"shrink\": .8})\n",
1208 | " \n",
1209 | " plt.title('{}'.format(continent), fontdict={'fontsize': 52})\n",
1210 | " plt.savefig('distance_cor/goals/{}_cor_goals.png'.format(continent))"
1211 | ]
1212 | },
1213 | {
1214 | "cell_type": "code",
1215 | "execution_count": null,
1216 | "metadata": {},
1217 | "outputs": [],
1218 | "source": [
1219 | "# data preparation for networkX\n",
1220 | "dcor_dict_g = {}\n",
1221 | "\n",
1222 | "for continent in cor_goals_continents_2.keys():\n",
1223 | " dcor_dict_g[continent] = {}\n",
1224 | "\n",
1225 | " for goalcombination in g_combinations:\n",
1226 | " dcor_dict_g[continent][tuple(goalcombination)] = [cor_goals_continents_2[continent].loc[goalcombination[1], goalcombination[0]], float(dict_cor_goals_continents_2[continent].loc[(dict_cor_goals_continents_2[continent]['pair_0']=='{}'.format(goalcombination[0])) & (dict_cor_goals_continents_2[continent]['pair_1']=='{}'.format(goalcombination[1]))]['p-value'])]"
1227 | ]
1228 | },
1229 | {
1230 | "cell_type": "code",
1231 | "execution_count": null,
1232 | "metadata": {},
1233 | "outputs": [],
1234 | "source": [
1235 | "for continent in cor_goals_continents_2.keys():\n",
1236 | " for key in dcor_dict_g[continent].keys():\n",
1237 | " if key[1] == 'T':\n",
1238 | " dcor_dict_g[continent][tuple((key[0], '18'))] = dcor_dict_g[continent].pop(tuple((key[0], 'T')))\n",
1239 | " elif key[0] == 'T':\n",
1240 | " dcor_dict_g[continent][tuple(('18', key[1]))] = dcor_dict_g[continent].pop(tuple(('T', key[1])))"
1241 | ]
1242 | },
1243 | {
1244 | "cell_type": "code",
1245 | "execution_count": null,
1246 | "metadata": {},
1247 | "outputs": [],
1248 | "source": [
1249 | "# color palettes to choose from\n",
1250 | "'PuBu'\n",
1251 | "'YlOrRd'\n",
1252 | "'Reds'"
1253 | ]
1254 | },
1255 | {
1256 | "cell_type": "code",
1257 | "execution_count": null,
1258 | "metadata": {
1259 | "scrolled": true
1260 | },
1261 | "outputs": [],
1262 | "source": [
1263 | "# plotting networks with weighted edges\n",
1264 | "\n",
1265 | "layout = 'circular'\n",
1266 | "\n",
1267 | "centrality_C = {} # dictionary to save centralities\n",
1268 | "degree_C = {} # dictionary to save degrees\n",
1269 | "density_C = {} # dictionary to save weighted densities\n",
1270 | "p_C = {} # auxiliary\n",
1271 | "partition_C = {} # dictionary to save clusters\n",
1272 | "\n",
1273 | "for continent in cor_goals_continents_2.keys():\n",
1274 | " G_C = nx.Graph()\n",
1275 | "\n",
1276 | " for key, value in dcor_dict_g[continent].items():\n",
1277 | " if value[1] <= 0.01:\n",
1278 | " w = value[0]\n",
1279 | " s = 'solid'\n",
1280 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1281 | " elif 0.01 < value[1] <= 0.05:\n",
1282 | " w = value[0]\n",
1283 | " s = 'dashed'\n",
1284 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1285 | " elif 0.05 < value[1] <= 0.1:\n",
1286 | " w = value[0]\n",
1287 | " s = 'dotted'\n",
1288 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1289 | " else:\n",
1290 | " w = 0\n",
1291 | " s = 'solid'\n",
1292 | " c = 'white'\n",
1293 | " G_C.add_edge(int(key[0]), int(key[1]), style=s, weight=w, color=c, alpha=value[0])\n",
1294 | " \n",
1295 | " if layout == 'circular':\n",
1296 | " pos = nx.circular_layout(G_C)\n",
1297 | " elif layout == 'spring':\n",
1298 | " pos = nx.spring_layout(G_C)\n",
1299 | " \n",
1300 | " plt.figure(figsize=(24,16))\n",
1301 | " plt.tight_layout()\n",
1302 | "\n",
1303 | " # nodes\n",
1304 | " nx.draw_networkx_nodes(G_C, pos, node_size=1000)\n",
1305 | "\n",
1306 | " # labels\n",
1307 | " nx.draw_networkx_labels(G_C, pos, font_size=46, font_family='sans-serif')\n",
1308 | "\n",
1309 | " nodes = G_C.nodes()\n",
1310 | " edges = G_C.edges()\n",
1311 | " colors = [G_C[u][v]['color'] for u,v in edges]\n",
1312 | " weights = [G_C[u][v]['weight'] for u,v in edges]\n",
1313 | " alphas = [G_C[u][v]['alpha'] for u,v in edges]\n",
1314 | " styles = [G_C[u][v]['style'] for u,v in edges]\n",
1315 | "\n",
1316 | " nx.draw_networkx_nodes(G_C, pos, nodelist=nodes, node_color='white', node_size=1000)\n",
1317 | "\n",
1318 | " for i, edge in enumerate(edges):\n",
1319 | " pos_edge = {edge[0]: pos[edge[0]], edge[1]: pos[edge[1]]}\n",
1320 | " nx.draw_networkx_edges(G_C, pos_edge, edgelist=[edge], edge_color=colors[i], style=styles[i], width=np.multiply(weights[i],25)) #alpha=np.multiply(alphas[i],2.5))\n",
1321 | " \n",
1322 | " #nx.draw_networkx(G_C, pos, with_labels=False, edges=edges, edge_color=colors, node_color='white', node_size=1000, width=np.multiply(weights,25))\n",
1323 | " \n",
1324 | " ax=plt.gca()\n",
1325 | " fig=plt.gcf()\n",
1326 | " trans = ax.transData.transform\n",
1327 | " trans_axes = fig.transFigure.inverted().transform\n",
1328 | " imsize = 0.08 # this is the image size\n",
1329 | " plt.title('{}'.format(continent), y=1.05, fontdict={'fontsize': 52})\n",
1330 | "\n",
1331 | " for node in G_C.nodes():\n",
1332 | " (x,y) = pos[node] \n",
1333 | " xx,yy = trans((x,y)) # figure coordinates\n",
1334 | " xa,ya = trans_axes((xx,yy)) # axes coordinates\n",
1335 | " a = plt.axes([xa-imsize/2.0,ya-imsize/2.0, imsize, imsize])\n",
1336 | " a.imshow(mpimg.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(node)))\n",
1337 | " a.axis('off')\n",
1338 | "\n",
1339 | "\n",
1340 | " plt.axis('off')\n",
1341 | " ax.axis('off')\n",
1342 | " \n",
1343 | " plt.savefig('distance_cor/goals/{}_{}_network_logos.png'.format(continent, layout), format='png')\n",
1344 | "\n",
1345 | " plt.show()\n",
1346 | " \n",
1347 | " # weighted centrality\n",
1348 | " centr = nx.eigenvector_centrality(G_C, weight='weight', max_iter=100000)\n",
1349 | " centrality_C[continent] = sorted((v, '{:0.2f}'.format(c)) for v, c in centr.items())\n",
1350 | " \n",
1351 | " degree_C[continent] = dict(G_C.degree(weight='weight'))\n",
1352 | " \n",
1353 | " # weighted density\n",
1354 | " density_C[continent] = 2 * np.sum(weights) / (len(nodes) * (len(nodes) - 1))\n",
1355 | " \n",
1356 | " # weighted clustering with Louvain algorithm\n",
1357 | " part_C = {}\n",
1358 | " modularity_C = {}\n",
1359 | " for i in range(100):\n",
1360 | " part_C[i] = community.best_partition(G_C, random_state=i)\n",
1361 | " modularity_C[i] = community.modularity(part_C[i], G_C)\n",
1362 | " \n",
1363 | " p_C[continent] = part_C[max(modularity_C, key=modularity_C.get)]\n",
1364 | "\n",
1365 | " # having lists with nodes being in different clusters\n",
1366 | " partition_C[continent] = {}\n",
1367 | " for com in set(p_C[continent].values()) :\n",
1368 | " partition_C[continent][com] = [nodes for nodes in p_C[continent].keys() if p_C[continent][nodes] == com]"
1369 | ]
1370 | },
1371 | {
1372 | "cell_type": "code",
1373 | "execution_count": null,
1374 | "metadata": {
1375 | "scrolled": true
1376 | },
1377 | "outputs": [],
1378 | "source": [
1379 | "# clusters\n",
1380 | "for continent in continents:\n",
1381 | " print(continent)\n",
1382 | " print(partition_C[continent])\n",
1383 | " print('-------------------------')\n",
1384 | "\n",
1385 | "g_part = open('distance_cor/goals/partition_continents.pkl', 'wb')\n",
1386 | "pickle.dump(partition_C, g_part)\n",
1387 | "g_part.close()"
1388 | ]
1389 | },
1390 | {
1391 | "cell_type": "code",
1392 | "execution_count": null,
1393 | "metadata": {},
1394 | "outputs": [],
1395 | "source": [
1396 | "# centralities\n",
1397 | "for continent in continents:\n",
1398 | " print(continent)\n",
1399 | " print(centrality_C[continent])\n",
1400 | " print('-------------------------')\n",
1401 | "\n",
1402 | "g_cent = open('distance_cor/goals/centrality_continents.pkl', 'wb')\n",
1403 | "pickle.dump(centrality_C, g_cent)\n",
1404 | "g_cent.close()"
1405 | ]
1406 | },
1407 | {
1408 | "cell_type": "code",
1409 | "execution_count": null,
1410 | "metadata": {},
1411 | "outputs": [],
1412 | "source": [
1413 | "# degrees\n",
1414 | "for continent in continents:\n",
1415 | " print(continent)\n",
1416 | " print(degree_C[continent])\n",
1417 | " print('-------------------------')\n",
1418 | "\n",
1419 | "g_deg = open('distance_cor/goals/degree_continents.pkl', 'wb')\n",
1420 | "pickle.dump(degree_C, g_deg)\n",
1421 | "g_deg.close()"
1422 | ]
1423 | },
1424 | {
1425 | "cell_type": "code",
1426 | "execution_count": null,
1427 | "metadata": {
1428 | "scrolled": true
1429 | },
1430 | "outputs": [],
1431 | "source": [
1432 | "# densities\n",
1433 | "for continent in continents:\n",
1434 | " print(continent)\n",
1435 | " print(density_C[continent])\n",
1436 | " print('-------------------------')\n",
1437 | " \n",
1438 | "g_dens = open('distance_cor/goals/density_continents.pkl', 'wb')\n",
1439 | "pickle.dump(degree_C, g_dens)\n",
1440 | "g_dens.close()"
1441 | ]
1442 | },
1443 | {
1444 | "cell_type": "markdown",
1445 | "metadata": {},
1446 | "source": [
1447 | "### Eigenvector visualisation"
1448 | ]
1449 | },
1450 | {
1451 | "cell_type": "code",
1452 | "execution_count": null,
1453 | "metadata": {},
1454 | "outputs": [],
1455 | "source": [
1456 | "def get_image(goal):\n",
1457 | " return OffsetImage(plt.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(goal)), zoom=0.06)"
1458 | ]
1459 | },
1460 | {
1461 | "cell_type": "code",
1462 | "execution_count": null,
1463 | "metadata": {
1464 | "scrolled": true
1465 | },
1466 | "outputs": [],
1467 | "source": [
1468 | "for continent in cor_goals_continents_2.keys():\n",
1469 | " # separating goals from their centralities\n",
1470 | " x = []\n",
1471 | " y = []\n",
1472 | " for cent in centrality_C[continent]:\n",
1473 | " x.append(cent[0])\n",
1474 | " y.append(float(cent[1]))\n",
1475 | "\n",
1476 | " fig, ax = plt.subplots(figsize=(24,16))\n",
1477 | " #plt.tight_layout()\n",
1478 | " plt.title('{}'.format(continent), y=1.05, fontdict={'fontsize': 52})\n",
1479 | " ax.scatter(x, y) \n",
1480 | " \n",
1481 | " # adding images\n",
1482 | " for x0, y0, goal in zip(x, y, list(nodes)):\n",
1483 | " ab = AnnotationBbox(get_image(goal), (x0, y0), frameon=False)\n",
1484 | " ax.add_artist(ab)\n",
1485 | "\n",
1486 | " ax.set_xticks([])\n",
1487 | " ax.set_yticklabels([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], fontsize=28)\n",
1488 | " ax.yaxis.grid()\n",
1489 | " ax.set_ylim(0, 0.75)\n",
1490 | " ax.set_ylabel('Eigenvector centrality', labelpad=24, fontdict={'fontsize': 38})\n",
1491 | " ax.set_xlabel('Variables (SDGs + climate change)', labelpad=54, fontdict={'fontsize': 38})\n",
1492 | " \n",
1493 | " plt.savefig('distance_cor/goals/{}_eigenvector_centrality.png'.format(continent), format='png')\n",
1494 | " \n",
1495 | " plt.show()"
1496 | ]
1497 | },
1498 | {
1499 | "cell_type": "markdown",
1500 | "metadata": {},
1501 | "source": [
1502 | "### Cluster visualisation"
1503 | ]
1504 | },
1505 | {
1506 | "cell_type": "code",
1507 | "execution_count": null,
1508 | "metadata": {
1509 | "scrolled": true
1510 | },
1511 | "outputs": [],
1512 | "source": [
1513 | "# plotting clusters in networks with weighted edges\n",
1514 | "\n",
1515 | "from matplotlib.patches import Polygon\n",
1516 | "from matplotlib.collections import PatchCollection\n",
1517 | "\n",
1518 | "layout = 'multipartite'\n",
1519 | "\n",
1520 | "for continent in cor_goals_continents_2.keys():\n",
1521 | " G_C = nx.Graph()\n",
1522 | "\n",
1523 | " for key, value in dcor_dict_g[continent].items():\n",
1524 | " G_C.add_edge(int(key[0]), int(key[1]), weight=value[0], color=sns.color_palette(\"Reds\", 100)[int(np.around(value[0]*100))], alpha=value[0])\n",
1525 | " \n",
1526 | " for node in nodes:\n",
1527 | " G_C.nodes[node]['subset'] = p_C[continent][node]\n",
1528 | " \n",
1529 | " if layout == 'circular':\n",
1530 | " pos = nx.circular_layout(G_C)\n",
1531 | " elif layout == 'spring':\n",
1532 | " pos = nx.spring_layout(G_C, iterations=100, seed=42)\n",
1533 | " elif layout == 'multipartite':\n",
1534 | " pos = nx.multipartite_layout(G_C, align='vertical')\n",
1535 | " \n",
1536 | " plt.figure(figsize=(24,16))\n",
1537 | "\n",
1538 | " # nodes\n",
1539 | " nx.draw_networkx_nodes(G_C, pos, node_size=1000)\n",
1540 | "\n",
1541 | " # labels\n",
1542 | " nx.draw_networkx_labels(G_C, pos, font_size=46, font_family='sans-serif')\n",
1543 | "\n",
1544 | " nodes = G_C.nodes()\n",
1545 | " edges = G_C.edges()\n",
1546 | " colors = [G_C[u][v]['color'] for u,v in edges]\n",
1547 | " weights = [G_C[u][v]['weight'] for u,v in edges]\n",
1548 | "\n",
1549 | " nx.draw_networkx(G_C, pos, with_labels=False, edgelist=edges, edge_color=colors, node_color='white', node_size=1000, width=np.multiply(weights,25))\n",
1550 | "\n",
1551 | " ax=plt.gca()\n",
1552 | " fig=plt.gcf()\n",
1553 | " trans = ax.transData.transform\n",
1554 | " trans_axes = fig.transFigure.inverted().transform\n",
1555 | " imsize = 0.08 # this is the image size\n",
1556 | " plt.title('{}'.format(continent), y=1.05, fontdict={'fontsize': 52})\n",
1557 | "\n",
1558 | " for node in G_C.nodes():\n",
1559 | " x,y = pos[node] \n",
1560 | " xx,yy = trans((x,y)) # figure coordinates\n",
1561 | " xa,ya = trans_axes((xx,yy)) # axes coordinates\n",
1562 | " a = plt.axes([xa-imsize/2.0,ya-imsize/2.0, imsize, imsize])\n",
1563 | " a.imshow(mpimg.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(node)))\n",
1564 | " a.axis('off')\n",
1565 | " \n",
1566 | " # finding clusters with maximum modularity\n",
1567 | " clusters = []\n",
1568 | " for com, goals in partition_C[continent].items():\n",
1569 | " position = []\n",
1570 | " for goal in goals:\n",
1571 | " x,y = pos[goal]\n",
1572 | " position.append((x,y))\n",
1573 | " \n",
1574 | " positions = []\n",
1575 | " for i in range(6000):\n",
1576 | " np.random.shuffle(position)\n",
1577 | " positions.extend(position)\n",
1578 | " \n",
1579 | " # polygens\n",
1580 | " polygon = Polygon(positions, closed=False)\n",
1581 | " clusters.append(polygon)\n",
1582 | " \n",
1583 | " np.random.seed(72)\n",
1584 | " colors = 100*np.random.rand(len(clusters))\n",
1585 | " p = PatchCollection(clusters, alpha=0.4)\n",
1586 | " p.set_array(np.array(colors))\n",
1587 | " ax.add_collection(p)\n",
1588 | " \n",
1589 | " plt.axis('off')\n",
1590 | " ax.axis('off')\n",
1591 | " \n",
1592 | " plt.savefig('distance_cor/goals/{}_{}_network_logos_cluster.png'.format(continent, layout), format='png')\n",
1593 | "\n",
1594 | " plt.show()"
1595 | ]
1596 | },
1597 | {
1598 | "cell_type": "code",
1599 | "execution_count": null,
1600 | "metadata": {},
1601 | "outputs": [],
1602 | "source": []
1603 | },
1604 | {
1605 | "cell_type": "code",
1606 | "execution_count": null,
1607 | "metadata": {},
1608 | "outputs": [],
1609 | "source": []
1610 | }
1611 | ],
1612 | "metadata": {
1613 | "kernelspec": {
1614 | "display_name": "Python 3.7 - Spark (local)",
1615 | "language": "python",
1616 | "name": "spark-3-python"
1617 | },
1618 | "language_info": {
1619 | "codemirror_mode": {
1620 | "name": "ipython",
1621 | "version": 3
1622 | },
1623 | "file_extension": ".py",
1624 | "mimetype": "text/x-python",
1625 | "name": "python",
1626 | "nbconvert_exporter": "python",
1627 | "pygments_lexer": "ipython3",
1628 | "version": "3.7.7"
1629 | }
1630 | },
1631 | "nbformat": 4,
1632 | "nbformat_minor": 2
1633 | }
1634 |
--------------------------------------------------------------------------------
/3_distance_cor_groups.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Non-linear dependencies amongst the SDGs and climate change by distance correlation\n",
8 | "\n",
9 | "We start with investigating dependencies amongst the SDGs on different levels. The method how we investigate these dependencies should take as few assumptions as possible. So, a Pearson linear correlation coefficient or a rank correlation coefficient are not our choice since they assume linearity and/or monotony, respectively.\n",
10 | "\n",
11 | "We choose to compute the [distance correlation](https://projecteuclid.org/euclid.aos/1201012979), precisely the [partial distance correlation](https://projecteuclid.org/download/pdfview_1/euclid.aos/1413810731), because of the following properties:\n",
12 | "1. we have an absolute measure of dependence ranging from $0$ to $1$, $0 \\leq \\mathcal{R}(X,Y) \\leq 1$\n",
13 | "2. $\\mathcal{R}(X,Y) = 0$ if and only if $X$ and $Y$ are independent,\n",
14 | "3. $\\mathcal{R}(X,Y) = \\mathcal{R}(Y,X)$\n",
15 | "4. we are able to investigate non-linear and non-monotone relationships,\n",
16 | "5. we can find dependencies between indicators with differently many measurements,\n",
17 | "6. the only assumptions we need to take is that probability distributions have finite first moments.\n",
18 | "\n",
19 | "The conditional distance correlation has the advantage that we ignore the influence of any other targets or goals when we compute the correlation between any two targets or goals. This procedure is also called controlling for confounders.\n",
20 | "\n",
21 | "The **distance correlation** is defined as:\n",
22 | "\n",
23 | "$$\n",
24 | "\\mathcal{R}^2(X,Y) = \\begin{cases}\n",
25 | "\\frac{\\mathcal{V}^2 (X,Y)}{\\sqrt{\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y)}} &\\text{, if $\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y) > 0$} \\\\\n",
26 | "0 &\\text{, if $\\mathcal{V}^2 (X)\\mathcal{V}^2 (Y) = 0$}\n",
27 | "\\end{cases}\n",
28 | "$$\n",
29 | "\n",
30 | "\n",
31 | "where\n",
32 | "\n",
33 | "\n",
34 | "$$\n",
35 | "\\mathcal{V}^2 (X,Y) = \\| f_{X,Y}(t) - f_X(t)f_Y(t) \\|^2\n",
36 | "$$\n",
37 | "\n",
38 | "\n",
39 | "is the distance covariance with **characteristic functions** $f(t)$. Bear in mind that characteristic functions include the imaginary unit $i$, $i^2 = -1$:\n",
40 | "\n",
41 | "$$\n",
42 | "f_X(t) = \\mathbb{E}[e^{itX}]\n",
43 | "$$\n",
44 | "\n",
45 | "Thus, we are in the space of complex numbers $\\mathbb{C}$. Unfortunately, this means we can most likely not find exact results, but we'll get back to this later under Estimators.\n",
46 | "\n",
47 | "The **conditional distance correlation** is defined as:\n",
48 | "\n",
49 | "$$\n",
50 | "\\mathcal{R}^2(X,Y \\ | \\ Z) = \\begin{cases}\n",
51 | "\\frac{\\mathcal{R}^2 (X,Y) - \\mathcal{R}^2 (X,Z) \\mathcal{R}^2 (Y,Z)}{\\sqrt{1 - \\mathcal{R}^4 (X,Z)} \\sqrt{1 - \\mathcal{R}^4 (Y,Z)}} &\\text{, if $\\mathcal{R}^4 (X,Z) \\neq 1$ and $\\mathcal{R}^4 (Y,Z) \\neq 1$} \\\\\n",
52 | "0 &\\text{, if $\\mathcal{R}^4 (X,Z) = 1$ and $\\mathcal{R}^4 (Y,Z) = 1$}\n",
53 | "\\end{cases}\n",
54 | "$$\n",
55 | "\n",
56 | "# Distance covariance\n",
57 | "Let's dismantle the distance covariance equation to know what we actually compute in the distance correlation:\n",
58 | "\n",
59 | "$$\n",
60 | "\\mathcal{V}^2 (X,Y) = \\| f_{X,Y}(t) - f_X(t) \\ f_Y(t) \\|^2 = \\frac{1}{c_p c_q} \\int_{\\mathbb{R}^{p+q}} \\frac{| f_{X,Y}(t) - f_X(t)f_Y(t) |^2}{| t |_p^{1+p} \\ | t |_q^{1+q}} dt\n",
61 | "$$\n",
62 | "\n",
63 | "where\n",
64 | "\n",
65 | "$$\n",
66 | "c_d = \\frac{\\pi^{(1+d)/2}}{\\Gamma \\Big( (1+d)/2 \\Big)}\n",
67 | "$$\n",
68 | "\n",
69 | "where the (complete) Gamma function $\\Gamma$ is\n",
70 | "\n",
71 | "$$\n",
72 | "\\Gamma (z) = \\int_0^{\\infty} x^{z-1} \\ e^{-x} \\ dx\n",
73 | "$$\n",
74 | "\n",
75 | "with $z \\in \\mathbb{R}^{+}$. \n",
76 | "\n",
77 | "$p$ and $q$ are the samples of time-series. We can see this as a random vector with multiple samples available for each time point. However, the number of samples for time points must not vary over the same time-series. We can write this as: \n",
78 | "\n",
79 | "$$X \\ \\text{in} \\ \\mathbb{R}^p$$\n",
80 | "\n",
81 | "$$Y \\ \\text{in} \\ \\mathbb{R}^q$$\n",
82 | "\n",
83 | "\n",
84 | "A preliminary conclusion of this formulation: **we can compute dependencies between time-series with different numbers of samples**. \n",
85 | "\n",
86 | "But we still have some terms in the distance covariance $\\mathcal{V}^2 (X,Y)$ which we need to define:\n",
87 | "\n",
88 | "$ | t |_p^{1+p} $ is the Euclidean distance of $t$ in $\\mathbb{R}^p$, $ | t |_q^{1+q} $ is the Euclidean distance of $t$ in $\\mathbb{R}^q$.\n",
89 | "\n",
90 | "The numerator in the integral of $\\mathcal{V}^2 (X,Y)$ is:\n",
91 | "$$\n",
92 | "| f_{X,Y}(t) - f_X(t) \\ f_Y(t) |^2 = \\Big( 1- |f_X(t) | ^2 \\Big) \\ \\Big( 1- |f_Y(t) |^2 \\Big)\n",
93 | "$$\n",
94 | "\n",
95 | "where $|f_X(t) |$ and $|f_Y(t) |$ are absolute random vectors of the characteristic functions $f(t)$ with $p$ and $q$ samples, respectively.\n",
96 | "\n",
97 | "\n",
98 | "## Estimators\n",
99 | "\n",
100 | "Since the characteristic functions include the imaginary unit $i$, we cannot recover the exact solution for the distance covariance. However, we can estimate it by a quite simple form. We compute these estimators according to [Huo & Szekely, 2016](https://arxiv.org/abs/1410.1503).\n",
101 | "\n",
102 | "We denote the pairwise distances of the $X$ observations by $a_{ij} := \\|X_i - X_j \\|$ and of the $Y$ observations by $b_{ij} = \\|Y_i - Y_j \\|$ for $i,j = 1, ..., n$, where $n$ is the number of measurements in $X$ and $Y$. The corresponding distance matrices are denoted by $(A_{ij})^n_{i,j=1}$ and $(B_{ij})^n_{i,j=1}$, where\n",
103 | "\n",
104 | "$$\n",
105 | "A_{ij} = \\begin{cases}\n",
106 | "a_{ij} - \\frac{1}{n} \\sum_{l=1}^n a_{il} - \\frac{1}{n} \\sum_{k=1}^n a_{kj} + \\frac{1}{n^2} \\sum_{k,l=1}^n a_{kl} & i \\neq j; \\\\\n",
107 | "0 & i = j.\n",
108 | "\\end{cases}\n",
109 | "$$\n",
110 | "\n",
111 | "and\n",
112 | "\n",
113 | "$$\n",
114 | "B_{ij} = \\begin{cases}\n",
115 | "b_{ij} - \\frac{1}{n} \\sum_{l=1}^n b_{il} - \\frac{1}{n} \\sum_{k=1}^n b_{kj} + \\frac{1}{n^2} \\sum_{k,l=1}^n b_{kl} & i \\neq j; \\\\\n",
116 | "0 & i = j.\n",
117 | "\\end{cases}\n",
118 | "$$\n",
119 | "\n",
120 | "\n",
121 | "Having computed these, we can estimate the sample distance covariance $\\hat{\\mathcal{V}}^2(X,Y)$ by\n",
122 | "\n",
123 | "$$\n",
124 | "\\hat{\\mathcal{V}}^2(X,Y) = \\frac{1}{n^2} \\sum_{i,j=1}^n A_{ij} \\ B_{ij}\n",
125 | "$$\n",
126 | "\n",
127 | "The corresponding sample variance $\\hat{\\mathcal{V}}^2(X)$ is consequently:\n",
128 | "\n",
129 | "$$\n",
130 | "\\hat{\\mathcal{V}}^2(X) = \\frac{1}{n^2} \\sum_{i,j=1}^n A^2_{ij}\n",
131 | "$$\n",
132 | "\n",
133 | "\n",
134 | "Then, we can scale these covariances to finally arrive at the sample distance correlation $\\hat{\\mathcal{R}}^2(X,Y)$:\n",
135 | "\n",
136 | "$$\n",
137 | "\\hat{\\mathcal{R}}^2(X,Y) = \\begin{cases}\n",
138 | "\\frac{\\hat{\\mathcal{V}}^2 (X,Y)}{\\sqrt{\\hat{\\mathcal{V}}^2 (X)\\hat{\\mathcal{V}}^2 (Y)}} &\\text{, if $\\hat{\\mathcal{V}}^2 (X)\\mathcal{V}^2 (Y) > 0$} \\\\\n",
139 | "0 &\\text{, if $\\hat{\\mathcal{V}}^2 (X)\\hat{\\mathcal{V}}^2 (Y) = 0$}\n",
140 | "\\end{cases}\n",
141 | "$$\n",
142 | "\n",
143 | "### Unbiased estimators\n",
144 | "These estimators are biased, but we can define unbiased estimators of the distance covariance $\\hat{\\mathcal{V}}^2(X,Y)$ and call them $\\Omega_n(x,y)$. We must first redefine our distance matrices $(A_{ij})^n_{i,j=1}$ and $(B_{ij})^n_{i,j=1}$, which we will call $(\\tilde{A}_{ij})^n_{i,j=1}$ and $(\\tilde{B}_{ij})^n_{i,j=1}$:\n",
145 | "\n",
146 | "$$\n",
147 | "\\tilde{A}_{ij} = \\begin{cases}\n",
148 | "a_{ij} - \\frac{1}{n-2} \\sum_{l=1}^n a_{il} - \\frac{1}{n-2} \\sum_{k=1}^n a_{kj} + \\frac{1}{(n-1)(n-2)} \\sum_{k,l=1}^n a_{kl} & i \\neq j; \\\\\n",
149 | "0 & i = j.\n",
150 | "\\end{cases}\n",
151 | "$$\n",
152 | "\n",
153 | "and\n",
154 | "\n",
155 | "$$\n",
156 | "\\tilde{B}_{ij} = \\begin{cases}\n",
157 | "b_{ij} - \\frac{1}{n-2} \\sum_{l=1}^n b_{il} - \\frac{1}{n-2} \\sum_{k=1}^n b_{kj} + \\frac{1}{(n-1)(n-2)} \\sum_{k,l=1}^n b_{kl} & i \\neq j; \\\\\n",
158 | "0 & i = j.\n",
159 | "\\end{cases}\n",
160 | "$$\n",
161 | "\n",
162 | "Finally, we can compute the unbiased estimator $\\Omega_n(X,Y)$ for $\\mathcal{V}^2(X,Y)$ as the dot product $\\langle \\tilde{A}, \\tilde{B} \\rangle$:\n",
163 | "\n",
164 | "$$\n",
165 | "\\Omega_n(X,Y) = \\langle \\tilde{A}, \\tilde{B} \\rangle = \\frac{1}{n(n-3)} \\sum_{i,j=1}^n \\tilde{A}_{ij} \\ \\tilde{B}_{ij}\n",
166 | "$$\n",
167 | "\n",
168 | "\n",
169 | "Interestingly, [Lyons (2013)](https://arxiv.org/abs/1106.5758) found another solution how not only the sample distance correlation can be computed, but also the population distance correlation without characteristic functions. This is good to acknowledge, but it is not necessary to focus on it. \n",
170 | "\n",
171 | "# Conditional distance covariance\n",
172 | "\n",
173 | "We start with computing the unbiased distance matrices $(\\tilde{A}_{ij})^n_{i,j=1}$, $(\\tilde{B}_{ij})^n_{i,j=1}$, and $(\\tilde{C}_{ij})^n_{i,j=1}$ for $X$, $Y$, and $Z$, respectively, as we have done previously for the distance covariance. We define the dot product\n",
174 | "\n",
175 | "$$\n",
176 | "\\Omega_n(X,Y) = \\langle \\tilde{A}, \\tilde{B} \\rangle = \\frac{1}{n(n-3)} \\sum_{i,j=1}^n \\tilde{A}_{ij} \\tilde{B}_{ij}\n",
177 | "$$\n",
178 | "\n",
179 | "and project the sample $x$ onto $z$ as \n",
180 | "\n",
181 | "$$\n",
182 | "P_z (x) = \\frac{\\langle \\tilde{A}, \\tilde{C} \\rangle}{\\langle \\tilde{C}, \\tilde{C} \\rangle} \\tilde{C} .\n",
183 | "$$\n",
184 | "\n",
185 | "The complementary projection is consequently\n",
186 | "\n",
187 | "$$\n",
188 | "P_{z^{\\bot}} (x) = \\tilde{A} - P_z (x) = \\tilde{A} - \\frac{\\langle \\tilde{A}, \\tilde{C} \\rangle}{\\langle \\tilde{C}, \\tilde{C} \\rangle} \\tilde{C} .\n",
189 | "$$\n",
190 | "\n",
191 | "Hence, the sample conditional distance covariance is\n",
192 | "\n",
193 | "$$\n",
194 | "\\hat{\\mathcal{V}}^2(X,Y \\ | \\ Z) = \\langle P_{z^{\\bot}} (x), P_{z^{\\bot}} (y) \\rangle .\n",
195 | "$$\n",
196 | "\n",
197 | "Then, we can scale these covariances to finally arrive at the sample conditional distance correlation $\\hat{\\mathcal{R}}^2(X,Y \\ | \\ Z)$:\n",
198 | "\n",
199 | "$$\n",
200 | "\\hat{\\mathcal{R}}^2(X,Y \\ | \\ Z) = \\begin{cases}\n",
201 | "\\frac{\\langle P_{z^{\\bot}} (x), P_{z^{\\bot}} (y) \\rangle}{\\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\|} &\\text{, if} \\ \\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\| \\neq 0 \\\\\n",
202 | "0 &\\text{, if} \\ \\| P_{z^{\\bot}} (x) \\| \\ \\| P_{z^{\\bot}} (y) \\| = 0\n",
203 | "\\end{cases}\n",
204 | "$$\n",
205 | "\n",
206 | "## Implementation\n",
207 | "For our computations, we'll use the packages [`dcor`](https://dcor.readthedocs.io/en/latest/?badge=latest) for the partial distance correlation and [`community`](https://github.com/taynaud/python-louvain) for the clustering."
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "import dcor\n",
217 | "import numpy as np\n",
218 | "import pickle\n",
219 | "import itertools\n",
220 | "import pandas as pd\n",
221 | "import os\n",
222 | "import math\n",
223 | "from tqdm.notebook import tqdm\n",
224 | "\n",
225 | "import matplotlib.pyplot as plt\n",
226 | "import seaborn as sns\n",
227 | "import networkx as nx\n",
228 | "import matplotlib.image as mpimg\n",
229 | "import matplotlib.pyplot as plt\n",
230 | "from matplotlib.offsetbox import OffsetImage, AnnotationBbox\n",
231 | "\n",
232 | "from community import community_louvain as community\n",
233 | "from scipy.spatial import distance\n",
234 | "\n",
235 | "from dcor._dcor_internals import _u_distance_matrix, u_complementary_projection\n",
236 | "from sklearn.manifold import MDS\n",
237 | "import gc\n",
238 | "import warnings \n",
239 | "warnings.filterwarnings('ignore')"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "### Loading standardised imputed data set\n",
247 | "We load first of all the standardised imputed data set which we have generated with the previous notebook."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "#dict_all = pickle.load(open('utils/data/dict_all_wb.pkl', 'rb'))\n",
257 | "dict_all_std = pickle.load(open('utils/data/dict_all_wb_std.pkl', 'rb'))\n",
258 | "#indicators_values_i = pickle.load(open('utils/data/indicators_values_i_up_wb.pkl', 'rb'))\n",
259 | "targets_values_i = pickle.load(open('utils/data/targets_values_i_up_arr_wb.pkl', 'rb'))\n",
260 | "goals_values_i = pickle.load(open('utils/data/goals_values_i_up_arr_wb.pkl', 'rb'))"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "# check whether T appended\n",
270 | "len(targets_values_i['Belgium'])"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "# read amended csv file\n",
280 | "c = pd.read_csv('utils/countries_wb.csv', dtype=str, delimiter=';', header=None)\n",
281 | "countries = list(c[0])\n",
282 | "groups = pd.read_csv(r'utils/groups.csv')\n",
283 | "groups.replace({\"Democratic People's Republic of Korea\": \"Korea, Dem. People's Rep.\", 'Gambia': 'Gambia, The', 'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom', 'Congo': 'Congo, Rep.', 'Democratic Republic of the Congo': 'Congo, Dem. Rep.', 'Czechia': 'Czech Republic', 'Iran (Islamic Republic of)': 'Iran, Islamic Rep.', \"Côte d'Ivoire\": \"Cote d'Ivoire\", 'Kyrgyzstan': 'Kyrgyz Republic', \"Lao People's Democratic Republic\": 'Lao PDR', 'Republic of Moldova': 'Moldova', 'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.', 'Slovakia': 'Slovak Republic', 'Viet Nam': 'Vietnam', 'Egypt': 'Egypt, Arab Rep.', 'United Republic of Tanzania': 'Tanzania','United States of America': 'United States', 'Venezuela (Bolivarian Republic of)': 'Venezuela, RB', 'Yemen': 'Yemen, Rep.', 'Bahamas': 'Bahamas, The', 'Bolivia (Plurinational State of)': 'Bolivia'}, inplace=True)\n",
284 | "info = pd.read_csv(r'utils/wb_info.csv', header=None)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "# removes some countries in-place\n",
294 | "countries.remove('Micronesia, Fed. Sts.')\n",
295 | "groups['Global South'].drop(index=1, inplace=True)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "We later compute the correlations on an indicator level, but this is too detailed for any network visualisation and for an overarching understanding. Hence, we group here all sub-indicators first on an indicator-level. Then, we compute the distance correlations for the indicators, targets and goals.\n",
303 | "\n",
304 | "We work with the `info` file again, so we don't need to assign all of this by hand."
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "# check\n",
314 | "info"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {
321 | "scrolled": true
322 | },
323 | "outputs": [],
324 | "source": [
325 | "# check\n",
326 | "#targets_values_i['France'].tail()"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "We would like to have values for targets, so we must, first of all, generate a list of all unique **targets**."
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "targets = list(info[4].unique())\n",
343 | "\n",
344 | "dict_targets = {}\n",
345 | "\n",
346 | "for target in targets:\n",
347 | " t = info[0].where(info[4] == target)\n",
348 | "\n",
349 | " dict_targets[target] = [i for i in t if str(i) != 'nan']"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "#check \n",
359 | "dict_targets['1.2']"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "Finally we also generate a list of all unique **goals**."
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "goals = list(info[3].unique())\n",
376 | "\n",
377 | "dict_goals = {}\n",
378 | "\n",
379 | "for goal in goals:\n",
380 | " g = info[4].where(info[3] == goal)\n",
381 | "\n",
382 | " dict_goals[goal] = [t for t in g if str(t) != 'nan']\n",
383 | " dict_goals[goal] = list(set(dict_goals[goal]))"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {
390 | "scrolled": true
391 | },
392 | "outputs": [],
393 | "source": [
394 | "#check \n",
395 | "print(dict_goals['13'])"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "## Distance correlations between goals\n",
403 | "\n",
404 | "The next step is to compute the distance correlations on a goal-level.\n",
405 | "\n",
406 | "We work with the **concatenated time-series** to compute the conditioned distance correlation directly on goal-level data. Visually speaking, this means that we fit one non-linear function to the data for all targets of these two goals. Since goals often have diverse targets, this may end up in fitting a non-linear curve to very noisy data.\n",
407 | "\n",
408 | "## Working with concatenated time-series\n",
409 | "\n",
410 | "### Conditioning iteratively on subsets of joint distributions of all goals\n",
411 | "We condition pairs of two goals iteratively on subsets of all remaining goals. We start with conditioning on the empty set, i.e. we compute the pairwise distance correlation first. Afterwards, we increase the set to condition on until we have reached the set of all remaining 15 goals to condition on. These sets are represented by the joint distributions of the goals entailed in them."
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {},
417 | "source": [
418 | "We need to condition on all **subsets** of these lists of SDGs we condition on to find the dependence which solely stems from either of the two SDGs we condition the others on:"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": null,
424 | "metadata": {},
425 | "outputs": [],
426 | "source": [
427 | "def combinations(iterable, r):\n",
428 | " # combinations('ABCD', 2) --> AB AC AD BC BD CD\n",
429 | " # combinations(range(4), 3) --> 012 013 023 123\n",
430 | " pool = tuple(iterable)\n",
431 | " n = len(pool)\n",
432 | " if r > n:\n",
433 | " return\n",
434 | " indices = list(range(r))\n",
435 | " yield list(pool[i] for i in indices)\n",
436 | " while True:\n",
437 | " for i in reversed(range(r)):\n",
438 | " if indices[i] != i + n - r:\n",
439 | " break\n",
440 | " else:\n",
441 | " return\n",
442 | " indices[i] += 1\n",
443 | " for j in range(i+1, r):\n",
444 | " indices[j] = indices[j-1] + 1\n",
445 | " yield list(pool[i] for i in indices)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "def combinations_tuple(iterable, r):\n",
455 | " # combinations('ABCD', 2) --> AB AC AD BC BD CD\n",
456 | " # combinations(range(4), 3) --> 012 013 023 123\n",
457 | " pool = tuple(iterable)\n",
458 | " n = len(pool)\n",
459 | " if r > n:\n",
460 | " return\n",
461 | " indices = list(range(r))\n",
462 | " yield tuple(pool[i] for i in indices)\n",
463 | " while True:\n",
464 | " for i in reversed(range(r)):\n",
465 | " if indices[i] != i + n - r:\n",
466 | " break\n",
467 | " else:\n",
468 | " return\n",
469 | " indices[i] += 1\n",
470 | " for j in range(i+1, r):\n",
471 | " indices[j] = indices[j-1] + 1\n",
472 | " yield tuple(pool[i] for i in indices)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": null,
478 | "metadata": {},
479 | "outputs": [],
480 | "source": [
481 | "def product(pool_0, pool_1):\n",
482 | " #result = [[x, y]+[z] for x, y in pool_0 for z in pool_1 if x not in z and y not in z] # ~ 10 Mio rows\n",
483 | " result = [[x, y]+[z] for x, y in pool_0 for z in pool_1] # ~ 40 Mio rows\n",
484 | " for prod in result:\n",
485 | " yield tuple(prod)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "metadata": {},
492 | "outputs": [],
493 | "source": [
494 | "# create list out of all unique combinations of goals\n",
495 | "g_combinations = list(combinations(goals, 2))\n",
496 | "conditions_g = []\n",
497 | "conditions_g_tuple = []\n",
498 | "for i in range(1, 18):\n",
499 | " conditions_g.extend(list(combinations(goals, i)))\n",
500 | " conditions_g_tuple.extend(tuple(combinations_tuple(goals, i)))\n",
501 | "\n",
502 | "# divide conditions_g_tuple into four sub-lists to save memory\n",
503 | "conditions_g_tuple_1 = conditions_g_tuple[:int(len(conditions_g_tuple)/4)]\n",
504 | "conditions_g_tuple_2 = conditions_g_tuple[int(len(conditions_g_tuple)/4)+1:2*int(len(conditions_g_tuple)/4)]\n",
505 | "conditions_g_tuple_3 = conditions_g_tuple[2*int(len(conditions_g_tuple)/4)+1:3*int(len(conditions_g_tuple)/4)]\n",
506 | "conditions_g_tuple_4 = conditions_g_tuple[3*int(len(conditions_g_tuple)/4)+1:]\n",
507 | " \n",
508 | "pairs = list(product(g_combinations, conditions_g_tuple))\n",
509 | "pairs_g0 = pd.DataFrame.from_records(pairs, columns=['pair_0', 'pair_1', 'condition'])\n",
510 | "\n",
511 | "pairs_1 = list(product(g_combinations, conditions_g_tuple_1))\n",
512 | "pairs_g0_1 = pd.DataFrame.from_records(pairs_1, columns=['pair_0', 'pair_1', 'condition'])\n",
513 | "pairs_2 = list(product(g_combinations, conditions_g_tuple_2))\n",
514 | "pairs_g0_2 = pd.DataFrame.from_records(pairs_2, columns=['pair_0', 'pair_1', 'condition'])\n",
515 | "pairs_3 = list(product(g_combinations, conditions_g_tuple_3))\n",
516 | "pairs_g0_3 = pd.DataFrame.from_records(pairs_3, columns=['pair_0', 'pair_1', 'condition'])\n",
517 | "pairs_4 = list(product(g_combinations, conditions_g_tuple_4))\n",
518 | "pairs_g0_4 = pd.DataFrame.from_records(pairs_4, columns=['pair_0', 'pair_1', 'condition'])"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "# how many rows?\n",
528 | "print(len(pairs_g0))\n",
529 | "print(len(pairs_g0_1), len(pairs_g0_2), len(pairs_g0_3), len(pairs_g0_4))"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": null,
535 | "metadata": {},
536 | "outputs": [],
537 | "source": [
538 | "# adding empty condition set for pairwise dcor\n",
539 | "pairs_g1 = pd.DataFrame.from_records(data=g_combinations, columns=['pair_0', 'pair_1'])\n",
540 | "pairs_g1['condition'] = '0'"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "# Groups"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "# data preparation\n",
557 | "groups_prep_g = {}\n",
558 | "\n",
559 | "for group in groups:\n",
560 | " print(group)\n",
561 | " \n",
562 | " groups_prep_g[group] = np.empty(18, dtype=object)\n",
563 | " \n",
564 | " for g, goal in enumerate(goals):\n",
565 | " g_list = []\n",
566 | " for country in groups[group].dropna():\n",
567 | " g_list.append(np.asarray(goals_values_i[country][g]))\n",
568 | "\n",
569 | " groups_prep_g[group][g] = np.asarray(g_list)"
570 | ]
571 | },
572 | {
573 | "cell_type": "markdown",
574 | "metadata": {},
575 | "source": [
576 | "Now we call these data in our `dcor` computations. We first compute the pairwise distance covariance and correlation, then the partial ones with conditioning on all the previously defined sets in `pairs_g`."
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {},
582 | "source": [
583 | "### Preparations\n",
584 | "Filtering out the conditions that contain goals $X$ (`pair_0`) or $Y$ (`pair_1`):"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": null,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "import multiprocessing as mp\n",
594 | "print(\"Number of processors: \", mp.cpu_count())"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": null,
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "# CHECKPOINT\n",
604 | "pairs_g0_left_0 = pd.read_csv('utils/pairs_g0_left_0.zip', dtype=str, compression='zip')\n",
605 | "\n",
606 | "pairs_g0_left_0_1 = pd.read_csv('utils/pairs_g0_left_0_1.zip', dtype=str, compression='zip')\n",
607 | "pairs_g0_left_0_2 = pd.read_csv('utils/pairs_g0_left_0_2.zip', dtype=str, compression='zip')\n",
608 | "pairs_g0_left_0_3 = pd.read_csv('utils/pairs_g0_left_0_3.zip', dtype=str, compression='zip')\n",
609 | "pairs_g0_left_0_4 = pd.read_csv('utils/pairs_g0_left_0_4.zip', dtype=str, compression='zip')"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "metadata": {},
616 | "outputs": [],
617 | "source": [
618 | "# check\n",
619 | "pairs_g0_left_0_3.tail()"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "metadata": {},
626 | "outputs": [],
627 | "source": [
628 | "pairs_g0_left_0.shape[0] / 153"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": null,
634 | "metadata": {},
635 | "outputs": [],
636 | "source": [
637 | "len(g_combinations)"
638 | ]
639 | },
640 | {
641 | "cell_type": "markdown",
642 | "metadata": {},
643 | "source": [
644 | "# With `multiprocessing` parallelisation\n",
645 | "\n",
646 | "\n",
647 | " \n",
648 | "### Partial distance correlation"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": null,
654 | "metadata": {},
655 | "outputs": [],
656 | "source": [
657 | "def partial_distance_cor(row):\n",
658 | " pair_0, pair_1, cond = row\n",
659 | " if pair_0=='T':\n",
660 | " pair_0 = 18\n",
661 | " if pair_1=='T':\n",
662 | " pair_1 = 18\n",
663 | " pair_0_array = groups_prep_g[group][int(pair_0)-1]\n",
664 | " pair_1_array = groups_prep_g[group][int(pair_1)-1]\n",
665 | " condition_array = conditions_dict[str(cond)].T\n",
666 | " \n",
667 | " return dcor.partial_distance_correlation(pair_0_array, pair_1_array, condition_array)**2"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": null,
673 | "metadata": {},
674 | "outputs": [],
675 | "source": [
676 | "#groups.drop(columns=['Global North', 'Global South'], inplace=True)"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": null,
682 | "metadata": {},
683 | "outputs": [],
684 | "source": [
685 | "groups.columns"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {
692 | "scrolled": true
693 | },
694 | "outputs": [],
695 | "source": [
696 | "# groups\n",
697 | " \n",
698 | "dict_cor_goals_groups_2_cond = {}\n",
699 | "\n",
700 | "for group in ['Global South']:\n",
701 | " print(group)\n",
702 | "\n",
703 | " #dict_cor_goa_c = pairs_g0_left_0.copy(deep=True)\n",
704 | " dict_cor_goa_c = pairs_g0_left_0_4.copy(deep=True) # pairs_g0_left_0 has all non-empty conditional sets\n",
705 | "\n",
706 | " # preparing conditional set\n",
707 | " conditions_dict = {}\n",
708 | "\n",
709 | " #for cond in conditions_g_tuple:\n",
710 | " for cond in conditions_g_tuple_4:\n",
711 | " condition = []\n",
712 | "\n",
713 | " for c in cond:\n",
714 | " if c=='T':\n",
715 | " condition.extend(groups_prep_g[group][17].T)\n",
716 | " else:\n",
717 | " condition.extend(groups_prep_g[group][int(c)-1].T)\n",
718 | "\n",
719 | " conditions_dict[str(cond)] = np.asarray(condition)\n",
720 | " \n",
721 | " # partial distance correlation\n",
722 | " pool = mp.Pool(int(mp.cpu_count()/2))\n",
723 | "\n",
724 | " dict_cor_goa_c_list = dict_cor_goa_c.values.tolist()\n",
725 | "\n",
726 | " print('start dcor...')\n",
727 | "\n",
728 | " cor_results = pool.map(partial_distance_cor, dict_cor_goa_c_list, chunksize=1000)\n",
729 | "\n",
730 | " pool.close()\n",
731 | " pool.join()\n",
732 | "\n",
733 | " dict_cor_goa_c['dcor'] = cor_results\n",
734 | "\n",
735 | " print('...dcor done')\n",
736 | "\n",
737 | " # find minimum distance correlation between any two goals\n",
738 | " dict_cor_goa_con = dict_cor_goa_c.groupby(['pair_0', 'pair_1'])['dcor'].apply(list).reset_index(name='list_dcor')\n",
739 | "\n",
740 | " for i, row_con in dict_cor_goa_con.iterrows():\n",
741 | " dict_cor_goa_con.loc[i, 'min_dcor'] = min(dict_cor_goa_con.loc[i, 'list_dcor'])\n",
742 | "\n",
743 | " dict_cor_goa_con.drop(columns=['list_dcor'], inplace=True)\n",
744 | " \n",
745 | " # finding conditional set of minimum partial distance correlation\n",
746 | " dict_cor_goa_cond = dict_cor_goa_con.merge(dict_cor_goa_c, left_on='min_dcor', right_on='dcor').drop(['pair_0_y', 'pair_1_y', 'dcor'], axis=1).rename(columns={'pair_0_x': 'pair_0', 'pair_1_x': 'pair_1'})\n",
747 | " \n",
748 | " dict_cor_goals_groups_2_cond[group] = dict_cor_goa_cond\n",
749 | " \n",
750 | " # save every group separately to save memory\n",
751 | " #g_cor = open('distance_cor/goals/dict_cor_goals_groups_2_cond_{}.pkl'.format(group), 'wb')\n",
752 | " g_cor = open('distance_cor/goals/dict_cor_goals_groups_2_cond_{}_4.pkl'.format(group), 'wb')\n",
753 | " pickle.dump(dict_cor_goals_groups_2_cond, g_cor)\n",
754 | " g_cor.close()\n",
755 | "\n",
756 | " gc.collect()"
757 | ]
758 | },
759 | {
760 | "cell_type": "code",
761 | "execution_count": null,
762 | "metadata": {},
763 | "outputs": [],
764 | "source": [
765 | "# for Global South (disaggregated because of memory restrictions)\n",
766 | "dict_GS_1 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South_1.pkl', 'rb'))\n",
767 | "dict_GS_2 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South_2.pkl', 'rb'))\n",
768 | "dict_GS_3 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South_3.pkl', 'rb'))\n",
769 | "dict_GS_4 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South_4.pkl', 'rb'))\n",
770 | "\n",
771 | "cor_goals_continents_2_GS = pd.concat([dict_GS_1['Global South'], dict_GS_2['Global South'], dict_GS_3['Global South'], dict_GS_4['Global South']])\n",
772 | "\n",
773 | "# find minimum distance correlation between any two goals\n",
774 | "dict_cor_goa_con = cor_goals_continents_2_GS.groupby(['pair_0', 'pair_1'])['min_dcor'].apply(list).reset_index(name='list_dcor')\n",
775 | "\n",
776 | "for i, row_c in dict_cor_goa_con.iterrows():\n",
777 | " dict_cor_goa_con.loc[i, 'min_dcor'] = min(dict_cor_goa_con.loc[i, 'list_dcor'])\n",
778 | "\n",
779 | "dict_cor_goa_con.drop(columns=['list_dcor'], inplace=True)\n",
780 | "\n",
781 | "# finding conditional set of minimum partial distance correlation\n",
782 | "dict_cor_goa_cond = dict_cor_goa_con.merge(cor_goals_continents_2_GS, left_on='min_dcor', right_on='min_dcor').drop(['pair_0_y', 'pair_1_y'], axis=1).rename(columns={'pair_0_x': 'pair_0', 'pair_1_x': 'pair_1'})\n",
783 | "\n",
784 | "# save every entry region separately to save memory\n",
785 | "g_cor = open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South.pkl', 'wb')\n",
786 | "pickle.dump(dict_cor_goa_cond, g_cor)\n",
787 | "g_cor.close()"
788 | ]
789 | },
790 | {
791 | "cell_type": "code",
792 | "execution_count": null,
793 | "metadata": {},
794 | "outputs": [],
795 | "source": [
796 | "dict_GN = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global North.pkl', 'rb'))\n",
797 | "dict_GS = {}\n",
798 | "dict_GS['Global South'] = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Global South.pkl', 'rb'))\n",
799 | "dict_LCD = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Least Developed Countries (LDC).pkl', 'rb'))\n",
800 | "dict_LLDC = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Land Locked Developing Countries (LLDC).pkl', 'rb'))\n",
801 | "dict_SIDS = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Small Island Developing States (SIDS).pkl', 'rb'))\n",
802 | "dict_G20 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_G20.pkl', 'rb'))\n",
803 | "dict_EM = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Emerging Markets (BRICS + N-11).pkl', 'rb'))\n",
804 | "dict_OPEC = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_OPEC.pkl', 'rb'))\n",
805 | "dict_LI = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Low Income.pkl', 'rb'))\n",
806 | "dict_LMI = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Lower middle Income.pkl', 'rb'))\n",
807 | "dict_UMI = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_Upper middle Income.pkl', 'rb'))\n",
808 | "dict_HI = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2_cond_High Income.pkl', 'rb'))"
809 | ]
810 | },
811 | {
812 | "cell_type": "code",
813 | "execution_count": null,
814 | "metadata": {},
815 | "outputs": [],
816 | "source": [
817 | "dict_cor_goals_groups_2_condition = {**dict_GN, **dict_GS, **dict_LCD, **dict_LLDC, **dict_SIDS, **dict_G20, **dict_EM, **dict_OPEC, **dict_LI, **dict_LMI, **dict_UMI, **dict_HI}"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": null,
823 | "metadata": {},
824 | "outputs": [],
825 | "source": [
826 | "# check\n",
827 | "print(dict_cor_goals_groups_2_condition.keys())\n",
828 | "dict_cor_goals_groups_2_condition['Global South']"
829 | ]
830 | },
831 | {
832 | "cell_type": "markdown",
833 | "metadata": {},
834 | "source": [
835 | "### Pairwise distance correlation"
836 | ]
837 | },
838 | {
839 | "cell_type": "code",
840 | "execution_count": null,
841 | "metadata": {},
842 | "outputs": [],
843 | "source": [
844 | "def distance_cor(row):\n",
845 | " pair_0, pair_1 = row\n",
846 | " if pair_0=='T':\n",
847 | " pair_0 = 18\n",
848 | " if pair_1=='T':\n",
849 | " pair_1 = 18\n",
850 | " pair_0_array = groups_prep_g[group][int(pair_0)-1]\n",
851 | " pair_1_array = groups_prep_g[group][int(pair_1)-1]\n",
852 | " \n",
853 | " return dcor.distance_correlation(pair_0_array, pair_1_array)**2"
854 | ]
855 | },
856 | {
857 | "cell_type": "code",
858 | "execution_count": null,
859 | "metadata": {
860 | "scrolled": true
861 | },
862 | "outputs": [],
863 | "source": [
864 | "# groups\n",
865 | "dict_cor_goals_groups_2_pair = {}\n",
866 | "\n",
867 | "for group in groups:\n",
868 | " print(group)\n",
869 | " \n",
870 | " dict_cor_goa_c_pair = pairs_g1.drop(columns=['condition']).copy(deep=True) # pairs_g1 has empty conditional sets for pairwise dcor\n",
871 | " \n",
872 | " pool = mp.Pool(int(mp.cpu_count()/2))\n",
873 | " \n",
874 | " print('start dcor...')\n",
875 | " \n",
876 | " dict_cor_goa_c_pair_list = dict_cor_goa_c_pair.values.tolist()\n",
877 | " \n",
878 | " cor_results = pool.map(distance_cor, dict_cor_goa_c_pair_list, chunksize=1000)\n",
879 | " \n",
880 | " pool.close()\n",
881 | " pool.join()\n",
882 | " \n",
883 | " dict_cor_goa_c_pair['min_dcor_pair'] = cor_results\n",
884 | " \n",
885 | " print('...dcor done')\n",
886 | " \n",
887 | " dict_cor_goals_groups_2_pair[group] = dict_cor_goa_c_pair"
888 | ]
889 | },
890 | {
891 | "cell_type": "code",
892 | "execution_count": null,
893 | "metadata": {},
894 | "outputs": [],
895 | "source": [
896 | "# check\n",
897 | "dict_cor_goals_groups_2_pair['Least Developed Countries (LDC)']"
898 | ]
899 | },
900 | {
901 | "cell_type": "code",
902 | "execution_count": null,
903 | "metadata": {},
904 | "outputs": [],
905 | "source": [
906 | "# merge dictionaries\n",
907 | "dict_cor_goals_groups_2 = {}\n",
908 | "\n",
909 | "for group in dict_cor_goals_groups_2_condition.keys():\n",
910 | " print(group)\n",
911 | " \n",
912 | " dict_cor_goals_groups_2[group] = pd.DataFrame(index=range(153), columns=['pair_0', 'pair_1', 'min_dcor', 'condition'])\n",
913 | " \n",
914 | " for i in dict_cor_goals_groups_2_pair[group].index:\n",
915 | " for j in dict_cor_goals_groups_2_condition[group].index:\n",
916 | " if dict_cor_goals_groups_2_pair[group].loc[i, 'pair_0']==dict_cor_goals_groups_2_condition[group].loc[j, 'pair_0'] and dict_cor_goals_groups_2_pair[group].loc[i, 'pair_1']==dict_cor_goals_groups_2_condition[group].loc[j, 'pair_1']:\n",
917 | " dict_cor_goals_groups_2[group].loc[i, 'pair_0'] = dict_cor_goals_groups_2_pair[group].loc[i, 'pair_0']\n",
918 | " dict_cor_goals_groups_2[group].loc[i, 'pair_1'] = dict_cor_goals_groups_2_pair[group].loc[i, 'pair_1']\n",
919 | " dict_cor_goals_groups_2[group].loc[i, 'min_dcor'] = min(dict_cor_goals_groups_2_pair[group].loc[i, 'min_dcor_pair'], dict_cor_goals_groups_2_condition[group].loc[j, 'min_dcor'])\n",
920 | " if dict_cor_goals_groups_2_pair[group].loc[i, 'min_dcor_pair'] < dict_cor_goals_groups_2_condition[group].loc[j, 'min_dcor']:\n",
921 | " dict_cor_goals_groups_2[group].loc[i, 'condition'] = 0\n",
922 | " else:\n",
923 | " dict_cor_goals_groups_2[group].loc[i, 'condition'] = dict_cor_goals_groups_2_condition[group].loc[j, 'condition']"
924 | ]
925 | },
926 | {
927 | "cell_type": "code",
928 | "execution_count": null,
929 | "metadata": {},
930 | "outputs": [],
931 | "source": [
932 | "# CHECKPOINT\n",
933 | "dict_cor_goals_groups_2 = pickle.load(open('distance_cor/goals/dict_cor_goals_groups_2.pkl', 'rb'))"
934 | ]
935 | },
936 | {
937 | "cell_type": "markdown",
938 | "metadata": {},
939 | "source": [
940 | "### Testing for statistical significance\n",
941 | "We calculate the p-values of our partial distance correlations, i.e., the probability that the null hypothesis of (partial) independence can be accepted."
942 | ]
943 | },
944 | {
945 | "cell_type": "code",
946 | "execution_count": null,
947 | "metadata": {},
948 | "outputs": [],
949 | "source": [
950 | "for group in groups:\n",
951 | " print(group)\n",
952 | " dict_cor_goals_groups_2[group]['p-value'] = -1\n",
953 | " for r, row in dict_cor_goals_groups_2[group].iterrows():\n",
954 | " \n",
955 | " # preparing pair_0 and pair_1\n",
956 | " if row.pair_1=='T':\n",
957 | " row.pair_1 = 18\n",
958 | " pair_0_array = groups_prep_g[group][int(row.pair_0)-1]\n",
959 | " pair_1_array = groups_prep_g[group][int(row.pair_1)-1]\n",
960 | " \n",
961 | " # extracting conditional variables from column 'condition'\n",
962 | " cond_list = []\n",
963 | " for i in row.condition.split():\n",
964 | " newstr = ''.join((ch if ch in '0123456789.-eT' else ' ') for ch in i)\n",
965 | " cond_list.extend([i for i in newstr.split()])\n",
966 | "\n",
967 | " condition = []\n",
968 | " for c in cond_list:\n",
969 | " if c=='T':\n",
970 | " condition.extend(groups_prep_g[group][17].T)\n",
971 | " else:\n",
972 | " condition.extend(groups_prep_g[group][int(c)-1].T)\n",
973 | "\n",
974 | " cond_array = np.asarray(condition).T\n",
975 | " \n",
976 | " dict_cor_goals_groups_2[group].iloc[r, 4] = dcor.independence.partial_distance_covariance_test(pair_0_array, pair_1_array, cond_array, num_resamples=10000).p_value"
977 | ]
978 | },
979 | {
980 | "cell_type": "code",
981 | "execution_count": null,
982 | "metadata": {},
983 | "outputs": [],
984 | "source": [
985 | "# save\n",
986 | "if not os.path.exists('distance_cor'):\n",
987 | " os.mkdir('distance_cor')\n",
988 | " \n",
989 | "if not os.path.exists('distance_cor/goals'):\n",
990 | " os.mkdir('distance_cor/goals')\n",
991 | "\n",
992 | "g_cor = open('distance_cor/goals/dict_cor_goals_groups_2.pkl', 'wb')\n",
993 | "pickle.dump(dict_cor_goals_groups_2, g_cor)\n",
994 | "g_cor.close()"
995 | ]
996 | },
997 | {
998 | "cell_type": "code",
999 | "execution_count": null,
1000 | "metadata": {},
1001 | "outputs": [],
1002 | "source": [
1003 | "# saving as csv's\n",
1004 | "for group in groups:\n",
1005 | " dict_cor_goals_groups_2[group] = dict_cor_goals_groups_2[group][['pair_0', 'pair_1', 'min_dcor', 'p-value', 'condition']]\n",
1006 | " dict_cor_goals_groups_2[group]['p-value'] = dict_cor_goals_groups_2[group]['p-value'].astype(float).round(5)\n",
1007 | " dict_cor_goals_groups_2[group].min_dcor = dict_cor_goals_groups_2[group].min_dcor.astype(float).round(5)\n",
1008 | " dict_cor_goals_groups_2[group].to_csv('distance_cor/goals/conditions_{}.csv'.format(group))"
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "markdown",
1013 | "metadata": {},
1014 | "source": [
1015 | "We want to keep the minimum significant distance correlation of each pair of two goals, pairwise or conditioned on any potential subset.\n",
1016 | "\n",
1017 | "The last step is to insert these values into the right cell in a matrix."
1018 | ]
1019 | },
1020 | {
1021 | "cell_type": "code",
1022 | "execution_count": null,
1023 | "metadata": {},
1024 | "outputs": [],
1025 | "source": [
1026 | "cor_goals_groups_2 = {}\n",
1027 | "\n",
1028 | "for group in dict_cor_goals_groups_2.keys():\n",
1029 | " print(group)\n",
1030 | " cor_goals_groups_2[group] = pd.DataFrame(index=goals, columns=goals)\n",
1031 | "\n",
1032 | " for i in list(dict_cor_goals_groups_2[group].index):\n",
1033 | " goal_0 = dict_cor_goals_groups_2[group].loc[i, 'pair_0']\n",
1034 | " goal_1 = dict_cor_goals_groups_2[group].loc[i, 'pair_1']\n",
1035 | " \n",
1036 | " # take square root because we have previously squared the distance correlation\n",
1037 | " cor_goals_groups_2[group].loc[goal_1, goal_0] = np.sqrt(dict_cor_goals_groups_2[group].loc[i, 'min_dcor'])"
1038 | ]
1039 | },
1040 | {
1041 | "cell_type": "markdown",
1042 | "metadata": {},
1043 | "source": [
1044 | "In `cor_goals_groups_2` are the conditional distance correlations for all continents in a setting of 18 random vectors $X$, $Y$, and $Z_1, Z_2, ..., Z_{16}$, where $\\boldsymbol{Z}$ is the array containing all random vectors we want to condition on."
1045 | ]
1046 | },
1047 | {
1048 | "cell_type": "code",
1049 | "execution_count": null,
1050 | "metadata": {},
1051 | "outputs": [],
1052 | "source": [
1053 | "# save\n",
1054 | "g_cor = open('distance_cor/goals/dcor_goals_groups_2.pkl', 'wb')\n",
1055 | "pickle.dump(cor_goals_groups_2, g_cor)\n",
1056 | "g_cor.close()"
1057 | ]
1058 | },
1059 | {
1060 | "cell_type": "code",
1061 | "execution_count": null,
1062 | "metadata": {},
1063 | "outputs": [],
1064 | "source": [
1065 | "# CHECKPOINT\n",
1066 | "g_cor = pickle.load(open('distance_cor/goals/dcor_goals_groups_2.pkl', 'rb'))"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "markdown",
1071 | "metadata": {},
1072 | "source": [
1073 | "## Visualisation on goal-level\n",
1074 | "Additionally to the matrices with numbers, we would also like to visualise these matrices and plot these correlations as networks."
1075 | ]
1076 | },
1077 | {
1078 | "cell_type": "code",
1079 | "execution_count": null,
1080 | "metadata": {
1081 | "scrolled": true
1082 | },
1083 | "outputs": [],
1084 | "source": [
1085 | "# groups\n",
1086 | "for group in dict_cor_goals_groups_2.keys():\n",
1087 | " # generate a mask for the upper triangle\n",
1088 | " mask = np.zeros_like(cor_goals_groups_2[group].fillna(0), dtype=np.bool)\n",
1089 | " mask[np.triu_indices_from(mask)] = True\n",
1090 | "\n",
1091 | " # set up the matplotlib figure\n",
1092 | " f, ax = plt.subplots(figsize=(25, 22))\n",
1093 | "\n",
1094 | " # generate a custom diverging colormap\n",
1095 | " cmap = sns.color_palette(\"Reds\", 100)\n",
1096 | "\n",
1097 | " # draw the heatmap with the mask and correct aspect ratio\n",
1098 | " sns.heatmap(cor_goals_groups_2[group].fillna(0), mask=mask, cmap=cmap, vmax=1, center=0.5, vmin=0,\n",
1099 | " square=True, linewidths=.5, cbar_kws={\"shrink\": .8})\n",
1100 | " \n",
1101 | " plt.title('{}'.format(group), fontdict={'fontsize': 52})\n",
1102 | " plt.savefig('distance_cor/goals/{}_cor_goals.png'.format(group))"
1103 | ]
1104 | },
1105 | {
1106 | "cell_type": "code",
1107 | "execution_count": null,
1108 | "metadata": {},
1109 | "outputs": [],
1110 | "source": [
1111 | "# data preparation for networkX\n",
1112 | "dcor_dict_g = {}\n",
1113 | "\n",
1114 | "for group in cor_goals_groups_2.keys():\n",
1115 | " dcor_dict_g[group] = {}\n",
1116 | "\n",
1117 | " for goalcombination in g_combinations:\n",
1118 | " dcor_dict_g[group][tuple(goalcombination)] = [cor_goals_groups_2[group].loc[goalcombination[1], goalcombination[0]], float(dict_cor_goals_groups_2[group].loc[(dict_cor_goals_groups_2[group]['pair_0']=='{}'.format(goalcombination[0])) & (dict_cor_goals_groups_2[group]['pair_1']=='{}'.format(goalcombination[1]))]['p-value'])]"
1119 | ]
1120 | },
1121 | {
1122 | "cell_type": "code",
1123 | "execution_count": null,
1124 | "metadata": {},
1125 | "outputs": [],
1126 | "source": [
1127 | "for group in cor_goals_groups_2.keys():\n",
1128 | " for key in dcor_dict_g[group].keys():\n",
1129 | " if key[1] == 'T':\n",
1130 | " dcor_dict_g[group][tuple((key[0], '18'))] = dcor_dict_g[group].pop(tuple((key[0], 'T')))\n",
1131 | " elif key[0] == 'T':\n",
1132 | " dcor_dict_g[group][tuple(('18', key[1]))] = dcor_dict_g[group].pop(tuple(('T', key[1])))"
1133 | ]
1134 | },
1135 | {
1136 | "cell_type": "code",
1137 | "execution_count": null,
1138 | "metadata": {
1139 | "scrolled": true
1140 | },
1141 | "outputs": [],
1142 | "source": [
1143 | "# plotting networks with weighted edges\n",
1144 | "\n",
1145 | "layout = 'circular'\n",
1146 | "\n",
1147 | "centrality_G = {} # dictionary to save centralities\n",
1148 | "degree_G = {} # dictionary to save degrees\n",
1149 | "density_G = {} # dictionary to save weighted densities\n",
1150 | "p_G = {} # auxiliary\n",
1151 | "partition_G = {} # dictionary to save clusters\n",
1152 | "\n",
1153 | "for group in cor_goals_groups_2.keys():\n",
1154 | " G_G = nx.Graph()\n",
1155 | " \n",
1156 | " for key, value in dcor_dict_g[group].items():\n",
1157 | " if value[1] <= 0.01:\n",
1158 | " w = value[0]\n",
1159 | " s = 'solid'\n",
1160 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1161 | " elif 0.01 < value[1] <= 0.05:\n",
1162 | " w = value[0]\n",
1163 | " s = 'dashed'\n",
1164 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1165 | " elif 0.05 < value[1] <= 0.1:\n",
1166 | " w = value[0]\n",
1167 | " s = 'dotted'\n",
1168 | " c = sns.color_palette('Reds', 100)[int(value[0]*100)]\n",
1169 | " else:\n",
1170 | " w = 0\n",
1171 | " s = 'solid'\n",
1172 | " c = 'white'\n",
1173 | " G_G.add_edge(int(key[0]), int(key[1]), style=s, weight=w, color=c, alpha=value[0])\n",
1174 | " \n",
1175 | " if layout == 'circular':\n",
1176 | " pos = nx.circular_layout(G_G)\n",
1177 | " elif layout == 'spring':\n",
1178 | " pos = nx.spring_layout(G_G)\n",
1179 | " \n",
1180 | " plt.figure(figsize=(24,16))\n",
1181 | " plt.tight_layout() \n",
1182 | " \n",
1183 | " # nodes\n",
1184 | " nx.draw_networkx_nodes(G_G, pos, node_size=1000)\n",
1185 | "\n",
1186 | " # labels\n",
1187 | " nx.draw_networkx_labels(G_G, pos, font_size=46, font_family='sans-serif')\n",
1188 | " \n",
1189 | " nodes = G_G.nodes()\n",
1190 | " edges = G_G.edges()\n",
1191 | " colors = [G_G[u][v]['color'] for u,v in edges]\n",
1192 | " weights = [G_G[u][v]['weight'] for u,v in edges]\n",
1193 | " alphas = [G_G[u][v]['alpha'] for u,v in edges]\n",
1194 | " styles = [G_G[u][v]['style'] for u,v in edges]\n",
1195 | "\n",
1196 | " nx.draw_networkx_nodes(G_G, pos, nodelist=nodes, node_color='white', node_size=1000)\n",
1197 | "\n",
1198 | " for i, edge in enumerate(edges):\n",
1199 | " pos_edge = {edge[0]: pos[edge[0]], edge[1]: pos[edge[1]]}\n",
1200 | " nx.draw_networkx_edges(G_G, pos_edge, edgelist=[edge], edge_color=colors[i], style=styles[i], width=np.multiply(weights[i],25)) #alpha=np.multiply(alphas[i],2.5))\n",
1201 | " \n",
1202 | " #nx.draw_networkx(G_G, pos, with_labels=False, edges=edges, edge_color=colors, node_color='white', node_size=1000, width=np.multiply(weights,25))\n",
1203 | "\n",
1204 | " ax=plt.gca()\n",
1205 | " fig=plt.gcf()\n",
1206 | " trans = ax.transData.transform\n",
1207 | " trans_axes = fig.transFigure.inverted().transform\n",
1208 | " imsize = 0.08 # this is the image size\n",
1209 | " plt.title('{}'.format(group), y=1.05, fontdict={'fontsize': 52})\n",
1210 | "\n",
1211 | " for node in G_G.nodes():\n",
1212 | " (x,y) = pos[node] \n",
1213 | " xx,yy = trans((x,y)) # figure coordinates\n",
1214 | " xa,ya = trans_axes((xx,yy)) # axes coordinates\n",
1215 | " a = plt.axes([xa-imsize/2.0,ya-imsize/2.0, imsize, imsize])\n",
1216 | " a.imshow(mpimg.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(node)))\n",
1217 | " a.axis('off')\n",
1218 | "\n",
1219 | "\n",
1220 | " plt.axis('off')\n",
1221 | " ax.axis('off')\n",
1222 | " \n",
1223 | " plt.savefig('distance_cor/goals/{}_{}_network_logos_main.png'.format(group, layout), format='png')\n",
1224 | "\n",
1225 | " plt.show()\n",
1226 | "\n",
1227 | " # weighted centrality\n",
1228 | " centr = nx.eigenvector_centrality(G_G, weight='weight', max_iter=100000)\n",
1229 | " centrality_G[group] = sorted((v, '{:0.2f}'.format(c)) for v, c in centr.items())\n",
1230 | " \n",
1231 | " degree_G[group] = dict(G_G.degree(weight='weight'))\n",
1232 | " \n",
1233 | " # weighted density\n",
1234 | " density_G[group] = 2 * np.sum(weights) / (len(nodes) * (len(nodes) - 1))\n",
1235 | " \n",
1236 | " # weighted clustering with Louvain algorithm\n",
1237 | " part_G = {}\n",
1238 | " modularity_G = {}\n",
1239 | " for i in range(100):\n",
1240 | " part_G[i] = community.best_partition(G_G, random_state=i)\n",
1241 | " modularity_G[i] = community.modularity(part_G[i], G_G)\n",
1242 | " \n",
1243 | " p_G[group] = part_G[max(modularity_G, key=modularity_G.get)]\n",
1244 | "\n",
1245 | " # having lists with nodes being in different clusters\n",
1246 | " partition_G[group] = {}\n",
1247 | " for com in set(p_G[group].values()) :\n",
1248 | " partition_G[group][com] = [nodes for nodes in p_G[group].keys() if p_G[group][nodes] == com]"
1249 | ]
1250 | },
1251 | {
1252 | "cell_type": "code",
1253 | "execution_count": null,
1254 | "metadata": {
1255 | "scrolled": true
1256 | },
1257 | "outputs": [],
1258 | "source": [
1259 | "# clusters\n",
1260 | "for group in cor_goals_groups_2.keys():\n",
1261 | " print(group)\n",
1262 | " print(partition_G[group])\n",
1263 | " print('-------------------------')\n",
1264 | "\n",
1265 | "g_part = open('distance_cor/goals/partition_groups.pkl', 'wb')\n",
1266 | "pickle.dump(partition_G, g_part)\n",
1267 | "g_part.close()"
1268 | ]
1269 | },
1270 | {
1271 | "cell_type": "code",
1272 | "execution_count": null,
1273 | "metadata": {
1274 | "scrolled": true
1275 | },
1276 | "outputs": [],
1277 | "source": [
1278 | "# centralities\n",
1279 | "for group in cor_goals_groups_2.keys():\n",
1280 | " print(group)\n",
1281 | " print(centrality_G[group])\n",
1282 | " print('-------------------------')\n",
1283 | "\n",
1284 | "g_cent = open('distance_cor/goals/centrality_groups.pkl', 'wb')\n",
1285 | "pickle.dump(centrality_G, g_cent)\n",
1286 | "g_cent.close()"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "code",
1291 | "execution_count": null,
1292 | "metadata": {
1293 | "scrolled": true
1294 | },
1295 | "outputs": [],
1296 | "source": [
1297 | "# degrees\n",
1298 | "for group in cor_goals_groups_2.keys():\n",
1299 | " print(group)\n",
1300 | " print(degree_G[group])\n",
1301 | " print('-------------------------')\n",
1302 | "\n",
1303 | "g_deg = open('distance_cor/goals/degree_groups.pkl', 'wb')\n",
1304 | "pickle.dump(degree_G, g_deg)\n",
1305 | "g_deg.close()"
1306 | ]
1307 | },
1308 | {
1309 | "cell_type": "code",
1310 | "execution_count": null,
1311 | "metadata": {
1312 | "scrolled": true
1313 | },
1314 | "outputs": [],
1315 | "source": [
1316 | "# densities\n",
1317 | "for group in cor_goals_groups_2.keys():\n",
1318 | " print(group)\n",
1319 | " print(density_G[group])\n",
1320 | " print('-------------------------')\n",
1321 | " \n",
1322 | "g_dens = open('distance_cor/goals/density_groups.pkl', 'wb')\n",
1323 | "pickle.dump(degree_G, g_dens)\n",
1324 | "g_dens.close()"
1325 | ]
1326 | },
1327 | {
1328 | "cell_type": "markdown",
1329 | "metadata": {},
1330 | "source": [
1331 | "### Eigenvector visualisation"
1332 | ]
1333 | },
1334 | {
1335 | "cell_type": "code",
1336 | "execution_count": null,
1337 | "metadata": {},
1338 | "outputs": [],
1339 | "source": [
1340 | "def get_image(goal):\n",
1341 | " return OffsetImage(plt.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(goal)), zoom=0.06)"
1342 | ]
1343 | },
1344 | {
1345 | "cell_type": "code",
1346 | "execution_count": null,
1347 | "metadata": {
1348 | "scrolled": true
1349 | },
1350 | "outputs": [],
1351 | "source": [
1352 | "for group in cor_goals_groups_2.keys():\n",
1353 | " # separating goals from their centralities\n",
1354 | " x = []\n",
1355 | " y = []\n",
1356 | " for cent in centrality_G[group]:\n",
1357 | " x.append(cent[0])\n",
1358 | " y.append(float(cent[1]))\n",
1359 | "\n",
1360 | " fig, ax = plt.subplots(figsize=(24,16))\n",
1361 | " #plt.tight_layout()\n",
1362 | " plt.title('{}'.format(group), y=1.05, fontdict={'fontsize': 52})\n",
1363 | " ax.scatter(x, y) \n",
1364 | " \n",
1365 | " # adding images\n",
1366 | " for x0, y0, goal in zip(x, y, list(nodes)):\n",
1367 | " ab = AnnotationBbox(get_image(goal), (x0, y0), frameon=False)\n",
1368 | " ax.add_artist(ab)\n",
1369 | "\n",
1370 | " ax.set_xticks([])\n",
1371 | " ax.set_yticklabels([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], fontsize=28)\n",
1372 | " ax.yaxis.grid()\n",
1373 | " ax.set_ylim(0, 0.75)\n",
1374 | " ax.set_ylabel('Eigenvector centrality', labelpad=24, fontdict={'fontsize': 38})\n",
1375 | " ax.set_xlabel('Variables (SDGs + climate change)', labelpad=54, fontdict={'fontsize': 38})\n",
1376 | " \n",
1377 | " plt.savefig('distance_cor/goals/{}_eigenvector_centrality.png'.format(group), format='png')\n",
1378 | " \n",
1379 | " plt.show()"
1380 | ]
1381 | },
1382 | {
1383 | "cell_type": "markdown",
1384 | "metadata": {},
1385 | "source": [
1386 | "### Cluster visualisation"
1387 | ]
1388 | },
1389 | {
1390 | "cell_type": "code",
1391 | "execution_count": null,
1392 | "metadata": {
1393 | "scrolled": true
1394 | },
1395 | "outputs": [],
1396 | "source": [
1397 | "# plotting clusters in networks with weighted edges\n",
1398 | "\n",
1399 | "from matplotlib.patches import Polygon\n",
1400 | "from matplotlib.collections import PatchCollection\n",
1401 | "\n",
1402 | "layout = 'multipartite'\n",
1403 | "\n",
1404 | "for group in cor_goals_groups_2.keys():\n",
1405 | " G_G = nx.Graph()\n",
1406 | "\n",
1407 | " for key, value in dcor_dict_g[group].items():\n",
1408 | " G_G.add_edge(int(key[0]), int(key[1]), weight=value[0], color=sns.color_palette(\"Reds\", 100)[int(np.around(value[0]*100))], alpha=value[0])\n",
1409 | " \n",
1410 | " for node in nodes:\n",
1411 | " G_G.nodes[node]['subset'] = p_G[group][node]\n",
1412 | " \n",
1413 | " if layout == 'circular':\n",
1414 | " pos = nx.circular_layout(G_G)\n",
1415 | " elif layout == 'spring':\n",
1416 | " pos = nx.spring_layout(G_G, iterations=100, seed=42)\n",
1417 | " elif layout == 'multipartite':\n",
1418 | " pos = nx.multipartite_layout(G_G)\n",
1419 | " \n",
1420 | " plt.figure(figsize=(24,16))\n",
1421 | "\n",
1422 | " # nodes\n",
1423 | " nx.draw_networkx_nodes(G_G, pos, node_size=1000)\n",
1424 | "\n",
1425 | " # labels\n",
1426 | " nx.draw_networkx_labels(G_G, pos, font_size=46, font_family='sans-serif')\n",
1427 | "\n",
1428 | " nodes = G_G.nodes()\n",
1429 | " edges = G_G.edges()\n",
1430 | " colors = [G_G[u][v]['color'] for u,v in edges]\n",
1431 | " weights = [G_G[u][v]['weight'] for u,v in edges]\n",
1432 | "\n",
1433 | " nx.draw_networkx(G_G, pos, with_labels=False, edgelist=edges, edge_color=colors, node_color='white', node_size=1000, width=np.multiply(weights,25))\n",
1434 | "\n",
1435 | " ax=plt.gca()\n",
1436 | " fig=plt.gcf()\n",
1437 | " trans = ax.transData.transform\n",
1438 | " trans_axes = fig.transFigure.inverted().transform\n",
1439 | " imsize = 0.08 # this is the image size\n",
1440 | " plt.title('{}'.format(group), y=1.05, fontdict={'fontsize': 52})\n",
1441 | "\n",
1442 | " for node in G_G.nodes():\n",
1443 | " x,y = pos[node] \n",
1444 | " xx,yy = trans((x,y)) # figure coordinates\n",
1445 | " xa,ya = trans_axes((xx,yy)) # axes coordinates\n",
1446 | " a = plt.axes([xa-imsize/2.0,ya-imsize/2.0, imsize, imsize])\n",
1447 | " a.imshow(mpimg.imread('utils/images/E_SDG goals_icons-individual-rgb-{}.png'.format(node)))\n",
1448 | " a.axis('off')\n",
1449 | " \n",
1450 | " # drawing polygon around nodes of clusters with maximum modularity\n",
1451 | " clusters = []\n",
1452 | " for com, goals in partition_G[group].items():\n",
1453 | " position = []\n",
1454 | " for goal in goals:\n",
1455 | " x,y = pos[goal]\n",
1456 | " position.append((x,y))\n",
1457 | " \n",
1458 | " positions = []\n",
1459 | " for i in range(6000):\n",
1460 | " np.random.shuffle(position)\n",
1461 | " positions.extend(position)\n",
1462 | " \n",
1463 | " # polygens\n",
1464 | " polygon = Polygon(positions, closed=False)\n",
1465 | " clusters.append(polygon)\n",
1466 | " \n",
1467 | " np.random.seed(72)\n",
1468 | " colors = 100*np.random.rand(len(clusters))\n",
1469 | " p = PatchCollection(clusters, alpha=0.4)\n",
1470 | " p.set_array(np.array(colors))\n",
1471 | " ax.add_collection(p)\n",
1472 | " \n",
1473 | " plt.axis('off')\n",
1474 | " ax.axis('off')\n",
1475 | " \n",
1476 | " plt.savefig('distance_cor/goals/{}_{}_network_logos_cluster.png'.format(group, layout), format='png')\n",
1477 | "\n",
1478 | " plt.show()"
1479 | ]
1480 | },
1481 | {
1482 | "cell_type": "code",
1483 | "execution_count": null,
1484 | "metadata": {},
1485 | "outputs": [],
1486 | "source": []
1487 | },
1488 | {
1489 | "cell_type": "code",
1490 | "execution_count": null,
1491 | "metadata": {},
1492 | "outputs": [],
1493 | "source": []
1494 | }
1495 | ],
1496 | "metadata": {
1497 | "kernelspec": {
1498 | "display_name": "Python 3.7 - Spark (local)",
1499 | "language": "python",
1500 | "name": "spark-3-python"
1501 | },
1502 | "language_info": {
1503 | "codemirror_mode": {
1504 | "name": "ipython",
1505 | "version": 3
1506 | },
1507 | "file_extension": ".py",
1508 | "mimetype": "text/x-python",
1509 | "name": "python",
1510 | "nbconvert_exporter": "python",
1511 | "pygments_lexer": "ipython3",
1512 | "version": "3.7.7"
1513 | }
1514 | },
1515 | "nbformat": 4,
1516 | "nbformat_minor": 2
1517 | }
1518 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Felix Laumann
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SDG climate change networks
2 |
3 | The website to explore the results in more detail is available [here](https://felix-laumann.github.io/SDG-networks/).
4 |
5 | The publication with this analysis is freely available at [The Lancet Planetary Health](https://www.thelancet.com/journals/lanplh/article/PIIS2542-5196(22)00070-5/fulltext).
6 |
7 | We retrieve SDG data from the [World Bank](http://datatopics.worldbank.org/sdgs/) for all SDGs except SDG 13 which we take from the [UN Statistics Division](https://unstats.un.org/sdgs/indicators/database/). Annual average temperatures on a country-level are taken from the [CRU data set](https://crudata.uea.ac.uk/cru/data/hrg/cru_ts_4.04/crucy.2004161557.v4.04/countries/tmp/). The 17 SDGs and climate change, measured by annual average temperature, are seen as 18 variables which want to be analysed on their inter-dependencies, often referred to as *interlinkages*.
8 | We compute the [partial distance correlations](https://projecteuclid.org/euclid.aos/1201012979) between any two variables given any subset of the remaining variables.
9 |
10 |
11 | ## Notebooks
12 |
13 | - [1_Temperature](https://github.com/felix-laumann/SDG-dataset/blob/master/1_Temperature.ipynb) prepares the CRU data set to be appended to the SDG indicators.
14 |
15 | - [1_data_preparation](https://github.com/felix-laumann/SDG-dataset/blob/master/1_data_preparation.ipynb) splits data in separate `csv` files per country, appends the temperature, and standardises the data.
16 |
17 | - [2_imputations_concatenating](https://github.com/felix-laumann/SDG-dataset/blob/master/2_imputations_concatenating.ipynb) imputes missing values in the data set with a weighted k nearest neighbour (w-kNN) algorithm, and averages and concatenates data to target and goal-level.
18 |
19 | - [3_distance_cor_continents](https://github.com/felix-laumann/SDG-dataset/blob/master/3_distance_cor_continents.ipynb) and [3_distance_cor_groups](https://github.com/felix-laumann/SDG-dataset/blob/master/3_distance_cor_groups.ipynb) computes the partial distance correlation between each unique pair of two variables given any subset of the remaining variables for continents and groups, respectively. These notebooks include the visualisations of the networks and eigenvector centrality figures.
20 |
21 | - [5_Additions](https://github.com/felix-laumann/SDG-networks/blob/master/5_Additions.ipynb) includes the visual exploration of dependencies.
22 |
--------------------------------------------------------------------------------
/TLPH_undirected_networks_supp_material.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felix-laumann/SDG-networks/6ad2ec3541e345c5e9ba1eafa490317c78de445c/TLPH_undirected_networks_supp_material.pdf
--------------------------------------------------------------------------------