├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── ecomplexity ├── ComplexityData.py ├── __init__.py ├── calc_density.py ├── calc_proximity.py ├── coicog.py ├── ecomplexity.py └── proximity.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | Data/ 2 | archive/ 3 | .DS_Store 4 | ecomplexity/tests/ 5 | 6 | # Following lines created by https://www.gitignore.io/api/python 7 | 8 | ### Python ### 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | /build/ 21 | develop-eggs/ 22 | dist/ 23 | /dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | ### Python Patch ### 124 | .venv/ 125 | 126 | ### Python.VirtualEnv Stack ### 127 | # Virtualenv 128 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 129 | [Bb]in 130 | [Ii]nclude 131 | [Ll]ib 132 | [Ll]ib64 133 | [Ll]ocal 134 | [Ss]cripts 135 | pyvenv.cfg 136 | pip-selfcheck.json 137 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Center for International Development at Harvard University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Economic Complexity and Product Complexity 2 | 3 | By the Growth Lab at Harvard's Center for International Development 4 | 5 | This package is part of Harvard Growth Lab’s portfolio of software packages, digital products and interactive data visualizations. To browse our entire portfolio, please visit [growthlab.app](growthlab.app). To learn more about our research, please visit [Harvard Growth Lab’s home page](https://growthlab.cid.harvard.edu/). 6 | 7 | # About 8 | Python package to calculate economic complexity indices. 9 | 10 | STATA implementation of the economic complexity index available at: 11 | 12 | Explore complexity and associated data using Harvard CID's Atlas tool: 13 | 14 | ## Tutorial 15 | 16 | **Installation**: 17 | At terminal: `pip install ecomplexity` 18 | 19 | If you wish to install the latest version of the package under development, you can install directly from GitHub: 20 | `pip install git+https://github.com/cid-harvard/py-ecomplexity@develop` 21 | 22 | **Usage**: 23 | 24 | ```python 25 | from ecomplexity import ecomplexity 26 | from ecomplexity import proximity 27 | 28 | # Import trade data from CID Atlas 29 | data_url = "https://intl-atlas-downloads.s3.amazonaws.com/country_hsproduct2digit_year.csv.zip" 30 | data = pd.read_csv(data_url, compression="zip", low_memory=False) 31 | data = data[['year','location_code','hs_product_code','export_value']] 32 | 33 | # Calculate complexity 34 | trade_cols = {'time':'year', 'loc':'location_code', 'prod':'hs_product_code', 'val':'export_value'} 35 | cdata = ecomplexity(data, trade_cols) 36 | 37 | # Calculate proximity matrix 38 | prox_df = proximity(data, trade_cols) 39 | ``` 40 | 41 | **Arguments**: 42 | 43 | ```text 44 | data: pandas dataframe containing production / trade data. 45 | Including variables indicating time, location, product and value 46 | cols_input: dict of column names for time, location, product and value. 47 | Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'} 48 | presence_test: str for test used for presence of industry in location. 49 | One of "rca" (default), "rpop", "both", or "manual". 50 | Determines which values are used for M_cp calculations. 51 | If "manual", M_cp is taken as given from the "value" column in data 52 | val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric 53 | *default* coerce. 54 | rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1. 55 | *default* 1. 56 | rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1. 57 | *default* 1. Only used if presence_test is not "rca". 58 | pop: pandas df, with time, location and corresponding population, in that order. 59 | Not required if presence_test is "rca" (default). 60 | continuous: Used to calculate product proximities, indicates whether 61 | to consider correlation of every product pair (True) or product 62 | co-occurrence (False). *default* False. 63 | asymmetric: Used to calculate product proximities, indicates whether 64 | to generate asymmetric proximity matrix (True) or symmetric (False). 65 | *default* False. 66 | knn: Number of nearest neighbors from proximity matrix to use to calculate 67 | density. Will use entire proximity matrix if None. 68 | *default* None. 69 | ``` 70 | 71 | ## FAQ 72 | 73 | - Why are ECI and PCI are both normalized using ECI's mean and std. dev? 74 | + This normalization preserves the property that ECI = (mean of PCI of products for which MCP=1) 75 | 76 | 77 | ### References 78 | 79 | - Hausmann, R., Hidalgo, C. A., Bustos, S., Coscia, M., Simoes, A., & Yıldırım, M. (2013). The Atlas of Economic Complexity: Mapping Paths to Prosperity (Part 1). Retrieved from 80 | - Hidalgo, C. A., Klinger, B., Barabasi, A.-L., & Hausmann, R. (2007). The Product Space Conditions the Development of Nations. Science, 317(5837), 482–487. 81 | -------------------------------------------------------------------------------- /ecomplexity/ComplexityData.py: -------------------------------------------------------------------------------- 1 | # Complexity calculations 2 | import numpy as np 3 | import pandas as pd 4 | import warnings 5 | import sys 6 | from functools import wraps 7 | import time 8 | import datetime 9 | 10 | 11 | class ComplexityData(object): 12 | """Calculate complexity and other related results 13 | 14 | Args: 15 | data: pandas dataframe containing production / trade data. 16 | Including variables indicating time, location, product and value 17 | cols_input: dict of column names for time, location, product and value. 18 | Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'} 19 | val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric 20 | *default* coerce. 21 | 22 | Attributes: 23 | data: clean data with standardized column names 24 | """ 25 | 26 | def __init__(self, data, cols_input, val_errors_flag): 27 | self.data = data.copy() 28 | self.cols_input = cols_input 29 | 30 | # Standardize column names based on input 31 | self.rename_cols() 32 | 33 | # Clean data to handle NA's and such 34 | self.clean_data(val_errors_flag) 35 | 36 | def rename_cols(self): 37 | """Standardize column names""" 38 | cols_map_inv = {v: k for k, v in self.cols_input.items()} 39 | self.data = self.data.rename(columns=cols_map_inv) 40 | self.data = self.data[["time", "loc", "prod", "val"]] 41 | 42 | def clean_data(self, val_errors_flag_input): 43 | """Clean data to remove non-numeric values, handle NA's and duplicates""" 44 | # Make sure values are numeric 45 | self.data.val = pd.to_numeric(self.data.val, errors=val_errors_flag_input) 46 | self.data.set_index(["time", "loc", "prod"], inplace=True) 47 | if self.data.val.isnull().values.any(): 48 | warnings.warn("NaN value(s) present, coercing to zero(es)") 49 | self.data.val.fillna(0, inplace=True) 50 | 51 | # Remove duplicates 52 | dups = self.data.index.duplicated() 53 | if dups.sum() > 0: 54 | warnings.warn("Duplicate values exist, keeping the first occurrence") 55 | self.data = self.data[~self.data.index.duplicated()] 56 | 57 | def create_full_df(self, t): 58 | """Rectangularize, but remove rows with diversity or ubiquity zero 59 | 60 | Rows with zero diversity / ubiquity lead to ZeroDivision errors and 61 | incorrect values during normalization 62 | """ 63 | self.t = t 64 | self.data_t = self.data.loc[t].copy() 65 | # Check for zero diversity and ubiquity 66 | val_diversity_check = ( 67 | self.data_t.reset_index().groupby(["loc"])["val"].sum().reset_index() 68 | ) 69 | val_ubiquity_check = ( 70 | self.data_t.reset_index().groupby(["prod"])["val"].sum().reset_index() 71 | ) 72 | val_diversity_check = val_diversity_check[val_diversity_check.val != 0] 73 | val_ubiquity_check = val_ubiquity_check[val_ubiquity_check.val != 0] 74 | # Remove locations and products with zero diversity and ubiquity respectively 75 | self.data_t = self.data_t.reset_index() 76 | self.data_t = self.data_t.merge( 77 | val_diversity_check[["loc"]], on="loc", how="right" 78 | ) 79 | self.data_t = self.data_t.merge( 80 | val_ubiquity_check[["prod"]], on="prod", how="right" 81 | ) 82 | self.data_t.set_index(["loc", "prod"], inplace=True) 83 | # Create full dataframe with all combinations of locations and products 84 | data_index = pd.MultiIndex.from_product( 85 | self.data_t.index.levels, names=self.data_t.index.names 86 | ) 87 | self.data_t = self.data_t.reindex(data_index, fill_value=0) 88 | 89 | def calculate_rca(self): 90 | """Calculate RCA""" 91 | # Convert data into numpy array 92 | loc_n_vals = len(self.data_t.index.levels[0]) 93 | prod_n_vals = len(self.data_t.index.levels[1]) 94 | data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals)) 95 | 96 | # Calculate RCA, disable dividebyzero errors 97 | with np.errstate(divide="ignore", invalid="ignore"): 98 | num = data_np / np.nansum(data_np, axis=1)[:, np.newaxis] 99 | loc_total = np.nansum(data_np, axis=0)[np.newaxis, :] 100 | world_total = np.nansum(loc_total, axis=1)[:, np.newaxis] 101 | den = loc_total / world_total 102 | self.rca_t = num / den 103 | 104 | def calculate_rpop(self, pop, t): 105 | """Calculate RPOP""" 106 | # After constructing df with all combinations, convert data into ndarray 107 | loc_n_vals = len(self.data_t.index.levels[0]) 108 | prod_n_vals = len(self.data_t.index.levels[1]) 109 | data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals)) 110 | 111 | # Read population data for selected year 112 | pop_t = pop[pop[self.cols_input["time"]] == t].copy() 113 | pop_t.columns = ["time", "loc", "pop"] 114 | pop_t = pop_t.drop(columns="time") 115 | 116 | pop_t = pop_t.reset_index(drop=True).set_index("loc") 117 | pop_index = self.data_t.index.unique("loc") 118 | pop_t = pop_t.reindex(pop_index) 119 | pop_t = pop_t.values 120 | assert ( 121 | pop_t.shape[0] == data_np.shape[0] 122 | ), f"Year {t}: Trade and population data have to be available for the same countries / locations" 123 | 124 | num = data_np / pop_t 125 | loc_total = np.nansum(data_np, axis=0)[np.newaxis, :] 126 | world_pop_total = np.nansum(pop_t) 127 | 128 | den = loc_total / world_pop_total 129 | rpop = num / den 130 | self.rpop_t = rpop 131 | 132 | def calculate_mcp( 133 | self, rca_mcp_threshold_input, rpop_mcp_threshold_input, presence_test, pop, t 134 | ): 135 | """Calculate MCP based on RCA / RPOP / both""" 136 | 137 | def convert_to_binary(x, threshold): 138 | x = np.nan_to_num(x) 139 | x = np.where(x >= threshold, 1, 0) 140 | return x 141 | 142 | if presence_test == "rca": 143 | self.mcp_t = convert_to_binary(self.rca_t, rca_mcp_threshold_input) 144 | 145 | elif presence_test == "rpop": 146 | self.calculate_rpop(pop, t) 147 | self.mcp_t = convert_to_binary(self.rpop_t, rpop_mcp_threshold_input) 148 | 149 | elif presence_test == "both": 150 | self.calculate_rpop(pop, t) 151 | self.mcp_t = convert_to_binary( 152 | self.rca_t, rca_mcp_threshold_input 153 | ) + convert_to_binary(self.rpop_t, rpop_mcp_threshold_input) 154 | 155 | def calculate_manual_mcp(self): 156 | """If pre-computed MCP supplied, check validity and reshape""" 157 | # Test to see if indeed MCP 158 | if np.any(~np.isin(self.data_t.values, [0, 1])): 159 | error_val = self.data_t.values[~np.isin(self.data_t.values, [0, 1])].flat[0] 160 | raise ValueError( 161 | "Manually supplied MCP column contains values other than 0 or 1 - Val: {}".format( 162 | error_val 163 | ) 164 | ) 165 | 166 | # Convert data into numpy array 167 | loc_n_vals = len(self.data_t.index.levels[0]) 168 | prod_n_vals = len(self.data_t.index.levels[1]) 169 | data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals)) 170 | 171 | self.mcp_t = data_np 172 | -------------------------------------------------------------------------------- /ecomplexity/__init__.py: -------------------------------------------------------------------------------- 1 | from ecomplexity.ecomplexity import ecomplexity 2 | from ecomplexity.proximity import proximity 3 | 4 | name = "ecomplexity" 5 | -------------------------------------------------------------------------------- /ecomplexity/calc_density.py: -------------------------------------------------------------------------------- 1 | # Density as defined in: 2 | # Hidalgo, C. A., Klinger, B., Barabasi, A.-L., & Hausmann, R. (2007). The Product Space Conditions the Development of Nations. Science, 317(5837), 482–487. https://doi.org/10.1126/science.1144581 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | 9 | def calc_density(rca_or_mcp, proximity_mat, knn=None): 10 | """Calculate density, as defined by Hidalgo et. al. (2007) 11 | 12 | Args: 13 | rca_or_mcp: numpy array of RCA (if continuous product proximities are 14 | used), else Mcp 15 | proximity_mat: product proximity matrix 16 | knn: number of nearest neighbors to consider for density calculation (optional) 17 | 18 | Returns: 19 | numpy array of same shape as proximity_mat corresponding to density of 20 | each product 21 | """ 22 | if knn is None: 23 | den = np.nansum(proximity_mat, axis=1)[np.newaxis, :] 24 | # density = rca_or_mcp @ (proximity_mat / den) 25 | density = rca_or_mcp @ (proximity_mat.T / den) 26 | else: 27 | # Convert proximity matrix to a distance matrix 28 | distance_mat = 1 - proximity_mat 29 | # Get proximity to k nearest neighbors 30 | nbrs = NearestNeighbors(n_neighbors=knn, metric="precomputed").fit(distance_mat) 31 | distance_knn, indices_knn = nbrs.kneighbors() 32 | # Get proximity 33 | proximity_knn = 1 - distance_knn 34 | # Calculate density 35 | # Get denominator 36 | den = np.nansum(proximity_knn, axis=1) 37 | density = [] 38 | for i, row in enumerate(indices_knn): 39 | # Use row to subset rca_or_mcp 40 | rca_knn_p = rca_or_mcp[np.arange(rca_or_mcp.shape[0])[:, np.newaxis], row] 41 | # Get distance_knn for this row 42 | proximity_knn_row = proximity_knn[i] 43 | # Divide by den 44 | proximity_knn_row = proximity_knn_row / den[i] 45 | # Multiply each row of rca_knn_p by proximity_knn_row 46 | num_p = rca_knn_p * proximity_knn_row 47 | # Sum across columns 48 | density_p = np.nansum(rca_knn_p, axis=1) 49 | density.append(density_p) 50 | density = np.array(density).T 51 | return density 52 | -------------------------------------------------------------------------------- /ecomplexity/calc_proximity.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def calc_discrete_proximity(mcp, ubiquity, asymmetric=False): 6 | """Calculate product proximity matrices - discrete 7 | 8 | Args: 9 | mcp: numpy ndarray with rows as locations and columns as products 10 | ubiquity: numpy array of shape=number of columns in "rca_or_mcp" 11 | asymmetric: Whether to generate asymmetric proximity matrix (True) or 12 | symmetric (False). *default* False. 13 | 14 | Returns: 15 | pandas df with proximity values for every product pair 16 | """ 17 | 18 | # Calculate discrete proximity 19 | phi = mcp.T @ mcp 20 | phi = phi / ubiquity[np.newaxis, :] 21 | 22 | if asymmetric == False: 23 | # Symmetric proximity matrix 24 | phi = np.minimum(phi, phi.T) 25 | elif asymmetric == True: 26 | # Asymmetric proximity matrix 27 | phi = phi.T 28 | 29 | return phi 30 | 31 | 32 | def calc_continuous_proximity(rca, ubiquity): 33 | """Calculate product proximity matrices - continuous 34 | 35 | Args: 36 | rca: numpy ndarray with rows as locations and columns as products 37 | ubiquity: numpy array of shape=number of columns in "rca_or_mcp" 38 | 39 | Returns: 40 | pandas df with proximity values for every product pair 41 | """ 42 | # Calculate continuous proximity 43 | phi = (1 + np.corrcoef(rca.T)) / 2 44 | return phi 45 | -------------------------------------------------------------------------------- /ecomplexity/coicog.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def calc_coi_cog(cdata, proximity_mat): 6 | """Calculate Complexity Outlook index 7 | 8 | Args: 9 | cdata: Object of ComplexityData class, with density calculated 10 | proximity_mat: proximity matrix 11 | 12 | Returns: 13 | cata: ComplexityData object with attribute coi 14 | """ 15 | # mata coi = ((density:*(1 :- M)):*kp)*J(Npx,Npx,1) 16 | coi = ((cdata.density_t * (1 - cdata.mcp_t)) * cdata.pci_t).sum(axis=1) 17 | # print(coi.shape) 18 | # mata cog = (1 :- M):*((1 :- M) * (proximity :* ((kp1d:/(proximity*J(Npx,1,1)))*J(1,Npx,1)))) 19 | cog = (1 - cdata.mcp_t) * ( 20 | (1 - cdata.mcp_t) 21 | @ (proximity_mat * (cdata.pci_t / proximity_mat.sum(axis=1))[:, np.newaxis]) 22 | ) 23 | # print(cog.shape) 24 | return (coi, cog) 25 | -------------------------------------------------------------------------------- /ecomplexity/ecomplexity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | from ecomplexity.calc_proximity import calc_discrete_proximity 5 | from ecomplexity.calc_proximity import calc_continuous_proximity 6 | from ecomplexity.ComplexityData import ComplexityData 7 | from ecomplexity.calc_density import calc_density 8 | from ecomplexity.coicog import calc_coi_cog 9 | 10 | 11 | def reshape_output_to_data(cdata, t): 12 | """Reshape output ndarrays to df""" 13 | diversity = ( 14 | cdata.diversity_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel() 15 | ) 16 | ubiquity = ( 17 | cdata.ubiquity_t[np.newaxis, :].repeat(cdata.mcp_t.shape[0], axis=0).ravel() 18 | ) 19 | eci = cdata.eci_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel() 20 | pci = cdata.pci_t[np.newaxis, :].repeat(cdata.mcp_t.shape[0], axis=0).ravel() 21 | coi = cdata.coi_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel() 22 | 23 | out_dict = { 24 | "diversity": diversity, 25 | "ubiquity": ubiquity, 26 | "mcp": cdata.mcp_t.ravel(), 27 | "eci": eci, 28 | "pci": pci, 29 | "density": cdata.density_t.ravel(), 30 | "coi": coi, 31 | "cog": cdata.cog_t.ravel(), 32 | } 33 | 34 | if hasattr(cdata, "rpop_t"): 35 | out_dict["rca"] = cdata.rca_t.ravel() 36 | out_dict["rpop"] = cdata.rpop_t.ravel() 37 | 38 | elif hasattr(cdata, "rca_t"): 39 | out_dict["rca"] = cdata.rca_t.ravel() 40 | 41 | output = pd.DataFrame.from_dict(out_dict).reset_index(drop=True) 42 | 43 | cdata.data_t["time"] = t 44 | cdata.output_t = pd.concat([cdata.data_t.reset_index(), output], axis=1) 45 | cdata.output_list.append(cdata.output_t) 46 | return cdata 47 | 48 | 49 | def conform_to_original_data(cdata, data): 50 | """Reset column names and add dropped columns back""" 51 | cdata.output = cdata.output.rename(columns=cdata.cols_input) 52 | cdata.output = cdata.output.merge( 53 | data, how="outer", on=list(cdata.cols_input.values()) 54 | ) 55 | return cdata 56 | 57 | 58 | def calc_eci_pci(cdata): 59 | # Check if diversity or ubiquity is 0 or nan, can cause problems 60 | if ((cdata.diversity_t == 0).sum() > 0) | ((cdata.ubiquity_t == 0).sum() > 0): 61 | warnings.warn( 62 | f"In year {cdata.t}, diversity / ubiquity is 0 for some locs/prods" 63 | ) 64 | 65 | # Extract valid elements only 66 | cntry_mask = np.argwhere(cdata.diversity_t == 0).squeeze() 67 | prod_mask = np.argwhere(cdata.ubiquity_t == 0).squeeze() 68 | diversity_valid = cdata.diversity_t[cdata.diversity_t != 0] 69 | ubiquity_valid = cdata.ubiquity_t[cdata.ubiquity_t != 0] 70 | mcp_valid = cdata.mcp_t[cdata.diversity_t != 0, :][:, cdata.ubiquity_t != 0] 71 | 72 | # Calculate ECI and PCI eigenvectors 73 | mcp1 = mcp_valid / diversity_valid[:, np.newaxis] 74 | mcp2 = mcp_valid / ubiquity_valid[np.newaxis, :] 75 | # Make copy of transpose to ensure contiguous array for performance reasons 76 | mcp2_t = mcp2.T.copy() 77 | # These matrix multiplication lines are very slow 78 | Mcc = mcp1 @ mcp2_t 79 | Mpp = mcp2_t @ mcp1 80 | 81 | try: 82 | # Calculate eigenvectors 83 | eigvals, eigvecs = np.linalg.eig(Mpp) 84 | eigvecs = np.real(eigvecs) 85 | # Get eigenvector corresponding to second largest eigenvalue 86 | eig_index = eigvals.argsort()[-2] 87 | kp = eigvecs[:, eig_index] 88 | kc = mcp1 @ kp 89 | 90 | # Adjust sign of ECI and PCI so it makes sense, as per book 91 | s1 = np.sign(np.corrcoef(diversity_valid, kc)[0, 1]) 92 | eci_t = s1 * kc 93 | pci_t = s1 * kp 94 | 95 | # Add back the deleted elements 96 | for x in cntry_mask: 97 | eci_t = np.insert(eci_t, x, np.nan) 98 | for x in prod_mask: 99 | pci_t = np.insert(pci_t, x, np.nan) 100 | 101 | except Exception as e: 102 | warnings.warn(f"Unable to calculate eigenvectors for year {cdata.t}") 103 | print(e) 104 | eci_t = np.empty(cdata.mcp_t.shape[0]) 105 | pci_t = np.empty(cdata.mcp_t.shape[1]) 106 | eci_t[:] = np.nan 107 | pci_t[:] = np.nan 108 | 109 | return (eci_t, pci_t) 110 | 111 | 112 | def ecomplexity( 113 | data, 114 | cols_input, 115 | presence_test="rca", 116 | val_errors_flag="coerce", 117 | rca_mcp_threshold=1, 118 | rpop_mcp_threshold=1, 119 | pop=None, 120 | continuous=False, 121 | asymmetric=False, 122 | knn=None, 123 | verbose=True, 124 | ): 125 | """Complexity calculations through the ComplexityData class 126 | 127 | Args: 128 | data: pandas dataframe containing production / trade data. 129 | Including variables indicating time, location, product and value 130 | cols_input: dict of column names for time, location, product and value. 131 | Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'} 132 | presence_test: str for test used for presence of industry in location. 133 | One of "rca" (default), "rpop", "both", or "manual". 134 | Determines which values are used for M_cp calculations. 135 | If "manual", M_cp is taken as given from the "value" column in data 136 | val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric 137 | *default* coerce. 138 | rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1. 139 | *default* 1. 140 | rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1. 141 | *default* 1. Only used if presence_test is not "rca". 142 | pop: pandas df, with time, location and corresponding population, in that order. 143 | Not required if presence_test is "rca", which is the default. 144 | continuous: Used to calculate product proximities, indicates whether 145 | to consider correlation of every product pair (True) or product 146 | co-occurrence (False). *default* False. 147 | asymmetric: Used to calculate product proximities, indicates whether 148 | to generate asymmetric proximity matrix (True) or symmetric (False). 149 | *default* False. 150 | knn: Number of nearest neighbors from proximity matrix to use to calculate 151 | density. Will use entire proximity matrix if None. 152 | *default* None. 153 | verbose: Print year being processed 154 | 155 | Returns: 156 | Pandas dataframe containing the data with the following additional columns: 157 | - diversity: k_c,0 158 | - ubiquity: k_p,0 159 | - rca: Balassa's RCA 160 | - rpop: (available if presence_test!="rca") RPOP 161 | - mcp: MCP used for complexity calculations 162 | - eci: Economic complexity index 163 | - pci: Product complexity index 164 | - density: Density of the network around each product 165 | - coi: Complexity Outlook Index 166 | - cog: Complexity Outlook Gain 167 | 168 | """ 169 | cdata = ComplexityData(data, cols_input, val_errors_flag) 170 | 171 | cdata.output_list = [] 172 | 173 | # Iterate over time stamps 174 | for t in cdata.data.index.unique("time"): 175 | if verbose: 176 | print(t) 177 | # Rectangularize df 178 | cdata.create_full_df(t) 179 | 180 | # Check if Mcp is pre-computed 181 | if presence_test != "manual": 182 | cdata.calculate_rca() 183 | cdata.calculate_mcp( 184 | rca_mcp_threshold, rpop_mcp_threshold, presence_test, pop, t 185 | ) 186 | else: 187 | cdata.calculate_manual_mcp() 188 | 189 | # Calculate diversity and ubiquity 190 | cdata.diversity_t = np.nansum(cdata.mcp_t, axis=1) 191 | cdata.ubiquity_t = np.nansum(cdata.mcp_t, axis=0) 192 | 193 | # If ANY of diversity or ubiquity is 0, warn that eci and pci will be nan 194 | if np.any(cdata.diversity_t == 0) or np.any(cdata.ubiquity_t == 0): 195 | warnings.warn( 196 | f"Year {t}: Diversity or ubiquity is 0, so ECI and PCI will be nan" 197 | ) 198 | 199 | # Calculate ECI and PCI 200 | cdata.eci_t, cdata.pci_t = calc_eci_pci(cdata) 201 | 202 | # Calculate proximity and density 203 | if continuous == False: 204 | prox_mat = calc_discrete_proximity( 205 | cdata.mcp_t, cdata.ubiquity_t, asymmetric 206 | ) 207 | cdata.density_t = calc_density( 208 | rca_or_mcp=cdata.mcp_t, proximity_mat=prox_mat, knn=knn 209 | ) 210 | elif continuous == True and presence_test == "rpop": 211 | prox_mat = calc_continuous_proximity(cdata.rpop_t, cdata.ubiquity_t) 212 | cdata.density_t = calc_density( 213 | rca_or_mcp=cdata.rpop_t, proximity_mat=prox_mat, knn=knn 214 | ) 215 | elif continuous == True and presence_test != "rpop": 216 | prox_mat = calc_continuous_proximity(cdata.rca_t, cdata.ubiquity_t) 217 | cdata.density_t = calc_density( 218 | rca_or_mcp=cdata.rca_t, proximity_mat=prox_mat, knn=knn 219 | ) 220 | 221 | # Calculate COI and COG 222 | cdata.coi_t, cdata.cog_t = calc_coi_cog(cdata, prox_mat) 223 | 224 | # Normalize variables as per STATA package 225 | # Normalization using ECI mean and std. dev. preserves the property that 226 | # ECI = (mean of PCI of products for which MCP=1) 227 | cdata.pci_t = (cdata.pci_t - cdata.eci_t.mean()) / cdata.eci_t.std() 228 | cdata.cog_t = cdata.cog_t / cdata.eci_t.std() 229 | cdata.eci_t = (cdata.eci_t - cdata.eci_t.mean()) / cdata.eci_t.std() 230 | 231 | cdata.coi_t = (cdata.coi_t - cdata.coi_t.mean()) / cdata.coi_t.std() 232 | 233 | # Reshape ndarrays to df 234 | cdata = reshape_output_to_data(cdata, t) 235 | 236 | cdata.output = pd.concat(cdata.output_list) 237 | cdata = conform_to_original_data(cdata, data) 238 | 239 | return cdata.output 240 | -------------------------------------------------------------------------------- /ecomplexity/proximity.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from ecomplexity.calc_proximity import calc_discrete_proximity 4 | from ecomplexity.calc_proximity import calc_continuous_proximity 5 | from ecomplexity.ComplexityData import ComplexityData 6 | 7 | 8 | def proximity( 9 | data, 10 | cols_input, 11 | presence_test="rca", 12 | val_errors_flag="coerce", 13 | rca_mcp_threshold=1, 14 | rpop_mcp_threshold=1, 15 | pop=None, 16 | continuous=False, 17 | asymmetric=False, 18 | ): 19 | """Wrapper function to calculate product proximity matrices 20 | 21 | Args: 22 | data: pandas df with cols 'time','loc','prod','val' 23 | cols_input: dict of column names for time, location, product and value. 24 | Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'} 25 | presence_test: str for test used for presence of industry in location. 26 | One of "rca" (default), "rpop", "both", or "manual". 27 | Determines which values are used for M_cp calculations. 28 | If "manual", M_cp is taken as given from the "value" column in data 29 | val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric 30 | *default* coerce. 31 | rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1. 32 | *default* 1. 33 | rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1. 34 | *default* 1. Only used if presence_test is not "rca". 35 | pop: pandas df, with time, location and corresponding population, in that order. 36 | Not required if presence_test is "rca" (default). 37 | continuous: Whether to consider correlation of every product pair (True) 38 | or product co-occurrence (False). *default* False. 39 | asymmetric: Whether to generate asymmetric proximity matrix (True) or 40 | symmetric (False). *default* False. 41 | 42 | Returns: 43 | pandas df with proximity values for every product pair 44 | """ 45 | 46 | cdata = ComplexityData(data, cols_input, val_errors_flag) 47 | 48 | output_list = [] 49 | 50 | # Iterate over time stamps 51 | for t in cdata.data.index.unique("time"): 52 | print(t) 53 | # Rectangularize df 54 | cdata.create_full_df(t) 55 | 56 | # Check if Mcp is pre-computed 57 | if presence_test != "manual": 58 | cdata.calculate_rca() 59 | cdata.calculate_mcp( 60 | rca_mcp_threshold, rpop_mcp_threshold, presence_test, pop, t 61 | ) 62 | else: 63 | cdata.calculate_manual_mcp() 64 | 65 | # Calculate diversity and ubiquity 66 | cdata.diversity_t = np.nansum(cdata.mcp_t, axis=1) 67 | cdata.ubiquity_t = np.nansum(cdata.mcp_t, axis=0) 68 | 69 | # Calculate proximity 70 | if continuous == False: 71 | prox_mat = calc_discrete_proximity( 72 | cdata.mcp_t, cdata.ubiquity_t, asymmetric 73 | ) 74 | elif continuous == True and presence_test == "rpop": 75 | prox_mat = calc_continuous_proximity(cdata.rpop_t, cdata.ubiquity_t) 76 | elif continuous == True and presence_test != "rpop": 77 | prox_mat = calc_continuous_proximity(cdata.rca_t, cdata.ubiquity_t) 78 | 79 | # Reshape as df 80 | output_index = pd.MultiIndex.from_product( 81 | [cdata.data_t.index.levels[1], cdata.data_t.index.levels[1]], 82 | names=["prod1", "prod2"], 83 | ) 84 | output = pd.DataFrame(data={"proximity": prox_mat.ravel()}, index=output_index) 85 | output["time"] = t 86 | output_list.append(output) 87 | 88 | output = pd.concat(output_list) 89 | 90 | # Remove entries for product's proximity with itself 91 | output = output.reset_index() 92 | output.columns = ["prod1", "prod2", "proximity", "time"] 93 | output = output[["time", "prod1", "prod2", "proximity"]] 94 | output = output[output.prod1 != output.prod2] 95 | 96 | # Rename based on original product column name 97 | output = output.rename( 98 | columns={ 99 | "prod1": cdata.cols_input["prod"] + "_1", 100 | "prod2": cdata.cols_input["prod"] + "_2", 101 | "time": cdata.cols_input["time"], 102 | } 103 | ) 104 | 105 | return output 106 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.5.2 2 | numpy==1.23.5 3 | scikit-learn==1.2.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def readme(): 5 | with open("README.md") as f: 6 | return f.read() 7 | 8 | setup(name='ecomplexity', 9 | version='0.5.2', 10 | description='Package to calculate economic complexity and associated variables', 11 | long_description=readme(), 12 | long_description_content_type='text/markdown', 13 | url='https://github.com/cid-harvard/py-ecomplexity', 14 | author='Shreyas Gadgin Matha', 15 | author_email='shreyas.gm61@gmail.com', 16 | license='MIT', 17 | packages=find_packages(), 18 | keywords="pandas python networks economics complexity", 19 | python_requires='>=3', 20 | install_requires=[ 21 | 'pandas >0.23.0', 22 | 'numpy >1.22.0' 23 | 'scikit-learn >1.0.0' 24 | ], 25 | zip_safe=False, 26 | classifiers=[ 27 | "Programming Language :: Python :: 3", 28 | "License :: OSI Approved :: MIT License", 29 | "Operating System :: OS Independent", 30 | ]) 31 | --------------------------------------------------------------------------------