├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── ecomplexity
    ├── ComplexityData.py
    ├── __init__.py
    ├── calc_density.py
    ├── calc_proximity.py
    ├── coicog.py
    ├── ecomplexity.py
    └── proximity.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | Data/
  2 | archive/
  3 | .DS_Store
  4 | ecomplexity/tests/
  5 | 
  6 | # Following lines created by https://www.gitignore.io/api/python
  7 | 
  8 | ### Python ###
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | /build/
 21 | develop-eggs/
 22 | dist/
 23 | /dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | ### Python Patch ###
124 | .venv/
125 | 
126 | ### Python.VirtualEnv Stack ###
127 | # Virtualenv
128 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
129 | [Bb]in
130 | [Ii]nclude
131 | [Ll]ib
132 | [Ll]ib64
133 | [Ll]ocal
134 | [Ss]cripts
135 | pyvenv.cfg
136 | pip-selfcheck.json
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Center for International Development at Harvard University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Economic Complexity and Product Complexity
 2 | 
 3 | By the Growth Lab at Harvard's Center for International Development
 4 | 
 5 | This package is part of Harvard Growth Lab’s portfolio of software packages, digital products and interactive data visualizations. To browse our entire portfolio, please visit [growthlab.app](growthlab.app). To learn more about our research, please visit [Harvard Growth Lab’s home page](https://growthlab.cid.harvard.edu/).
 6 | 
 7 | # About
 8 | Python package to calculate economic complexity indices.
 9 | 
10 | STATA implementation of the economic complexity index available at: <https://github.com/cid-harvard/ecomplexity>
11 | 
12 | Explore complexity and associated data using Harvard CID's Atlas tool: <http://atlas.cid.harvard.edu>
13 | 
14 | ## Tutorial
15 | 
16 | **Installation**:
17 | At terminal: `pip install ecomplexity`
18 | 
19 | If you wish to install the latest version of the package under development, you can install directly from GitHub:
20 | `pip install git+https://github.com/cid-harvard/py-ecomplexity@develop`
21 | 
22 | **Usage**:
23 | 
24 | ```python
25 | from ecomplexity import ecomplexity
26 | from ecomplexity import proximity
27 | 
28 | # Import trade data from CID Atlas
29 | data_url = "https://intl-atlas-downloads.s3.amazonaws.com/country_hsproduct2digit_year.csv.zip"
30 | data = pd.read_csv(data_url, compression="zip", low_memory=False)
31 | data = data[['year','location_code','hs_product_code','export_value']]
32 | 
33 | # Calculate complexity
34 | trade_cols = {'time':'year', 'loc':'location_code', 'prod':'hs_product_code', 'val':'export_value'}
35 | cdata = ecomplexity(data, trade_cols)
36 | 
37 | # Calculate proximity matrix
38 | prox_df = proximity(data, trade_cols)
39 | ```
40 | 
41 | **Arguments**:
42 | 
43 | ```text
44 | data: pandas dataframe containing production / trade data.
45 |     Including variables indicating time, location, product and value
46 | cols_input: dict of column names for time, location, product and value.
47 |     Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'}
48 | presence_test: str for test used for presence of industry in location.
49 |     One of "rca" (default), "rpop", "both", or "manual".
50 |     Determines which values are used for M_cp calculations.
51 |     If "manual", M_cp is taken as given from the "value" column in data
52 | val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric
53 |     *default* coerce.
54 | rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1.
55 |     *default* 1.
56 | rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1.
57 |     *default* 1. Only used if presence_test is not "rca".
58 | pop: pandas df, with time, location and corresponding population, in that order.
59 |     Not required if presence_test is "rca" (default).
60 | continuous: Used to calculate product proximities, indicates whether
61 |     to consider correlation of every product pair (True) or product
62 |     co-occurrence (False). *default* False.
63 | asymmetric: Used to calculate product proximities, indicates whether
64 |     to generate asymmetric proximity matrix (True) or symmetric (False).
65 |     *default* False.
66 | knn: Number of nearest neighbors from proximity matrix to use to calculate
67 |     density. Will use entire proximity matrix if None.
68 |     *default* None.
69 | ```
70 | 
71 | ## FAQ
72 | 
73 | - Why are ECI and PCI are both normalized using ECI's mean and std. dev?
74 |     + This normalization preserves the property that ECI = (mean of PCI of products for which MCP=1)
75 | 
76 | 
77 | ### References
78 | 
79 | - Hausmann, R., Hidalgo, C. A., Bustos, S., Coscia, M., Simoes, A., & Yıldırım, M. (2013). The Atlas of Economic Complexity: Mapping Paths to Prosperity (Part 1). Retrieved from <https://growthlab.cid.harvard.edu/files/growthlab/files/atlas_2013_part1.pdf>
80 | - Hidalgo, C. A., Klinger, B., Barabasi, A.-L., & Hausmann, R. (2007). The Product Space Conditions the Development of Nations. Science, 317(5837), 482–487. <http://doi.org/10.1126/science.1144581>
81 | 


--------------------------------------------------------------------------------
/ecomplexity/ComplexityData.py:
--------------------------------------------------------------------------------
  1 | # Complexity calculations
  2 | import numpy as np
  3 | import pandas as pd
  4 | import warnings
  5 | import sys
  6 | from functools import wraps
  7 | import time
  8 | import datetime
  9 | 
 10 | 
 11 | class ComplexityData(object):
 12 |     """Calculate complexity and other related results
 13 | 
 14 |     Args:
 15 |         data: pandas dataframe containing production / trade data.
 16 |             Including variables indicating time, location, product and value
 17 |         cols_input: dict of column names for time, location, product and value.
 18 |             Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'}
 19 |         val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric
 20 |             *default* coerce.
 21 | 
 22 |     Attributes:
 23 |         data: clean data with standardized column names
 24 |     """
 25 | 
 26 |     def __init__(self, data, cols_input, val_errors_flag):
 27 |         self.data = data.copy()
 28 |         self.cols_input = cols_input
 29 | 
 30 |         # Standardize column names based on input
 31 |         self.rename_cols()
 32 | 
 33 |         # Clean data to handle NA's and such
 34 |         self.clean_data(val_errors_flag)
 35 | 
 36 |     def rename_cols(self):
 37 |         """Standardize column names"""
 38 |         cols_map_inv = {v: k for k, v in self.cols_input.items()}
 39 |         self.data = self.data.rename(columns=cols_map_inv)
 40 |         self.data = self.data[["time", "loc", "prod", "val"]]
 41 | 
 42 |     def clean_data(self, val_errors_flag_input):
 43 |         """Clean data to remove non-numeric values, handle NA's and duplicates"""
 44 |         # Make sure values are numeric
 45 |         self.data.val = pd.to_numeric(self.data.val, errors=val_errors_flag_input)
 46 |         self.data.set_index(["time", "loc", "prod"], inplace=True)
 47 |         if self.data.val.isnull().values.any():
 48 |             warnings.warn("NaN value(s) present, coercing to zero(es)")
 49 |             self.data.val.fillna(0, inplace=True)
 50 | 
 51 |         # Remove duplicates
 52 |         dups = self.data.index.duplicated()
 53 |         if dups.sum() > 0:
 54 |             warnings.warn("Duplicate values exist, keeping the first occurrence")
 55 |             self.data = self.data[~self.data.index.duplicated()]
 56 | 
 57 |     def create_full_df(self, t):
 58 |         """Rectangularize, but remove rows with diversity or ubiquity zero
 59 | 
 60 |         Rows with zero diversity / ubiquity lead to ZeroDivision errors and
 61 |         incorrect values during normalization
 62 |         """
 63 |         self.t = t
 64 |         self.data_t = self.data.loc[t].copy()
 65 |         # Check for zero diversity and ubiquity
 66 |         val_diversity_check = (
 67 |             self.data_t.reset_index().groupby(["loc"])["val"].sum().reset_index()
 68 |         )
 69 |         val_ubiquity_check = (
 70 |             self.data_t.reset_index().groupby(["prod"])["val"].sum().reset_index()
 71 |         )
 72 |         val_diversity_check = val_diversity_check[val_diversity_check.val != 0]
 73 |         val_ubiquity_check = val_ubiquity_check[val_ubiquity_check.val != 0]
 74 |         # Remove locations and products with zero diversity and ubiquity respectively
 75 |         self.data_t = self.data_t.reset_index()
 76 |         self.data_t = self.data_t.merge(
 77 |             val_diversity_check[["loc"]], on="loc", how="right"
 78 |         )
 79 |         self.data_t = self.data_t.merge(
 80 |             val_ubiquity_check[["prod"]], on="prod", how="right"
 81 |         )
 82 |         self.data_t.set_index(["loc", "prod"], inplace=True)
 83 |         # Create full dataframe with all combinations of locations and products
 84 |         data_index = pd.MultiIndex.from_product(
 85 |             self.data_t.index.levels, names=self.data_t.index.names
 86 |         )
 87 |         self.data_t = self.data_t.reindex(data_index, fill_value=0)
 88 | 
 89 |     def calculate_rca(self):
 90 |         """Calculate RCA"""
 91 |         # Convert data into numpy array
 92 |         loc_n_vals = len(self.data_t.index.levels[0])
 93 |         prod_n_vals = len(self.data_t.index.levels[1])
 94 |         data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals))
 95 | 
 96 |         # Calculate RCA, disable dividebyzero errors
 97 |         with np.errstate(divide="ignore", invalid="ignore"):
 98 |             num = data_np / np.nansum(data_np, axis=1)[:, np.newaxis]
 99 |             loc_total = np.nansum(data_np, axis=0)[np.newaxis, :]
100 |             world_total = np.nansum(loc_total, axis=1)[:, np.newaxis]
101 |             den = loc_total / world_total
102 |             self.rca_t = num / den
103 | 
104 |     def calculate_rpop(self, pop, t):
105 |         """Calculate RPOP"""
106 |         # After constructing df with all combinations, convert data into ndarray
107 |         loc_n_vals = len(self.data_t.index.levels[0])
108 |         prod_n_vals = len(self.data_t.index.levels[1])
109 |         data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals))
110 | 
111 |         # Read population data for selected year
112 |         pop_t = pop[pop[self.cols_input["time"]] == t].copy()
113 |         pop_t.columns = ["time", "loc", "pop"]
114 |         pop_t = pop_t.drop(columns="time")
115 | 
116 |         pop_t = pop_t.reset_index(drop=True).set_index("loc")
117 |         pop_index = self.data_t.index.unique("loc")
118 |         pop_t = pop_t.reindex(pop_index)
119 |         pop_t = pop_t.values
120 |         assert (
121 |             pop_t.shape[0] == data_np.shape[0]
122 |         ), f"Year {t}: Trade and population data have to be available for the same countries / locations"
123 | 
124 |         num = data_np / pop_t
125 |         loc_total = np.nansum(data_np, axis=0)[np.newaxis, :]
126 |         world_pop_total = np.nansum(pop_t)
127 | 
128 |         den = loc_total / world_pop_total
129 |         rpop = num / den
130 |         self.rpop_t = rpop
131 | 
132 |     def calculate_mcp(
133 |         self, rca_mcp_threshold_input, rpop_mcp_threshold_input, presence_test, pop, t
134 |     ):
135 |         """Calculate MCP based on RCA / RPOP / both"""
136 | 
137 |         def convert_to_binary(x, threshold):
138 |             x = np.nan_to_num(x)
139 |             x = np.where(x >= threshold, 1, 0)
140 |             return x
141 | 
142 |         if presence_test == "rca":
143 |             self.mcp_t = convert_to_binary(self.rca_t, rca_mcp_threshold_input)
144 | 
145 |         elif presence_test == "rpop":
146 |             self.calculate_rpop(pop, t)
147 |             self.mcp_t = convert_to_binary(self.rpop_t, rpop_mcp_threshold_input)
148 | 
149 |         elif presence_test == "both":
150 |             self.calculate_rpop(pop, t)
151 |             self.mcp_t = convert_to_binary(
152 |                 self.rca_t, rca_mcp_threshold_input
153 |             ) + convert_to_binary(self.rpop_t, rpop_mcp_threshold_input)
154 | 
155 |     def calculate_manual_mcp(self):
156 |         """If pre-computed MCP supplied, check validity and reshape"""
157 |         # Test to see if indeed MCP
158 |         if np.any(~np.isin(self.data_t.values, [0, 1])):
159 |             error_val = self.data_t.values[~np.isin(self.data_t.values, [0, 1])].flat[0]
160 |             raise ValueError(
161 |                 "Manually supplied MCP column contains values other than 0 or 1 - Val: {}".format(
162 |                     error_val
163 |                 )
164 |             )
165 | 
166 |         # Convert data into numpy array
167 |         loc_n_vals = len(self.data_t.index.levels[0])
168 |         prod_n_vals = len(self.data_t.index.levels[1])
169 |         data_np = self.data_t.values.reshape((loc_n_vals, prod_n_vals))
170 | 
171 |         self.mcp_t = data_np
172 | 


--------------------------------------------------------------------------------
/ecomplexity/__init__.py:
--------------------------------------------------------------------------------
1 | from ecomplexity.ecomplexity import ecomplexity
2 | from ecomplexity.proximity import proximity
3 | 
4 | name = "ecomplexity"
5 | 


--------------------------------------------------------------------------------
/ecomplexity/calc_density.py:
--------------------------------------------------------------------------------
 1 | # Density as defined in:
 2 | # Hidalgo, C. A., Klinger, B., Barabasi, A.-L., & Hausmann, R. (2007). The Product Space Conditions the Development of Nations. Science, 317(5837), 482–487. https://doi.org/10.1126/science.1144581
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from sklearn.neighbors import NearestNeighbors
 7 | 
 8 | 
 9 | def calc_density(rca_or_mcp, proximity_mat, knn=None):
10 |     """Calculate density, as defined by Hidalgo et. al. (2007)
11 | 
12 |     Args:
13 |         rca_or_mcp: numpy array of RCA (if continuous product proximities are
14 |             used), else Mcp
15 |         proximity_mat: product proximity matrix
16 |         knn: number of nearest neighbors to consider for density calculation (optional)
17 | 
18 |     Returns:
19 |         numpy array of same shape as proximity_mat corresponding to density of
20 |         each product
21 |     """
22 |     if knn is None:
23 |         den = np.nansum(proximity_mat, axis=1)[np.newaxis, :]
24 |         # density = rca_or_mcp @ (proximity_mat / den)
25 |         density = rca_or_mcp @ (proximity_mat.T / den)
26 |     else:
27 |         # Convert proximity matrix to a distance matrix
28 |         distance_mat = 1 - proximity_mat
29 |         # Get proximity to k nearest neighbors
30 |         nbrs = NearestNeighbors(n_neighbors=knn, metric="precomputed").fit(distance_mat)
31 |         distance_knn, indices_knn = nbrs.kneighbors()
32 |         # Get proximity
33 |         proximity_knn = 1 - distance_knn
34 |         # Calculate density
35 |         # Get denominator
36 |         den = np.nansum(proximity_knn, axis=1)
37 |         density = []
38 |         for i, row in enumerate(indices_knn):
39 |             # Use row to subset rca_or_mcp
40 |             rca_knn_p = rca_or_mcp[np.arange(rca_or_mcp.shape[0])[:, np.newaxis], row]
41 |             # Get distance_knn for this row
42 |             proximity_knn_row = proximity_knn[i]
43 |             # Divide by den
44 |             proximity_knn_row = proximity_knn_row / den[i]
45 |             # Multiply each row of rca_knn_p by proximity_knn_row
46 |             num_p = rca_knn_p * proximity_knn_row
47 |             # Sum across columns
48 |             density_p = np.nansum(rca_knn_p, axis=1)
49 |             density.append(density_p)
50 |         density = np.array(density).T
51 |     return density
52 | 


--------------------------------------------------------------------------------
/ecomplexity/calc_proximity.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def calc_discrete_proximity(mcp, ubiquity, asymmetric=False):
 6 |     """Calculate product proximity matrices - discrete
 7 | 
 8 |     Args:
 9 |         mcp: numpy ndarray with rows as locations and columns as products
10 |         ubiquity: numpy array of shape=number of columns in "rca_or_mcp"
11 |         asymmetric: Whether to generate asymmetric proximity matrix (True) or
12 |             symmetric (False). *default* False.
13 | 
14 |     Returns:
15 |         pandas df with proximity values for every product pair
16 |     """
17 | 
18 |     # Calculate discrete proximity
19 |     phi = mcp.T @ mcp
20 |     phi = phi / ubiquity[np.newaxis, :]
21 | 
22 |     if asymmetric == False:
23 |         # Symmetric proximity matrix
24 |         phi = np.minimum(phi, phi.T)
25 |     elif asymmetric == True:
26 |         # Asymmetric proximity matrix
27 |         phi = phi.T
28 | 
29 |     return phi
30 | 
31 | 
32 | def calc_continuous_proximity(rca, ubiquity):
33 |     """Calculate product proximity matrices - continuous
34 | 
35 |     Args:
36 |         rca: numpy ndarray with rows as locations and columns as products
37 |         ubiquity: numpy array of shape=number of columns in "rca_or_mcp"
38 | 
39 |     Returns:
40 |         pandas df with proximity values for every product pair
41 |     """
42 |     # Calculate continuous proximity
43 |     phi = (1 + np.corrcoef(rca.T)) / 2
44 |     return phi
45 | 


--------------------------------------------------------------------------------
/ecomplexity/coicog.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def calc_coi_cog(cdata, proximity_mat):
 6 |     """Calculate Complexity Outlook index
 7 | 
 8 |     Args:
 9 |         cdata: Object of ComplexityData class, with density calculated
10 |         proximity_mat: proximity matrix
11 | 
12 |     Returns:
13 |         cata: ComplexityData object with attribute coi
14 |     """
15 |     # mata coi = ((density:*(1 :- M)):*kp)*J(Npx,Npx,1)
16 |     coi = ((cdata.density_t * (1 - cdata.mcp_t)) * cdata.pci_t).sum(axis=1)
17 |     # print(coi.shape)
18 |     # mata cog = (1 :- M):*((1 :- M) * (proximity :* ((kp1d:/(proximity*J(Npx,1,1)))*J(1,Npx,1))))
19 |     cog = (1 - cdata.mcp_t) * (
20 |         (1 - cdata.mcp_t)
21 |         @ (proximity_mat * (cdata.pci_t / proximity_mat.sum(axis=1))[:, np.newaxis])
22 |     )
23 |     # print(cog.shape)
24 |     return (coi, cog)
25 | 


--------------------------------------------------------------------------------
/ecomplexity/ecomplexity.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import warnings
  4 | from ecomplexity.calc_proximity import calc_discrete_proximity
  5 | from ecomplexity.calc_proximity import calc_continuous_proximity
  6 | from ecomplexity.ComplexityData import ComplexityData
  7 | from ecomplexity.calc_density import calc_density
  8 | from ecomplexity.coicog import calc_coi_cog
  9 | 
 10 | 
 11 | def reshape_output_to_data(cdata, t):
 12 |     """Reshape output ndarrays to df"""
 13 |     diversity = (
 14 |         cdata.diversity_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel()
 15 |     )
 16 |     ubiquity = (
 17 |         cdata.ubiquity_t[np.newaxis, :].repeat(cdata.mcp_t.shape[0], axis=0).ravel()
 18 |     )
 19 |     eci = cdata.eci_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel()
 20 |     pci = cdata.pci_t[np.newaxis, :].repeat(cdata.mcp_t.shape[0], axis=0).ravel()
 21 |     coi = cdata.coi_t[:, np.newaxis].repeat(cdata.mcp_t.shape[1], axis=1).ravel()
 22 | 
 23 |     out_dict = {
 24 |         "diversity": diversity,
 25 |         "ubiquity": ubiquity,
 26 |         "mcp": cdata.mcp_t.ravel(),
 27 |         "eci": eci,
 28 |         "pci": pci,
 29 |         "density": cdata.density_t.ravel(),
 30 |         "coi": coi,
 31 |         "cog": cdata.cog_t.ravel(),
 32 |     }
 33 | 
 34 |     if hasattr(cdata, "rpop_t"):
 35 |         out_dict["rca"] = cdata.rca_t.ravel()
 36 |         out_dict["rpop"] = cdata.rpop_t.ravel()
 37 | 
 38 |     elif hasattr(cdata, "rca_t"):
 39 |         out_dict["rca"] = cdata.rca_t.ravel()
 40 | 
 41 |     output = pd.DataFrame.from_dict(out_dict).reset_index(drop=True)
 42 | 
 43 |     cdata.data_t["time"] = t
 44 |     cdata.output_t = pd.concat([cdata.data_t.reset_index(), output], axis=1)
 45 |     cdata.output_list.append(cdata.output_t)
 46 |     return cdata
 47 | 
 48 | 
 49 | def conform_to_original_data(cdata, data):
 50 |     """Reset column names and add dropped columns back"""
 51 |     cdata.output = cdata.output.rename(columns=cdata.cols_input)
 52 |     cdata.output = cdata.output.merge(
 53 |         data, how="outer", on=list(cdata.cols_input.values())
 54 |     )
 55 |     return cdata
 56 | 
 57 | 
 58 | def calc_eci_pci(cdata):
 59 |     # Check if diversity or ubiquity is 0 or nan, can cause problems
 60 |     if ((cdata.diversity_t == 0).sum() > 0) | ((cdata.ubiquity_t == 0).sum() > 0):
 61 |         warnings.warn(
 62 |             f"In year {cdata.t}, diversity / ubiquity is 0 for some locs/prods"
 63 |         )
 64 | 
 65 |     # Extract valid elements only
 66 |     cntry_mask = np.argwhere(cdata.diversity_t == 0).squeeze()
 67 |     prod_mask = np.argwhere(cdata.ubiquity_t == 0).squeeze()
 68 |     diversity_valid = cdata.diversity_t[cdata.diversity_t != 0]
 69 |     ubiquity_valid = cdata.ubiquity_t[cdata.ubiquity_t != 0]
 70 |     mcp_valid = cdata.mcp_t[cdata.diversity_t != 0, :][:, cdata.ubiquity_t != 0]
 71 | 
 72 |     # Calculate ECI and PCI eigenvectors
 73 |     mcp1 = mcp_valid / diversity_valid[:, np.newaxis]
 74 |     mcp2 = mcp_valid / ubiquity_valid[np.newaxis, :]
 75 |     # Make copy of transpose to ensure contiguous array for performance reasons
 76 |     mcp2_t = mcp2.T.copy()
 77 |     # These matrix multiplication lines are very slow
 78 |     Mcc = mcp1 @ mcp2_t
 79 |     Mpp = mcp2_t @ mcp1
 80 | 
 81 |     try:
 82 |         # Calculate eigenvectors
 83 |         eigvals, eigvecs = np.linalg.eig(Mpp)
 84 |         eigvecs = np.real(eigvecs)
 85 |         # Get eigenvector corresponding to second largest eigenvalue
 86 |         eig_index = eigvals.argsort()[-2]
 87 |         kp = eigvecs[:, eig_index]
 88 |         kc = mcp1 @ kp
 89 | 
 90 |         # Adjust sign of ECI and PCI so it makes sense, as per book
 91 |         s1 = np.sign(np.corrcoef(diversity_valid, kc)[0, 1])
 92 |         eci_t = s1 * kc
 93 |         pci_t = s1 * kp
 94 | 
 95 |         # Add back the deleted elements
 96 |         for x in cntry_mask:
 97 |             eci_t = np.insert(eci_t, x, np.nan)
 98 |         for x in prod_mask:
 99 |             pci_t = np.insert(pci_t, x, np.nan)
100 | 
101 |     except Exception as e:
102 |         warnings.warn(f"Unable to calculate eigenvectors for year {cdata.t}")
103 |         print(e)
104 |         eci_t = np.empty(cdata.mcp_t.shape[0])
105 |         pci_t = np.empty(cdata.mcp_t.shape[1])
106 |         eci_t[:] = np.nan
107 |         pci_t[:] = np.nan
108 | 
109 |     return (eci_t, pci_t)
110 | 
111 | 
112 | def ecomplexity(
113 |     data,
114 |     cols_input,
115 |     presence_test="rca",
116 |     val_errors_flag="coerce",
117 |     rca_mcp_threshold=1,
118 |     rpop_mcp_threshold=1,
119 |     pop=None,
120 |     continuous=False,
121 |     asymmetric=False,
122 |     knn=None,
123 |     verbose=True,
124 | ):
125 |     """Complexity calculations through the ComplexityData class
126 | 
127 |     Args:
128 |         data: pandas dataframe containing production / trade data.
129 |             Including variables indicating time, location, product and value
130 |         cols_input: dict of column names for time, location, product and value.
131 |             Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'}
132 |         presence_test: str for test used for presence of industry in location.
133 |             One of "rca" (default), "rpop", "both", or "manual".
134 |             Determines which values are used for M_cp calculations.
135 |             If "manual", M_cp is taken as given from the "value" column in data
136 |         val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric
137 |             *default* coerce.
138 |         rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1.
139 |             *default* 1.
140 |         rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1.
141 |             *default* 1. Only used if presence_test is not "rca".
142 |         pop: pandas df, with time, location and corresponding population, in that order.
143 |             Not required if presence_test is "rca", which is the default.
144 |         continuous: Used to calculate product proximities, indicates whether
145 |             to consider correlation of every product pair (True) or product
146 |             co-occurrence (False). *default* False.
147 |         asymmetric: Used to calculate product proximities, indicates whether
148 |             to generate asymmetric proximity matrix (True) or symmetric (False).
149 |             *default* False.
150 |         knn: Number of nearest neighbors from proximity matrix to use to calculate
151 |             density. Will use entire proximity matrix if None.
152 |             *default* None.
153 |         verbose: Print year being processed
154 | 
155 |     Returns:
156 |         Pandas dataframe containing the data with the following additional columns:
157 |             - diversity: k_c,0
158 |             - ubiquity: k_p,0
159 |             - rca: Balassa's RCA
160 |             - rpop: (available if presence_test!="rca") RPOP
161 |             - mcp: MCP used for complexity calculations
162 |             - eci: Economic complexity index
163 |             - pci: Product complexity index
164 |             - density: Density of the network around each product
165 |             - coi: Complexity Outlook Index
166 |             - cog: Complexity Outlook Gain
167 | 
168 |     """
169 |     cdata = ComplexityData(data, cols_input, val_errors_flag)
170 | 
171 |     cdata.output_list = []
172 | 
173 |     # Iterate over time stamps
174 |     for t in cdata.data.index.unique("time"):
175 |         if verbose:
176 |             print(t)
177 |         # Rectangularize df
178 |         cdata.create_full_df(t)
179 | 
180 |         # Check if Mcp is pre-computed
181 |         if presence_test != "manual":
182 |             cdata.calculate_rca()
183 |             cdata.calculate_mcp(
184 |                 rca_mcp_threshold, rpop_mcp_threshold, presence_test, pop, t
185 |             )
186 |         else:
187 |             cdata.calculate_manual_mcp()
188 | 
189 |         # Calculate diversity and ubiquity
190 |         cdata.diversity_t = np.nansum(cdata.mcp_t, axis=1)
191 |         cdata.ubiquity_t = np.nansum(cdata.mcp_t, axis=0)
192 | 
193 |         # If ANY of diversity or ubiquity is 0, warn that eci and pci will be nan
194 |         if np.any(cdata.diversity_t == 0) or np.any(cdata.ubiquity_t == 0):
195 |             warnings.warn(
196 |                 f"Year {t}: Diversity or ubiquity is 0, so ECI and PCI will be nan"
197 |             )
198 | 
199 |         # Calculate ECI and PCI
200 |         cdata.eci_t, cdata.pci_t = calc_eci_pci(cdata)
201 | 
202 |         # Calculate proximity and density
203 |         if continuous == False:
204 |             prox_mat = calc_discrete_proximity(
205 |                 cdata.mcp_t, cdata.ubiquity_t, asymmetric
206 |             )
207 |             cdata.density_t = calc_density(
208 |                 rca_or_mcp=cdata.mcp_t, proximity_mat=prox_mat, knn=knn
209 |             )
210 |         elif continuous == True and presence_test == "rpop":
211 |             prox_mat = calc_continuous_proximity(cdata.rpop_t, cdata.ubiquity_t)
212 |             cdata.density_t = calc_density(
213 |                 rca_or_mcp=cdata.rpop_t, proximity_mat=prox_mat, knn=knn
214 |             )
215 |         elif continuous == True and presence_test != "rpop":
216 |             prox_mat = calc_continuous_proximity(cdata.rca_t, cdata.ubiquity_t)
217 |             cdata.density_t = calc_density(
218 |                 rca_or_mcp=cdata.rca_t, proximity_mat=prox_mat, knn=knn
219 |             )
220 | 
221 |         # Calculate COI and COG
222 |         cdata.coi_t, cdata.cog_t = calc_coi_cog(cdata, prox_mat)
223 | 
224 |         # Normalize variables as per STATA package
225 |         # Normalization using ECI mean and std. dev. preserves the property that 
226 |         # ECI = (mean of PCI of products for which MCP=1)
227 |         cdata.pci_t = (cdata.pci_t - cdata.eci_t.mean()) / cdata.eci_t.std()
228 |         cdata.cog_t = cdata.cog_t / cdata.eci_t.std()
229 |         cdata.eci_t = (cdata.eci_t - cdata.eci_t.mean()) / cdata.eci_t.std()
230 | 
231 |         cdata.coi_t = (cdata.coi_t - cdata.coi_t.mean()) / cdata.coi_t.std()
232 | 
233 |         # Reshape ndarrays to df
234 |         cdata = reshape_output_to_data(cdata, t)
235 | 
236 |     cdata.output = pd.concat(cdata.output_list)
237 |     cdata = conform_to_original_data(cdata, data)
238 | 
239 |     return cdata.output
240 | 


--------------------------------------------------------------------------------
/ecomplexity/proximity.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from ecomplexity.calc_proximity import calc_discrete_proximity
  4 | from ecomplexity.calc_proximity import calc_continuous_proximity
  5 | from ecomplexity.ComplexityData import ComplexityData
  6 | 
  7 | 
  8 | def proximity(
  9 |     data,
 10 |     cols_input,
 11 |     presence_test="rca",
 12 |     val_errors_flag="coerce",
 13 |     rca_mcp_threshold=1,
 14 |     rpop_mcp_threshold=1,
 15 |     pop=None,
 16 |     continuous=False,
 17 |     asymmetric=False,
 18 | ):
 19 |     """Wrapper function to calculate product proximity matrices
 20 | 
 21 |         Args:
 22 |             data: pandas df with cols 'time','loc','prod','val'
 23 |             cols_input: dict of column names for time, location, product and value.
 24 |                 Example: {'time':'year', 'loc':'origin', 'prod':'hs92', 'val':'export_val'}
 25 |             presence_test: str for test used for presence of industry in location.
 26 |                 One of "rca" (default), "rpop", "both", or "manual".
 27 |                 Determines which values are used for M_cp calculations.
 28 |                 If "manual", M_cp is taken as given from the "value" column in data
 29 |             val_errors_flag: {'coerce','ignore','raise'}. Passed to pd.to_numeric
 30 |                 *default* coerce.
 31 |             rca_mcp_threshold: numeric indicating RCA threshold beyond which mcp is 1.
 32 |                 *default* 1.
 33 |             rpop_mcp_threshold: numeric indicating RPOP threshold beyond which mcp is 1.
 34 |                 *default* 1. Only used if presence_test is not "rca".
 35 |             pop: pandas df, with time, location and corresponding population, in that order.
 36 |                 Not required if presence_test is "rca" (default).
 37 |             continuous: Whether to consider correlation of every product pair (True)
 38 |                 or product co-occurrence (False). *default* False.
 39 |             asymmetric: Whether to generate asymmetric proximity matrix (True) or
 40 |                 symmetric (False). *default* False.
 41 | 
 42 |         Returns:
 43 |             pandas df with proximity values for every product pair
 44 |     """
 45 | 
 46 |     cdata = ComplexityData(data, cols_input, val_errors_flag)
 47 | 
 48 |     output_list = []
 49 | 
 50 |     # Iterate over time stamps
 51 |     for t in cdata.data.index.unique("time"):
 52 |         print(t)
 53 |         # Rectangularize df
 54 |         cdata.create_full_df(t)
 55 | 
 56 |         # Check if Mcp is pre-computed
 57 |         if presence_test != "manual":
 58 |             cdata.calculate_rca()
 59 |             cdata.calculate_mcp(
 60 |                 rca_mcp_threshold, rpop_mcp_threshold, presence_test, pop, t
 61 |             )
 62 |         else:
 63 |             cdata.calculate_manual_mcp()
 64 | 
 65 |         # Calculate diversity and ubiquity
 66 |         cdata.diversity_t = np.nansum(cdata.mcp_t, axis=1)
 67 |         cdata.ubiquity_t = np.nansum(cdata.mcp_t, axis=0)
 68 | 
 69 |         # Calculate proximity
 70 |         if continuous == False:
 71 |             prox_mat = calc_discrete_proximity(
 72 |                 cdata.mcp_t, cdata.ubiquity_t, asymmetric
 73 |             )
 74 |         elif continuous == True and presence_test == "rpop":
 75 |             prox_mat = calc_continuous_proximity(cdata.rpop_t, cdata.ubiquity_t)
 76 |         elif continuous == True and presence_test != "rpop":
 77 |             prox_mat = calc_continuous_proximity(cdata.rca_t, cdata.ubiquity_t)
 78 | 
 79 |         # Reshape as df
 80 |         output_index = pd.MultiIndex.from_product(
 81 |             [cdata.data_t.index.levels[1], cdata.data_t.index.levels[1]],
 82 |             names=["prod1", "prod2"],
 83 |         )
 84 |         output = pd.DataFrame(data={"proximity": prox_mat.ravel()}, index=output_index)
 85 |         output["time"] = t
 86 |         output_list.append(output)
 87 | 
 88 |     output = pd.concat(output_list)
 89 | 
 90 |     # Remove entries for product's proximity with itself
 91 |     output = output.reset_index()
 92 |     output.columns = ["prod1", "prod2", "proximity", "time"]
 93 |     output = output[["time", "prod1", "prod2", "proximity"]]
 94 |     output = output[output.prod1 != output.prod2]
 95 | 
 96 |     # Rename based on original product column name
 97 |     output = output.rename(
 98 |         columns={
 99 |             "prod1": cdata.cols_input["prod"] + "_1",
100 |             "prod2": cdata.cols_input["prod"] + "_2",
101 |             "time": cdata.cols_input["time"],
102 |         }
103 |     )
104 | 
105 |     return output
106 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | numpy==1.23.5
3 | scikit-learn==1.2.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | def readme():
 5 |     with open("README.md") as f:
 6 |         return f.read()
 7 | 
 8 | setup(name='ecomplexity',
 9 |       version='0.5.2',
10 |       description='Package to calculate economic complexity and associated variables',
11 |       long_description=readme(),
12 |       long_description_content_type='text/markdown',
13 |       url='https://github.com/cid-harvard/py-ecomplexity',
14 |       author='Shreyas Gadgin Matha',
15 |       author_email='shreyas.gm61@gmail.com',
16 |       license='MIT',
17 |       packages=find_packages(),
18 |       keywords="pandas python networks economics complexity",
19 |       python_requires='>=3',
20 |       install_requires=[
21 |           'pandas >0.23.0',
22 |           'numpy >1.22.0'
23 |           'scikit-learn >1.0.0'
24 |       ],
25 |       zip_safe=False,
26 |       classifiers=[
27 |           "Programming Language :: Python :: 3",
28 |           "License :: OSI Approved :: MIT License",
29 |           "Operating System :: OS Independent",
30 |       ])
31 | 


--------------------------------------------------------------------------------