├── example.xlsx ├── README.md └── dagum_gini.py /example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ObbaLX/Dagum-Gini/HEAD/example.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dagum-Gini Project 2 | 3 | ## 项目简介 4 | 1. Dagum(1997)在文章《A new approach to the decomposition of the Gini income inequality ratio》中将传统 Gini 系数分解为三个组成部分,即**组内差距**、**组间差距**以及**超变密度**。 5 | 2. William Griffiths (2008) 指出 Dagum(1997)提出的分解 Gini 系数的新方法用于测量组间和组内的不平等贡献,其分解结果与文献中常用的传统分解方法相同。 6 | 7 | --- 8 | 9 | ## 项目输入 10 | - 数据输入参考传统的使用 Excel 宏计算 Gini 系数的数据格式。 11 | - 数据主要分为四列:`index`、`year`、`group`、`value`。 12 | - `value`:我们关注的差距(例如收入数据)。 13 | - `group`:对数据进行分组的标识。 14 | 15 | ### 注意事项 16 | 1. 数据的构建形式要与输入要求严格一致,尤其是列名。 17 | 2. 数据不能有缺失值。 18 | 3. 数据不能有负值(根据 Gini 系数的要求)。 19 | 20 | ### 示例数据 21 | | index | year | group | value | 22 | |-------|------|-------|--------| 23 | | 0 | 2011 | 1 | 0.7784 | 24 | | 1 | 2011 | 1 | 0.6208 | 25 | | 2 | 2011 | 3 | 0.4486 | 26 | | 3 | 2011 | 2 | 0.4822 | 27 | 28 | --- 29 | 30 | ## 项目输出 31 | 如果运行成功,会输出一个表格如下: 32 | 33 | | year | 总体 | 1 | 2 | 3 | 1-2 | 1-3 | 2-3 | 组内 | 组间 | 超变密度 | 34 | |------|--------|-------|-------|-------|-------|-------|-------|--------|--------|--------| 35 | | 2011 | 0.2344 | 0.2651 | 0.1914 | 0.1628 | 0.2909 | 0.2902 | 0.1828 | 28.31% | 43.63% | 28.06% | 36 | | 2012 | 0.2348 | 0.2662 | 0.1972 | 0.1569 | 0.2957 | 0.2889 | 0.1833 | 28.14% | 44.35% | 27.51% | 37 | | 2013 | 0.2396 | 0.2641 | 0.207 | 0.1507 | 0.3023 | 0.3038 | 0.1866 | 27.37% | 48.10% | 24.53% | 38 | | 2014 | 0.2365 | 0.2605 | 0.2048 | 0.1481 | 0.2998 | 0.3000 | 0.1834 | 27.33% | 49.59% | 23.08% | 39 | | 2015 | 0.2438 | 0.2753 | 0.202 | 0.1559 | 0.3144 | 0.3082 | 0.1858 | 27.36% | 48.03% | 24.61% | 40 | | 2016 | 0.255 | 0.2822 | 0.1946 | 0.1868 | 0.3185 | 0.3234 | 0.1961 | 27.89% | 46.29% | 25.82% | 41 | | 2017 | 0.2739 | 0.3264 | 0.1965 | 0.199 | 0.3445 | 0.3506 | 0.205 42 |  43 | -------------------------------------------------------------------------------- /dagum_gini.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 16 21:40:59 2021 4 | 5 | @author: Codeonce 6 | 7 | desc: Dagum基尼系数分解 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | 12 | 13 | class DagumGini: 14 | """Dagum基尼系数分解""" 15 | 16 | def __init__(self, data): 17 | self.data = data 18 | self.μ = self.data['value'].mean() # 样本均值 19 | self.group_num = self.data.groupby('group').count() # 每组的个体数量 20 | self.λi = self.group_num / self.group_num.sum() # 每组个体数量的比重 21 | self.λi = self.λi.rename(columns={'value': 'λ'}) 22 | self.μi = self.data.groupby('group').mean() # 每组得分的均值 23 | self.μi = self.μi.rename(columns={'value': 'μ'}) 24 | self.si = self.data.groupby('group').sum().apply(lambda x: x / self.data['value'].sum()) # 每组得分均值在总得分中的比重 25 | self.si = self.si.rename(columns={'value': 's'}) 26 | 27 | def gini_w(self): 28 | """within Gini index""" 29 | # 先计算Gii 30 | self.gini_w = 0 31 | self.Gini_ij = pd.DataFrame(index=self.group_num.index, columns=self.group_num.index) 32 | for group in self.group_num.index: 33 | group_i = self.data.loc[self.data.group == group, 'value'] 34 | total = 0 35 | for i in group_i: 36 | diff = abs(np.array([i]) - np.array(group_i)).sum() 37 | total += diff 38 | 39 | gini_ii = total / (2 * pow(self.group_num.loc[group,][0], 2) * self.μi.loc[group,][0]) 40 | self.Gini_ij.loc[group, group] = gini_ii 41 | # 计算Gw 42 | self.gini_w += self.λi.loc[group,][0] * self.si.loc[group,][0] * gini_ii 43 | # 返回组间差距和组间基尼系数 44 | return self.gini_w, self.Gini_ij 45 | 46 | def gini_gb(self): 47 | """Gross Contribution betweengroups""" 48 | self.gini_gb = 0 49 | for i in self.group_num.index: 50 | # 删除当前组,剩下的就是其他组 51 | g_ls = list(self.group_num.index) 52 | g_ls.remove(i) 53 | # 当前组 54 | group_i = self.data.loc[self.data.group == i, 'value'] 55 | # 建立字典,存放其他组 56 | og_dic = {} 57 | for k in g_ls: 58 | other_group = self.data.loc[(self.data.group == k), 'value'] 59 | og_dic[k] = other_group 60 | 61 | 62 | for j in og_dic.keys(): 63 | sum_ = 0 64 | for value in group_i: 65 | diff = abs(np.array([value]) - np.array(og_dic[j])).sum() 66 | sum_ += diff 67 | 68 | g_ij = sum_ / ((self.μi.loc[i,][0] + self.μi.loc[j,][0]) * \ 69 | self.group_num.loc[i,][0] * self.group_num.loc[j,][0]) 70 | self.Gini_ij.loc[i, j] = g_ij 71 | 72 | # 累加计算G_gb 73 | y = (self.λi.loc[j,][0] * self.si.loc[i,][0] + self.λi.loc[i,][0] * \ 74 | self.si.loc[j,][0]) * g_ij 75 | self.gini_gb += y / 2 76 | 77 | return self.gini_gb, self.Gini_ij 78 | 79 | 80 | def gini_nb(self): 81 | """Net Contribution of the Gini between""" 82 | λ_μ = pd.concat([self.λi, self.μi], axis=1) 83 | 84 | multi_array = np.array([]) 85 | subtrac_array = np.array([]) 86 | for index in range(self.group_num.shape[0] - 1): 87 | index += 1 88 | array1 = np.array(λ_μ.loc[index, 'λ']) * np.array(λ_μ.loc[index+1:, 'λ']) 89 | array2 = abs(np.array(λ_μ.loc[index, 'μ']) - np.array(λ_μ.loc[index+1:, 'μ'])) 90 | 91 | multi_array = np.append(multi_array, array1) 92 | subtrac_array = np.append(subtrac_array, array2) 93 | # Net Contribution of the Gini between 94 | self.gini_nb = (multi_array * subtrac_array).sum() / self.μ 95 | 96 | return self.gini_nb 97 | 98 | def gini_t(self): 99 | """Transvariation between""" 100 | self.gini_t = self.gini_gb - self.gini_nb 101 | 102 | return self.gini_t 103 | 104 | def calculate_dg(data): 105 | """ 106 | 以表格形式输出Dagum基尼系数分解结果 107 | 108 | Parameters: 109 | ----------- 110 | data: 分解Dagum 基尼系数的标准数据格式,index,group,value. 111 | """ 112 | years = data['year'].drop_duplicates() 113 | gini_table = pd.DataFrame() 114 | for year in years: 115 | df = data.loc[data.year == year][['group', 'value']] 116 | 117 | dg = DagumGini(df) 118 | gini_w, Gini_ij = dg.gini_w() 119 | gini_gb, Gini_ij = dg.gini_gb() 120 | gini_nb = dg.gini_nb() 121 | gini_t = dg.gini_t() 122 | gini_sum = gini_w + gini_gb 123 | 124 | gini_table.loc[year, '总体'] = round(gini_sum, 4) 125 | # 汇报组内基尼系数 126 | for index in Gini_ij.index: 127 | gini_table.loc[year, index] = round(Gini_ij.loc[index, index], 4) 128 | # 汇报组间基尼系数 129 | for i in Gini_ij.index: 130 | for j in Gini_ij.index: 131 | if i != j: 132 | gini_table.loc[year, '{}-{}'.format(i, j)] = round(Gini_ij.loc[i, j], 4) 133 | gini_table.loc[year, '组内'] = '{:.2%}'.format(gini_w / gini_sum) 134 | gini_table.loc[year, '组间'] = '{:.2%}'.format(gini_nb / gini_sum) 135 | gini_table.loc[year, '超变密度'] = '{:.2%}'.format(gini_t / gini_sum) 136 | # 返回汇总表格 137 | return gini_table 138 | 139 | if __name__ == '__main__': 140 | data = pd.read_excel('./example.xlsx', index_col=0) 141 | gini_table = calculate_dg(data) 142 | 143 | --------------------------------------------------------------------------------