├── README.md ├── data_generator.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # Causal-Transfer-Learning 2 | 3 | ## 简介 4 | 5 | Causal-Transfer-Learning 是一种因果迁移学习方法。其通过生成代表不同人群的数据集,在样本量足够的数据集中训练多层感知机分类器模型,并评估其在样本量稀疏的数据集上的表现。 6 | 7 | 假设有大量(10000个)50岁左右的群体与已有少量(5个)30岁左右群体的样本,通过分析吸烟对于血压在两个人群的影响因子,将大量50岁左右的群体样本迁移至30岁左右群体,以丰富样本容量,从而更加准确地预测30岁左右群体是否患有心脏病。 8 | 9 | ## 环境安装 10 | 11 | 推荐使用conda安装虚拟环境,推荐使用ubuntu系统,在命令行中运行: 12 | ```bash 13 | conda create -n ctl python=3.9 14 | conda activate ctl 15 | pip install scikit-learn 16 | ``` 17 | 18 | ## 运行 19 | 运行: 20 | ```bash 21 | python main.py 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /data_generator.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def generate_data(n, bias_smoking=0, age=50): 6 | smoking = np.random.binomial(1, 0.3, n) 7 | exercise = np.random.binomial(1, 0.5, n) 8 | age = np.random.normal(age, 5, n) # 平均年龄50,标准差10 9 | gender = np.random.binomial(1, 0.5, n) # 0为女性,1为男性 10 | diet = np.random.binomial(1, 0.4, n) # 健康饮食为1,否则为0 11 | cholesterol = np.random.normal(200, 30, n) # 均值200,标准差30 12 | 13 | blood_pressure = np.random.normal(120 + bias_smoking * smoking + 0.5 * age, 10, n) 14 | # 计算心脏病发病概率,确保其在0和1之间 15 | p_heart_disease = 0.25 * (blood_pressure - 120) / 10 + 0.2 * (1 - exercise) + 0.05 * gender + 0.05 * (1 - diet) + \ 16 | 0.02 * (cholesterol - 200) / 30 - 0.3 17 | #print(p_heart_disease) 18 | p_heart_disease = np.clip(p_heart_disease, 0, 1) # 将概率限制在0和1之间 19 | heart_disease = np.random.binomial(1, p_heart_disease, n) 20 | 21 | return pd.DataFrame({ 22 | 'smoking': smoking, 23 | 'exercise': exercise, 24 | 'age': age, 25 | 'gender': gender, 26 | 'diet': diet, 27 | 'cholesterol': cholesterol, 28 | 'blood_pressure': blood_pressure, 29 | 'heart_disease': heart_disease 30 | }) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neural_network import MLPClassifier 3 | import statsmodels.api as sm 4 | import data_generator as dg 5 | import warnings 6 | warnings.filterwarnings('ignore') 7 | 8 | np.random.seed(42) 9 | 10 | 11 | data_source = dg.generate_data(10000, bias_smoking=35,age=50) 12 | data_target = dg.generate_data(5, bias_smoking=5,age=30) 13 | data_target2 = dg.generate_data(1000, bias_smoking=5,age=30) 14 | 15 | def train_model(data): 16 | model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000) 17 | X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']] 18 | y = data['heart_disease'] 19 | model.fit(X, y) 20 | return model 21 | 22 | def evaluate_model(model, data): 23 | X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']] 24 | y = data['heart_disease'] 25 | score = model.score(X, y) 26 | return score 27 | 28 | model_source = train_model(data_source) 29 | 30 | 31 | print("调整前源域模型在目标域的表现:", evaluate_model(model_source, data_target2)) 32 | 33 | 34 | # 估计源域中吸烟对血压的影响 35 | X_source = sm.add_constant(data_source['smoking']) 36 | bp_model_source = sm.OLS(data_source['blood_pressure'], X_source).fit() 37 | smoking_effect_on_bp_source = bp_model_source.params['smoking'] 38 | 39 | # 估计源域中血压对心脏病的影响 40 | X_source = sm.add_constant(data_source['blood_pressure']) 41 | heart_disease_model_source = sm.OLS(data_source['heart_disease'], X_source).fit(disp=False)#Logit 42 | bp_effect_on_heart_disease_source = heart_disease_model_source.params['blood_pressure'] 43 | 44 | # 估计目标域中吸烟对血压的影响 45 | X_target = sm.add_constant(data_target['smoking']) 46 | bp_model_target = sm.OLS(data_target['blood_pressure'], X_target).fit() 47 | smoking_effect_on_bp_target = bp_model_target.params['smoking'] 48 | 49 | # 调整目标域模型 50 | def adjust_model(data, effect_difference, effect2): 51 | model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000) 52 | X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']] 53 | X['blood_pressure'] = data['blood_pressure'] + effect_difference * data['smoking'] - 10 54 | X['age'] = data['age'] - 20 55 | 56 | y = data['heart_disease'] 57 | 58 | p_heart_disease = 0.25 * (X['blood_pressure'] - 120) / 10 + 0.2 * (1 - X['exercise']) + 0.05 * X['gender'] + 0.05 * (1 - X['diet']) + \ 59 | 0.02 * (X['cholesterol'] - 200) / 30 - 0.3 60 | 61 | p_heart_disease = np.clip(p_heart_disease, 0, 1) # 将概率限制在0和1之间 62 | y = np.random.binomial(1, p_heart_disease, len(y)) 63 | 64 | model.fit(X, y) 65 | return model 66 | 67 | effect_difference = smoking_effect_on_bp_target - smoking_effect_on_bp_source 68 | # print(effect_difference) 69 | # print(bp_effect_on_heart_disease_source) 70 | 71 | model_adjusted = adjust_model(data_source, effect_difference, bp_effect_on_heart_disease_source) 72 | 73 | print("源域数据调整后模型在目标域的表现:", evaluate_model(model_adjusted, data_target2)) 74 | 75 | model3=train_model(data_target) 76 | 77 | print("目标域样本训练模型在目标域的表现:", evaluate_model(model3, data_target2)) 78 | 79 | --------------------------------------------------------------------------------