├── README.md
├── data_generator.py
└── main.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Causal-Transfer-Learning
 2 | 
 3 | ## 简介
 4 | 
 5 | Causal-Transfer-Learning 是一种因果迁移学习方法。其通过生成代表不同人群的数据集，在样本量足够的数据集中训练多层感知机分类器模型，并评估其在样本量稀疏的数据集上的表现。
 6 | 
 7 | 假设有大量（10000个）50岁左右的群体与已有少量（5个）30岁左右群体的样本，通过分析吸烟对于血压在两个人群的影响因子，将大量50岁左右的群体样本迁移至30岁左右群体，以丰富样本容量，从而更加准确地预测30岁左右群体是否患有心脏病。
 8 | 
 9 | ## 环境安装
10 | 
11 | 推荐使用conda安装虚拟环境，推荐使用ubuntu系统，在命令行中运行：
12 | ```bash
13 | conda create -n ctl python=3.9
14 | conda activate ctl
15 | pip install scikit-learn
16 | ```
17 | 
18 | ## 运行
19 | 运行：
20 | ```bash
21 | python main.py
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/data_generator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | def generate_data(n, bias_smoking=0, age=50):
 6 |     smoking = np.random.binomial(1, 0.3, n)
 7 |     exercise = np.random.binomial(1, 0.5, n)
 8 |     age = np.random.normal(age, 5, n)  # 平均年龄50，标准差10
 9 |     gender = np.random.binomial(1, 0.5, n)  # 0为女性，1为男性
10 |     diet = np.random.binomial(1, 0.4, n)  # 健康饮食为1，否则为0
11 |     cholesterol = np.random.normal(200, 30, n)  # 均值200，标准差30
12 | 
13 |     blood_pressure = np.random.normal(120 + bias_smoking * smoking + 0.5 * age, 10, n)
14 |     # 计算心脏病发病概率，确保其在0和1之间
15 |     p_heart_disease = 0.25 * (blood_pressure - 120) / 10 + 0.2 * (1 - exercise) + 0.05 * gender + 0.05 * (1 - diet) + \
16 |                       0.02 * (cholesterol - 200) / 30 - 0.3
17 |     #print(p_heart_disease)
18 |     p_heart_disease = np.clip(p_heart_disease, 0, 1)  # 将概率限制在0和1之间
19 |     heart_disease = np.random.binomial(1, p_heart_disease, n)
20 | 
21 |     return pd.DataFrame({
22 |         'smoking': smoking,
23 |         'exercise': exercise,
24 |         'age': age,
25 |         'gender': gender,
26 |         'diet': diet,
27 |         'cholesterol': cholesterol,
28 |         'blood_pressure': blood_pressure,
29 |         'heart_disease': heart_disease
30 |     })


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.neural_network import MLPClassifier
 3 | import statsmodels.api as sm
 4 | import data_generator as dg
 5 | import warnings
 6 | warnings.filterwarnings('ignore')
 7 | 
 8 | np.random.seed(42)
 9 | 
10 | 
11 | data_source = dg.generate_data(10000, bias_smoking=35,age=50)
12 | data_target = dg.generate_data(5, bias_smoking=5,age=30)
13 | data_target2 = dg.generate_data(1000, bias_smoking=5,age=30)
14 | 
15 | def train_model(data):
16 |     model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)
17 |     X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']]
18 |     y = data['heart_disease']
19 |     model.fit(X, y)
20 |     return model
21 | 
22 | def evaluate_model(model, data):
23 |     X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']]
24 |     y = data['heart_disease']
25 |     score = model.score(X, y)
26 |     return score
27 | 
28 | model_source = train_model(data_source)
29 | 
30 | 
31 | print("调整前源域模型在目标域的表现：", evaluate_model(model_source, data_target2))
32 | 
33 | 
34 | # 估计源域中吸烟对血压的影响
35 | X_source = sm.add_constant(data_source['smoking'])
36 | bp_model_source = sm.OLS(data_source['blood_pressure'], X_source).fit()
37 | smoking_effect_on_bp_source = bp_model_source.params['smoking']
38 | 
39 | # 估计源域中血压对心脏病的影响
40 | X_source = sm.add_constant(data_source['blood_pressure'])
41 | heart_disease_model_source = sm.OLS(data_source['heart_disease'], X_source).fit(disp=False)#Logit
42 | bp_effect_on_heart_disease_source = heart_disease_model_source.params['blood_pressure']
43 | 
44 | # 估计目标域中吸烟对血压的影响
45 | X_target = sm.add_constant(data_target['smoking'])
46 | bp_model_target = sm.OLS(data_target['blood_pressure'], X_target).fit()
47 | smoking_effect_on_bp_target = bp_model_target.params['smoking']
48 | 
49 | # 调整目标域模型
50 | def adjust_model(data, effect_difference, effect2):
51 |     model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)
52 |     X = data[['smoking', 'exercise', 'age', 'gender', 'diet', 'cholesterol', 'blood_pressure']]
53 |     X['blood_pressure'] = data['blood_pressure'] + effect_difference * data['smoking'] - 10
54 |     X['age'] = data['age'] - 20
55 | 
56 |     y = data['heart_disease']
57 | 
58 |     p_heart_disease = 0.25 * (X['blood_pressure'] - 120) / 10 + 0.2 * (1 - X['exercise']) + 0.05 * X['gender'] + 0.05 * (1 - X['diet']) + \
59 |                       0.02 * (X['cholesterol'] - 200) / 30 - 0.3
60 | 
61 |     p_heart_disease = np.clip(p_heart_disease, 0, 1)  # 将概率限制在0和1之间
62 |     y = np.random.binomial(1, p_heart_disease, len(y))
63 | 
64 |     model.fit(X, y)
65 |     return model
66 | 
67 | effect_difference = smoking_effect_on_bp_target - smoking_effect_on_bp_source
68 | # print(effect_difference)
69 | # print(bp_effect_on_heart_disease_source)
70 | 
71 | model_adjusted = adjust_model(data_source, effect_difference, bp_effect_on_heart_disease_source)
72 | 
73 | print("源域数据调整后模型在目标域的表现：", evaluate_model(model_adjusted, data_target2))
74 | 
75 | model3=train_model(data_target)
76 | 
77 | print("目标域样本训练模型在目标域的表现：", evaluate_model(model3, data_target2))
78 | 
79 | 


--------------------------------------------------------------------------------