├── README.md
├── sentiment.py
├── Logistic regression.py
├── Random Forest.py
└── XGB.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Predict_IPO-UnderpricingRisk
2 | 通过将对上市公司招股说明书情绪分析的结果与常用财务指标、企业科研指标等结合,综合使用多种分类模型:传统LR、随机森林、XGB、LGB集成学习模型对新上市公司破发情况进行学习和预测,筛选重要特征,并由此来得到一个新股破发分类器。
3 |
--------------------------------------------------------------------------------
/sentiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pandas as pd
4 | import numpy as np
5 | import re
6 | import jieba
7 | from jieba import analyse
8 | from collections import Counter
9 |
10 | data = pd.DataFrame()
11 | file_folder = "./data/"
12 | for file in os.listdir(file_folder):
13 | file_path = os.path.join(file_folder, file)
14 | f = open(file_path, encoding = "utf-8")
15 | content = f.read()
16 | stock = file[0:6]
17 | df = pd.DataFrame([[stock, file, content]], columns=["stock","filename","content"])
18 | data = pd.concat([data, df])
19 | f.close()
20 |
21 | data = data.reset_index(drop = True)
22 |
23 | from cnsenti import Sentiment
24 |
25 | senti = Sentiment(pos='./dict/formal_pos.txt',
26 | neg='./dict/formal_neg.txt',
27 | merge=False,
28 | encoding='utf-8')
29 |
30 | data.insert(data.shape[1], 'words', 0)
31 | data.insert(data.shape[1], 'sentences', 0)
32 | data.insert(data.shape[1], 'pos', 0)
33 | data.insert(data.shape[1], 'neg', 0)
34 |
35 | a = 0
36 | for content in data["content"]:
37 | result = senti.sentiment_count(content)
38 | data.iloc[a,3] = result["words"]
39 | data.iloc[a,4] = result["sentences"]
40 | data.iloc[a,5] = result["pos"]
41 | data.iloc[a,6] = result["neg"]
42 | a = a + 1
43 |
44 | simple = data[["stock","words","sentences","pos","neg"]]
45 | simple.to_csv("./dict/simple.csv")
--------------------------------------------------------------------------------
/Logistic regression.py:
--------------------------------------------------------------------------------
1 | # logistic
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import statsmodels.api as sm
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.preprocessing import StandardScaler
8 | from imblearn.over_sampling import RandomOverSampler
9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
11 | from collections import Counter
12 |
13 | # 加载数据集和预处理
14 | data = pd.read_csv("D.csv")
15 | X = data.drop(["y"], axis=1) # 特征
16 | y = data["y"] # 标签
17 |
18 | # 划分训练集和测试集
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50)
20 |
21 | # 特征标准化
22 | scaler = StandardScaler()
23 | X_train_scaled = scaler.fit_transform(X_train)
24 | X_test_scaled = scaler.transform(X_test)
25 |
26 | # 过采样
27 | oversampler = RandomOverSampler(sampling_strategy=1)
28 | X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)
29 |
30 | # 构建逻辑回归模型
31 | logistic_model = LogisticRegression(random_state=42)
32 |
33 | # 训练模型
34 | logistic_model.fit(X_train_resampled, y_train_resampled)
35 |
36 | # 预测测试集
37 | y_pred = logistic_model.predict(X_test_scaled)
38 |
39 | # 模型准确率
40 | accuracy = accuracy_score(y_test, y_pred)
41 | print("模型准确率:", accuracy)
42 |
43 | # F1 分数
44 | f1 = f1_score(y_test, y_pred)
45 | print("模型F1分数:", f1)
46 |
47 | # AUC
48 | y_pred_prob = logistic_model.predict_proba(X_test_scaled)[:, 1] # 获取正类的预测概率
49 | auc = roc_auc_score(y_test, y_pred_prob)
50 | print("模型AUC:", auc)
51 |
52 | # 使用statsmodels获取统计信息
53 | X_train_resampled = sm.add_constant(X_train_resampled) # 加入常数列
54 | logit_model = sm.Logit(y_train_resampled, X_train_resampled)
55 | result = logit_model.fit()
56 |
57 | # 创建系数表格
58 | coef_summary = result.summary2().tables[1]
59 |
60 | print(coef_summary)
61 |
--------------------------------------------------------------------------------
/Random Forest.py:
--------------------------------------------------------------------------------
1 | # Random forest
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import graphviz
6 | from sklearn.preprocessing import StandardScaler
7 | from sklearn.model_selection import train_test_split
8 | from imblearn.under_sampling import RandomUnderSampler
9 | from imblearn.over_sampling import RandomOverSampler
10 | from collections import Counter
11 | from sklearn.ensemble import RandomForestClassifier
12 | from sklearn.metrics import accuracy_score
13 | from sklearn.metrics import f1_score
14 | from sklearn.metrics import roc_auc_score
15 | from sklearn.tree import export_graphviz
16 |
17 | # 预处理
18 | data = pd.read_csv("D.csv")
19 | X = data.drop(["y","是否为科创版"], axis=1)
20 | y = data["y"]
21 |
22 | # 划分训练集测试集
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
24 |
25 | # 标准化
26 | scaler = StandardScaler()
27 | X_scaled = scaler.fit_transform(X)
28 |
29 | # 显示原始类别分布
30 | print("原始类别分布:", Counter(y_train))
31 |
32 | # 过采样
33 | oversampler = RandomOverSampler(sampling_strategy=1)
34 | X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
35 | print("过采样后的类别分布:", Counter(y_train_resampled))
36 |
37 | # 随机森林模型
38 | random_forest_model = RandomForestClassifier(n_estimators=100, random_state=50)
39 |
40 | # 训练模型
41 | random_forest_model.fit(X_train_resampled, y_train_resampled)
42 |
43 | # 预测测试集
44 | y_pred = random_forest_model.predict(X_test)
45 |
46 | # 准确率
47 | accuracy = accuracy_score(y_test, y_pred)
48 | print("模型准确率:", accuracy)
49 |
50 | # 测试集中 y=1 的预测准确率
51 | y_test_positive = y_test[y_test == 1]
52 | y_pred_positive = y_pred[y_test == 1]
53 | accuracy_positive = accuracy_score(y_test_positive, y_pred_positive)
54 | print("测试集中 y=1 的预测准确率:", accuracy_positive)
55 |
56 | y_test_negative = y_test[y_test == 0]
57 | y_pred_negative = y_pred[y_test == 0]
58 | error_rate_negative = 1 - accuracy_score(y_test_negative, y_pred_negative)
59 | print("测试集中 y=0 的预测错误率:", error_rate_negative)
60 |
61 | # F1 分数
62 | f1 = f1_score(y_test, y_pred)
63 | print("模型F1分数:", f1)
64 |
65 | # 重要性
66 | feature_importance = random_forest_model.feature_importances_
67 |
68 | # 创建特征重要性的DataFrame
69 | feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
70 | feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
71 |
72 | print(feature_importance_df)
73 |
74 | # AUC
75 | y_pred_prob = random_forest_model.predict_proba(X_test)[:, 1] # 获取正类的预测概率
76 | auc = roc_auc_score(y_test, y_pred_prob)
77 | print("模型AUC:", auc)
78 |
79 | # 获取随机森林中的第一棵树
80 | first_tree = random_forest_model.estimators_[0]
81 |
82 | # 导出树的可视化表示
83 | dot_data = export_graphviz(first_tree, out_file=None,
84 | feature_names=X.columns,
85 | class_names=["0", "1"], # 根据你的类别标签调整
86 | filled=True, rounded=True, special_characters=True)
87 |
88 | # 使用Graphviz绘制树状图
89 | graph = graphviz.Source(dot_data)
90 | graph.render("random_forest_tree") # 可选:将树状图保存为文件
91 | graph.view() # 在默认浏览器中显示树状图
92 |
93 |
94 |
95 |
96 |
97 | # 模型提升曲线(未完成,直接比较模型优劣)
98 |
99 | import numpy as np
100 | import pandas as pd
101 | import matplotlib.pyplot as plt
102 | from sklearn.model_selection import train_test_split
103 | from sklearn.ensemble import RandomForestClassifier
104 | from sklearn.preprocessing import StandardScaler
105 | from sklearn.metrics import accuracy_score
106 |
107 | # 预处理
108 | data = pd.read_csv("D.csv")
109 | X = data.drop("y", axis=1)
110 | y = data["y"]
111 | scaler = StandardScaler()
112 | X_scaled = scaler.fit_transform(X)
113 | X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=50)
114 |
115 | # 随机森林模型
116 | n_estimators_values = [45,90,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
117 | accuracy_scores = []
118 |
119 | for n_estimators in n_estimators_values:
120 | random_forest_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
121 | random_forest_model.fit(X_train, y_train)
122 | y_pred = random_forest_model.predict(X_test)
123 | accuracy = accuracy_score(y_test, y_pred)
124 | accuracy_scores.append(accuracy)
125 |
126 | # 绘制提升曲线
127 | plt.figure(figsize=(10, 6))
128 | plt.plot(n_estimators_values, accuracy_scores, marker='o')
129 | plt.title("Random Forest Model Boosting Curve")
130 | plt.xlabel("Number of Estimators")
131 | plt.ylabel("Accuracy")
132 | plt.grid(True)
133 | plt.show()
134 |
--------------------------------------------------------------------------------
/XGB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import xgboost as xgb\n",
12 | "from sklearn.model_selection import train_test_split\n",
13 | "from sklearn.metrics import accuracy_score\n",
14 | "from sklearn.metrics import roc_auc_score"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 6,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from imblearn.over_sampling import RandomOverSampler"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 36,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from sklearn.metrics import f1_score"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 20,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# 从CSV文件中加载数据集\n",
42 | "data = pd.read_csv('new_data.csv')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 21,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/html": [
53 | "
\n",
54 | "\n",
67 | "
\n",
68 | " \n",
69 | " \n",
70 | " \n",
71 | " 稀释每股收益 \n",
72 | " 扣除非经常性损益后的加权平均净资产收益率 \n",
73 | " 每股经营活动产生的现金流量净额 \n",
74 | " 归属于上市公司股东的每股净资产 \n",
75 | " 资产负债率 \n",
76 | " 流动比率 \n",
77 | " 速动比率 \n",
78 | " 应收账款周转率 \n",
79 | " 存货周转率 \n",
80 | " 是否有私募或风投 \n",
81 | " 当年获得专利数量 \n",
82 | " 研发费用营业收入 \n",
83 | " 政府补助营业收入 \n",
84 | " 是否为科创版 \n",
85 | " posratio \n",
86 | " negratio \n",
87 | " 首日是否破发 \n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 \n",
93 | " 0.08 \n",
94 | " 1.330000 \n",
95 | " 0.10 \n",
96 | " 3.44 \n",
97 | " 47.349998 \n",
98 | " 0.94 \n",
99 | " 0.40 \n",
100 | " 45.860001 \n",
101 | " 5.230000 \n",
102 | " 0.0 \n",
103 | " 2 \n",
104 | " 0.006433 \n",
105 | " 0.010428 \n",
106 | " 0 \n",
107 | " 0.027111 \n",
108 | " 0.006456 \n",
109 | " 0 \n",
110 | " \n",
111 | " \n",
112 | " 1 \n",
113 | " 0.80 \n",
114 | " 48.110001 \n",
115 | " 0.77 \n",
116 | " NaN \n",
117 | " NaN \n",
118 | " 1.11 \n",
119 | " 0.72 \n",
120 | " 22.790001 \n",
121 | " 19.690001 \n",
122 | " 1.0 \n",
123 | " 0 \n",
124 | " 0.000000 \n",
125 | " 0.000000 \n",
126 | " 0 \n",
127 | " 0.036460 \n",
128 | " 0.005590 \n",
129 | " 0 \n",
130 | " \n",
131 | " \n",
132 | " 2 \n",
133 | " 0.33 \n",
134 | " 27.639999 \n",
135 | " 0.13 \n",
136 | " 1.28 \n",
137 | " 77.320000 \n",
138 | " 0.21 \n",
139 | " 0.17 \n",
140 | " 22.940001 \n",
141 | " 7.990000 \n",
142 | " 1.0 \n",
143 | " 5 \n",
144 | " 0.024233 \n",
145 | " 0.000623 \n",
146 | " 0 \n",
147 | " 0.019627 \n",
148 | " 0.006191 \n",
149 | " 0 \n",
150 | " \n",
151 | " \n",
152 | " 3 \n",
153 | " 0.72 \n",
154 | " 11.460000 \n",
155 | " 1.11 \n",
156 | " 6.94 \n",
157 | " 34.750000 \n",
158 | " 0.93 \n",
159 | " 0.80 \n",
160 | " 7.660000 \n",
161 | " 19.110001 \n",
162 | " 1.0 \n",
163 | " 8 \n",
164 | " 0.036983 \n",
165 | " 0.000131 \n",
166 | " 0 \n",
167 | " 0.029616 \n",
168 | " 0.007126 \n",
169 | " 0 \n",
170 | " \n",
171 | " \n",
172 | " 4 \n",
173 | " 1.73 \n",
174 | " 30.280001 \n",
175 | " 1.76 \n",
176 | " 7.32 \n",
177 | " 24.420000 \n",
178 | " 2.89 \n",
179 | " 2.32 \n",
180 | " 6.470000 \n",
181 | " 9.860000 \n",
182 | " 1.0 \n",
183 | " 0 \n",
184 | " 0.000000 \n",
185 | " 0.000000 \n",
186 | " 0 \n",
187 | " 0.027892 \n",
188 | " 0.004673 \n",
189 | " 0 \n",
190 | " \n",
191 | " \n",
192 | "
\n",
193 | "
"
194 | ],
195 | "text/plain": [
196 | " 稀释每股收益 扣除非经常性损益后的加权平均净资产收益率 每股经营活动产生的现金流量净额 归属于上市公司股东的每股净资产 资产负债率 \n",
197 | "0 0.08 1.330000 0.10 3.44 47.349998 \\\n",
198 | "1 0.80 48.110001 0.77 NaN NaN \n",
199 | "2 0.33 27.639999 0.13 1.28 77.320000 \n",
200 | "3 0.72 11.460000 1.11 6.94 34.750000 \n",
201 | "4 1.73 30.280001 1.76 7.32 24.420000 \n",
202 | "\n",
203 | " 流动比率 速动比率 应收账款周转率 存货周转率 是否有私募或风投 当年获得专利数量 研发费用营业收入 政府补助营业收入 \n",
204 | "0 0.94 0.40 45.860001 5.230000 0.0 2 0.006433 0.010428 \\\n",
205 | "1 1.11 0.72 22.790001 19.690001 1.0 0 0.000000 0.000000 \n",
206 | "2 0.21 0.17 22.940001 7.990000 1.0 5 0.024233 0.000623 \n",
207 | "3 0.93 0.80 7.660000 19.110001 1.0 8 0.036983 0.000131 \n",
208 | "4 2.89 2.32 6.470000 9.860000 1.0 0 0.000000 0.000000 \n",
209 | "\n",
210 | " 是否为科创版 posratio negratio 首日是否破发 \n",
211 | "0 0 0.027111 0.006456 0 \n",
212 | "1 0 0.036460 0.005590 0 \n",
213 | "2 0 0.019627 0.006191 0 \n",
214 | "3 0 0.029616 0.007126 0 \n",
215 | "4 0 0.027892 0.004673 0 "
216 | ]
217 | },
218 | "execution_count": 21,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "data.head()"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 22,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "# 将特征列和目标列分开\n",
234 | "X = data.drop('首日是否破发', axis=1)\n",
235 | "y = data['首日是否破发']"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 23,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/html": [
246 | "\n",
247 | "\n",
260 | "
\n",
261 | " \n",
262 | " \n",
263 | " \n",
264 | " 稀释每股收益 \n",
265 | " 扣除非经常性损益后的加权平均净资产收益率 \n",
266 | " 每股经营活动产生的现金流量净额 \n",
267 | " 归属于上市公司股东的每股净资产 \n",
268 | " 资产负债率 \n",
269 | " 流动比率 \n",
270 | " 速动比率 \n",
271 | " 应收账款周转率 \n",
272 | " 存货周转率 \n",
273 | " 是否有私募或风投 \n",
274 | " 当年获得专利数量 \n",
275 | " 研发费用营业收入 \n",
276 | " 政府补助营业收入 \n",
277 | " 是否为科创版 \n",
278 | " posratio \n",
279 | " negratio \n",
280 | " \n",
281 | " \n",
282 | " \n",
283 | " \n",
284 | " 0 \n",
285 | " 0.08 \n",
286 | " 1.330000 \n",
287 | " 0.10 \n",
288 | " 3.44 \n",
289 | " 47.349998 \n",
290 | " 0.94 \n",
291 | " 0.40 \n",
292 | " 45.860001 \n",
293 | " 5.230000 \n",
294 | " 0.0 \n",
295 | " 2 \n",
296 | " 0.006433 \n",
297 | " 0.010428 \n",
298 | " 0 \n",
299 | " 0.027111 \n",
300 | " 0.006456 \n",
301 | " \n",
302 | " \n",
303 | " 1 \n",
304 | " 0.80 \n",
305 | " 48.110001 \n",
306 | " 0.77 \n",
307 | " NaN \n",
308 | " NaN \n",
309 | " 1.11 \n",
310 | " 0.72 \n",
311 | " 22.790001 \n",
312 | " 19.690001 \n",
313 | " 1.0 \n",
314 | " 0 \n",
315 | " 0.000000 \n",
316 | " 0.000000 \n",
317 | " 0 \n",
318 | " 0.036460 \n",
319 | " 0.005590 \n",
320 | " \n",
321 | " \n",
322 | " 2 \n",
323 | " 0.33 \n",
324 | " 27.639999 \n",
325 | " 0.13 \n",
326 | " 1.28 \n",
327 | " 77.320000 \n",
328 | " 0.21 \n",
329 | " 0.17 \n",
330 | " 22.940001 \n",
331 | " 7.990000 \n",
332 | " 1.0 \n",
333 | " 5 \n",
334 | " 0.024233 \n",
335 | " 0.000623 \n",
336 | " 0 \n",
337 | " 0.019627 \n",
338 | " 0.006191 \n",
339 | " \n",
340 | " \n",
341 | " 3 \n",
342 | " 0.72 \n",
343 | " 11.460000 \n",
344 | " 1.11 \n",
345 | " 6.94 \n",
346 | " 34.750000 \n",
347 | " 0.93 \n",
348 | " 0.80 \n",
349 | " 7.660000 \n",
350 | " 19.110001 \n",
351 | " 1.0 \n",
352 | " 8 \n",
353 | " 0.036983 \n",
354 | " 0.000131 \n",
355 | " 0 \n",
356 | " 0.029616 \n",
357 | " 0.007126 \n",
358 | " \n",
359 | " \n",
360 | " 4 \n",
361 | " 1.73 \n",
362 | " 30.280001 \n",
363 | " 1.76 \n",
364 | " 7.32 \n",
365 | " 24.420000 \n",
366 | " 2.89 \n",
367 | " 2.32 \n",
368 | " 6.470000 \n",
369 | " 9.860000 \n",
370 | " 1.0 \n",
371 | " 0 \n",
372 | " 0.000000 \n",
373 | " 0.000000 \n",
374 | " 0 \n",
375 | " 0.027892 \n",
376 | " 0.004673 \n",
377 | " \n",
378 | " \n",
379 | "
\n",
380 | "
"
381 | ],
382 | "text/plain": [
383 | " 稀释每股收益 扣除非经常性损益后的加权平均净资产收益率 每股经营活动产生的现金流量净额 归属于上市公司股东的每股净资产 资产负债率 \n",
384 | "0 0.08 1.330000 0.10 3.44 47.349998 \\\n",
385 | "1 0.80 48.110001 0.77 NaN NaN \n",
386 | "2 0.33 27.639999 0.13 1.28 77.320000 \n",
387 | "3 0.72 11.460000 1.11 6.94 34.750000 \n",
388 | "4 1.73 30.280001 1.76 7.32 24.420000 \n",
389 | "\n",
390 | " 流动比率 速动比率 应收账款周转率 存货周转率 是否有私募或风投 当年获得专利数量 研发费用营业收入 政府补助营业收入 \n",
391 | "0 0.94 0.40 45.860001 5.230000 0.0 2 0.006433 0.010428 \\\n",
392 | "1 1.11 0.72 22.790001 19.690001 1.0 0 0.000000 0.000000 \n",
393 | "2 0.21 0.17 22.940001 7.990000 1.0 5 0.024233 0.000623 \n",
394 | "3 0.93 0.80 7.660000 19.110001 1.0 8 0.036983 0.000131 \n",
395 | "4 2.89 2.32 6.470000 9.860000 1.0 0 0.000000 0.000000 \n",
396 | "\n",
397 | " 是否为科创版 posratio negratio \n",
398 | "0 0 0.027111 0.006456 \n",
399 | "1 0 0.036460 0.005590 \n",
400 | "2 0 0.019627 0.006191 \n",
401 | "3 0 0.029616 0.007126 \n",
402 | "4 0 0.027892 0.004673 "
403 | ]
404 | },
405 | "execution_count": 23,
406 | "metadata": {},
407 | "output_type": "execute_result"
408 | }
409 | ],
410 | "source": [
411 | "X.head()"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 24,
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "data": {
421 | "text/plain": [
422 | "0 0\n",
423 | "1 0\n",
424 | "2 0\n",
425 | "3 0\n",
426 | "4 0\n",
427 | "Name: 首日是否破发, dtype: int64"
428 | ]
429 | },
430 | "execution_count": 24,
431 | "metadata": {},
432 | "output_type": "execute_result"
433 | }
434 | ],
435 | "source": [
436 | "y.head()"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 25,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "# 划分训练集和测试集\n",
446 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 26,
452 | "metadata": {},
453 | "outputs": [
454 | {
455 | "data": {
456 | "text/plain": [
457 | "2177 0\n",
458 | "695 0\n",
459 | "409 0\n",
460 | "2624 1\n",
461 | "1686 0\n",
462 | " ..\n",
463 | "1863 0\n",
464 | "1330 0\n",
465 | "2213 0\n",
466 | "2055 0\n",
467 | "2267 1\n",
468 | "Name: 首日是否破发, Length: 2225, dtype: int64"
469 | ]
470 | },
471 | "execution_count": 26,
472 | "metadata": {},
473 | "output_type": "execute_result"
474 | }
475 | ],
476 | "source": [
477 | "y_train"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 65,
483 | "metadata": {},
484 | "outputs": [],
485 | "source": [
486 | "# # 创建RandomOverSampler对象\n",
487 | "# oversampler = RandomOverSampler(random_state=42)\n",
488 | "\n",
489 | "# # 对训练集进行过采样\n",
490 | "# X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 27,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "# 创建XGBoost回归模型\n",
500 | "# model = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=6.34, random_state=40)\n",
501 | "model = xgb.XGBClassifier(objective='binary:logistic', random_state=40)"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 28,
507 | "metadata": {},
508 | "outputs": [
509 | {
510 | "data": {
511 | "text/html": [
512 | "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
513 | " colsample_bylevel=None, colsample_bynode=None,\n",
514 | " colsample_bytree=None, early_stopping_rounds=None,\n",
515 | " enable_categorical=False, eval_metric=None, feature_types=None,\n",
516 | " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
517 | " interaction_constraints=None, learning_rate=None, max_bin=None,\n",
518 | " max_cat_threshold=None, max_cat_to_onehot=None,\n",
519 | " max_delta_step=None, max_depth=None, max_leaves=None,\n",
520 | " min_child_weight=None, missing=nan, monotone_constraints=None,\n",
521 | " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
522 | " predictor=None, random_state=40, ...) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifier XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
523 | " colsample_bylevel=None, colsample_bynode=None,\n",
524 | " colsample_bytree=None, early_stopping_rounds=None,\n",
525 | " enable_categorical=False, eval_metric=None, feature_types=None,\n",
526 | " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
527 | " interaction_constraints=None, learning_rate=None, max_bin=None,\n",
528 | " max_cat_threshold=None, max_cat_to_onehot=None,\n",
529 | " max_delta_step=None, max_depth=None, max_leaves=None,\n",
530 | " min_child_weight=None, missing=nan, monotone_constraints=None,\n",
531 | " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
532 | " predictor=None, random_state=40, ...) "
533 | ],
534 | "text/plain": [
535 | "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
536 | " colsample_bylevel=None, colsample_bynode=None,\n",
537 | " colsample_bytree=None, early_stopping_rounds=None,\n",
538 | " enable_categorical=False, eval_metric=None, feature_types=None,\n",
539 | " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
540 | " interaction_constraints=None, learning_rate=None, max_bin=None,\n",
541 | " max_cat_threshold=None, max_cat_to_onehot=None,\n",
542 | " max_delta_step=None, max_depth=None, max_leaves=None,\n",
543 | " min_child_weight=None, missing=nan, monotone_constraints=None,\n",
544 | " n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
545 | " predictor=None, random_state=40, ...)"
546 | ]
547 | },
548 | "execution_count": 28,
549 | "metadata": {},
550 | "output_type": "execute_result"
551 | }
552 | ],
553 | "source": [
554 | "# 训练模型\n",
555 | "# model.fit(X_train, y_train)\n",
556 | "model.fit(X_train, y_train)"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 29,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "# 预测\n",
566 | "y_pred = model.predict(X_test)"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 30,
572 | "metadata": {},
573 | "outputs": [
574 | {
575 | "data": {
576 | "text/plain": [
577 | "41"
578 | ]
579 | },
580 | "execution_count": 30,
581 | "metadata": {},
582 | "output_type": "execute_result"
583 | }
584 | ],
585 | "source": [
586 | "sum(y_pred)"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": 40,
592 | "metadata": {},
593 | "outputs": [],
594 | "source": [
595 | "y_pred = pd.DataFrame(y_pred)"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 48,
601 | "metadata": {},
602 | "outputs": [
603 | {
604 | "data": {
605 | "text/html": [
606 | "\n",
607 | "\n",
620 | "
\n",
621 | " \n",
622 | " \n",
623 | " \n",
624 | " 0 \n",
625 | " \n",
626 | " \n",
627 | " \n",
628 | " \n",
629 | " 502 \n",
630 | " 0 \n",
631 | " \n",
632 | " \n",
633 | " 590 \n",
634 | " 0 \n",
635 | " \n",
636 | " \n",
637 | " 1407 \n",
638 | " 0 \n",
639 | " \n",
640 | " \n",
641 | " 371 \n",
642 | " 0 \n",
643 | " \n",
644 | " \n",
645 | " 2692 \n",
646 | " 0 \n",
647 | " \n",
648 | " \n",
649 | " ... \n",
650 | " ... \n",
651 | " \n",
652 | " \n",
653 | " 494 \n",
654 | " 0 \n",
655 | " \n",
656 | " \n",
657 | " 1135 \n",
658 | " 0 \n",
659 | " \n",
660 | " \n",
661 | " 1621 \n",
662 | " 0 \n",
663 | " \n",
664 | " \n",
665 | " 80 \n",
666 | " 0 \n",
667 | " \n",
668 | " \n",
669 | " 646 \n",
670 | " 0 \n",
671 | " \n",
672 | " \n",
673 | "
\n",
674 | "
557 rows × 1 columns
\n",
675 | "
"
676 | ],
677 | "text/plain": [
678 | " 0\n",
679 | "502 0\n",
680 | "590 0\n",
681 | "1407 0\n",
682 | "371 0\n",
683 | "2692 0\n",
684 | "... ..\n",
685 | "494 0\n",
686 | "1135 0\n",
687 | "1621 0\n",
688 | "80 0\n",
689 | "646 0\n",
690 | "\n",
691 | "[557 rows x 1 columns]"
692 | ]
693 | },
694 | "execution_count": 48,
695 | "metadata": {},
696 | "output_type": "execute_result"
697 | }
698 | ],
699 | "source": [
700 | "y_pred"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 47,
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "y_pred.index = y_test.index"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": 90,
715 | "metadata": {},
716 | "outputs": [],
717 | "source": [
718 | "np.savetxt('prediction_final.csv', y_pred, delimiter=',', fmt='%d')\n",
719 | "# np.savetxt('realw.csv', y_test, delimiter=',', fmt='%d')"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": 31,
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "data": {
729 | "text/plain": [
730 | "79"
731 | ]
732 | },
733 | "execution_count": 31,
734 | "metadata": {},
735 | "output_type": "execute_result"
736 | }
737 | ],
738 | "source": [
739 | "sum(y_test)"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": 42,
745 | "metadata": {},
746 | "outputs": [],
747 | "source": [
748 | "y_test1 = pd.DataFrame(y_test)"
749 | ]
750 | },
751 | {
752 | "cell_type": "code",
753 | "execution_count": 46,
754 | "metadata": {},
755 | "outputs": [
756 | {
757 | "data": {
758 | "text/plain": [
759 | "Index([ 502, 590, 1407, 371, 2692, 1023, 679, 963, 2331, 252,\n",
760 | " ...\n",
761 | " 1321, 266, 2235, 269, 1752, 494, 1135, 1621, 80, 646],\n",
762 | " dtype='int64', length=557)"
763 | ]
764 | },
765 | "execution_count": 46,
766 | "metadata": {},
767 | "output_type": "execute_result"
768 | }
769 | ],
770 | "source": [
771 | "y_test1.index"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 52,
777 | "metadata": {},
778 | "outputs": [],
779 | "source": [
780 | "df_combined = pd.concat([y_pred, y_test1], axis=1)\n",
781 | "df_combined.columns=['y_pred', 'y_test1']"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": 53,
787 | "metadata": {},
788 | "outputs": [
789 | {
790 | "data": {
791 | "text/html": [
792 | "\n",
793 | "\n",
806 | "
\n",
807 | " \n",
808 | " \n",
809 | " \n",
810 | " y_pred \n",
811 | " y_test1 \n",
812 | " \n",
813 | " \n",
814 | " \n",
815 | " \n",
816 | " 502 \n",
817 | " 0 \n",
818 | " 0 \n",
819 | " \n",
820 | " \n",
821 | " 590 \n",
822 | " 0 \n",
823 | " 0 \n",
824 | " \n",
825 | " \n",
826 | " 1407 \n",
827 | " 0 \n",
828 | " 0 \n",
829 | " \n",
830 | " \n",
831 | " 371 \n",
832 | " 0 \n",
833 | " 0 \n",
834 | " \n",
835 | " \n",
836 | " 2692 \n",
837 | " 0 \n",
838 | " 0 \n",
839 | " \n",
840 | " \n",
841 | " ... \n",
842 | " ... \n",
843 | " ... \n",
844 | " \n",
845 | " \n",
846 | " 494 \n",
847 | " 0 \n",
848 | " 0 \n",
849 | " \n",
850 | " \n",
851 | " 1135 \n",
852 | " 0 \n",
853 | " 0 \n",
854 | " \n",
855 | " \n",
856 | " 1621 \n",
857 | " 0 \n",
858 | " 0 \n",
859 | " \n",
860 | " \n",
861 | " 80 \n",
862 | " 0 \n",
863 | " 0 \n",
864 | " \n",
865 | " \n",
866 | " 646 \n",
867 | " 0 \n",
868 | " 0 \n",
869 | " \n",
870 | " \n",
871 | "
\n",
872 | "
557 rows × 2 columns
\n",
873 | "
"
874 | ],
875 | "text/plain": [
876 | " y_pred y_test1\n",
877 | "502 0 0\n",
878 | "590 0 0\n",
879 | "1407 0 0\n",
880 | "371 0 0\n",
881 | "2692 0 0\n",
882 | "... ... ...\n",
883 | "494 0 0\n",
884 | "1135 0 0\n",
885 | "1621 0 0\n",
886 | "80 0 0\n",
887 | "646 0 0\n",
888 | "\n",
889 | "[557 rows x 2 columns]"
890 | ]
891 | },
892 | "execution_count": 53,
893 | "metadata": {},
894 | "output_type": "execute_result"
895 | }
896 | ],
897 | "source": [
898 | "df_combined"
899 | ]
900 | },
901 | {
902 | "cell_type": "code",
903 | "execution_count": 55,
904 | "metadata": {},
905 | "outputs": [
906 | {
907 | "name": "stdout",
908 | "output_type": "stream",
909 | "text": [
910 | "0.43037974683544306\n"
911 | ]
912 | }
913 | ],
914 | "source": [
915 | "count = ((df_combined['y_pred'] == 1) & (df_combined['y_test1'] == 1)).sum()\n",
916 | "\n",
917 | "print(count/sum(y_test))"
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": 34,
923 | "metadata": {},
924 | "outputs": [],
925 | "source": [
926 | "# y_pred_tol = model.predict(X)"
927 | ]
928 | },
929 | {
930 | "cell_type": "code",
931 | "execution_count": 39,
932 | "metadata": {},
933 | "outputs": [],
934 | "source": [
935 | "# np.savetxt('predictions.csv', y_pred_tol, delimiter=',', fmt='%d')"
936 | ]
937 | },
938 | {
939 | "cell_type": "code",
940 | "execution_count": 32,
941 | "metadata": {},
942 | "outputs": [],
943 | "source": [
944 | "y_pred_proba = model.predict_proba(X_test)[:, 1]"
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": 33,
950 | "metadata": {},
951 | "outputs": [
952 | {
953 | "name": "stdout",
954 | "output_type": "stream",
955 | "text": [
956 | "Accuracy: 0.9066427289048474\n"
957 | ]
958 | }
959 | ],
960 | "source": [
961 | "# 计算准确率\n",
962 | "accuracy = accuracy_score(y_test, y_pred)\n",
963 | "print(\"Accuracy:\", accuracy)"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 34,
969 | "metadata": {},
970 | "outputs": [
971 | {
972 | "name": "stdout",
973 | "output_type": "stream",
974 | "text": [
975 | "AUC: 0.9274932471797045\n"
976 | ]
977 | }
978 | ],
979 | "source": [
980 | "# 计算AUC\n",
981 | "auc = roc_auc_score(y_test, y_pred_proba)\n",
982 | "print(\"AUC:\", auc)"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": 37,
988 | "metadata": {},
989 | "outputs": [
990 | {
991 | "name": "stdout",
992 | "output_type": "stream",
993 | "text": [
994 | "F1 score: 0.5666666666666667\n"
995 | ]
996 | }
997 | ],
998 | "source": [
999 | "# 计算F1分数\n",
1000 | "f1 = f1_score(y_test, y_pred)\n",
1001 | "\n",
1002 | "print(\"F1 score:\", f1)"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "code",
1007 | "execution_count": 56,
1008 | "metadata": {},
1009 | "outputs": [
1010 | {
1011 | "name": "stdout",
1012 | "output_type": "stream",
1013 | "text": [
1014 | "稀释每股收益: 0.043412793427705765\n",
1015 | "扣除非经常性损益后的加权平均净资产收益率: 0.03483826294541359\n",
1016 | "每股经营活动产生的现金流量净额: 0.042398642748594284\n",
1017 | "归属于上市公司股东的每股净资产: 0.03426690027117729\n",
1018 | "资产负债率: 0.03857659175992012\n",
1019 | "流动比率: 0.039224375039339066\n",
1020 | "速动比率: 0.03607599809765816\n",
1021 | "应收账款周转率: 0.035372745245695114\n",
1022 | "存货周转率: 0.03474723920226097\n",
1023 | "是否有私募或风投: 0.03987099230289459\n",
1024 | "当年获得专利数量: 0.09171169251203537\n",
1025 | "研发费用营业收入: 0.05831410735845566\n",
1026 | "政府补助营业收入: 0.02760440669953823\n",
1027 | "是否为科创版: 0.37066173553466797\n",
1028 | "posratio: 0.03695209324359894\n",
1029 | "negratio: 0.03597134351730347\n"
1030 | ]
1031 | }
1032 | ],
1033 | "source": [
1034 | "# 获取特征重要性\n",
1035 | "importance = model.feature_importances_\n",
1036 | "\n",
1037 | "# 打印各个特征的重要性\n",
1038 | "for feature, importance_score in zip(X.columns, importance):\n",
1039 | " print(f\"{feature}: {importance_score}\")"
1040 | ]
1041 | }
1042 | ],
1043 | "metadata": {
1044 | "kernelspec": {
1045 | "display_name": "pytorch",
1046 | "language": "python",
1047 | "name": "python3"
1048 | },
1049 | "language_info": {
1050 | "codemirror_mode": {
1051 | "name": "ipython",
1052 | "version": 3
1053 | },
1054 | "file_extension": ".py",
1055 | "mimetype": "text/x-python",
1056 | "name": "python",
1057 | "nbconvert_exporter": "python",
1058 | "pygments_lexer": "ipython3",
1059 | "version": "3.10.9"
1060 | },
1061 | "orig_nbformat": 4
1062 | },
1063 | "nbformat": 4,
1064 | "nbformat_minor": 2
1065 | }
1066 |
--------------------------------------------------------------------------------