├── README.md
├── sentiment.py
├── Logistic regression.py
├── Random Forest.py
└── XGB.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Predict_IPO-UnderpricingRisk
2 | 通过将对上市公司招股说明书情绪分析的结果与常用财务指标、企业科研指标等结合，综合使用多种分类模型：传统LR、随机森林、XGB、LGB集成学习模型对新上市公司破发情况进行学习和预测，筛选重要特征，并由此来得到一个新股破发分类器。
3 | 


--------------------------------------------------------------------------------
/sentiment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import pandas as pd
 4 | import numpy as np
 5 | import re
 6 | import jieba
 7 | from jieba import analyse
 8 | from collections import Counter
 9 | 
10 | data = pd.DataFrame()
11 | file_folder = "./data/"
12 | for file in os.listdir(file_folder):
13 |     file_path = os.path.join(file_folder, file)
14 |     f = open(file_path, encoding = "utf-8")
15 |     content = f.read()
16 |     stock = file[0:6]
17 |     df = pd.DataFrame([[stock, file, content]], columns=["stock","filename","content"])
18 |     data = pd.concat([data, df])
19 |     f.close()
20 | 
21 | data = data.reset_index(drop = True)
22 | 
23 | from cnsenti import Sentiment
24 | 
25 | senti = Sentiment(pos='./dict/formal_pos.txt',  
26 |                   neg='./dict/formal_neg.txt',  
27 |                   merge=False,             
28 |                   encoding='utf-8') 
29 | 
30 | data.insert(data.shape[1], 'words', 0)
31 | data.insert(data.shape[1], 'sentences', 0)
32 | data.insert(data.shape[1], 'pos', 0)
33 | data.insert(data.shape[1], 'neg', 0)    
34 | 
35 | a = 0
36 | for content in data["content"]:
37 |     result = senti.sentiment_count(content)
38 |     data.iloc[a,3] = result["words"] 
39 |     data.iloc[a,4] = result["sentences"]
40 |     data.iloc[a,5] = result["pos"]  
41 |     data.iloc[a,6] = result["neg"]
42 |     a = a + 1 
43 | 
44 | simple = data[["stock","words","sentences","pos","neg"]]
45 | simple.to_csv("./dict/simple.csv")


--------------------------------------------------------------------------------
/Logistic regression.py:
--------------------------------------------------------------------------------
 1 | # logistic
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import statsmodels.api as sm
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.preprocessing import StandardScaler
 8 | from imblearn.over_sampling import RandomOverSampler
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
11 | from collections import Counter
12 | 
13 | # 加载数据集和预处理
14 | data = pd.read_csv("D.csv")
15 | X = data.drop(["y"], axis=1)  # 特征
16 | y = data["y"]  # 标签
17 | 
18 | # 划分训练集和测试集
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50)
20 | 
21 | # 特征标准化
22 | scaler = StandardScaler()
23 | X_train_scaled = scaler.fit_transform(X_train)
24 | X_test_scaled = scaler.transform(X_test)
25 | 
26 | # 过采样
27 | oversampler = RandomOverSampler(sampling_strategy=1)
28 | X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)
29 | 
30 | # 构建逻辑回归模型
31 | logistic_model = LogisticRegression(random_state=42)
32 | 
33 | # 训练模型
34 | logistic_model.fit(X_train_resampled, y_train_resampled)
35 | 
36 | # 预测测试集
37 | y_pred = logistic_model.predict(X_test_scaled)
38 | 
39 | # 模型准确率
40 | accuracy = accuracy_score(y_test, y_pred)
41 | print("模型准确率:", accuracy)
42 | 
43 | # F1 分数
44 | f1 = f1_score(y_test, y_pred)
45 | print("模型F1分数:", f1)
46 | 
47 | # AUC
48 | y_pred_prob = logistic_model.predict_proba(X_test_scaled)[:, 1]  # 获取正类的预测概率
49 | auc = roc_auc_score(y_test, y_pred_prob)
50 | print("模型AUC:", auc)
51 | 
52 | # 使用statsmodels获取统计信息
53 | X_train_resampled = sm.add_constant(X_train_resampled)  # 加入常数列
54 | logit_model = sm.Logit(y_train_resampled, X_train_resampled)
55 | result = logit_model.fit()
56 | 
57 | # 创建系数表格
58 | coef_summary = result.summary2().tables[1]
59 | 
60 | print(coef_summary)
61 | 


--------------------------------------------------------------------------------
/Random Forest.py:
--------------------------------------------------------------------------------
  1 | # Random forest
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import graphviz
  6 | from sklearn.preprocessing import StandardScaler
  7 | from sklearn.model_selection import train_test_split
  8 | from imblearn.under_sampling import RandomUnderSampler
  9 | from imblearn.over_sampling import RandomOverSampler
 10 | from collections import Counter
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.metrics import accuracy_score
 13 | from sklearn.metrics import f1_score
 14 | from sklearn.metrics import roc_auc_score
 15 | from sklearn.tree import export_graphviz
 16 | 
 17 | # 预处理
 18 | data = pd.read_csv("D.csv")
 19 | X = data.drop(["y","是否为科创版"], axis=1)
 20 | y = data["y"] 
 21 | 
 22 | # 划分训练集测试集
 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 24 | 
 25 | # 标准化
 26 | scaler = StandardScaler()
 27 | X_scaled = scaler.fit_transform(X)
 28 | 
 29 | # 显示原始类别分布
 30 | print("原始类别分布:", Counter(y_train))
 31 | 
 32 | # 过采样
 33 | oversampler = RandomOverSampler(sampling_strategy=1)
 34 | X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
 35 | print("过采样后的类别分布:", Counter(y_train_resampled))
 36 | 
 37 | # 随机森林模型
 38 | random_forest_model = RandomForestClassifier(n_estimators=100, random_state=50)
 39 | 
 40 | # 训练模型
 41 | random_forest_model.fit(X_train_resampled, y_train_resampled)
 42 | 
 43 | # 预测测试集
 44 | y_pred = random_forest_model.predict(X_test)
 45 | 
 46 | # 准确率
 47 | accuracy = accuracy_score(y_test, y_pred)
 48 | print("模型准确率:", accuracy)
 49 | 
 50 | # 测试集中 y=1 的预测准确率
 51 | y_test_positive = y_test[y_test == 1]
 52 | y_pred_positive = y_pred[y_test == 1]
 53 | accuracy_positive = accuracy_score(y_test_positive, y_pred_positive)
 54 | print("测试集中 y=1 的预测准确率:", accuracy_positive)
 55 | 
 56 | y_test_negative = y_test[y_test == 0]
 57 | y_pred_negative = y_pred[y_test == 0]
 58 | error_rate_negative = 1 - accuracy_score(y_test_negative, y_pred_negative)
 59 | print("测试集中 y=0 的预测错误率:", error_rate_negative)
 60 | 
 61 | # F1 分数
 62 | f1 = f1_score(y_test, y_pred)
 63 | print("模型F1分数:", f1)
 64 | 
 65 | # 重要性
 66 | feature_importance = random_forest_model.feature_importances_
 67 | 
 68 | # 创建特征重要性的DataFrame
 69 | feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
 70 | feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
 71 | 
 72 | print(feature_importance_df)
 73 | 
 74 | # AUC
 75 | y_pred_prob = random_forest_model.predict_proba(X_test)[:, 1]  # 获取正类的预测概率
 76 | auc = roc_auc_score(y_test, y_pred_prob)
 77 | print("模型AUC:", auc)
 78 | 
 79 | # 获取随机森林中的第一棵树
 80 | first_tree = random_forest_model.estimators_[0]
 81 | 
 82 | # 导出树的可视化表示
 83 | dot_data = export_graphviz(first_tree, out_file=None, 
 84 |                            feature_names=X.columns, 
 85 |                            class_names=["0", "1"],  # 根据你的类别标签调整
 86 |                            filled=True, rounded=True, special_characters=True)
 87 | 
 88 | # 使用Graphviz绘制树状图
 89 | graph = graphviz.Source(dot_data)
 90 | graph.render("random_forest_tree")  # 可选：将树状图保存为文件
 91 | graph.view()  # 在默认浏览器中显示树状图
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # 模型提升曲线(未完成，直接比较模型优劣)
 98 | 
 99 | import numpy as np
100 | import pandas as pd
101 | import matplotlib.pyplot as plt
102 | from sklearn.model_selection import train_test_split
103 | from sklearn.ensemble import RandomForestClassifier
104 | from sklearn.preprocessing import StandardScaler
105 | from sklearn.metrics import accuracy_score
106 | 
107 | # 预处理
108 | data = pd.read_csv("D.csv") 
109 | X = data.drop("y", axis=1) 
110 | y = data["y"] 
111 | scaler = StandardScaler()
112 | X_scaled = scaler.fit_transform(X)
113 | X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=50)
114 | 
115 | # 随机森林模型
116 | n_estimators_values = [45,90,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
117 | accuracy_scores = []
118 | 
119 | for n_estimators in n_estimators_values:
120 |     random_forest_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
121 |     random_forest_model.fit(X_train, y_train)
122 |     y_pred = random_forest_model.predict(X_test)
123 |     accuracy = accuracy_score(y_test, y_pred)
124 |     accuracy_scores.append(accuracy)
125 | 
126 | # 绘制提升曲线
127 | plt.figure(figsize=(10, 6))
128 | plt.plot(n_estimators_values, accuracy_scores, marker='o')
129 | plt.title("Random Forest Model Boosting Curve")
130 | plt.xlabel("Number of Estimators")
131 | plt.ylabel("Accuracy")
132 | plt.grid(True)
133 | plt.show()
134 | 


--------------------------------------------------------------------------------
/XGB.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 5,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import pandas as pd\n",
  10 |     "import numpy as np\n",
  11 |     "import xgboost as xgb\n",
  12 |     "from sklearn.model_selection import train_test_split\n",
  13 |     "from sklearn.metrics import accuracy_score\n",
  14 |     "from sklearn.metrics import roc_auc_score"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 6,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "from imblearn.over_sampling import RandomOverSampler"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 36,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": [
  32 |     "from sklearn.metrics import f1_score"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 20,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "# 从CSV文件中加载数据集\n",
  42 |     "data = pd.read_csv('new_data.csv')"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 21,
  48 |    "metadata": {},
  49 |    "outputs": [
  50 |     {
  51 |      "data": {
  52 |       "text/html": [
  53 |        "<div>\n",
  54 |        "<style scoped>\n",
  55 |        "    .dataframe tbody tr th:only-of-type {\n",
  56 |        "        vertical-align: middle;\n",
  57 |        "    }\n",
  58 |        "\n",
  59 |        "    .dataframe tbody tr th {\n",
  60 |        "        vertical-align: top;\n",
  61 |        "    }\n",
  62 |        "\n",
  63 |        "    .dataframe thead th {\n",
  64 |        "        text-align: right;\n",
  65 |        "    }\n",
  66 |        "</style>\n",
  67 |        "<table border=\"1\" class=\"dataframe\">\n",
  68 |        "  <thead>\n",
  69 |        "    <tr style=\"text-align: right;\">\n",
  70 |        "      <th></th>\n",
  71 |        "      <th>稀释每股收益</th>\n",
  72 |        "      <th>扣除非经常性损益后的加权平均净资产收益率</th>\n",
  73 |        "      <th>每股经营活动产生的现金流量净额</th>\n",
  74 |        "      <th>归属于上市公司股东的每股净资产</th>\n",
  75 |        "      <th>资产负债率</th>\n",
  76 |        "      <th>流动比率</th>\n",
  77 |        "      <th>速动比率</th>\n",
  78 |        "      <th>应收账款周转率</th>\n",
  79 |        "      <th>存货周转率</th>\n",
  80 |        "      <th>是否有私募或风投</th>\n",
  81 |        "      <th>当年获得专利数量</th>\n",
  82 |        "      <th>研发费用营业收入</th>\n",
  83 |        "      <th>政府补助营业收入</th>\n",
  84 |        "      <th>是否为科创版</th>\n",
  85 |        "      <th>posratio</th>\n",
  86 |        "      <th>negratio</th>\n",
  87 |        "      <th>首日是否破发</th>\n",
  88 |        "    </tr>\n",
  89 |        "  </thead>\n",
  90 |        "  <tbody>\n",
  91 |        "    <tr>\n",
  92 |        "      <th>0</th>\n",
  93 |        "      <td>0.08</td>\n",
  94 |        "      <td>1.330000</td>\n",
  95 |        "      <td>0.10</td>\n",
  96 |        "      <td>3.44</td>\n",
  97 |        "      <td>47.349998</td>\n",
  98 |        "      <td>0.94</td>\n",
  99 |        "      <td>0.40</td>\n",
 100 |        "      <td>45.860001</td>\n",
 101 |        "      <td>5.230000</td>\n",
 102 |        "      <td>0.0</td>\n",
 103 |        "      <td>2</td>\n",
 104 |        "      <td>0.006433</td>\n",
 105 |        "      <td>0.010428</td>\n",
 106 |        "      <td>0</td>\n",
 107 |        "      <td>0.027111</td>\n",
 108 |        "      <td>0.006456</td>\n",
 109 |        "      <td>0</td>\n",
 110 |        "    </tr>\n",
 111 |        "    <tr>\n",
 112 |        "      <th>1</th>\n",
 113 |        "      <td>0.80</td>\n",
 114 |        "      <td>48.110001</td>\n",
 115 |        "      <td>0.77</td>\n",
 116 |        "      <td>NaN</td>\n",
 117 |        "      <td>NaN</td>\n",
 118 |        "      <td>1.11</td>\n",
 119 |        "      <td>0.72</td>\n",
 120 |        "      <td>22.790001</td>\n",
 121 |        "      <td>19.690001</td>\n",
 122 |        "      <td>1.0</td>\n",
 123 |        "      <td>0</td>\n",
 124 |        "      <td>0.000000</td>\n",
 125 |        "      <td>0.000000</td>\n",
 126 |        "      <td>0</td>\n",
 127 |        "      <td>0.036460</td>\n",
 128 |        "      <td>0.005590</td>\n",
 129 |        "      <td>0</td>\n",
 130 |        "    </tr>\n",
 131 |        "    <tr>\n",
 132 |        "      <th>2</th>\n",
 133 |        "      <td>0.33</td>\n",
 134 |        "      <td>27.639999</td>\n",
 135 |        "      <td>0.13</td>\n",
 136 |        "      <td>1.28</td>\n",
 137 |        "      <td>77.320000</td>\n",
 138 |        "      <td>0.21</td>\n",
 139 |        "      <td>0.17</td>\n",
 140 |        "      <td>22.940001</td>\n",
 141 |        "      <td>7.990000</td>\n",
 142 |        "      <td>1.0</td>\n",
 143 |        "      <td>5</td>\n",
 144 |        "      <td>0.024233</td>\n",
 145 |        "      <td>0.000623</td>\n",
 146 |        "      <td>0</td>\n",
 147 |        "      <td>0.019627</td>\n",
 148 |        "      <td>0.006191</td>\n",
 149 |        "      <td>0</td>\n",
 150 |        "    </tr>\n",
 151 |        "    <tr>\n",
 152 |        "      <th>3</th>\n",
 153 |        "      <td>0.72</td>\n",
 154 |        "      <td>11.460000</td>\n",
 155 |        "      <td>1.11</td>\n",
 156 |        "      <td>6.94</td>\n",
 157 |        "      <td>34.750000</td>\n",
 158 |        "      <td>0.93</td>\n",
 159 |        "      <td>0.80</td>\n",
 160 |        "      <td>7.660000</td>\n",
 161 |        "      <td>19.110001</td>\n",
 162 |        "      <td>1.0</td>\n",
 163 |        "      <td>8</td>\n",
 164 |        "      <td>0.036983</td>\n",
 165 |        "      <td>0.000131</td>\n",
 166 |        "      <td>0</td>\n",
 167 |        "      <td>0.029616</td>\n",
 168 |        "      <td>0.007126</td>\n",
 169 |        "      <td>0</td>\n",
 170 |        "    </tr>\n",
 171 |        "    <tr>\n",
 172 |        "      <th>4</th>\n",
 173 |        "      <td>1.73</td>\n",
 174 |        "      <td>30.280001</td>\n",
 175 |        "      <td>1.76</td>\n",
 176 |        "      <td>7.32</td>\n",
 177 |        "      <td>24.420000</td>\n",
 178 |        "      <td>2.89</td>\n",
 179 |        "      <td>2.32</td>\n",
 180 |        "      <td>6.470000</td>\n",
 181 |        "      <td>9.860000</td>\n",
 182 |        "      <td>1.0</td>\n",
 183 |        "      <td>0</td>\n",
 184 |        "      <td>0.000000</td>\n",
 185 |        "      <td>0.000000</td>\n",
 186 |        "      <td>0</td>\n",
 187 |        "      <td>0.027892</td>\n",
 188 |        "      <td>0.004673</td>\n",
 189 |        "      <td>0</td>\n",
 190 |        "    </tr>\n",
 191 |        "  </tbody>\n",
 192 |        "</table>\n",
 193 |        "</div>"
 194 |       ],
 195 |       "text/plain": [
 196 |        "   稀释每股收益  扣除非经常性损益后的加权平均净资产收益率  每股经营活动产生的现金流量净额  归属于上市公司股东的每股净资产      资产负债率   \n",
 197 |        "0    0.08              1.330000             0.10             3.44  47.349998  \\\n",
 198 |        "1    0.80             48.110001             0.77              NaN        NaN   \n",
 199 |        "2    0.33             27.639999             0.13             1.28  77.320000   \n",
 200 |        "3    0.72             11.460000             1.11             6.94  34.750000   \n",
 201 |        "4    1.73             30.280001             1.76             7.32  24.420000   \n",
 202 |        "\n",
 203 |        "   流动比率  速动比率    应收账款周转率      存货周转率  是否有私募或风投  当年获得专利数量  研发费用营业收入  政府补助营业收入   \n",
 204 |        "0  0.94  0.40  45.860001   5.230000       0.0         2  0.006433  0.010428  \\\n",
 205 |        "1  1.11  0.72  22.790001  19.690001       1.0         0  0.000000  0.000000   \n",
 206 |        "2  0.21  0.17  22.940001   7.990000       1.0         5  0.024233  0.000623   \n",
 207 |        "3  0.93  0.80   7.660000  19.110001       1.0         8  0.036983  0.000131   \n",
 208 |        "4  2.89  2.32   6.470000   9.860000       1.0         0  0.000000  0.000000   \n",
 209 |        "\n",
 210 |        "   是否为科创版  posratio  negratio  首日是否破发  \n",
 211 |        "0       0  0.027111  0.006456       0  \n",
 212 |        "1       0  0.036460  0.005590       0  \n",
 213 |        "2       0  0.019627  0.006191       0  \n",
 214 |        "3       0  0.029616  0.007126       0  \n",
 215 |        "4       0  0.027892  0.004673       0  "
 216 |       ]
 217 |      },
 218 |      "execution_count": 21,
 219 |      "metadata": {},
 220 |      "output_type": "execute_result"
 221 |     }
 222 |    ],
 223 |    "source": [
 224 |     "data.head()"
 225 |    ]
 226 |   },
 227 |   {
 228 |    "cell_type": "code",
 229 |    "execution_count": 22,
 230 |    "metadata": {},
 231 |    "outputs": [],
 232 |    "source": [
 233 |     "# 将特征列和目标列分开\n",
 234 |     "X = data.drop('首日是否破发', axis=1)\n",
 235 |     "y = data['首日是否破发']"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": 23,
 241 |    "metadata": {},
 242 |    "outputs": [
 243 |     {
 244 |      "data": {
 245 |       "text/html": [
 246 |        "<div>\n",
 247 |        "<style scoped>\n",
 248 |        "    .dataframe tbody tr th:only-of-type {\n",
 249 |        "        vertical-align: middle;\n",
 250 |        "    }\n",
 251 |        "\n",
 252 |        "    .dataframe tbody tr th {\n",
 253 |        "        vertical-align: top;\n",
 254 |        "    }\n",
 255 |        "\n",
 256 |        "    .dataframe thead th {\n",
 257 |        "        text-align: right;\n",
 258 |        "    }\n",
 259 |        "</style>\n",
 260 |        "<table border=\"1\" class=\"dataframe\">\n",
 261 |        "  <thead>\n",
 262 |        "    <tr style=\"text-align: right;\">\n",
 263 |        "      <th></th>\n",
 264 |        "      <th>稀释每股收益</th>\n",
 265 |        "      <th>扣除非经常性损益后的加权平均净资产收益率</th>\n",
 266 |        "      <th>每股经营活动产生的现金流量净额</th>\n",
 267 |        "      <th>归属于上市公司股东的每股净资产</th>\n",
 268 |        "      <th>资产负债率</th>\n",
 269 |        "      <th>流动比率</th>\n",
 270 |        "      <th>速动比率</th>\n",
 271 |        "      <th>应收账款周转率</th>\n",
 272 |        "      <th>存货周转率</th>\n",
 273 |        "      <th>是否有私募或风投</th>\n",
 274 |        "      <th>当年获得专利数量</th>\n",
 275 |        "      <th>研发费用营业收入</th>\n",
 276 |        "      <th>政府补助营业收入</th>\n",
 277 |        "      <th>是否为科创版</th>\n",
 278 |        "      <th>posratio</th>\n",
 279 |        "      <th>negratio</th>\n",
 280 |        "    </tr>\n",
 281 |        "  </thead>\n",
 282 |        "  <tbody>\n",
 283 |        "    <tr>\n",
 284 |        "      <th>0</th>\n",
 285 |        "      <td>0.08</td>\n",
 286 |        "      <td>1.330000</td>\n",
 287 |        "      <td>0.10</td>\n",
 288 |        "      <td>3.44</td>\n",
 289 |        "      <td>47.349998</td>\n",
 290 |        "      <td>0.94</td>\n",
 291 |        "      <td>0.40</td>\n",
 292 |        "      <td>45.860001</td>\n",
 293 |        "      <td>5.230000</td>\n",
 294 |        "      <td>0.0</td>\n",
 295 |        "      <td>2</td>\n",
 296 |        "      <td>0.006433</td>\n",
 297 |        "      <td>0.010428</td>\n",
 298 |        "      <td>0</td>\n",
 299 |        "      <td>0.027111</td>\n",
 300 |        "      <td>0.006456</td>\n",
 301 |        "    </tr>\n",
 302 |        "    <tr>\n",
 303 |        "      <th>1</th>\n",
 304 |        "      <td>0.80</td>\n",
 305 |        "      <td>48.110001</td>\n",
 306 |        "      <td>0.77</td>\n",
 307 |        "      <td>NaN</td>\n",
 308 |        "      <td>NaN</td>\n",
 309 |        "      <td>1.11</td>\n",
 310 |        "      <td>0.72</td>\n",
 311 |        "      <td>22.790001</td>\n",
 312 |        "      <td>19.690001</td>\n",
 313 |        "      <td>1.0</td>\n",
 314 |        "      <td>0</td>\n",
 315 |        "      <td>0.000000</td>\n",
 316 |        "      <td>0.000000</td>\n",
 317 |        "      <td>0</td>\n",
 318 |        "      <td>0.036460</td>\n",
 319 |        "      <td>0.005590</td>\n",
 320 |        "    </tr>\n",
 321 |        "    <tr>\n",
 322 |        "      <th>2</th>\n",
 323 |        "      <td>0.33</td>\n",
 324 |        "      <td>27.639999</td>\n",
 325 |        "      <td>0.13</td>\n",
 326 |        "      <td>1.28</td>\n",
 327 |        "      <td>77.320000</td>\n",
 328 |        "      <td>0.21</td>\n",
 329 |        "      <td>0.17</td>\n",
 330 |        "      <td>22.940001</td>\n",
 331 |        "      <td>7.990000</td>\n",
 332 |        "      <td>1.0</td>\n",
 333 |        "      <td>5</td>\n",
 334 |        "      <td>0.024233</td>\n",
 335 |        "      <td>0.000623</td>\n",
 336 |        "      <td>0</td>\n",
 337 |        "      <td>0.019627</td>\n",
 338 |        "      <td>0.006191</td>\n",
 339 |        "    </tr>\n",
 340 |        "    <tr>\n",
 341 |        "      <th>3</th>\n",
 342 |        "      <td>0.72</td>\n",
 343 |        "      <td>11.460000</td>\n",
 344 |        "      <td>1.11</td>\n",
 345 |        "      <td>6.94</td>\n",
 346 |        "      <td>34.750000</td>\n",
 347 |        "      <td>0.93</td>\n",
 348 |        "      <td>0.80</td>\n",
 349 |        "      <td>7.660000</td>\n",
 350 |        "      <td>19.110001</td>\n",
 351 |        "      <td>1.0</td>\n",
 352 |        "      <td>8</td>\n",
 353 |        "      <td>0.036983</td>\n",
 354 |        "      <td>0.000131</td>\n",
 355 |        "      <td>0</td>\n",
 356 |        "      <td>0.029616</td>\n",
 357 |        "      <td>0.007126</td>\n",
 358 |        "    </tr>\n",
 359 |        "    <tr>\n",
 360 |        "      <th>4</th>\n",
 361 |        "      <td>1.73</td>\n",
 362 |        "      <td>30.280001</td>\n",
 363 |        "      <td>1.76</td>\n",
 364 |        "      <td>7.32</td>\n",
 365 |        "      <td>24.420000</td>\n",
 366 |        "      <td>2.89</td>\n",
 367 |        "      <td>2.32</td>\n",
 368 |        "      <td>6.470000</td>\n",
 369 |        "      <td>9.860000</td>\n",
 370 |        "      <td>1.0</td>\n",
 371 |        "      <td>0</td>\n",
 372 |        "      <td>0.000000</td>\n",
 373 |        "      <td>0.000000</td>\n",
 374 |        "      <td>0</td>\n",
 375 |        "      <td>0.027892</td>\n",
 376 |        "      <td>0.004673</td>\n",
 377 |        "    </tr>\n",
 378 |        "  </tbody>\n",
 379 |        "</table>\n",
 380 |        "</div>"
 381 |       ],
 382 |       "text/plain": [
 383 |        "   稀释每股收益  扣除非经常性损益后的加权平均净资产收益率  每股经营活动产生的现金流量净额  归属于上市公司股东的每股净资产      资产负债率   \n",
 384 |        "0    0.08              1.330000             0.10             3.44  47.349998  \\\n",
 385 |        "1    0.80             48.110001             0.77              NaN        NaN   \n",
 386 |        "2    0.33             27.639999             0.13             1.28  77.320000   \n",
 387 |        "3    0.72             11.460000             1.11             6.94  34.750000   \n",
 388 |        "4    1.73             30.280001             1.76             7.32  24.420000   \n",
 389 |        "\n",
 390 |        "   流动比率  速动比率    应收账款周转率      存货周转率  是否有私募或风投  当年获得专利数量  研发费用营业收入  政府补助营业收入   \n",
 391 |        "0  0.94  0.40  45.860001   5.230000       0.0         2  0.006433  0.010428  \\\n",
 392 |        "1  1.11  0.72  22.790001  19.690001       1.0         0  0.000000  0.000000   \n",
 393 |        "2  0.21  0.17  22.940001   7.990000       1.0         5  0.024233  0.000623   \n",
 394 |        "3  0.93  0.80   7.660000  19.110001       1.0         8  0.036983  0.000131   \n",
 395 |        "4  2.89  2.32   6.470000   9.860000       1.0         0  0.000000  0.000000   \n",
 396 |        "\n",
 397 |        "   是否为科创版  posratio  negratio  \n",
 398 |        "0       0  0.027111  0.006456  \n",
 399 |        "1       0  0.036460  0.005590  \n",
 400 |        "2       0  0.019627  0.006191  \n",
 401 |        "3       0  0.029616  0.007126  \n",
 402 |        "4       0  0.027892  0.004673  "
 403 |       ]
 404 |      },
 405 |      "execution_count": 23,
 406 |      "metadata": {},
 407 |      "output_type": "execute_result"
 408 |     }
 409 |    ],
 410 |    "source": [
 411 |     "X.head()"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": 24,
 417 |    "metadata": {},
 418 |    "outputs": [
 419 |     {
 420 |      "data": {
 421 |       "text/plain": [
 422 |        "0    0\n",
 423 |        "1    0\n",
 424 |        "2    0\n",
 425 |        "3    0\n",
 426 |        "4    0\n",
 427 |        "Name: 首日是否破发, dtype: int64"
 428 |       ]
 429 |      },
 430 |      "execution_count": 24,
 431 |      "metadata": {},
 432 |      "output_type": "execute_result"
 433 |     }
 434 |    ],
 435 |    "source": [
 436 |     "y.head()"
 437 |    ]
 438 |   },
 439 |   {
 440 |    "cell_type": "code",
 441 |    "execution_count": 25,
 442 |    "metadata": {},
 443 |    "outputs": [],
 444 |    "source": [
 445 |     "# 划分训练集和测试集\n",
 446 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)"
 447 |    ]
 448 |   },
 449 |   {
 450 |    "cell_type": "code",
 451 |    "execution_count": 26,
 452 |    "metadata": {},
 453 |    "outputs": [
 454 |     {
 455 |      "data": {
 456 |       "text/plain": [
 457 |        "2177    0\n",
 458 |        "695     0\n",
 459 |        "409     0\n",
 460 |        "2624    1\n",
 461 |        "1686    0\n",
 462 |        "       ..\n",
 463 |        "1863    0\n",
 464 |        "1330    0\n",
 465 |        "2213    0\n",
 466 |        "2055    0\n",
 467 |        "2267    1\n",
 468 |        "Name: 首日是否破发, Length: 2225, dtype: int64"
 469 |       ]
 470 |      },
 471 |      "execution_count": 26,
 472 |      "metadata": {},
 473 |      "output_type": "execute_result"
 474 |     }
 475 |    ],
 476 |    "source": [
 477 |     "y_train"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 65,
 483 |    "metadata": {},
 484 |    "outputs": [],
 485 |    "source": [
 486 |     "# # 创建RandomOverSampler对象\n",
 487 |     "# oversampler = RandomOverSampler(random_state=42)\n",
 488 |     "\n",
 489 |     "# # 对训练集进行过采样\n",
 490 |     "# X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)"
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "code",
 495 |    "execution_count": 27,
 496 |    "metadata": {},
 497 |    "outputs": [],
 498 |    "source": [
 499 |     "# 创建XGBoost回归模型\n",
 500 |     "# model = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=6.34, random_state=40)\n",
 501 |     "model = xgb.XGBClassifier(objective='binary:logistic', random_state=40)"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "code",
 506 |    "execution_count": 28,
 507 |    "metadata": {},
 508 |    "outputs": [
 509 |     {
 510 |      "data": {
 511 |       "text/html": [
 512 |        "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
 513 |        "              colsample_bylevel=None, colsample_bynode=None,\n",
 514 |        "              colsample_bytree=None, early_stopping_rounds=None,\n",
 515 |        "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
 516 |        "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
 517 |        "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
 518 |        "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
 519 |        "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
 520 |        "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
 521 |        "              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
 522 |        "              predictor=None, random_state=40, ...)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">XGBClassifier</label><div class=\"sk-toggleable__content\"><pre>XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
 523 |        "              colsample_bylevel=None, colsample_bynode=None,\n",
 524 |        "              colsample_bytree=None, early_stopping_rounds=None,\n",
 525 |        "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
 526 |        "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
 527 |        "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
 528 |        "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
 529 |        "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
 530 |        "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
 531 |        "              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
 532 |        "              predictor=None, random_state=40, ...)</pre></div></div></div></div></div>"
 533 |       ],
 534 |       "text/plain": [
 535 |        "XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
 536 |        "              colsample_bylevel=None, colsample_bynode=None,\n",
 537 |        "              colsample_bytree=None, early_stopping_rounds=None,\n",
 538 |        "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
 539 |        "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
 540 |        "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
 541 |        "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
 542 |        "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
 543 |        "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
 544 |        "              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
 545 |        "              predictor=None, random_state=40, ...)"
 546 |       ]
 547 |      },
 548 |      "execution_count": 28,
 549 |      "metadata": {},
 550 |      "output_type": "execute_result"
 551 |     }
 552 |    ],
 553 |    "source": [
 554 |     "# 训练模型\n",
 555 |     "# model.fit(X_train, y_train)\n",
 556 |     "model.fit(X_train, y_train)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 29,
 562 |    "metadata": {},
 563 |    "outputs": [],
 564 |    "source": [
 565 |     "# 预测\n",
 566 |     "y_pred = model.predict(X_test)"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 30,
 572 |    "metadata": {},
 573 |    "outputs": [
 574 |     {
 575 |      "data": {
 576 |       "text/plain": [
 577 |        "41"
 578 |       ]
 579 |      },
 580 |      "execution_count": 30,
 581 |      "metadata": {},
 582 |      "output_type": "execute_result"
 583 |     }
 584 |    ],
 585 |    "source": [
 586 |     "sum(y_pred)"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "code",
 591 |    "execution_count": 40,
 592 |    "metadata": {},
 593 |    "outputs": [],
 594 |    "source": [
 595 |     "y_pred = pd.DataFrame(y_pred)"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": 48,
 601 |    "metadata": {},
 602 |    "outputs": [
 603 |     {
 604 |      "data": {
 605 |       "text/html": [
 606 |        "<div>\n",
 607 |        "<style scoped>\n",
 608 |        "    .dataframe tbody tr th:only-of-type {\n",
 609 |        "        vertical-align: middle;\n",
 610 |        "    }\n",
 611 |        "\n",
 612 |        "    .dataframe tbody tr th {\n",
 613 |        "        vertical-align: top;\n",
 614 |        "    }\n",
 615 |        "\n",
 616 |        "    .dataframe thead th {\n",
 617 |        "        text-align: right;\n",
 618 |        "    }\n",
 619 |        "</style>\n",
 620 |        "<table border=\"1\" class=\"dataframe\">\n",
 621 |        "  <thead>\n",
 622 |        "    <tr style=\"text-align: right;\">\n",
 623 |        "      <th></th>\n",
 624 |        "      <th>0</th>\n",
 625 |        "    </tr>\n",
 626 |        "  </thead>\n",
 627 |        "  <tbody>\n",
 628 |        "    <tr>\n",
 629 |        "      <th>502</th>\n",
 630 |        "      <td>0</td>\n",
 631 |        "    </tr>\n",
 632 |        "    <tr>\n",
 633 |        "      <th>590</th>\n",
 634 |        "      <td>0</td>\n",
 635 |        "    </tr>\n",
 636 |        "    <tr>\n",
 637 |        "      <th>1407</th>\n",
 638 |        "      <td>0</td>\n",
 639 |        "    </tr>\n",
 640 |        "    <tr>\n",
 641 |        "      <th>371</th>\n",
 642 |        "      <td>0</td>\n",
 643 |        "    </tr>\n",
 644 |        "    <tr>\n",
 645 |        "      <th>2692</th>\n",
 646 |        "      <td>0</td>\n",
 647 |        "    </tr>\n",
 648 |        "    <tr>\n",
 649 |        "      <th>...</th>\n",
 650 |        "      <td>...</td>\n",
 651 |        "    </tr>\n",
 652 |        "    <tr>\n",
 653 |        "      <th>494</th>\n",
 654 |        "      <td>0</td>\n",
 655 |        "    </tr>\n",
 656 |        "    <tr>\n",
 657 |        "      <th>1135</th>\n",
 658 |        "      <td>0</td>\n",
 659 |        "    </tr>\n",
 660 |        "    <tr>\n",
 661 |        "      <th>1621</th>\n",
 662 |        "      <td>0</td>\n",
 663 |        "    </tr>\n",
 664 |        "    <tr>\n",
 665 |        "      <th>80</th>\n",
 666 |        "      <td>0</td>\n",
 667 |        "    </tr>\n",
 668 |        "    <tr>\n",
 669 |        "      <th>646</th>\n",
 670 |        "      <td>0</td>\n",
 671 |        "    </tr>\n",
 672 |        "  </tbody>\n",
 673 |        "</table>\n",
 674 |        "<p>557 rows × 1 columns</p>\n",
 675 |        "</div>"
 676 |       ],
 677 |       "text/plain": [
 678 |        "      0\n",
 679 |        "502   0\n",
 680 |        "590   0\n",
 681 |        "1407  0\n",
 682 |        "371   0\n",
 683 |        "2692  0\n",
 684 |        "...  ..\n",
 685 |        "494   0\n",
 686 |        "1135  0\n",
 687 |        "1621  0\n",
 688 |        "80    0\n",
 689 |        "646   0\n",
 690 |        "\n",
 691 |        "[557 rows x 1 columns]"
 692 |       ]
 693 |      },
 694 |      "execution_count": 48,
 695 |      "metadata": {},
 696 |      "output_type": "execute_result"
 697 |     }
 698 |    ],
 699 |    "source": [
 700 |     "y_pred"
 701 |    ]
 702 |   },
 703 |   {
 704 |    "cell_type": "code",
 705 |    "execution_count": 47,
 706 |    "metadata": {},
 707 |    "outputs": [],
 708 |    "source": [
 709 |     "y_pred.index = y_test.index"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": 90,
 715 |    "metadata": {},
 716 |    "outputs": [],
 717 |    "source": [
 718 |     "np.savetxt('prediction_final.csv', y_pred, delimiter=',', fmt='%d')\n",
 719 |     "# np.savetxt('realw.csv', y_test, delimiter=',', fmt='%d')"
 720 |    ]
 721 |   },
 722 |   {
 723 |    "cell_type": "code",
 724 |    "execution_count": 31,
 725 |    "metadata": {},
 726 |    "outputs": [
 727 |     {
 728 |      "data": {
 729 |       "text/plain": [
 730 |        "79"
 731 |       ]
 732 |      },
 733 |      "execution_count": 31,
 734 |      "metadata": {},
 735 |      "output_type": "execute_result"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "sum(y_test)"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "code",
 744 |    "execution_count": 42,
 745 |    "metadata": {},
 746 |    "outputs": [],
 747 |    "source": [
 748 |     "y_test1 = pd.DataFrame(y_test)"
 749 |    ]
 750 |   },
 751 |   {
 752 |    "cell_type": "code",
 753 |    "execution_count": 46,
 754 |    "metadata": {},
 755 |    "outputs": [
 756 |     {
 757 |      "data": {
 758 |       "text/plain": [
 759 |        "Index([ 502,  590, 1407,  371, 2692, 1023,  679,  963, 2331,  252,\n",
 760 |        "       ...\n",
 761 |        "       1321,  266, 2235,  269, 1752,  494, 1135, 1621,   80,  646],\n",
 762 |        "      dtype='int64', length=557)"
 763 |       ]
 764 |      },
 765 |      "execution_count": 46,
 766 |      "metadata": {},
 767 |      "output_type": "execute_result"
 768 |     }
 769 |    ],
 770 |    "source": [
 771 |     "y_test1.index"
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": 52,
 777 |    "metadata": {},
 778 |    "outputs": [],
 779 |    "source": [
 780 |     "df_combined = pd.concat([y_pred, y_test1], axis=1)\n",
 781 |     "df_combined.columns=['y_pred', 'y_test1']"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": 53,
 787 |    "metadata": {},
 788 |    "outputs": [
 789 |     {
 790 |      "data": {
 791 |       "text/html": [
 792 |        "<div>\n",
 793 |        "<style scoped>\n",
 794 |        "    .dataframe tbody tr th:only-of-type {\n",
 795 |        "        vertical-align: middle;\n",
 796 |        "    }\n",
 797 |        "\n",
 798 |        "    .dataframe tbody tr th {\n",
 799 |        "        vertical-align: top;\n",
 800 |        "    }\n",
 801 |        "\n",
 802 |        "    .dataframe thead th {\n",
 803 |        "        text-align: right;\n",
 804 |        "    }\n",
 805 |        "</style>\n",
 806 |        "<table border=\"1\" class=\"dataframe\">\n",
 807 |        "  <thead>\n",
 808 |        "    <tr style=\"text-align: right;\">\n",
 809 |        "      <th></th>\n",
 810 |        "      <th>y_pred</th>\n",
 811 |        "      <th>y_test1</th>\n",
 812 |        "    </tr>\n",
 813 |        "  </thead>\n",
 814 |        "  <tbody>\n",
 815 |        "    <tr>\n",
 816 |        "      <th>502</th>\n",
 817 |        "      <td>0</td>\n",
 818 |        "      <td>0</td>\n",
 819 |        "    </tr>\n",
 820 |        "    <tr>\n",
 821 |        "      <th>590</th>\n",
 822 |        "      <td>0</td>\n",
 823 |        "      <td>0</td>\n",
 824 |        "    </tr>\n",
 825 |        "    <tr>\n",
 826 |        "      <th>1407</th>\n",
 827 |        "      <td>0</td>\n",
 828 |        "      <td>0</td>\n",
 829 |        "    </tr>\n",
 830 |        "    <tr>\n",
 831 |        "      <th>371</th>\n",
 832 |        "      <td>0</td>\n",
 833 |        "      <td>0</td>\n",
 834 |        "    </tr>\n",
 835 |        "    <tr>\n",
 836 |        "      <th>2692</th>\n",
 837 |        "      <td>0</td>\n",
 838 |        "      <td>0</td>\n",
 839 |        "    </tr>\n",
 840 |        "    <tr>\n",
 841 |        "      <th>...</th>\n",
 842 |        "      <td>...</td>\n",
 843 |        "      <td>...</td>\n",
 844 |        "    </tr>\n",
 845 |        "    <tr>\n",
 846 |        "      <th>494</th>\n",
 847 |        "      <td>0</td>\n",
 848 |        "      <td>0</td>\n",
 849 |        "    </tr>\n",
 850 |        "    <tr>\n",
 851 |        "      <th>1135</th>\n",
 852 |        "      <td>0</td>\n",
 853 |        "      <td>0</td>\n",
 854 |        "    </tr>\n",
 855 |        "    <tr>\n",
 856 |        "      <th>1621</th>\n",
 857 |        "      <td>0</td>\n",
 858 |        "      <td>0</td>\n",
 859 |        "    </tr>\n",
 860 |        "    <tr>\n",
 861 |        "      <th>80</th>\n",
 862 |        "      <td>0</td>\n",
 863 |        "      <td>0</td>\n",
 864 |        "    </tr>\n",
 865 |        "    <tr>\n",
 866 |        "      <th>646</th>\n",
 867 |        "      <td>0</td>\n",
 868 |        "      <td>0</td>\n",
 869 |        "    </tr>\n",
 870 |        "  </tbody>\n",
 871 |        "</table>\n",
 872 |        "<p>557 rows × 2 columns</p>\n",
 873 |        "</div>"
 874 |       ],
 875 |       "text/plain": [
 876 |        "      y_pred  y_test1\n",
 877 |        "502        0        0\n",
 878 |        "590        0        0\n",
 879 |        "1407       0        0\n",
 880 |        "371        0        0\n",
 881 |        "2692       0        0\n",
 882 |        "...      ...      ...\n",
 883 |        "494        0        0\n",
 884 |        "1135       0        0\n",
 885 |        "1621       0        0\n",
 886 |        "80         0        0\n",
 887 |        "646        0        0\n",
 888 |        "\n",
 889 |        "[557 rows x 2 columns]"
 890 |       ]
 891 |      },
 892 |      "execution_count": 53,
 893 |      "metadata": {},
 894 |      "output_type": "execute_result"
 895 |     }
 896 |    ],
 897 |    "source": [
 898 |     "df_combined"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "code",
 903 |    "execution_count": 55,
 904 |    "metadata": {},
 905 |    "outputs": [
 906 |     {
 907 |      "name": "stdout",
 908 |      "output_type": "stream",
 909 |      "text": [
 910 |       "0.43037974683544306\n"
 911 |      ]
 912 |     }
 913 |    ],
 914 |    "source": [
 915 |     "count = ((df_combined['y_pred'] == 1) & (df_combined['y_test1'] == 1)).sum()\n",
 916 |     "\n",
 917 |     "print(count/sum(y_test))"
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "code",
 922 |    "execution_count": 34,
 923 |    "metadata": {},
 924 |    "outputs": [],
 925 |    "source": [
 926 |     "# y_pred_tol = model.predict(X)"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "code",
 931 |    "execution_count": 39,
 932 |    "metadata": {},
 933 |    "outputs": [],
 934 |    "source": [
 935 |     "# np.savetxt('predictions.csv', y_pred_tol, delimiter=',', fmt='%d')"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": 32,
 941 |    "metadata": {},
 942 |    "outputs": [],
 943 |    "source": [
 944 |     "y_pred_proba = model.predict_proba(X_test)[:, 1]"
 945 |    ]
 946 |   },
 947 |   {
 948 |    "cell_type": "code",
 949 |    "execution_count": 33,
 950 |    "metadata": {},
 951 |    "outputs": [
 952 |     {
 953 |      "name": "stdout",
 954 |      "output_type": "stream",
 955 |      "text": [
 956 |       "Accuracy: 0.9066427289048474\n"
 957 |      ]
 958 |     }
 959 |    ],
 960 |    "source": [
 961 |     "# 计算准确率\n",
 962 |     "accuracy = accuracy_score(y_test, y_pred)\n",
 963 |     "print(\"Accuracy:\", accuracy)"
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "code",
 968 |    "execution_count": 34,
 969 |    "metadata": {},
 970 |    "outputs": [
 971 |     {
 972 |      "name": "stdout",
 973 |      "output_type": "stream",
 974 |      "text": [
 975 |       "AUC: 0.9274932471797045\n"
 976 |      ]
 977 |     }
 978 |    ],
 979 |    "source": [
 980 |     "# 计算AUC\n",
 981 |     "auc = roc_auc_score(y_test, y_pred_proba)\n",
 982 |     "print(\"AUC:\", auc)"
 983 |    ]
 984 |   },
 985 |   {
 986 |    "cell_type": "code",
 987 |    "execution_count": 37,
 988 |    "metadata": {},
 989 |    "outputs": [
 990 |     {
 991 |      "name": "stdout",
 992 |      "output_type": "stream",
 993 |      "text": [
 994 |       "F1 score: 0.5666666666666667\n"
 995 |      ]
 996 |     }
 997 |    ],
 998 |    "source": [
 999 |     "# 计算F1分数\n",
1000 |     "f1 = f1_score(y_test, y_pred)\n",
1001 |     "\n",
1002 |     "print(\"F1 score:\", f1)"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "code",
1007 |    "execution_count": 56,
1008 |    "metadata": {},
1009 |    "outputs": [
1010 |     {
1011 |      "name": "stdout",
1012 |      "output_type": "stream",
1013 |      "text": [
1014 |       "稀释每股收益: 0.043412793427705765\n",
1015 |       "扣除非经常性损益后的加权平均净资产收益率: 0.03483826294541359\n",
1016 |       "每股经营活动产生的现金流量净额: 0.042398642748594284\n",
1017 |       "归属于上市公司股东的每股净资产: 0.03426690027117729\n",
1018 |       "资产负债率: 0.03857659175992012\n",
1019 |       "流动比率: 0.039224375039339066\n",
1020 |       "速动比率: 0.03607599809765816\n",
1021 |       "应收账款周转率: 0.035372745245695114\n",
1022 |       "存货周转率: 0.03474723920226097\n",
1023 |       "是否有私募或风投: 0.03987099230289459\n",
1024 |       "当年获得专利数量: 0.09171169251203537\n",
1025 |       "研发费用营业收入: 0.05831410735845566\n",
1026 |       "政府补助营业收入: 0.02760440669953823\n",
1027 |       "是否为科创版: 0.37066173553466797\n",
1028 |       "posratio: 0.03695209324359894\n",
1029 |       "negratio: 0.03597134351730347\n"
1030 |      ]
1031 |     }
1032 |    ],
1033 |    "source": [
1034 |     "# 获取特征重要性\n",
1035 |     "importance = model.feature_importances_\n",
1036 |     "\n",
1037 |     "# 打印各个特征的重要性\n",
1038 |     "for feature, importance_score in zip(X.columns, importance):\n",
1039 |     "    print(f\"{feature}: {importance_score}\")"
1040 |    ]
1041 |   }
1042 |  ],
1043 |  "metadata": {
1044 |   "kernelspec": {
1045 |    "display_name": "pytorch",
1046 |    "language": "python",
1047 |    "name": "python3"
1048 |   },
1049 |   "language_info": {
1050 |    "codemirror_mode": {
1051 |     "name": "ipython",
1052 |     "version": 3
1053 |    },
1054 |    "file_extension": ".py",
1055 |    "mimetype": "text/x-python",
1056 |    "name": "python",
1057 |    "nbconvert_exporter": "python",
1058 |    "pygments_lexer": "ipython3",
1059 |    "version": "3.10.9"
1060 |   },
1061 |   "orig_nbformat": 4
1062 |  },
1063 |  "nbformat": 4,
1064 |  "nbformat_minor": 2
1065 | }
1066 | 


--------------------------------------------------------------------------------