├── .gitattributes ├── .gitignore ├── 1.txt ├── LICENSE ├── README.md ├── params ├── bart │ ├── config.json │ ├── merges.txt │ ├── tokenizer.json │ └── vocab.json ├── pegasus │ ├── config.json │ ├── special_tokens_map.json │ ├── spiece.model │ ├── tokenizer.json │ └── tokenizer_config.json ├── t5-base │ ├── config.json │ ├── spiece.model │ └── tokenizer.json ├── t5-large │ ├── config.json │ ├── spiece.model │ └── tokenizer.json └── t5-small │ ├── config.json │ ├── spiece.model │ └── tokenizer.json ├── requirements.txt ├── score.png └── source ├── __pycache__ ├── models.cpython-37.pyc ├── pretrained_models.cpython-37.pyc ├── settings.cpython-37.pyc ├── submodels.cpython-37.pyc └── utils.cpython-37.pyc ├── go.py ├── models.py ├── pretrained_models.py ├── settings.py ├── temp.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dataset/ 2 | *.bin -------------------------------------------------------------------------------- /1.txt: -------------------------------------------------------------------------------- 1 | PegasusForConditionalGeneration( 2 | (model): PegasusModel( 3 | (shared): Embedding(96103, 1024, padding_idx=0) 4 | (encoder): PegasusEncoder( 5 | (embed_tokens): Embedding(96103, 1024, padding_idx=0) 6 | (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024) 7 | (layers): ModuleList( 8 | (0): PegasusEncoderLayer( 9 | (self_attn): PegasusAttention( 10 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 11 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 12 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 13 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 14 | ) 15 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 16 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 17 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 18 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 19 | ) 20 | (1): PegasusEncoderLayer( 21 | (self_attn): PegasusAttention( 22 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 23 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 24 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 25 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 26 | ) 27 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 28 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 29 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 30 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 31 | ) 32 | (2): PegasusEncoderLayer( 33 | (self_attn): PegasusAttention( 34 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 35 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 36 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 37 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 38 | ) 39 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 40 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 41 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 42 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 43 | ) 44 | (3): PegasusEncoderLayer( 45 | (self_attn): PegasusAttention( 46 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 47 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 48 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 49 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 50 | ) 51 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 52 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 53 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 54 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 55 | ) 56 | (4): PegasusEncoderLayer( 57 | (self_attn): PegasusAttention( 58 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 59 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 60 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 61 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 62 | ) 63 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 64 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 65 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 66 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 67 | ) 68 | (5): PegasusEncoderLayer( 69 | (self_attn): PegasusAttention( 70 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 71 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 72 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 73 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 74 | ) 75 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 76 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 77 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 78 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 79 | ) 80 | (6): PegasusEncoderLayer( 81 | (self_attn): PegasusAttention( 82 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 83 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 84 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 85 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 86 | ) 87 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 88 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 89 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 90 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 91 | ) 92 | (7): PegasusEncoderLayer( 93 | (self_attn): PegasusAttention( 94 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 95 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 96 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 97 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 98 | ) 99 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 100 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 101 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 102 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 103 | ) 104 | (8): PegasusEncoderLayer( 105 | (self_attn): PegasusAttention( 106 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 107 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 108 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 109 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 110 | ) 111 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 112 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 113 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 114 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 115 | ) 116 | (9): PegasusEncoderLayer( 117 | (self_attn): PegasusAttention( 118 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 119 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 120 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 121 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 122 | ) 123 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 124 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 125 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 126 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 127 | ) 128 | (10): PegasusEncoderLayer( 129 | (self_attn): PegasusAttention( 130 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 131 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 132 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 133 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 134 | ) 135 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 136 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 137 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 138 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 139 | ) 140 | (11): PegasusEncoderLayer( 141 | (self_attn): PegasusAttention( 142 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 143 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 144 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 145 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 146 | ) 147 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 148 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 149 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 150 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 151 | ) 152 | (12): PegasusEncoderLayer( 153 | (self_attn): PegasusAttention( 154 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 155 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 156 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 157 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 158 | ) 159 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 160 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 161 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 162 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 163 | ) 164 | (13): PegasusEncoderLayer( 165 | (self_attn): PegasusAttention( 166 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 167 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 168 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 169 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 170 | ) 171 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 172 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 173 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 174 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 175 | ) 176 | (14): PegasusEncoderLayer( 177 | (self_attn): PegasusAttention( 178 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 179 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 180 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 181 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 182 | ) 183 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 184 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 185 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 186 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 187 | ) 188 | (15): PegasusEncoderLayer( 189 | (self_attn): PegasusAttention( 190 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 191 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 192 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 193 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 194 | ) 195 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 196 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 197 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 198 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 199 | ) 200 | ) 201 | (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 202 | ) 203 | (decoder): PegasusDecoder( 204 | (embed_tokens): Embedding(96103, 1024, padding_idx=0) 205 | (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024) 206 | (layers): ModuleList( 207 | (0): PegasusDecoderLayer( 208 | (self_attn): PegasusAttention( 209 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 210 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 211 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 212 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 213 | ) 214 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 215 | (encoder_attn): PegasusAttention( 216 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 217 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 218 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 219 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 220 | ) 221 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 222 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 223 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 224 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 225 | ) 226 | (1): PegasusDecoderLayer( 227 | (self_attn): PegasusAttention( 228 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 229 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 230 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 231 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 232 | ) 233 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 234 | (encoder_attn): PegasusAttention( 235 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 236 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 237 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 238 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 239 | ) 240 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 241 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 242 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 243 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 244 | ) 245 | (2): PegasusDecoderLayer( 246 | (self_attn): PegasusAttention( 247 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 248 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 249 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 250 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 251 | ) 252 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 253 | (encoder_attn): PegasusAttention( 254 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 255 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 256 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 257 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 258 | ) 259 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 260 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 261 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 262 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 263 | ) 264 | (3): PegasusDecoderLayer( 265 | (self_attn): PegasusAttention( 266 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 267 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 268 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 269 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 270 | ) 271 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 272 | (encoder_attn): PegasusAttention( 273 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 274 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 275 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 276 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 277 | ) 278 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 279 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 280 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 281 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 282 | ) 283 | (4): PegasusDecoderLayer( 284 | (self_attn): PegasusAttention( 285 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 286 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 287 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 288 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 289 | ) 290 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 291 | (encoder_attn): PegasusAttention( 292 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 293 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 294 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 295 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 296 | ) 297 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 298 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 299 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 300 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 301 | ) 302 | (5): PegasusDecoderLayer( 303 | (self_attn): PegasusAttention( 304 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 305 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 306 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 307 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 308 | ) 309 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 310 | (encoder_attn): PegasusAttention( 311 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 312 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 313 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 314 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 315 | ) 316 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 317 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 318 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 319 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 320 | ) 321 | (6): PegasusDecoderLayer( 322 | (self_attn): PegasusAttention( 323 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 324 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 325 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 326 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 327 | ) 328 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 329 | (encoder_attn): PegasusAttention( 330 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 331 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 332 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 333 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 334 | ) 335 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 336 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 337 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 338 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 339 | ) 340 | (7): PegasusDecoderLayer( 341 | (self_attn): PegasusAttention( 342 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 343 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 344 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 345 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 346 | ) 347 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 348 | (encoder_attn): PegasusAttention( 349 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 350 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 351 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 352 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 353 | ) 354 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 355 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 356 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 357 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 358 | ) 359 | (8): PegasusDecoderLayer( 360 | (self_attn): PegasusAttention( 361 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 362 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 363 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 364 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 365 | ) 366 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 367 | (encoder_attn): PegasusAttention( 368 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 369 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 370 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 371 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 372 | ) 373 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 374 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 375 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 376 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 377 | ) 378 | (9): PegasusDecoderLayer( 379 | (self_attn): PegasusAttention( 380 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 381 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 382 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 383 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 384 | ) 385 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 386 | (encoder_attn): PegasusAttention( 387 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 388 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 389 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 390 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 391 | ) 392 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 393 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 394 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 395 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 396 | ) 397 | (10): PegasusDecoderLayer( 398 | (self_attn): PegasusAttention( 399 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 400 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 401 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 402 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 403 | ) 404 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 405 | (encoder_attn): PegasusAttention( 406 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 407 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 408 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 409 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 410 | ) 411 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 412 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 413 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 414 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 415 | ) 416 | (11): PegasusDecoderLayer( 417 | (self_attn): PegasusAttention( 418 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 419 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 420 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 421 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 422 | ) 423 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 424 | (encoder_attn): PegasusAttention( 425 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 426 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 427 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 428 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 429 | ) 430 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 431 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 432 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 433 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 434 | ) 435 | (12): PegasusDecoderLayer( 436 | (self_attn): PegasusAttention( 437 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 438 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 439 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 440 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 441 | ) 442 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 443 | (encoder_attn): PegasusAttention( 444 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 445 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 446 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 447 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 448 | ) 449 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 450 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 451 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 452 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 453 | ) 454 | (13): PegasusDecoderLayer( 455 | (self_attn): PegasusAttention( 456 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 457 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 458 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 459 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 460 | ) 461 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 462 | (encoder_attn): PegasusAttention( 463 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 464 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 465 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 466 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 467 | ) 468 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 469 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 470 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 471 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 472 | ) 473 | (14): PegasusDecoderLayer( 474 | (self_attn): PegasusAttention( 475 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 476 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 477 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 478 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 479 | ) 480 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 481 | (encoder_attn): PegasusAttention( 482 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 483 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 484 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 485 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 486 | ) 487 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 488 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 489 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 490 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 491 | ) 492 | (15): PegasusDecoderLayer( 493 | (self_attn): PegasusAttention( 494 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 495 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 496 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 497 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 498 | ) 499 | (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 500 | (encoder_attn): PegasusAttention( 501 | (k_proj): Linear(in_features=1024, out_features=1024, bias=True) 502 | (v_proj): Linear(in_features=1024, out_features=1024, bias=True) 503 | (q_proj): Linear(in_features=1024, out_features=1024, bias=True) 504 | (out_proj): Linear(in_features=1024, out_features=1024, bias=True) 505 | ) 506 | (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 507 | (fc1): Linear(in_features=1024, out_features=4096, bias=True) 508 | (fc2): Linear(in_features=4096, out_features=1024, bias=True) 509 | (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 510 | ) 511 | ) 512 | (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) 513 | ) 514 | ) 515 | (lm_head): Linear(in_features=1024, out_features=96103, bias=False) 516 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextSum 2 | ## 0 使用说明 3 | 1. 项目相关依赖已写入requirements.txt文件 `pip install -r requirements.txt` 4 | 2. 项目使用了`transformers`提供的预训练模型,相关模型、配置文件、词典文件等于[此处](https://huggingface.co/models)下载 5 | 3. 运行项目前,于/source/settings.py中修改路径设置为本地实际绝对路径 6 | 4. 项目结构:
7 | TextSum
8 | --dataset(数据集、词典、词频表)
9 | --params(预训练模型、模型参数保存文件)
10 | --source(源代码)
11 | ----go.py(主控函数)
12 | ----pretrained_models.py(预训练模型)
13 | ----models.py(自定义模型)
14 | ----settings.py(项目设置)
15 | ----utils.py(工具函数)
16 | 5. `python go.py` 运行项目,可选命令行参数如下: 17 | ``` 18 | -h, --help show this help message and exit 19 | -p, --preprocess 预处理数据 20 | -b, --build 建立词频表 21 | -m, --make 建立词典 22 | -t 模型名, --train 训练 23 | 24 | -f 模型名, --fine_tune 微调 25 | 26 | -g 模型名 参数路径, --gen 生成submission 27 | 28 | ``` 29 | ## 1 数据处理 30 | 本项目数据处理共分为部分:数据清洗与划分、词典生成、张量转换 31 | + 数据清洗与划分 32 | + 使用正则表达式清洗原始数据,去除文本中与任务无关的信息 33 | + 从原始训练集中划分出验证集 34 | + 将原始CSV文件转换为逐条文本的JSON文件 35 | + 词典生成 36 | 统计数据集中出现过的所有单词的词频,取一定数目的高频词生成字典 37 | + 张量转换 38 | 读取预处理完毕的json文件,进一步处理后将文本数据集转换为成batch的Tensor 39 | ## 2 模型结构 40 | 本项目使用`pytorch`实现了模型基础结构、自定义损失函数、优化器以及模型训练、验证过程; 41 | 本项目还使用`transformers`提供的预训练模型(bart、t5、pegasus)及函数接口实现了模型的微调与推断 42 | 以下给出部分模型的网络结构 43 | 1. GRU编码器-解码器架构网络结构如下: 44 | ```python 45 | EncoderDecoder( 46 | (encoder): GruEncoder( 47 | (embdding): Embedding(10004, 512) 48 | (rnn): GRU(512, 256, num_layers=2) 49 | ) 50 | (decoder): GruDecoder( 51 | (embdding): Embedding(10004, 512) 52 | (rnn): GRU(768, 256, num_layers=2) 53 | (dense): Linear(in_features=256, out_features=10004, bias=True) 54 | ) 55 | ) 56 | ``` 57 | 2. t5(small) 58 | ```python 59 | T5ForConditionalGeneration( 60 | (shared): Embedding(32128, 512) 61 | (encoder): T5Stack( 62 | (embed_tokens): Embedding(32128, 512) 63 | (block): ModuleList( 64 | (0): T5Block( 65 | (layer): ModuleList( 66 | (0): T5LayerSelfAttention( 67 | (SelfAttention): T5Attention( 68 | (q): Linear(in_features=512, out_features=512, bias=False) 69 | (k): Linear(in_features=512, out_features=512, bias=False) 70 | (v): Linear(in_features=512, out_features=512, bias=False) 71 | (o): Linear(in_features=512, out_features=512, bias=False) 72 | (relative_attention_bias): Embedding(32, 8) 73 | ) 74 | (layer_norm): T5LayerNorm() 75 | (dropout): Dropout(p=0.1, inplace=False) 76 | ) 77 | (1): T5LayerFF( 78 | (DenseReluDense): T5DenseReluDense( 79 | (wi): Linear(in_features=512, out_features=2048, bias=False) 80 | (wo): Linear(in_features=2048, out_features=512, bias=False) 81 | (dropout): Dropout(p=0.1, inplace=False) 82 | ) 83 | (layer_norm): T5LayerNorm() 84 | (dropout): Dropout(p=0.1, inplace=False) 85 | ) 86 | ) 87 | ) 88 | (1): T5Block( 89 | (layer): ModuleList( 90 | (0): T5LayerSelfAttention( 91 | (SelfAttention): T5Attention( 92 | (q): Linear(in_features=512, out_features=512, bias=False) 93 | (k): Linear(in_features=512, out_features=512, bias=False) 94 | (v): Linear(in_features=512, out_features=512, bias=False) 95 | (o): Linear(in_features=512, out_features=512, bias=False) 96 | ) 97 | (layer_norm): T5LayerNorm() 98 | (dropout): Dropout(p=0.1, inplace=False) 99 | ) 100 | (1): T5LayerFF( 101 | (DenseReluDense): T5DenseReluDense( 102 | (wi): Linear(in_features=512, out_features=2048, bias=False) 103 | (wo): Linear(in_features=2048, out_features=512, bias=False) 104 | (dropout): Dropout(p=0.1, inplace=False) 105 | ) 106 | (layer_norm): T5LayerNorm() 107 | (dropout): Dropout(p=0.1, inplace=False) 108 | ) 109 | ) 110 | ) 111 | (2): T5Block( 112 | (layer): ModuleList( 113 | (0): T5LayerSelfAttention( 114 | (SelfAttention): T5Attention( 115 | (q): Linear(in_features=512, out_features=512, bias=False) 116 | (k): Linear(in_features=512, out_features=512, bias=False) 117 | (v): Linear(in_features=512, out_features=512, bias=False) 118 | (o): Linear(in_features=512, out_features=512, bias=False) 119 | ) 120 | (layer_norm): T5LayerNorm() 121 | (dropout): Dropout(p=0.1, inplace=False) 122 | ) 123 | (1): T5LayerFF( 124 | (DenseReluDense): T5DenseReluDense( 125 | (wi): Linear(in_features=512, out_features=2048, bias=False) 126 | (wo): Linear(in_features=2048, out_features=512, bias=False) 127 | (dropout): Dropout(p=0.1, inplace=False) 128 | ) 129 | (layer_norm): T5LayerNorm() 130 | (dropout): Dropout(p=0.1, inplace=False) 131 | ) 132 | ) 133 | ) 134 | (3): T5Block( 135 | (layer): ModuleList( 136 | (0): T5LayerSelfAttention( 137 | (SelfAttention): T5Attention( 138 | (q): Linear(in_features=512, out_features=512, bias=False) 139 | (k): Linear(in_features=512, out_features=512, bias=False) 140 | (v): Linear(in_features=512, out_features=512, bias=False) 141 | (o): Linear(in_features=512, out_features=512, bias=False) 142 | ) 143 | (layer_norm): T5LayerNorm() 144 | (dropout): Dropout(p=0.1, inplace=False) 145 | ) 146 | (1): T5LayerFF( 147 | (DenseReluDense): T5DenseReluDense( 148 | (wi): Linear(in_features=512, out_features=2048, bias=False) 149 | (wo): Linear(in_features=2048, out_features=512, bias=False) 150 | (dropout): Dropout(p=0.1, inplace=False) 151 | ) 152 | (layer_norm): T5LayerNorm() 153 | (dropout): Dropout(p=0.1, inplace=False) 154 | ) 155 | ) 156 | ) 157 | (4): T5Block( 158 | (layer): ModuleList( 159 | (0): T5LayerSelfAttention( 160 | (SelfAttention): T5Attention( 161 | (q): Linear(in_features=512, out_features=512, bias=False) 162 | (k): Linear(in_features=512, out_features=512, bias=False) 163 | (v): Linear(in_features=512, out_features=512, bias=False) 164 | (o): Linear(in_features=512, out_features=512, bias=False) 165 | ) 166 | (layer_norm): T5LayerNorm() 167 | (dropout): Dropout(p=0.1, inplace=False) 168 | ) 169 | (1): T5LayerFF( 170 | (DenseReluDense): T5DenseReluDense( 171 | (wi): Linear(in_features=512, out_features=2048, bias=False) 172 | (wo): Linear(in_features=2048, out_features=512, bias=False) 173 | (dropout): Dropout(p=0.1, inplace=False) 174 | ) 175 | (layer_norm): T5LayerNorm() 176 | (dropout): Dropout(p=0.1, inplace=False) 177 | ) 178 | ) 179 | ) 180 | (5): T5Block( 181 | (layer): ModuleList( 182 | (0): T5LayerSelfAttention( 183 | (SelfAttention): T5Attention( 184 | (q): Linear(in_features=512, out_features=512, bias=False) 185 | (k): Linear(in_features=512, out_features=512, bias=False) 186 | (v): Linear(in_features=512, out_features=512, bias=False) 187 | (o): Linear(in_features=512, out_features=512, bias=False) 188 | ) 189 | (layer_norm): T5LayerNorm() 190 | (dropout): Dropout(p=0.1, inplace=False) 191 | ) 192 | (1): T5LayerFF( 193 | (DenseReluDense): T5DenseReluDense( 194 | (wi): Linear(in_features=512, out_features=2048, bias=False) 195 | (wo): Linear(in_features=2048, out_features=512, bias=False) 196 | (dropout): Dropout(p=0.1, inplace=False) 197 | ) 198 | (layer_norm): T5LayerNorm() 199 | (dropout): Dropout(p=0.1, inplace=False) 200 | ) 201 | ) 202 | ) 203 | ) 204 | (final_layer_norm): T5LayerNorm() 205 | (dropout): Dropout(p=0.1, inplace=False) 206 | ) 207 | (decoder): T5Stack( 208 | (embed_tokens): Embedding(32128, 512) 209 | (block): ModuleList( 210 | (0): T5Block( 211 | (layer): ModuleList( 212 | (0): T5LayerSelfAttention( 213 | (SelfAttention): T5Attention( 214 | (q): Linear(in_features=512, out_features=512, bias=False) 215 | (k): Linear(in_features=512, out_features=512, bias=False) 216 | (v): Linear(in_features=512, out_features=512, bias=False) 217 | (o): Linear(in_features=512, out_features=512, bias=False) 218 | (relative_attention_bias): Embedding(32, 8) 219 | ) 220 | (layer_norm): T5LayerNorm() 221 | (dropout): Dropout(p=0.1, inplace=False) 222 | ) 223 | (1): T5LayerCrossAttention( 224 | (EncDecAttention): T5Attention( 225 | (q): Linear(in_features=512, out_features=512, bias=False) 226 | (k): Linear(in_features=512, out_features=512, bias=False) 227 | (v): Linear(in_features=512, out_features=512, bias=False) 228 | (o): Linear(in_features=512, out_features=512, bias=False) 229 | ) 230 | (layer_norm): T5LayerNorm() 231 | (dropout): Dropout(p=0.1, inplace=False) 232 | ) 233 | (2): T5LayerFF( 234 | (DenseReluDense): T5DenseReluDense( 235 | (wi): Linear(in_features=512, out_features=2048, bias=False) 236 | (wo): Linear(in_features=2048, out_features=512, bias=False) 237 | (dropout): Dropout(p=0.1, inplace=False) 238 | ) 239 | (layer_norm): T5LayerNorm() 240 | (dropout): Dropout(p=0.1, inplace=False) 241 | ) 242 | ) 243 | ) 244 | (1): T5Block( 245 | (layer): ModuleList( 246 | (0): T5LayerSelfAttention( 247 | (SelfAttention): T5Attention( 248 | (q): Linear(in_features=512, out_features=512, bias=False) 249 | (k): Linear(in_features=512, out_features=512, bias=False) 250 | (v): Linear(in_features=512, out_features=512, bias=False) 251 | (o): Linear(in_features=512, out_features=512, bias=False) 252 | ) 253 | (layer_norm): T5LayerNorm() 254 | (dropout): Dropout(p=0.1, inplace=False) 255 | ) 256 | (1): T5LayerCrossAttention( 257 | (EncDecAttention): T5Attention( 258 | (q): Linear(in_features=512, out_features=512, bias=False) 259 | (k): Linear(in_features=512, out_features=512, bias=False) 260 | (v): Linear(in_features=512, out_features=512, bias=False) 261 | (o): Linear(in_features=512, out_features=512, bias=False) 262 | ) 263 | (layer_norm): T5LayerNorm() 264 | (dropout): Dropout(p=0.1, inplace=False) 265 | ) 266 | (2): T5LayerFF( 267 | (DenseReluDense): T5DenseReluDense( 268 | (wi): Linear(in_features=512, out_features=2048, bias=False) 269 | (wo): Linear(in_features=2048, out_features=512, bias=False) 270 | (dropout): Dropout(p=0.1, inplace=False) 271 | ) 272 | (layer_norm): T5LayerNorm() 273 | (dropout): Dropout(p=0.1, inplace=False) 274 | ) 275 | ) 276 | ) 277 | (2): T5Block( 278 | (layer): ModuleList( 279 | (0): T5LayerSelfAttention( 280 | (SelfAttention): T5Attention( 281 | (q): Linear(in_features=512, out_features=512, bias=False) 282 | (k): Linear(in_features=512, out_features=512, bias=False) 283 | (v): Linear(in_features=512, out_features=512, bias=False) 284 | (o): Linear(in_features=512, out_features=512, bias=False) 285 | ) 286 | (layer_norm): T5LayerNorm() 287 | (dropout): Dropout(p=0.1, inplace=False) 288 | ) 289 | (1): T5LayerCrossAttention( 290 | (EncDecAttention): T5Attention( 291 | (q): Linear(in_features=512, out_features=512, bias=False) 292 | (k): Linear(in_features=512, out_features=512, bias=False) 293 | (v): Linear(in_features=512, out_features=512, bias=False) 294 | (o): Linear(in_features=512, out_features=512, bias=False) 295 | ) 296 | (layer_norm): T5LayerNorm() 297 | (dropout): Dropout(p=0.1, inplace=False) 298 | ) 299 | (2): T5LayerFF( 300 | (DenseReluDense): T5DenseReluDense( 301 | (wi): Linear(in_features=512, out_features=2048, bias=False) 302 | (wo): Linear(in_features=2048, out_features=512, bias=False) 303 | (dropout): Dropout(p=0.1, inplace=False) 304 | ) 305 | (layer_norm): T5LayerNorm() 306 | (dropout): Dropout(p=0.1, inplace=False) 307 | ) 308 | ) 309 | ) 310 | (3): T5Block( 311 | (layer): ModuleList( 312 | (0): T5LayerSelfAttention( 313 | (SelfAttention): T5Attention( 314 | (q): Linear(in_features=512, out_features=512, bias=False) 315 | (k): Linear(in_features=512, out_features=512, bias=False) 316 | (v): Linear(in_features=512, out_features=512, bias=False) 317 | (o): Linear(in_features=512, out_features=512, bias=False) 318 | ) 319 | (layer_norm): T5LayerNorm() 320 | (dropout): Dropout(p=0.1, inplace=False) 321 | ) 322 | (1): T5LayerCrossAttention( 323 | (EncDecAttention): T5Attention( 324 | (q): Linear(in_features=512, out_features=512, bias=False) 325 | (k): Linear(in_features=512, out_features=512, bias=False) 326 | (v): Linear(in_features=512, out_features=512, bias=False) 327 | (o): Linear(in_features=512, out_features=512, bias=False) 328 | ) 329 | (layer_norm): T5LayerNorm() 330 | (dropout): Dropout(p=0.1, inplace=False) 331 | ) 332 | (2): T5LayerFF( 333 | (DenseReluDense): T5DenseReluDense( 334 | (wi): Linear(in_features=512, out_features=2048, bias=False) 335 | (wo): Linear(in_features=2048, out_features=512, bias=False) 336 | (dropout): Dropout(p=0.1, inplace=False) 337 | ) 338 | (layer_norm): T5LayerNorm() 339 | (dropout): Dropout(p=0.1, inplace=False) 340 | ) 341 | ) 342 | ) 343 | (4): T5Block( 344 | (layer): ModuleList( 345 | (0): T5LayerSelfAttention( 346 | (SelfAttention): T5Attention( 347 | (q): Linear(in_features=512, out_features=512, bias=False) 348 | (k): Linear(in_features=512, out_features=512, bias=False) 349 | (v): Linear(in_features=512, out_features=512, bias=False) 350 | (o): Linear(in_features=512, out_features=512, bias=False) 351 | ) 352 | (layer_norm): T5LayerNorm() 353 | (dropout): Dropout(p=0.1, inplace=False) 354 | ) 355 | (1): T5LayerCrossAttention( 356 | (EncDecAttention): T5Attention( 357 | (q): Linear(in_features=512, out_features=512, bias=False) 358 | (k): Linear(in_features=512, out_features=512, bias=False) 359 | (v): Linear(in_features=512, out_features=512, bias=False) 360 | (o): Linear(in_features=512, out_features=512, bias=False) 361 | ) 362 | (layer_norm): T5LayerNorm() 363 | (dropout): Dropout(p=0.1, inplace=False) 364 | ) 365 | (2): T5LayerFF( 366 | (DenseReluDense): T5DenseReluDense( 367 | (wi): Linear(in_features=512, out_features=2048, bias=False) 368 | (wo): Linear(in_features=2048, out_features=512, bias=False) 369 | (dropout): Dropout(p=0.1, inplace=False) 370 | ) 371 | (layer_norm): T5LayerNorm() 372 | (dropout): Dropout(p=0.1, inplace=False) 373 | ) 374 | ) 375 | ) 376 | (5): T5Block( 377 | (layer): ModuleList( 378 | (0): T5LayerSelfAttention( 379 | (SelfAttention): T5Attention( 380 | (q): Linear(in_features=512, out_features=512, bias=False) 381 | (k): Linear(in_features=512, out_features=512, bias=False) 382 | (v): Linear(in_features=512, out_features=512, bias=False) 383 | (o): Linear(in_features=512, out_features=512, bias=False) 384 | ) 385 | (layer_norm): T5LayerNorm() 386 | (dropout): Dropout(p=0.1, inplace=False) 387 | ) 388 | (1): T5LayerCrossAttention( 389 | (EncDecAttention): T5Attention( 390 | (q): Linear(in_features=512, out_features=512, bias=False) 391 | (k): Linear(in_features=512, out_features=512, bias=False) 392 | (v): Linear(in_features=512, out_features=512, bias=False) 393 | (o): Linear(in_features=512, out_features=512, bias=False) 394 | ) 395 | (layer_norm): T5LayerNorm() 396 | (dropout): Dropout(p=0.1, inplace=False) 397 | ) 398 | (2): T5LayerFF( 399 | (DenseReluDense): T5DenseReluDense( 400 | (wi): Linear(in_features=512, out_features=2048, bias=False) 401 | (wo): Linear(in_features=2048, out_features=512, bias=False) 402 | (dropout): Dropout(p=0.1, inplace=False) 403 | ) 404 | (layer_norm): T5LayerNorm() 405 | (dropout): Dropout(p=0.1, inplace=False) 406 | ) 407 | ) 408 | ) 409 | ) 410 | (final_layer_norm): T5LayerNorm() 411 | (dropout): Dropout(p=0.1, inplace=False) 412 | ) 413 | (lm_head): Linear(in_features=512, out_features=32128, bias=False) 414 | ) 415 | ``` 416 | 417 | ## 3 最终成绩 418 | 本项目最终成绩为0.32107609 419 | ![](score.png) 420 | 参数设置如下: 421 | + 模型:bart-large-cnn 422 | + 搜索束个数:2 423 | + 最大序列长度:1024 424 | + 激活函数:gelu 425 | + 预测序列最短长度:30 426 | + 预测序列最长长度:590 427 | + 是否允许提前停止(预测出``即停止):是 428 | -------------------------------------------------------------------------------- /params/bart/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_num_labels": 3, 3 | "activation_dropout": 0.0, 4 | "activation_function": "gelu", 5 | "add_final_layer_norm": false, 6 | "architectures": [ 7 | "BartForConditionalGeneration" 8 | ], 9 | "attention_dropout": 0.0, 10 | "bos_token_id": 0, 11 | "classif_dropout": 0.0, 12 | "classifier_dropout": 0.0, 13 | "d_model": 1024, 14 | "decoder_attention_heads": 16, 15 | "decoder_ffn_dim": 4096, 16 | "decoder_layerdrop": 0.0, 17 | "decoder_layers": 12, 18 | "decoder_start_token_id": 2, 19 | "dropout": 0.1, 20 | "early_stopping": true, 21 | "encoder_attention_heads": 16, 22 | "encoder_ffn_dim": 4096, 23 | "encoder_layerdrop": 0.0, 24 | "encoder_layers": 12, 25 | "eos_token_id": 2, 26 | "force_bos_token_to_be_generated": true, 27 | "forced_bos_token_id": 0, 28 | "forced_eos_token_id": 2, 29 | "gradient_checkpointing": false, 30 | "id2label": { 31 | "0": "LABEL_0", 32 | "1": "LABEL_1", 33 | "2": "LABEL_2" 34 | }, 35 | "init_std": 0.02, 36 | "is_encoder_decoder": true, 37 | "label2id": { 38 | "LABEL_0": 0, 39 | "LABEL_1": 1, 40 | "LABEL_2": 2 41 | }, 42 | "length_penalty": 2.0, 43 | "max_length": 142, 44 | "max_position_embeddings": 1024, 45 | "min_length": 56, 46 | "model_type": "bart", 47 | "no_repeat_ngram_size": 3, 48 | "normalize_before": false, 49 | "num_beams": 4, 50 | "num_hidden_layers": 12, 51 | "output_past": true, 52 | "pad_token_id": 1, 53 | "prefix": " ", 54 | "scale_embedding": false, 55 | "task_specific_params": { 56 | "summarization": { 57 | "early_stopping": true, 58 | "length_penalty": 2.0, 59 | "max_length": 142, 60 | "min_length": 56, 61 | "no_repeat_ngram_size": 3, 62 | "num_beams": 4 63 | } 64 | }, 65 | "transformers_version": "4.7.0.dev0", 66 | "use_cache": true, 67 | "vocab_size": 50264 68 | } 69 | -------------------------------------------------------------------------------- /params/pegasus/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "./", 3 | "activation_dropout": 0.1, 4 | "activation_function": "relu", 5 | "add_bias_logits": false, 6 | "add_final_layer_norm": true, 7 | "architectures": [ 8 | "PegasusForConditionalGeneration" 9 | ], 10 | "attention_dropout": 0.1, 11 | "bos_token_id": 0, 12 | "classif_dropout": 0.0, 13 | "classifier_dropout": 0.0, 14 | "d_model": 1024, 15 | "decoder_attention_heads": 16, 16 | "decoder_ffn_dim": 4096, 17 | "decoder_layerdrop": 0.0, 18 | "decoder_layers": 16, 19 | "decoder_start_token_id": 0, 20 | "do_blenderbot_90_layernorm": false, 21 | "dropout": 0.1, 22 | "encoder_attention_heads": 16, 23 | "encoder_ffn_dim": 4096, 24 | "encoder_layerdrop": 0.0, 25 | "encoder_layers": 16, 26 | "eos_token_id": 1, 27 | "extra_pos_embeddings": 0, 28 | "force_bos_token_to_be_generated": false, 29 | "forced_eos_token_id": 1, 30 | "gradient_checkpointing": false, 31 | "id2label": { 32 | "0": "LABEL_0", 33 | "1": "LABEL_1", 34 | "2": "LABEL_2" 35 | }, 36 | "init_std": 0.02, 37 | "is_encoder_decoder": true, 38 | "label2id": { 39 | "LABEL_0": 0, 40 | "LABEL_1": 1, 41 | "LABEL_2": 2 42 | }, 43 | "length_penalty": 0.6, 44 | "max_length": 64, 45 | "max_position_embeddings": 512, 46 | "model_type": "pegasus", 47 | "normalize_before": true, 48 | "normalize_embedding": false, 49 | "num_beams": 8, 50 | "num_hidden_layers": 16, 51 | "pad_token_id": 0, 52 | "scale_embedding": true, 53 | "static_position_embeddings": true, 54 | "transformers_version": "4.11.0.dev0", 55 | "use_cache": true, 56 | "vocab_size": 96103 57 | } 58 | -------------------------------------------------------------------------------- /params/pegasus/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"eos_token": "", "unk_token": "", "pad_token": ""} -------------------------------------------------------------------------------- /params/pegasus/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/pegasus/spiece.model -------------------------------------------------------------------------------- /params/pegasus/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"model_max_length": 512, "special_tokens_map_file": null, "full_tokenizer_file": null} -------------------------------------------------------------------------------- /params/t5-base/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5WithLMHeadModel" 4 | ], 5 | "d_ff": 3072, 6 | "d_kv": 64, 7 | "d_model": 768, 8 | "decoder_start_token_id": 0, 9 | "dropout_rate": 0.1, 10 | "eos_token_id": 1, 11 | "initializer_factor": 1.0, 12 | "is_encoder_decoder": true, 13 | "layer_norm_epsilon": 1e-06, 14 | "model_type": "t5", 15 | "n_positions": 512, 16 | "num_heads": 12, 17 | "num_layers": 12, 18 | "output_past": true, 19 | "pad_token_id": 0, 20 | "relative_attention_num_buckets": 32, 21 | "task_specific_params": { 22 | "summarization": { 23 | "early_stopping": true, 24 | "length_penalty": 2.0, 25 | "max_length": 200, 26 | "min_length": 30, 27 | "no_repeat_ngram_size": 3, 28 | "num_beams": 4, 29 | "prefix": "summarize: " 30 | }, 31 | "translation_en_to_de": { 32 | "early_stopping": true, 33 | "max_length": 300, 34 | "num_beams": 4, 35 | "prefix": "translate English to German: " 36 | }, 37 | "translation_en_to_fr": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to French: " 42 | }, 43 | "translation_en_to_ro": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to Romanian: " 48 | } 49 | }, 50 | "vocab_size": 32128 51 | } 52 | -------------------------------------------------------------------------------- /params/t5-base/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-base/spiece.model -------------------------------------------------------------------------------- /params/t5-large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5WithLMHeadModel" 4 | ], 5 | "d_ff": 4096, 6 | "d_kv": 64, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dropout_rate": 0.1, 10 | "eos_token_id": 1, 11 | "initializer_factor": 1.0, 12 | "is_encoder_decoder": true, 13 | "layer_norm_epsilon": 1e-06, 14 | "model_type": "t5", 15 | "n_positions": 512, 16 | "num_heads": 16, 17 | "num_layers": 24, 18 | "output_past": true, 19 | "pad_token_id": 0, 20 | "relative_attention_num_buckets": 32, 21 | "task_specific_params": { 22 | "summarization": { 23 | "early_stopping": true, 24 | "length_penalty": 2.0, 25 | "max_length": 200, 26 | "min_length": 30, 27 | "no_repeat_ngram_size": 3, 28 | "num_beams": 4, 29 | "prefix": "summarize: " 30 | }, 31 | "translation_en_to_de": { 32 | "early_stopping": true, 33 | "max_length": 300, 34 | "num_beams": 4, 35 | "prefix": "translate English to German: " 36 | }, 37 | "translation_en_to_fr": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to French: " 42 | }, 43 | "translation_en_to_ro": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to Romanian: " 48 | } 49 | }, 50 | "vocab_size": 32128 51 | } 52 | -------------------------------------------------------------------------------- /params/t5-large/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-large/spiece.model -------------------------------------------------------------------------------- /params/t5-small/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5WithLMHeadModel" 4 | ], 5 | "d_ff": 2048, 6 | "d_kv": 64, 7 | "d_model": 512, 8 | "decoder_start_token_id": 0, 9 | "dropout_rate": 0.1, 10 | "eos_token_id": 1, 11 | "initializer_factor": 1.0, 12 | "is_encoder_decoder": true, 13 | "layer_norm_epsilon": 1e-06, 14 | "model_type": "t5", 15 | "n_positions": 512, 16 | "num_heads": 8, 17 | "num_layers": 6, 18 | "output_past": true, 19 | "pad_token_id": 0, 20 | "relative_attention_num_buckets": 32, 21 | "task_specific_params": { 22 | "summarization": { 23 | "early_stopping": true, 24 | "length_penalty": 2.0, 25 | "max_length": 450, 26 | "min_length": 30, 27 | "no_repeat_ngram_size": 3, 28 | "num_beams": 4, 29 | "prefix": "summarize: " 30 | }, 31 | "translation_en_to_de": { 32 | "early_stopping": true, 33 | "max_length": 300, 34 | "num_beams": 4, 35 | "prefix": "translate English to German: " 36 | }, 37 | "translation_en_to_fr": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to French: " 42 | }, 43 | "translation_en_to_ro": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to Romanian: " 48 | } 49 | }, 50 | "vocab_size": 32128 51 | } -------------------------------------------------------------------------------- /params/t5-small/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-small/spiece.model -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | tqdm 3 | transformers 4 | pandas 5 | rouge -------------------------------------------------------------------------------- /score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/score.png -------------------------------------------------------------------------------- /source/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /source/__pycache__/pretrained_models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/pretrained_models.cpython-37.pyc -------------------------------------------------------------------------------- /source/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /source/__pycache__/submodels.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/submodels.cpython-37.pyc -------------------------------------------------------------------------------- /source/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /source/go.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import utils 3 | from models import GetModel 4 | import pretrained_models as pm 5 | 6 | parser=argparse.ArgumentParser() 7 | parser.add_argument("-p","--preprocess",help="预处理数据",action="store_true") 8 | parser.add_argument("-b","--build",help="建立词频表",action="store_true") 9 | parser.add_argument("-m","--make",help="建立词典",action="store_true") 10 | parser.add_argument("-t","--train",help="训练",type=str) 11 | parser.add_argument("-f","--fine_tune",help="微调",type=str) 12 | parser.add_argument("-g","--gen",help="生成submission",nargs=2,type=str) 13 | 14 | 15 | args=parser.parse_args() 16 | 17 | def main(): 18 | if(args.preprocess): 19 | print("--------------开始数据预处理--------------") 20 | try: 21 | utils.Preprocess() 22 | except Exception as e: 23 | print(e) 24 | print("--------------数据预处理完毕--------------") 25 | exit(0) 26 | if(args.build): 27 | print("--------------开始建立词频表--------------") 28 | try: 29 | utils.BuildVocabCounter() 30 | except Exception as e: 31 | print(e) 32 | print("--------------词频表建立完毕--------------") 33 | exit(0) 34 | if(args.make): 35 | print("--------------开始建立字典--------------") 36 | try: 37 | utils.MakeVocab() 38 | except Exception as e: 39 | print(e) 40 | print("--------------字典建立完毕--------------") 41 | exit(0) 42 | if(args.train): 43 | 44 | try: 45 | net=GetModel(args.train) 46 | print("--------------开始训练模型--------------") 47 | utils.Train(net) 48 | print("--------------模型训练完毕--------------") 49 | except Exception as e: 50 | print(e) 51 | exit(0) 52 | 53 | if(args.fine_tune): 54 | try: 55 | net,tkz=pm.GetPModel(args.fine_tune) 56 | print("--------------开始微调--------------") 57 | pm.FineTune(net,tkz) 58 | print("--------------微调完毕--------------") 59 | except Exception as e: 60 | print(e) 61 | exit(0) 62 | if(args.gen): 63 | 64 | net,param_path=args.gen 65 | 66 | if(param_path=="x"): 67 | param_path=None 68 | try: 69 | print("--------------开始生成submission--------------") 70 | if(net=="gru"): 71 | net=GetModel(net) 72 | utils.GenSubmisson(net,param_path) 73 | else: 74 | net,tkz=pm.GetPModel(net) 75 | pm.GenSub(net,tkz,param_path) 76 | 77 | print("--------------submission生成完毕--------------") 78 | except Exception as e: 79 | print(e) 80 | exit(0) 81 | 82 | 83 | 84 | print(r""" 85 | ___________ __ _________ .__ 86 | \__ ___/___ ___ ____/ |_ / _____/__ __ _____ _____ _____ _______|__|_______ ___________ 87 | | |_/ __ \\ \/ /\ __\ \_____ \| | \/ \ / \\__ \\_ __ \ \___ // __ \_ __ \ 88 | | |\ ___/ > < | | / \ | / Y Y \ Y Y \/ __ \| | \/ |/ /\ ___/| | \/ 89 | |____| \___ >__/\_ \ |__| /_______ /____/|__|_| /__|_| (____ /__| |__/_____ \\___ >__| 90 | \/ \/ \/ \/ \/ \/ \/ \/ 91 | """) 92 | print("-h, --help show help message and exit") 93 | 94 | if __name__=='__main__': 95 | main() -------------------------------------------------------------------------------- /source/models.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from torch.tensor import Tensor 4 | from settings import * 5 | import utils 6 | 7 | class MaskedSoftmaxCELoss(nn.CrossEntropyLoss): 8 | """带遮蔽的softmax交叉熵损失函数""" 9 | 10 | def _sequence_mask(self, X, valid_len, value=0): 11 | """ 在序列中屏蔽不相关的项。 12 | 接收valid_len是多个有效长度组成的一维tensor,如[1,2]代表第一个序列有效长度为1,第二个序列有效长度为2 13 | """ 14 | 15 | maxlen = X.size(1) 16 | mask = torch.arange((maxlen), dtype=torch.float32, 17 | device=X.device)[None, :] < valid_len[:, None] 18 | X[~mask] = value 19 | # 有效长度以外的元素都被置零,不改变原始shape 20 | return X 21 | 22 | def forward(self, pred, label, valid_len): 23 | # 不用看标签中的padding的损失 24 | weights = torch.ones_like(label) 25 | weights = self._sequence_mask(weights, valid_len) 26 | self.reduction = 'none' 27 | unweighted_loss = super().forward(pred.permute(0, 2, 1), label) 28 | 29 | # 把整个序列的loss取平均,最后输出的shape是(batch_size) 30 | weighted_loss = (unweighted_loss * weights).mean(dim=1) 31 | return weighted_loss 32 | 33 | 34 | class Encoder(nn.Module): 35 | '''编码器接口''' 36 | def __init__(self, **kwargs): 37 | super(Encoder,self).__init__(**kwargs) 38 | 39 | def forward(self,X,*args): 40 | raise NotImplementedError 41 | 42 | class Decoder(nn.Module): 43 | '''编码器接口''' 44 | def __init__(self, **kwargs): 45 | super(Decoder,self).__init__(**kwargs) 46 | 47 | # 接收编码器的输出,作为当前步的先验状态 48 | def init_state(self,enc_outputs,*args): 49 | raise NotImplementedError 50 | # state和解码器输入共同作为输入 51 | # 在一次序列训练中,初始state为编码器输入,之后会不断自我更新 52 | def forward(self,X,state): 53 | raise NotImplementedError 54 | 55 | class EncoderDecoder(nn.Module): 56 | '''编码器解码器架构基类''' 57 | def __init__(self, encoder:Encoder,decoder:Decoder,**kwargs): 58 | super(EncoderDecoder,self).__init__(**kwargs) 59 | self.encoder=encoder 60 | self.decoder=decoder 61 | 62 | def forward(self,enc_X,dec_X,*args): 63 | enc_outputs=self.encoder(enc_X,*args) 64 | dec_state=self.decoder.init_state(enc_outputs) 65 | 66 | return self.decoder(dec_X,dec_state) 67 | 68 | 69 | ################################## RNN(效果太差了) 70 | class GruEncoder(Encoder): 71 | def __init__(self,in_dim,emb_dim,hidden_size,num_layers,dropout=0,**kwargs): 72 | super(GruEncoder,self).__init__(**kwargs) 73 | self.embdding=nn.Embedding(in_dim,emb_dim) 74 | self.rnn=nn.GRU(emb_dim,hidden_size,num_layers,dropout=dropout) 75 | 76 | def forward(self,X:Tensor,*args): 77 | X=self.embdding(X) 78 | # 更改数据维度为seq_len,batch_size,features 79 | X=X.permute(1,0,2) 80 | output,state=self.rnn(X) 81 | # shape分别为: 82 | # (seq_len,batch_size,hidden_size) 83 | # (num_layers,batch_size,hidden_size) 84 | return output,state 85 | 86 | class GruDecoder(Decoder): 87 | def __init__(self,in_dim,emb_dim,hidden_size,num_layers,dropout=0,**kwargs): 88 | super(GruDecoder,self).__init__(**kwargs) 89 | self.embdding=nn.Embedding(in_dim,emb_dim) 90 | self.rnn=nn.GRU(emb_dim+hidden_size,hidden_size,num_layers,dropout=dropout) 91 | self.dense=nn.Linear(hidden_size,VOCAB_SIZE+4) 92 | 93 | def init_state(self, enc_outputs, *args): 94 | # 取enc的state 95 | return enc_outputs[1] 96 | 97 | def forward(self,X:Tensor,state:Tensor): 98 | X=self.embdding(X).permute(1,0,2) 99 | # 取最后时刻的最后一层 100 | context=state[-1].repeat(X.shape[0],1,1) 101 | 102 | # 虽然state在h0已经传过来了,但是还是把state拼一下,拼到了特征的维度,问题不大 103 | X_and_context=torch.cat((X,context),2) 104 | output,state=self.rnn(X_and_context,hx=state) 105 | output=self.dense(output).permute(1,0,2) 106 | # shape分别为: 107 | # (batch_size,seq_len,hidden_size) 108 | # (num_layers,batch_size,hidden_size) 109 | return output,state 110 | 111 | def GetTextSum_GRU(): 112 | return EncoderDecoder( 113 | GruEncoder(VOCAB_SIZE+4,512,256,2), 114 | GruDecoder(VOCAB_SIZE+4,512,256,2) 115 | ) 116 | ################################## 117 | 118 | 119 | 120 | def GetModel(name:str): 121 | name=name.lower() 122 | if(name=="gru"): 123 | return GetTextSum_GRU().to(DEVICE) 124 | 125 | else: 126 | raise Exception("该模型未实现!") 127 | 128 | if __name__=='__main__': 129 | # encoder=GruEncoder(VOCAB_SIZE+4,512,256,2) 130 | # decoder=GruDecoder(VOCAB_SIZE+4,512,256,2) 131 | # for enc_X,dec_X,y in utils.train_iter: 132 | # print(enc_X[0].shape) 133 | # enc_out=encoder(enc_X[0]) 134 | 135 | # state=decoder.init_state(enc_out) 136 | # output,state=decoder(dec_X[0],state) 137 | # print(output.shape) 138 | # loss_f=MaskedSoftmaxCELoss() 139 | # l=loss_f(output,y[0],y[1]) 140 | # print(l) 141 | 142 | # break 143 | 144 | net=GetTextSum_GRU() 145 | 146 | 147 | with open("1.txt","w+") as f: 148 | f.write(str(net)) 149 | -------------------------------------------------------------------------------- /source/pretrained_models.py: -------------------------------------------------------------------------------- 1 | # 使用预训练模型 2 | from transformers import PegasusTokenizer,PegasusForConditionalGeneration 3 | from transformers import T5Tokenizer, T5ForConditionalGeneration,AdamW 4 | from transformers import BartTokenizer,BartForConditionalGeneration 5 | from settings import * 6 | from utils import GetRouge,CountFiles 7 | import os 8 | from torch.utils.data.dataset import TensorDataset 9 | from torch.utils.data.dataloader import DataLoader 10 | from torch.nn.modules.module import Module 11 | 12 | current_model="" 13 | 14 | 15 | 16 | def ToTensor(texts,summaries,tokenizer): 17 | task_prefix="summarize: " 18 | encoding = tokenizer([task_prefix + sequence for sequence in texts], 19 | padding='longest', 20 | max_length=SOURCE_THRESHOLD, 21 | truncation=True, 22 | return_tensors="pt") 23 | input_ids, attention_mask = encoding.input_ids, encoding.attention_mask 24 | 25 | target_encoding = tokenizer(summaries, 26 | padding='longest', 27 | max_length=SUMMARY_THRESHOLD, 28 | truncation=True) 29 | labels = target_encoding.input_ids 30 | labels = [(i if i != tokenizer.pad_token_id else -100) for i in labels] 31 | labels = torch.tensor(labels) 32 | 33 | return TensorDataset(input_ids,attention_mask,labels) 34 | 35 | def FineTune(net:Module,tokenizer): 36 | '''微调''' 37 | 38 | tset_texts=[] 39 | tset_summaries=[] 40 | vset_texts=[] 41 | vset_summaries=[] 42 | tset_len=CountFiles(DATA_DIR+"new_train") 43 | vset_len=CountFiles(DATA_DIR+"new_val") 44 | for i in range(tset_len): 45 | text,summary=ReadJson(i,DATA_DIR+"new_train") 46 | tset_texts.append(text) 47 | tset_summaries.append(summary) 48 | for i in range(vset_len): 49 | text,summary=ReadJson(i,DATA_DIR+"new_val") 50 | vset_texts.append(text) 51 | vset_summaries.append(summary) 52 | print("训练数据已读入内存...") 53 | 54 | train_iter=DataLoader( 55 | ToTensor(tset_texts,tset_summaries,tokenizer), 56 | batch_size=BATCH_SZIE, 57 | shuffle=True, 58 | num_workers=4 59 | ) 60 | val_iter=DataLoader( 61 | ToTensor(vset_texts,vset_summaries,tokenizer), 62 | batch_size=BATCH_SZIE, 63 | shuffle=False, 64 | num_workers=4 65 | ) 66 | 67 | print("minibatch已生成...") 68 | 69 | print("开始训练模型...") 70 | opt=AdamW(net.parameters()) 71 | from tqdm import tqdm 72 | import time 73 | min_loss=10 74 | for epoch in range(EPOCHS): 75 | train_loss=[] 76 | val_loss=[] 77 | net.train() 78 | for batch in tqdm(train_iter): 79 | input_ids,attention_mask,labels=[x.to(DEVICE) for x in batch] 80 | l = net(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss 81 | l.backward() 82 | opt.step() 83 | opt.zero_grad() 84 | with torch.no_grad(): 85 | train_loss.append(l.item()) 86 | 87 | torch.cuda.empty_cache() 88 | net.eval() 89 | with torch.no_grad(): 90 | for batch in tqdm(val_iter): 91 | input_ids,attention_mask,labels=[x.to(DEVICE) for x in batch] 92 | l = net(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss 93 | val_loss.append(l.item()) 94 | 95 | if(sum(val_loss)数字 13 | WORD_IDX_PATH="D:/2021UCAS/AdvancedAI/TextSum/dataset/word2idx.pkl" 14 | # 数字->单词 15 | IDX_WORD_PATH="D:/2021UCAS/AdvancedAI/TextSum/dataset/idx2word.pkl" 16 | 17 | #------------------ 词典设置 ------------------# 18 | # 特殊符号 19 | PAD_WORD = '' 20 | UNK_WORD = '' 21 | BOS_WORD = '' 22 | EOS_WORD = '' 23 | PAD_NUM = 0 24 | UNK_NUM = 1 25 | BOS_NUM = 2 26 | EOS_NUM = 3 27 | # 词典大小(拉满就不会出现UNK),注意输入至网络时要加4(还有四个特殊字符) 28 | VOCAB_SIZE=10000 29 | # 最长原文序列长度 30 | MAX_SOURCE_LEN=2193 31 | # 最长摘要序列长度 32 | MAX_SUMMARY_LEN=587 33 | 34 | # 限定序列长度(长于此长度做切割,短于此长度做padding) 35 | SOURCE_THRESHOLD=1800 36 | SUMMARY_THRESHOLD=550 37 | # 读取数据时的标志 38 | TRAIN_FALG=0 39 | VAL_FALG=1 40 | TEST_FALG=2 41 | # 数据清理规则 42 | # 顺序莫变! 43 | PATTERNS_ONCE=[ 44 | "by .*? published :.*?\. \| \..*? [0-9]+ \. ", 45 | "by \. .*? \. ", 46 | "-lrb- cnn -rrb- -- ", 47 | "\t(.*?-lrb- .*? -rrb- -- )", 48 | ] 49 | PATTERNS_ANY=[ 50 | "``|''" 51 | ] 52 | 53 | #------------------ 其他设置 ------------------# 54 | DEVICE=torch.device("cuda:0") 55 | EPOCHS=10 56 | BATCH_SZIE=28 57 | 58 | 59 | #------------------ 预训练模型设置 ------------------# 60 | 61 | # 搜索束个数 62 | NUM_BEAMS=1 63 | # 预测序列最大长度 64 | MAX_LEN=590 65 | # 预测序列最小长度 66 | MIN_LEN=30 67 | -------------------------------------------------------------------------------- /source/temp.py: -------------------------------------------------------------------------------- 1 | # import re 2 | # s1="3 by . daily mail reporter . published : . 15:34 est , 13 july 2012 . | . updated : . 01:33 est , 16 july 2012 . kelsey grammer 's wife kayte has given birth to their first child together . the boss actor , 57 , and his 32-year-old spouse -- who were expecting twins -- are ` thrilled ' after welcoming a ` healthy baby girl ' weighing 6lbs 2oz into the world this morning in los angeles , and they have named her faith evangeline elisa grammer . but the couple revealed they tragically lost their unborn son shortly after announcing kayte was pregnant with twins . joy and heartache : kelsey grammer and kayte walsh , pictured in chicago esterday , have welcomed a baby girl , but also revealed they lost a twin boy during the pregnancy . in a personal note , they said : ` early . this morning kayte gave birth to faith evangeline elisa grammer . we . are thrilled . she was 6lbs 2oz when she entered the world at 1am on the . 13th of july in the year 2012 . mother and child are in excellent . health . ' ` we were ecstatic earlier this year , . when we announced that kayte was carrying twins . tragically we lost the . little boy shortly thereafter . this was not something we cared to make . known publicly at the time . ' ` it was unspeakably painful and we . know that people will understand our desire to keep the news private . then , as we know they will respect our privacy in this matter now . a . glorious birth with a lingering sadness is ours today . ` we choose to celebrate the life that has been given us ' : the pair released an emotional statement today . ` healthy baby girl ' : they have named the baby , who weighs 6lbs 2oz , faith evangeline elisa grammer . ` we choose to celebrate the life that . has been given us . we proudly introduce our faith to the world today . looking forward to the days ahead and the children yet to come . ' the couple -- who got married in . february 2011 and renewed their vows in june -- previously lost a child . when kayte suffered a miscarriage in 2010 . kelsey already has four kids , . spencer , 28 , and greer , 19 , from previous relationships and 10-year-old . mason and jude , seven , with ex-wife camille donatacci . the couple went public with their romance just weeks after he split from the real housewives of beverly hills star . ex wife : kelsey with real housewives star camille and their children jude and mason in 2008 . kayte gave birth to a ` healthy baby girl ' named faith evangeline elisa this morning . couple reveal ` unspeakable ' pain at losing twin boy during pregnancy . celebrating a ` glorious birth ' with ` lingering sadness '" 3 | # s2="2 by . daily mail reporter . published : . 00:04 est , 14 july 2012 . | . updated : . 01:30 est , 16 july 2012 . sylvester stallone was said to have almost collapsed with grief on learning of the death of his son yesterday . the body of sage stallone , 36 , was found by his housekeeper at his los angeles home . prescription drugs were reportedly found nearby but police said it was too early to say whether they were the cause of his death . tragedy : sylvester stallone 's son sage was found dead this afternoon in his los angeles apartment after a suspected drug overdose . he was 36 , pictured here in 2006 in hollywood . a source close to stallone said : . ` when he heard the news , sly was shocked , short of breath and almost . collapsed . he just went quiet before sobbing uncontrollably . he is a . wreck at the moment . ' sage 's aunt melanie hart told the mail on sunday : ` people are speculating that it was suicide but we really have no idea . ' there were unconfirmed reports that . sage , whose mother is stallone 's first wife sasha czack , had been dead . for four days before his body was found . a source told radaronline that medics . arrived on the scene at 3.05 pm this afternoon and spent around 25 . minutes trying to revive sage before his death was pronounced at the . scene . his body was taken straight to the coroner 's office - and the insider claims no suicide note was found . ' i suspect he had been dead for quite a while when he was discovered , ' the source told the website . ` usually medics will be at the scene . for around 45 minutes but they were out of there within half an . hour . ` there were a number of prescription bottles found at the scene but it did not appear to be suicide and no note was found . ' pronounced dead at the scene : the coroner 's van was spotted at sage 's home in los angeles along with news crews . unresponsive : the filmmaker 's body was taken straight to the coroner 's office - and not to the hospital . a 9-1-1 call was placed shortly . before 3pm and the caller said sage was n't breathing and indicated it . could be a drug overdose , radar reports . an autopsy is scheduled to take place in the next 48 hours . shortly after news of sage 's death , a . spokesman released a statement on behalf of his action hero father , 66 , . who was at the comic con film convention in san diego yesterday . ` sylvester stallone is devastated and . grief-stricken over the sudden loss of his son , ' the actor 's . spokesperson michelle bega said in the statement . ` his compassion and thoughts are with sage 's mother , sasha . ' sudden death : the body of the 36-year-old sage stallone was brought out to the coroner 's van in los angeles . devastated : sly 's agent released a statement saying he was ` grief-striken ' at the loss of his son . mystery : an autopsy is scheduled to take place in the next 48 hours to determine the cause of death . earlier : sly was at comic com yesterday evening . red carpet smiles : sage pictured in 1996 at the hollywood premiere of daylight with his father sylvester and his now-wife jennifer flavin . double act : sage appeared alongside his father in the 1990 movie rocky v , playing the role of rocky 's son robert balboa . ` he was a very talented and wonderful young man . his loss will be felt forever . ' police said they found the younger . stallone in the home while responding to a ` welfare check ' , however . sage 's lawyer george braunstein said he was found by a housekeeper . friends and acquaintances had become concerned because they had n't heard from sage in the past day . braunstein said the death came as a shock , telling the new york post this afternoon : ` he was in good spirits , and working . on all kinds of projects . ` he was planning on getting married . i am just devastated . he was an extremely wonderful , loving guy . this is a tragedy . ' before the heartbreak : stallone was pictured yesterday with arnold schwarzenegger at the comic con film convention in san diego . sage moonblood stallone was the . oldest of sylvester stallone 's children and co-starred with his father . in two films . he was the first of two sons stallone had with first wife . sasha czack . he made his acting debut in 1990 's . rocky v - he played his stallone 's onscreen son - and also appeared with . his father in 1996 's daylight . hand in hand : sylvester pictured back in 1982 with his first wife sasha czack , sage 's mother . also in 1996 , sage stallone and . veteran film editor bob murawski co-founded grindhouse releasing , a . company dedicated to preserving and promoting the b-movies and . exploitation films of the 1970s and 80s . he also directed the 2006 short vic , which screened at the palm springs film festival . braunstein said sage had frequent requests to work on films . ` he was a full of life filmmaker with . his whole future ahead of him , ' he said . ` he was just very up and . enthusiastic and positive . ' i think it was probably some sort of accident , ' he said of the death . braunstein added that sage stallone greatly admired his father but was working hard to make his own name in the film industry . ` he was very proud of his father and proud to be his father 's son , ' he said . stallone 's split from sage 's mother czack in 1985 after 11 years together . they also have a another son . seargeoh , 32 , who is autistic . stallone went on to wed model and actress brigitte . nielsen in beverly hills but they split just two . years later in a very public divorce . he married third wife , jennifer . flavin , in 1997 after an eight-year on-again , off-again relationship and . they have three daughters : sophia rose , 15 , sistine rose , 14 , and . scarlet rose , 10 . sage , who was raised by his mother following his parents ' divorce , felt distant from his father growing up , a theme which hit home as they were filming rocky v together . big boots to fill : sage said he always worried about living up to his father 's success , seen here together again in rocky v . ` when i was screaming , `` you never spent time with me ! you never spent time with my mother ! '' - that was true , ' he told people magazine in 1996 . ` i was looking into my father 's face and really saying that . ' but it proved a turning point for the father and son , who went on to form a close bond and they acted again together in the 1996 film daylight . ` between takes , sly and sage would roll around in the dirt like two puppies , ' the director rob cohen observed at the time . sage certainly felt the pressure of growing up with such a famous father and would worry that he would never match his success . ` i tell him , `` as long as you give it your best , that 's all that matters , '' his mother sasha said in that same year . sage went on to pursue a career behind the camera and shunned the wild hollywood party scene , preferring to watch horror zombie films instead . ` people call me a hermit , ' he said while promoting the film . ` but i 'm happy . ' star ` devastated and grief-stricken ' over sudden loss of his eldest child . sage played the 66-year-old 's onscreen son in rocky v . an autopsy is scheduled to take place in the next 48 hours after filmmaker was found next to prescription drugs ." 4 | # s3="1 -lrb- cnn -rrb- -- to resolve america 's ongoing , bruising battle over the debt and deficit , house republican paul ryan and senate democrat patty murray announced a deal on december 10 to halt spending cuts -- mostly in defense -- and lock in a two-year budget agreement to avoid another government shutdown on january 15 . but in eagerly seeking agreement with the republicans who shut the government down in october , democrats risk hurting the economy 's fragile recovery by accepting too much budget austerity embedded in the newly adopted budget . president obama and the democrats won big over the republicans in october 's budget fight . instead of pressing their advantage , democrats took tax increases for the rich off the table , agreed to cut federal pensions and did not get unemployment benefits extended . the democrats basically threw away their political gains . the deal repeals less than half of the sequestration cuts planned for 2014 . if obama and congress continue their shortsighted obsession with austerity and budget cuts , they ignore the big economic lesson from the past several years : austerity hurts prosperity . the congressional budget office estimated that repealing the entire 2013-2014 spending cuts would increase gross domestic product by $ 113 billion and create 900,000 additional jobs next year . the october 2013 government shutdown took another $ 24 billion out of the gdp . unemployment remains stuck around 7 % . though the deal reduces a bit of fiscal uncertainty , it hardly affected the u.s. growth forecasts for big banks , despite bank economists citing some pessimism because of `` austerity shock '' from spending cuts and `` uncertainty shock '' from washington 's continued fiscal battles . republicans bargain for more cuts and fewer taxes , but cutting military spending makes them nervous , so they attack social security and medicare . the wall street-affiliated democratic group third way is helping . it launched an attack on sen. elizabeth warren , d-massachusetts , and others who rightly refuse to cut social security as part of a long-term budget solution . we all know that republicans like to defend the wealthy and slash government . but why does austerity , especially cuts to old-age programs , have credibility with obama and other democrats ? advocates of `` grand bargains , '' cutting programs to balance the budget , wrongly presume the budget is a fixed quantity . they imagine it like a fixed pie . programs for the young , like education , must be paid for by cutting other programs , like social security . but their belief that a dollar taken from the old will be spent on the young is not only divisive , mean and fierce -- it is wrong . in his december 6 speech on inequality , obama talked about the sky-high and stubborn child poverty rate : more than 24 % . but cutting social security and medicare will only destabilize the economy and increase the elderly poverty rate . in many countries , programs for elderly people are not traded off against help for the young . when support for old-age programs increases , so does spending on children . advanced democratic countries ' spending on the elderly is positively correlated with education spending . one analysis shows that a 10 % increase in spending on education is correlated with a 7.3 % increase in spending on pensions . the congressional budget office warns that long-term deficits can hurt the economy . want to reduce the debt and deficit ? tax the wealthy , which wo n't hurt the economy . economists emmanuel saez and thomas piketty estimate that raising the tax rate for the top 1 % as high as 80 % would generate far more revenue . sen. tom harkin , d-iowa , and rep. peter defazio , d-oregon , propose a transactions tax -- a three-penny charge on every $ 100 traded in the stock market , which the congressional budget office estimates would raise $ 352 billion over 10 years . this small tax would also reduce stock churning by speculators , creating a nice secondary benefit . want to find even more savings ? sen. harry reid , d-nevada , wisely put tax loopholes that cost the treasury almost a trillion dollars per year on the table . for example , reid called for eliminating the small , but noxious , tax break for buying yachts and the $ 17 billion break that comes from taxing private equity , real estate and hedge fund profits as `` carried interest '' rather than at the ordinary income rate of 39.6 % instead of the capital gains rate of 20 % . there is one piece of good news : the deficit is coming down , from 9.2 % when obama took office to 4.1 % of gdp in 2017 . faster economic growth would shrink the deficit more rapidly . in contrast , further spending cuts will slow the economy and deficit reduction along with it . so , this is no time for obama to accept a lower budget path , or to consider cuts in social security and medicare . the small budget deficit reductions in this deal -- less than one-half of 1 % of the total debt or $ 23 billion -- would almost pay for extended unemployment benefits for one year at $ 25 billion . democrats are flinching under continued pressure from republicans playing out their long game as they ready for another bitter fight when the debt limit is reached next spring . but the president and the democrats have a winning economic and political strategy : raise revenues and keep social security and medicare strong . do n't throw october 's hard-won victory away ; it wo n't help the elderly , it wo n't help children , and it wo n't help the economy . the opinions expressed in this commentary are solely those of rick mcgahey and teresa ghilarducci . democrats and republicans reach a deal on the budget . rick mcgahey , teresa ghilarducci : austerity in budget will hurt our economy . they say president obama should not make cuts to programs like social security . mcgahey , ghilarducci : taxing the wealthy would generate revenue and cut deficit ." 5 | # s4="123 new york -lrb- cnn -rrb- -- preliminary tests indicate ricin was found in letters sent this past weekend to new york mayor michael bloomberg , new york deputy police commissioner paul browne said wednesday . browne said the letters to bloomberg contained a threat to the mayor and mentioned the debate on gun laws . `` the letter obviously , referred to our anti-gun efforts but there 's 12,000 people -lrb- who -rrb- are going to get killed this year with guns and 19,000 that are going to commit suicide with guns , and we 're not going to walk away from those efforts , '' bloomberg said . one letter addressed to the mayor 's office was opened at the city government 's mail facility , browne said . the suspicious material found in the two letters was a `` pink-orange oily substance , '' he said , adding that it was the second of two tests that showed what appeared to be trace amounts of ricin . what is ricin ? the substance is being tested at the national bioforensic analysis center in maryland , with conclusive results expected by friday . some of the emergency services workers who touched the letter friday were examined after they showed minor intestinal symptoms of ricin exposure on saturday , browne said . the symptoms have since subsided . civilian workers showed no symptoms , browne said in a statement . `` we take a lot of security measures as you know , '' bloomberg said . `` the men and women that open the mail for example ... they are well trained . '' the second letter to the mayor was opened by mark glaze , director of mayors against illegal guns -- founded and co-chaired by bloomberg -- in washington on sunday . browne 's statement appeared to indicate glaze showed no symptoms . a spokeswoman for the organization declined to comment wednesday . opinion : ricin - almost never deadly . both letters were postmarked in shreveport , louisiana , on may 20 , the american postal workers union said on its website . bloomberg is an outspoken critic of current gun laws . in march , he said nationwide background checks on all gun sales would save lives . `` we know that 's true , because in states that already require background checks on private sales , the rate of women murdered by an intimate partner armed with a gun is 38 % lower than in states that do n't have such background checks , '' he said . fbi spokesman jim margolin told cnn the agency is working to determine from where the letters were sent and who sent them . if inhaled , injected or ingested , less than a pinpoint of ricin can kill a person within 36 to 48 hours because of the failure of the respiratory and circulatory systems . there is no known antidote for the toxin , which is derived from castor beans . it has been included in letters in the past few months sent to president barack obama and other officials . in april , letters were sent to obama ; sen. roger wicker , r-mississippi ; and sadie holland , a judge in lee county , mississippi . james everett dutschke of tupelo , mississippi , has been charged with possession and use of a biological agent in connection with the case . last week , fbi agents arrested matthew ryan buquet after a grand jury charged him with mailing threatening communication to a senior judge in the u.s. district court for the eastern district of washington state . the fbi said in a statement that tests -- conducted by that agency and the spokane regional health district -- showed that a suspicious substance found with the letter was `` active ricin toxin . '' there are no indications the cases are connected . man , 37 , arrested in probe of washington state ricin-laced letters . cnn 's deborah feyerick , jason kessler , lawrence crook iii , carol cratty and mary snow contributed to this report . new : suspicious substance was oily , new york police official says . new : postal union says letters were postmarked in shreveport , louisiana . letters were addressed to bloomberg , one went to an organization he founded . ricin is a toxin that can kill in a matter of days . " 6 | # s5="975 london , england -lrb- cnn -rrb- -- human rights and freedom of the press in china , the detention of terrorist suspects by the united states and russia 's treatment of political dissent are the focus of scrutiny in amnesty international 's annual report , released wednesday , which looks at the state of human rights around the world . amnesty international protestors outside the us supreme court in january dressed as guantanamo bay detainees . the 398-page report comes 60 years after the united nations adopted the universal declaration of human rights , and amnesty says governments still need to act on their promises . `` the biggest threat to the future of human rights is the absence of a shared vision and collective leadership , '' the organization said in a statement . irene khan , amnesty 's secretary-general , said that in particular , `` the human-rights flash points in darfur , zimbabwe , gaza , iraq and myanmar demand immediate attention . '' the report , the group said , `` reveals a world riven by inequality , scarred by discrimination and distorted by political repression . '' according to its count , people are tortured or subject to other ill treatment in at least 81 countries , face unfair trials in at least 54 countries and are not allowed to express themselves freely in at least 77 countries . of the 150 countries and regions listed in the report , amnesty paid particular attention to china , the host of this summer 's olympic games . the group said growing numbers of human rights activists were imprisoned or harassed in china in 2007 , with ethnic and religious minorities -- including tibetans , falun gong practitioners and christians -- repressed or persecuted . death penalty statistics in china are difficult to assess , amnesty said , but based on public reports , the group estimated that at least 470 people were executed in 2007 . amnesty also noted the repression of free speech in china and said censorship of the internet and other media intensified last year . `` the chinese authorities maintained efforts to tightly control the flow of information , '' the report said . `` they decided what topics and news stories could be published , and media outlets were sometimes required to respond within minutes to government directives . the authorities continued to block web sites and to filter internet content based on specified words and topics . '' around 30 journalists and at least 50 others are known to be in prison for posting their views online , amnesty said . amnesty also criticized the death penalty in the united states , where 42 people were executed last year . it noted new jersey 's decision in december to abolish the death penalty made it the first u.s. state in more than 40 years to do away with executions . as it has in previous annual reports , amnesty criticized the detention of hundreds of foreign nationals at the u.s. naval base at guantanamo bay , cuba . `` the usa must close guantanamo detention camp and secret detention centers , prosecute the detainees under fair trial standards or release them , and unequivocally reject the use of torture and ill-treatment , '' amnesty said . the group noted that guantanamo detainees are held indefinitely , most of them without charge and without recourse to u.s. courts . most detainees there are held in isolation in maximum-security facilities , heightening concerns for their physical and mental health , amnesty said . in fact , more is written on the united states than any other country listed in the report . asked about that at a press conference tuesday , khan said , `` we certainly devote a lot of time to sudan , to china , to zimbabwe and other countries . but we look to the u.s. to provide leadership around the world . governments around the world look to the united states as a role model for their own behavior . '' in a lengthy section on iraq , amnesty noted that thousands of civilians , including children , were killed or injured in ongoing sectarian violence during 2007 . `` all sides involved in the fighting committed gross human rights violations , some of which amounted to war crimes and crimes against humanity , '' the report said . abductions , torture and murder , with bodies left in the street , occur daily , and the violence has caused 2 million iraqis to flee to syria , jordan and elsewhere , amnesty said . u.s. forces held some 25,000 detainees `` without charge or trial , '' the group said , and 33 people were executed , `` some after grossly unfair trials . '' in afghanistan , conflict and insecurity aggravated by drought and floods contributed to `` large-scale displacement '' of people throughout the year . `` at least 6,500 people were estimated to have been killed in the context of the conflict , '' the report said . `` violations of international humanitarian and human rights law were committed with impunity by all parties , including afghan and international security forces and insurgent groups . '' russia must show greater tolerance for political dissent , amnesty said . `` the russian authorities were increasingly intolerant of dissent or criticism , branding it ` unpatriotic , ' '' the report said . `` a crackdown on civil and political rights was evident throughout the year and in particular during the run-up to the state duma -lsb- parliament -rsb- elections in december . '' the european court of human rights ruled that russia was responsible for enforced disappearances , torture and extrajudicial executions in 15 judgments relating to the recent conflict in chechnya , amnesty said . there were fewer reported disappearances in the chechen republic in 2007 than in previous years , amnesty said , but continued human rights violations made people reluctant to report abuses . the report also criticized human rights conditions in iran , gaza and myanmar . human rights conditions in zimbabwe continued to decline in 2007 , the report said , `` with an increase in organized violence and torture and restrictions on the rights to freedom of association , assembly and expression . '' members of the main opposition party , the mdc , along with other human rights defenders , were arrested , and many were tortured while in custody , amnesty said . some 4 million people required food aid because of the nation 's deteriorating economy , and victims of forced evictions in 2005 continued to live in `` deplorable conditions '' while president robert mugabe 's government failed to remedy their situation . `` human rights problems are not isolated tragedies , but are like viruses that can infect and spread rapidly , endangering all of us , '' khan said . `` governments today must show the same degree of vision , courage and commitment that led the united nations to adopt the universal declaration of human rights 60 years ago . ''" 7 | # s6="6737 by . eleanor crooks , press association . maria sharapova reached her third successive french open final by battling past eugenie bouchard . sharapova maintained her remarkable record in three-set matches by winning an 18th consecutive deciding set on clay in a 4-6 , 7-5 , 6-2 victory . the russian won her first title at roland garros in 2012 before losing to serena williams 12 months ago . on form : maria sharapova fought back from a set down to overcome a stiff challenge from eugenie bouchard . rising star : eugenie bouchard , 20 , was playing in her second consecutive grand slam semi-final . bouchard , . who was playing in her second straight grand slam semi-final , had lost . comfortably to sharapova in the second round last year and demonstrated . again the huge strides she has made . she possesses the same steely-eyed determination as sharapova and her mental strength is remarkable for a 20-year-old . the . canadian said after beating angelique kerber in the fourth round that . she did not have a best friend in tennis , adding : ' i do n't think the . tennis tour is the place to have friends . for me , it 's all competition . ' it . was a sentence that could well have been written by sharapova so it was . no surprise that this was not a match for the faint-hearted . bouchard . has improved significantly since making the last four at the australian . open in january , hitting the ball a lot more aggressively , and it was . she who struck first with a break for 2-1 . pumped up : sharapova celebrates as she comes back from a set down to seal her place in the final . sharapova . fought back to level at 4-4 but bouchard forged ahead again immediately . and held to take the set , saving a break point with the gutsiest of . backhand winners onto the line . sharapova . had recovered from a set down in both her last two matches against sam . stosur and garbine muguruza and set about doing the same , moving into a . 5-2 lead . but . the russian 's serve , never something to be relied upon , was having an . off day and , serving for the set , she twice double-faulted on set point . rising star : bouchard gets down low to play a forehand as she takes the first set over sharapova . there was also a second-serve ace on a break point for good measure but on her third chance bouchard pounced . the . 20-year-old was unable to resist when sharapova broke again at 5-5 , . though , and this time the seventh seed clinched the set when bouchard . netted a forehand . bouchard . had never lost a grand slam match in which she had won the first set . before but the sense was sharapova 's prowess in deciding sets would be . the crucial factor . scene of success : sharapova will play in her third consecutive french open final on saturday . the russian moved ahead at 3-1 , and for the first time bouchard was making bad mistakes on the big points . she . held for 4-2 , saving two break points , but in the next game missed a . routine forehand and a volley as sharapova moved to within one game of . victory . bouchard . fought on , saving four match points in terrific style , but there was . nothing she could do when a sharapova forehand fizzed off the baseline . after two hours and 27 minutes . french kiss : sharapova acknowledges the roland garros crowd after semi-final victory . sharapova fought back from a set down to beat canadian bouchard . the 2012 champion won 4-6 , 7-5 , 6-2 in two hours and 27 minutes . sharapova will play simona halep in saturday 's final at roland garros . " 8 | # s7="0 editor 's note : in our behind the scenes series , cnn correspondents share their experiences in covering news and analyze the stories behind the events . here , soledad o'brien takes users inside a jail where many of the inmates are mentally ill . an inmate housed on the `` forgotten floor , '' where many mentally ill inmates are housed in miami before trial . miami , florida -lrb- cnn -rrb- -- the ninth floor of the miami-dade pretrial detention facility is dubbed the `` forgotten floor . '' here , inmates with the most severe mental illnesses are incarcerated until they 're ready to appear in court . most often , they face drug charges or charges of assaulting an officer -- charges that judge steven leifman says are usually `` avoidable felonies . '' he says the arrests often result from confrontations with police . mentally ill people often wo n't do what they 're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid , delusional , and less likely to follow directions , according to leifman . so , they end up on the ninth floor severely mentally disturbed , but not getting any real help because they 're in jail . we toured the jail with leifman . he is well known in miami as an advocate for justice and the mentally ill . even though we were not exactly welcomed with open arms by the guards , we were given permission to shoot videotape and tour the floor . go inside the ` forgotten floor ' '' at first , it 's hard to determine where the people are . the prisoners are wearing sleeveless robes . imagine cutting holes for arms and feet in a heavy wool sleeping bag -- that 's kind of what they look like . they 're designed to keep the mentally ill patients from injuring themselves . that 's also why they have no shoes , laces or mattresses . leifman says about one-third of all people in miami-dade county jails are mentally ill . so , he says , the sheer volume is overwhelming the system , and the result is what we see on the ninth floor . of course , it is a jail , so it 's not supposed to be warm and comforting , but the lights glare , the cells are tiny and it 's loud . we see two , sometimes three men -- sometimes in the robes , sometimes naked , lying or sitting in their cells . `` i am the son of the president . you need to get me out of here ! '' one man shouts at me . he is absolutely serious , convinced that help is on the way -- if only he could reach the white house . leifman tells me that these prisoner-patients will often circulate through the system , occasionally stabilizing in a mental hospital , only to return to jail to face their charges . it 's brutally unjust , in his mind , and he has become a strong advocate for changing things in miami . over a meal later , we talk about how things got this way for mental patients . leifman says 200 years ago people were considered `` lunatics '' and they were locked up in jails even if they had no charges against them . they were just considered unfit to be in society . over the years , he says , there was some public outcry , and the mentally ill were moved out of jails and into hospitals . but leifman says many of these mental hospitals were so horrible they were shut down . where did the patients go ? nowhere . the streets . they became , in many cases , the homeless , he says . they never got treatment . leifman says in 1955 there were more than half a million people in state mental hospitals , and today that number has been reduced 90 percent , and 40,000 to 50,000 people are in mental hospitals . the judge says he 's working to change this . starting in 2008 , many inmates who would otherwise have been brought to the `` forgotten floor '' will instead be sent to a new mental health facility -- the first step on a journey toward long-term treatment , not just punishment . leifman says it 's not the complete answer , but it 's a start . leifman says the best part is that it 's a win-win solution . the patients win , the families are relieved , and the state saves money by simply not cycling these prisoners through again and again . and , for leifman , justice is served . e-mail to a friend ." 9 | # s8="21 lagos , nigeria -lrb- reuters -rrb- -- nigeria 's television survival show has been" 10 | # pat1="by .*? published :.*?\. \| \..*? [0-9]+ \. " 11 | # pat2="-lrb- cnn -rrb- -- " 12 | # pat3="\t(.*?-lrb- .*? -rrb- -- )" 13 | # pat4="``|''" 14 | # pat5="by \. .*? \. " 15 | # res=re.sub(pat1,"",s4,1) 16 | # res=re.sub(pat2,"",res,1) 17 | # res=re.sub(pat3,"",res,1) 18 | # res=re.sub(pat4,"",res) 19 | # res=re.sub(pat5,"",res,1) 20 | # print(res) 21 | # import torch 22 | # from torch import nn 23 | 24 | # transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12) 25 | # src = torch.rand((10, 32, 512)) 26 | # tgt = torch.rand((20, 32, 512)) 27 | # out = transformer_model(src, tgt) 28 | # print(out.shape) 29 | 30 | # l=[1,2,3,4] 31 | 32 | # print([i if i!=4 else 5 for i in l]) 33 | 34 | 35 | # import utils 36 | 37 | # a=["hello hello","hi world"] 38 | # b=["hello hello","world"] 39 | 40 | # utils.GetRouge(a,b) 41 | 42 | # import torch 43 | # from transformers import T5Tokenizer, T5Model 44 | 45 | # tokenizer=T5Tokenizer.from_pretrained('t5-small') 46 | # text = ['Hello world!', 'Hello python!'] 47 | # inputs = tokenizer(text, return_tensors='pt', padding=True) 48 | # print(inputs) 49 | 50 | from transformers import T5Tokenizer, T5ForConditionalGeneration,AdamW 51 | import torch 52 | from settings import * 53 | 54 | tokenizer = T5Tokenizer.from_pretrained(PARAM_DIR+"t5-small") 55 | model = T5ForConditionalGeneration.from_pretrained(PARAM_DIR+"t5-small") 56 | 57 | # the following 2 hyperparameters are task-specific 58 | max_source_length = 512 59 | max_target_length = 128 60 | 61 | # Suppose we have the following 2 training examples: 62 | input_sequence_1 = "Welcome to NYC" 63 | output_sequence_1 = "Bienvenue à NYC" 64 | 65 | input_sequence_2 = "HuggingFace is a company as e dd" 66 | output_sequence_2 = "HuggingFace est une entreprise" 67 | 68 | # encode the inputs 69 | task_prefix = "translate English to French: " 70 | input_sequences = [input_sequence_1, input_sequence_2] 71 | encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 72 | padding='longest', 73 | max_length=max_source_length, 74 | truncation=True, 75 | return_tensors="pt") 76 | input_ids, attention_mask = encoding.input_ids, encoding.attention_mask 77 | 78 | # encode the targets 79 | target_encoding = tokenizer([output_sequence_1, output_sequence_2], 80 | padding='longest', 81 | max_length=max_target_length, 82 | truncation=True) 83 | labels = target_encoding.input_ids 84 | 85 | # replace padding token id's of the labels by -100 86 | labels = [ 87 | [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels 88 | ] 89 | labels = torch.tensor(labels) 90 | loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss 91 | loss.backward() 92 | opt=AdamW(model.parameters()) 93 | opt.step() 94 | 95 | print(loss) -------------------------------------------------------------------------------- /source/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | from torch import nn 4 | from torch import optim 5 | from torch.nn.modules.module import Module 6 | from tqdm.std import tqdm 7 | from settings import * 8 | import json 9 | import pickle as pkl 10 | import re 11 | from torch.utils.data.dataset import Dataset 12 | from torch.utils.data.dataloader import DataLoader 13 | import torch 14 | from rouge import Rouge 15 | import models 16 | 17 | ############################### Just run for one time! ############################### 18 | def Preprocess(train_path=DATA_DIR+"train_dataset.csv",test_path=DATA_DIR+"test_dataset.csv"): 19 | ''' 20 | 清理数据、划分验证集后重新保存至新文件 21 | ''' 22 | 23 | # 数据清洗 24 | def _cleanData(data): 25 | print("数据清洗开始=========================================") 26 | 27 | clean_data=[] 28 | for i,d in tqdm(enumerate(data)): 29 | res=d 30 | for pat in PATTERNS_ONCE: 31 | #################################之后修改 32 | if("\t" in pat): 33 | res=re.sub(pat,"\t",res,1) 34 | else: 35 | res=re.sub(pat,"",res,1) 36 | #################################### 37 | for pat in PATTERNS_ANY: 38 | res=re.sub(pat,"",res) 39 | 40 | clean_data.append(res) 41 | 42 | print("数据清洗完毕=========================================") 43 | return clean_data 44 | 45 | # 将处理后的数据保存为json文件 46 | def _save2Json(data,mode): 47 | 48 | 49 | if mode==2: 50 | 51 | for i in range(len(data)): 52 | source=data[i].split('\t')[1].strip('\n') 53 | if source!='': 54 | dict_data={"text":source,"summary":'no summary'}#测试集没有参考摘要 55 | 56 | with open(new_test_path+str(i)+'.json','w+',encoding='utf-8') as f: 57 | f.write(json.dumps(dict_data,ensure_ascii=False)) 58 | 59 | 60 | else: 61 | 62 | for i in range(len(data)): 63 | 64 | if len(data[i].split('\t'))==3: 65 | source_seg=data[i].split("\t")[1] 66 | traget_seg=data[i].split("\t")[2].strip('\n') 67 | 68 | 69 | if source_seg and traget_seg !='': 70 | dict_data={"text":source_seg,"summary":traget_seg} 71 | path=new_train_path 72 | if mode==1: 73 | path= new_val_path 74 | with open(path+str(i)+'.json','w+',encoding='utf-8') as f: 75 | f.write(json.dumps(dict_data,ensure_ascii=False)) 76 | 77 | 78 | 79 | with open(train_path,'r',encoding='utf-8') as f: 80 | train_data_all=f.readlines() 81 | 82 | with open(test_path,'r',encoding='utf-8') as f: 83 | test_data=f.readlines() 84 | 85 | # 数据清洗 86 | train_data_all=_cleanData(train_data_all) 87 | test_data=_cleanData(test_data) 88 | 89 | # with open("./1.csv",'w',encoding='utf-8') as f: 90 | # f.writelines(train_data_all) 91 | # with open("./2.csv",'w',encoding='utf-8') as f: 92 | # f.writelines(test_data) 93 | # random.shuffle(train_data_all) 94 | 95 | # 设置新文件路径 96 | new_train_path=os.path.join(DATA_DIR,"new_train/") 97 | new_val_path=os.path.join(DATA_DIR,"new_val/") 98 | new_test_path=os.path.join(DATA_DIR,"new_test/") 99 | 100 | if not os.path.exists(new_train_path): 101 | os.makedirs(new_train_path) 102 | 103 | if not os.path.exists(new_val_path): 104 | os.makedirs(new_val_path) 105 | 106 | if not os.path.exists(new_test_path): 107 | os.makedirs(new_test_path) 108 | 109 | train_data=train_data_all[:8000] #把训练集重新划分为训练子集和验证子集,保证验证集上loss最小的模型,预测测试集 110 | val_data=train_data_all[8000:] 111 | 112 | _save2Json(train_data,TRAIN_FALG) 113 | _save2Json(val_data,VAL_FALG) 114 | _save2Json(test_data,TEST_FALG) 115 | 116 | 117 | def CountFiles(path): 118 | ''' 119 | 计算目标文件夹json文件数目 120 | ''' 121 | matcher = re.compile(r'[0-9]+\.json') 122 | match = lambda name: bool(matcher.match(name)) 123 | names = os.listdir(path) 124 | n_data = len(list(filter(match, names))) 125 | return n_data 126 | 127 | def BuildVocabCounter(data_dir=DATA_DIR): 128 | ''' 129 | 统计所有词汇,建立词频表 130 | ''' 131 | from collections import Counter 132 | 133 | def GetTokens(path): 134 | n_data=CountFiles(path) 135 | summary_words=[] 136 | source_words=[] 137 | for i in range(n_data): 138 | js_data=json.load(open(os.path.join(path,f"{i}.json"),encoding="utf-8")) 139 | summary=''.join(js_data['summary']).strip() 140 | summary_words.extend(summary.strip().split(' ')) 141 | 142 | source=''.join(js_data['text']).strip() 143 | source_words.extend(source.strip().split(' ')) 144 | 145 | return source_words+summary_words 146 | 147 | # print(_count_data(data_dir+"new_train")) 148 | vocab_counter=Counter() 149 | vocab_counter.update(t for t in GetTokens(data_dir+"new_train") if t !="") 150 | vocab_counter.update(t for t in GetTokens(data_dir+"new_val") if t !="") 151 | vocab_counter.update(t for t in GetTokens(data_dir+"new_test") if t !="") 152 | # print(vocab_counter.values()) 153 | 154 | with open(VOCAB_PATH,"wb") as f: 155 | pkl.dump(vocab_counter,f) 156 | 157 | def MakeVocab(vocab_size=VOCAB_SIZE): 158 | ''' 159 | 建立词典,通过vocab_size设置字典大小,将常用词设置到字典即可,其他生僻词汇用''表示 160 | ''' 161 | with open(VOCAB_PATH,"rb") as f: 162 | wc=pkl.load(f) 163 | word2idx, idx2word = {}, {} 164 | word2idx[PAD_WORD] = 0 165 | word2idx[UNK_WORD] = 1 166 | word2idx[BOS_WORD] = 2 167 | word2idx[EOS_WORD] = 3 168 | for i, (w, _) in enumerate(wc.most_common(vocab_size), 4): 169 | word2idx[w] = i 170 | for w, i in word2idx.items(): 171 | idx2word[i] = w 172 | 173 | with open(WORD_IDX_PATH,"wb") as f: 174 | pkl.dump(word2idx,f) 175 | with open(IDX_WORD_PATH,"wb") as f: 176 | pkl.dump(idx2word,f) 177 | 178 | def GetNumOfLongestSeq(): 179 | ''' 180 | 找到最长的seq长度,用于padding 181 | ''' 182 | 183 | def _findInFolders(path,length): 184 | max_len=0 185 | for i in range(length): 186 | js_data=json.load(open(os.path.join(path,f"{i}.json"),encoding="utf-8")) 187 | l_data=js_data["summary"].split(" ") 188 | l=len(l_data) 189 | if(max_lenthreshold): 254 | if(EOS_NUM in line): 255 | line[threshold-1]=EOS_NUM 256 | return line[:threshold],threshold 257 | return line + [PAD_NUM] * (threshold - len(line)),p_len 258 | 259 | def ReadJson2List(dir,i,label=False): 260 | '''读取单个json文件(一个样本),并按空格分割转换成列表''' 261 | 262 | js_data=json.load(open(os.path.join(dir,f"{i}.json"),encoding="utf-8")) 263 | if label: 264 | return js_data["summary"].split(" ") 265 | return js_data["text"].split(" ") 266 | 267 | 268 | def GetRouge(pred,label): 269 | '''获取ROUGR-L值''' 270 | rouge=Rouge() 271 | rouge_score = rouge.get_scores(pred, label) 272 | rouge_L_f1 = 0 273 | rouge_L_p = 0 274 | rouge_L_r = 0 275 | for d in rouge_score: 276 | rouge_L_f1 += d["rouge-l"]["f"] 277 | rouge_L_p += d["rouge-l"]["p"] 278 | rouge_L_r += d["rouge-l"]["r"] 279 | 280 | return (rouge_L_f1 / len(rouge_score)) 281 | 282 | print("rouge_f1:%.2f" % (rouge_L_f1 / len(rouge_score))) 283 | print("rouge_p:%.2f" % (rouge_L_p / len(rouge_score))) 284 | print("rouge_r:%.2f" % (rouge_L_r / len(rouge_score))) 285 | 286 | 287 | # 将数据转换为成batch的Tensor,win平台有bug,多进程不能写在函数里 288 | with open(WORD_IDX_PATH,"rb") as f: 289 | w2i=pkl.load(f) 290 | train_iter=DataLoader(TextDataset(TRAIN_FALG,w2i),shuffle=True,batch_size=BATCH_SZIE,num_workers=8) 291 | val_iter=DataLoader(TextDataset(VAL_FALG,w2i),shuffle=False,batch_size=BATCH_SZIE,num_workers=4) 292 | test_iter=DataLoader(TextDataset(TEST_FALG,w2i),shuffle=False,batch_size=1) 293 | 294 | def Train(net:Module,lr=0.01): 295 | """训练序列到序列模型。""" 296 | from tqdm import tqdm 297 | 298 | def xavier_init_weights(m): 299 | if type(m) == nn.Linear: 300 | nn.init.xavier_uniform_(m.weight) 301 | if type(m) == nn.GRU: 302 | for param in m._flat_weights_names: 303 | if "weight" in param: 304 | nn.init.xavier_uniform_(m._parameters[param]) 305 | 306 | net.apply(xavier_init_weights) 307 | net.to(DEVICE) 308 | optimizer = optim.Adam(net.parameters(), lr=lr) 309 | loss = models.MaskedSoftmaxCELoss() 310 | 311 | # 验证集loss降到10000以下时开始保存每轮更低的参数 312 | min_loss=10000 313 | for epoch in range(EPOCHS): 314 | train_loss=[] 315 | val_loss=[] 316 | 317 | net.train() 318 | for batch in tqdm(train_iter): 319 | (enc_X, enc_x_l), (dec_x, dec_x_l), (y,y_l) = [(x[0].to(DEVICE),x[1].to(DEVICE)) for x in batch] 320 | 321 | 322 | pred, _ = net(enc_X, dec_x, enc_x_l) 323 | l = loss(pred, y, y_l).sum() 324 | l.backward() 325 | 326 | optimizer.step() 327 | optimizer.zero_grad() 328 | 329 | with torch.no_grad(): 330 | train_loss.append(l.item()) 331 | 332 | # 释放显存 333 | torch.cuda.empty_cache() 334 | 335 | net.eval() 336 | with torch.no_grad(): 337 | for batch in tqdm(val_iter): 338 | (enc_X, enc_x_l), (dec_x, dec_x_l), (y,y_l) = [(x[0].to(DEVICE),x[1].to(DEVICE)) for x in batch] 339 | pred, _ = net(enc_X, dec_x, enc_x_l) 340 | l = loss(pred, y, y_l).sum() 341 | val_loss.append(l.item()) 342 | 343 | # 保存模型参数,秒级时间戳保证唯一性 344 | if(sum(val_loss)