├── .gitattributes
├── .gitignore
├── 1.txt
├── LICENSE
├── README.md
├── params
    ├── bart
    │   ├── config.json
    │   ├── merges.txt
    │   ├── tokenizer.json
    │   └── vocab.json
    ├── pegasus
    │   ├── config.json
    │   ├── special_tokens_map.json
    │   ├── spiece.model
    │   ├── tokenizer.json
    │   └── tokenizer_config.json
    ├── t5-base
    │   ├── config.json
    │   ├── spiece.model
    │   └── tokenizer.json
    ├── t5-large
    │   ├── config.json
    │   ├── spiece.model
    │   └── tokenizer.json
    └── t5-small
    │   ├── config.json
    │   ├── spiece.model
    │   └── tokenizer.json
├── requirements.txt
├── score.png
└── source
    ├── __pycache__
        ├── models.cpython-37.pyc
        ├── pretrained_models.cpython-37.pyc
        ├── settings.cpython-37.pyc
        ├── submodels.cpython-37.pyc
        └── utils.cpython-37.pyc
    ├── go.py
    ├── models.py
    ├── pretrained_models.py
    ├── settings.py
    ├── temp.py
    └── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /dataset/
2 | *.bin


--------------------------------------------------------------------------------
/1.txt:
--------------------------------------------------------------------------------
  1 | PegasusForConditionalGeneration(
  2 |   (model): PegasusModel(
  3 |     (shared): Embedding(96103, 1024, padding_idx=0)
  4 |     (encoder): PegasusEncoder(
  5 |       (embed_tokens): Embedding(96103, 1024, padding_idx=0)
  6 |       (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
  7 |       (layers): ModuleList(
  8 |         (0): PegasusEncoderLayer(
  9 |           (self_attn): PegasusAttention(
 10 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 11 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 12 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 13 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 14 |           )
 15 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 16 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 17 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 18 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 19 |         )
 20 |         (1): PegasusEncoderLayer(
 21 |           (self_attn): PegasusAttention(
 22 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 23 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 24 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 25 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 26 |           )
 27 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 28 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 29 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 30 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 31 |         )
 32 |         (2): PegasusEncoderLayer(
 33 |           (self_attn): PegasusAttention(
 34 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 35 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 36 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 37 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 38 |           )
 39 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 40 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 41 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 42 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 43 |         )
 44 |         (3): PegasusEncoderLayer(
 45 |           (self_attn): PegasusAttention(
 46 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 47 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 48 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 49 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 50 |           )
 51 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 52 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 53 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 54 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 55 |         )
 56 |         (4): PegasusEncoderLayer(
 57 |           (self_attn): PegasusAttention(
 58 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 59 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 60 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 61 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 62 |           )
 63 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 64 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 65 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 66 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 67 |         )
 68 |         (5): PegasusEncoderLayer(
 69 |           (self_attn): PegasusAttention(
 70 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 71 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 72 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 73 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 74 |           )
 75 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 76 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 77 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 78 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 79 |         )
 80 |         (6): PegasusEncoderLayer(
 81 |           (self_attn): PegasusAttention(
 82 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 83 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 84 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 85 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 86 |           )
 87 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 88 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 89 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 90 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
 91 |         )
 92 |         (7): PegasusEncoderLayer(
 93 |           (self_attn): PegasusAttention(
 94 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
 95 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
 96 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
 97 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
 98 |           )
 99 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
100 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
101 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
102 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
103 |         )
104 |         (8): PegasusEncoderLayer(
105 |           (self_attn): PegasusAttention(
106 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
107 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
108 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
109 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
110 |           )
111 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
112 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
113 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
114 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
115 |         )
116 |         (9): PegasusEncoderLayer(
117 |           (self_attn): PegasusAttention(
118 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
119 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
120 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
121 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
122 |           )
123 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
124 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
125 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
126 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
127 |         )
128 |         (10): PegasusEncoderLayer(
129 |           (self_attn): PegasusAttention(
130 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
131 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
132 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
133 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
134 |           )
135 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
136 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
137 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
138 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
139 |         )
140 |         (11): PegasusEncoderLayer(
141 |           (self_attn): PegasusAttention(
142 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
143 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
144 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
145 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
146 |           )
147 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
148 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
149 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
150 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
151 |         )
152 |         (12): PegasusEncoderLayer(
153 |           (self_attn): PegasusAttention(
154 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
155 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
156 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
157 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
158 |           )
159 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
160 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
161 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
162 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
163 |         )
164 |         (13): PegasusEncoderLayer(
165 |           (self_attn): PegasusAttention(
166 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
167 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
168 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
169 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
170 |           )
171 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
172 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
173 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
174 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
175 |         )
176 |         (14): PegasusEncoderLayer(
177 |           (self_attn): PegasusAttention(
178 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
179 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
180 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
181 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
182 |           )
183 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
184 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
185 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
186 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
187 |         )
188 |         (15): PegasusEncoderLayer(
189 |           (self_attn): PegasusAttention(
190 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
191 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
192 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
193 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
194 |           )
195 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
196 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
197 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
198 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
199 |         )
200 |       )
201 |       (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
202 |     )
203 |     (decoder): PegasusDecoder(
204 |       (embed_tokens): Embedding(96103, 1024, padding_idx=0)
205 |       (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
206 |       (layers): ModuleList(
207 |         (0): PegasusDecoderLayer(
208 |           (self_attn): PegasusAttention(
209 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
210 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
211 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
212 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
213 |           )
214 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
215 |           (encoder_attn): PegasusAttention(
216 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
217 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
218 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
219 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
220 |           )
221 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
222 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
223 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
224 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
225 |         )
226 |         (1): PegasusDecoderLayer(
227 |           (self_attn): PegasusAttention(
228 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
229 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
230 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
231 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
232 |           )
233 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
234 |           (encoder_attn): PegasusAttention(
235 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
236 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
237 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
238 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
239 |           )
240 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
241 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
242 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
243 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
244 |         )
245 |         (2): PegasusDecoderLayer(
246 |           (self_attn): PegasusAttention(
247 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
248 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
249 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
250 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
251 |           )
252 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
253 |           (encoder_attn): PegasusAttention(
254 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
255 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
256 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
257 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
258 |           )
259 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
260 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
261 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
262 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
263 |         )
264 |         (3): PegasusDecoderLayer(
265 |           (self_attn): PegasusAttention(
266 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
267 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
268 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
269 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
270 |           )
271 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
272 |           (encoder_attn): PegasusAttention(
273 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
274 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
275 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
276 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
277 |           )
278 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
279 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
280 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
281 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
282 |         )
283 |         (4): PegasusDecoderLayer(
284 |           (self_attn): PegasusAttention(
285 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
286 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
287 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
288 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
289 |           )
290 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
291 |           (encoder_attn): PegasusAttention(
292 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
293 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
294 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
295 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
296 |           )
297 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
298 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
299 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
300 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
301 |         )
302 |         (5): PegasusDecoderLayer(
303 |           (self_attn): PegasusAttention(
304 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
305 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
306 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
307 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
308 |           )
309 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
310 |           (encoder_attn): PegasusAttention(
311 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
312 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
313 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
314 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
315 |           )
316 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
317 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
318 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
319 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
320 |         )
321 |         (6): PegasusDecoderLayer(
322 |           (self_attn): PegasusAttention(
323 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
324 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
325 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
326 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
327 |           )
328 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
329 |           (encoder_attn): PegasusAttention(
330 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
331 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
332 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
333 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
334 |           )
335 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
336 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
337 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
338 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
339 |         )
340 |         (7): PegasusDecoderLayer(
341 |           (self_attn): PegasusAttention(
342 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
343 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
344 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
345 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
346 |           )
347 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
348 |           (encoder_attn): PegasusAttention(
349 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
350 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
351 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
352 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
353 |           )
354 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
355 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
356 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
357 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
358 |         )
359 |         (8): PegasusDecoderLayer(
360 |           (self_attn): PegasusAttention(
361 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
362 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
363 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
364 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
365 |           )
366 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
367 |           (encoder_attn): PegasusAttention(
368 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
369 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
370 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
371 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
372 |           )
373 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
374 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
375 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
376 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
377 |         )
378 |         (9): PegasusDecoderLayer(
379 |           (self_attn): PegasusAttention(
380 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
381 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
382 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
383 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
384 |           )
385 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
386 |           (encoder_attn): PegasusAttention(
387 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
388 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
389 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
390 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
391 |           )
392 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
393 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
394 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
395 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
396 |         )
397 |         (10): PegasusDecoderLayer(
398 |           (self_attn): PegasusAttention(
399 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
400 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
401 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
402 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
403 |           )
404 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
405 |           (encoder_attn): PegasusAttention(
406 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
407 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
408 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
409 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
410 |           )
411 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
412 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
413 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
414 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
415 |         )
416 |         (11): PegasusDecoderLayer(
417 |           (self_attn): PegasusAttention(
418 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
419 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
420 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
421 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
422 |           )
423 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
424 |           (encoder_attn): PegasusAttention(
425 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
426 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
427 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
428 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
429 |           )
430 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
431 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
432 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
433 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
434 |         )
435 |         (12): PegasusDecoderLayer(
436 |           (self_attn): PegasusAttention(
437 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
438 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
439 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
440 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
441 |           )
442 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
443 |           (encoder_attn): PegasusAttention(
444 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
445 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
446 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
447 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
448 |           )
449 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
450 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
451 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
452 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
453 |         )
454 |         (13): PegasusDecoderLayer(
455 |           (self_attn): PegasusAttention(
456 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
457 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
458 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
459 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
460 |           )
461 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
462 |           (encoder_attn): PegasusAttention(
463 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
464 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
465 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
466 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
467 |           )
468 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
469 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
470 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
471 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
472 |         )
473 |         (14): PegasusDecoderLayer(
474 |           (self_attn): PegasusAttention(
475 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
476 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
477 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
478 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
479 |           )
480 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
481 |           (encoder_attn): PegasusAttention(
482 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
483 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
484 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
485 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
486 |           )
487 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
488 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
489 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
490 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
491 |         )
492 |         (15): PegasusDecoderLayer(
493 |           (self_attn): PegasusAttention(
494 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
495 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
496 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
497 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
498 |           )
499 |           (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
500 |           (encoder_attn): PegasusAttention(
501 |             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
502 |             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
503 |             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
504 |             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
505 |           )
506 |           (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
507 |           (fc1): Linear(in_features=1024, out_features=4096, bias=True)
508 |           (fc2): Linear(in_features=4096, out_features=1024, bias=True)
509 |           (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
510 |         )
511 |       )
512 |       (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
513 |     )
514 |   )
515 |   (lm_head): Linear(in_features=1024, out_features=96103, bias=False)
516 | )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TextSum
  2 | ## 0 使用说明
  3 | 1. 项目相关依赖已写入requirements.txt文件 `pip install -r requirements.txt`
  4 | 2. 项目使用了`transformers`提供的预训练模型，相关模型、配置文件、词典文件等于[此处](https://huggingface.co/models)下载
  5 | 3. 运行项目前，于/source/settings.py中修改路径设置为本地实际绝对路径
  6 | 4. 项目结构：<br/>
  7 | TextSum<br/>
  8 | --dataset（数据集、词典、词频表）<br/>
  9 | --params（预训练模型、模型参数保存文件）<br/>
 10 | --source（源代码）<br/>
 11 | ----go.py（主控函数）<br/>
 12 | ----pretrained_models.py（预训练模型）<br/>
 13 | ----models.py（自定义模型）<br/>
 14 | ----settings.py（项目设置）<br/>
 15 | ----utils.py（工具函数）<br/>
 16 | 5. `python go.py` 运行项目，可选命令行参数如下：
 17 |     ```
 18 |     -h, --help            show this help message and exit
 19 |     -p, --preprocess      预处理数据
 20 |     -b, --build           建立词频表
 21 |     -m, --make            建立词典
 22 |     -t 模型名, --train           训练
 23 |                             
 24 |     -f 模型名, --fine_tune       微调
 25 |                             
 26 |     -g 模型名 参数路径, --gen             生成submission
 27 |                             
 28 |     ```
 29 | ## 1 数据处理
 30 | 本项目数据处理共分为部分：数据清洗与划分、词典生成、张量转换
 31 | + 数据清洗与划分
 32 |   + 使用正则表达式清洗原始数据，去除文本中与任务无关的信息
 33 |   + 从原始训练集中划分出验证集
 34 |   + 将原始CSV文件转换为逐条文本的JSON文件
 35 | + 词典生成  
 36 | 统计数据集中出现过的所有单词的词频，取一定数目的高频词生成字典
 37 | + 张量转换  
 38 | 读取预处理完毕的json文件，进一步处理后将文本数据集转换为成batch的Tensor
 39 | ## 2 模型结构
 40 | 本项目使用`pytorch`实现了模型基础结构、自定义损失函数、优化器以及模型训练、验证过程；  
 41 | 本项目还使用`transformers`提供的预训练模型（bart、t5、pegasus）及函数接口实现了模型的微调与推断  
 42 | 以下给出部分模型的网络结构
 43 | 1. GRU编码器-解码器架构网络结构如下：
 44 |     ```python
 45 |     EncoderDecoder(
 46 |     (encoder): GruEncoder(
 47 |         (embdding): Embedding(10004, 512)
 48 |         (rnn): GRU(512, 256, num_layers=2)
 49 |     )
 50 |     (decoder): GruDecoder(
 51 |         (embdding): Embedding(10004, 512)
 52 |         (rnn): GRU(768, 256, num_layers=2)
 53 |         (dense): Linear(in_features=256, out_features=10004, bias=True)
 54 |     )
 55 |     )
 56 |     ```
 57 | 2. t5(small)
 58 |     ```python
 59 |     T5ForConditionalGeneration(
 60 |         (shared): Embedding(32128, 512)
 61 |         (encoder): T5Stack(
 62 |             (embed_tokens): Embedding(32128, 512)
 63 |             (block): ModuleList(
 64 |             (0): T5Block(
 65 |                 (layer): ModuleList(
 66 |                 (0): T5LayerSelfAttention(
 67 |                     (SelfAttention): T5Attention(
 68 |                     (q): Linear(in_features=512, out_features=512, bias=False)
 69 |                     (k): Linear(in_features=512, out_features=512, bias=False)
 70 |                     (v): Linear(in_features=512, out_features=512, bias=False)
 71 |                     (o): Linear(in_features=512, out_features=512, bias=False)
 72 |                     (relative_attention_bias): Embedding(32, 8)
 73 |                     )
 74 |                     (layer_norm): T5LayerNorm()
 75 |                     (dropout): Dropout(p=0.1, inplace=False)
 76 |                 )
 77 |                 (1): T5LayerFF(
 78 |                     (DenseReluDense): T5DenseReluDense(
 79 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
 80 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
 81 |                     (dropout): Dropout(p=0.1, inplace=False)
 82 |                     )
 83 |                     (layer_norm): T5LayerNorm()
 84 |                     (dropout): Dropout(p=0.1, inplace=False)
 85 |                 )
 86 |                 )
 87 |             )
 88 |             (1): T5Block(
 89 |                 (layer): ModuleList(
 90 |                 (0): T5LayerSelfAttention(
 91 |                     (SelfAttention): T5Attention(
 92 |                     (q): Linear(in_features=512, out_features=512, bias=False)
 93 |                     (k): Linear(in_features=512, out_features=512, bias=False)
 94 |                     (v): Linear(in_features=512, out_features=512, bias=False)
 95 |                     (o): Linear(in_features=512, out_features=512, bias=False)
 96 |                     )
 97 |                     (layer_norm): T5LayerNorm()
 98 |                     (dropout): Dropout(p=0.1, inplace=False)
 99 |                 )
100 |                 (1): T5LayerFF(
101 |                     (DenseReluDense): T5DenseReluDense(
102 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
103 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
104 |                     (dropout): Dropout(p=0.1, inplace=False)
105 |                     )
106 |                     (layer_norm): T5LayerNorm()
107 |                     (dropout): Dropout(p=0.1, inplace=False)
108 |                 )
109 |                 )
110 |             )
111 |             (2): T5Block(
112 |                 (layer): ModuleList(
113 |                 (0): T5LayerSelfAttention(
114 |                     (SelfAttention): T5Attention(
115 |                     (q): Linear(in_features=512, out_features=512, bias=False)
116 |                     (k): Linear(in_features=512, out_features=512, bias=False)
117 |                     (v): Linear(in_features=512, out_features=512, bias=False)
118 |                     (o): Linear(in_features=512, out_features=512, bias=False)
119 |                     )
120 |                     (layer_norm): T5LayerNorm()
121 |                     (dropout): Dropout(p=0.1, inplace=False)
122 |                 )
123 |                 (1): T5LayerFF(
124 |                     (DenseReluDense): T5DenseReluDense(
125 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
126 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
127 |                     (dropout): Dropout(p=0.1, inplace=False)
128 |                     )
129 |                     (layer_norm): T5LayerNorm()
130 |                     (dropout): Dropout(p=0.1, inplace=False)
131 |                 )
132 |                 )
133 |             )
134 |             (3): T5Block(
135 |                 (layer): ModuleList(
136 |                 (0): T5LayerSelfAttention(
137 |                     (SelfAttention): T5Attention(
138 |                     (q): Linear(in_features=512, out_features=512, bias=False)
139 |                     (k): Linear(in_features=512, out_features=512, bias=False)
140 |                     (v): Linear(in_features=512, out_features=512, bias=False)
141 |                     (o): Linear(in_features=512, out_features=512, bias=False)
142 |                     )
143 |                     (layer_norm): T5LayerNorm()
144 |                     (dropout): Dropout(p=0.1, inplace=False)
145 |                 )
146 |                 (1): T5LayerFF(
147 |                     (DenseReluDense): T5DenseReluDense(
148 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
149 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
150 |                     (dropout): Dropout(p=0.1, inplace=False)
151 |                     )
152 |                     (layer_norm): T5LayerNorm()
153 |                     (dropout): Dropout(p=0.1, inplace=False)
154 |                 )
155 |                 )
156 |             )
157 |             (4): T5Block(
158 |                 (layer): ModuleList(
159 |                 (0): T5LayerSelfAttention(
160 |                     (SelfAttention): T5Attention(
161 |                     (q): Linear(in_features=512, out_features=512, bias=False)
162 |                     (k): Linear(in_features=512, out_features=512, bias=False)
163 |                     (v): Linear(in_features=512, out_features=512, bias=False)
164 |                     (o): Linear(in_features=512, out_features=512, bias=False)
165 |                     )
166 |                     (layer_norm): T5LayerNorm()
167 |                     (dropout): Dropout(p=0.1, inplace=False)
168 |                 )
169 |                 (1): T5LayerFF(
170 |                     (DenseReluDense): T5DenseReluDense(
171 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
172 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
173 |                     (dropout): Dropout(p=0.1, inplace=False)
174 |                     )
175 |                     (layer_norm): T5LayerNorm()
176 |                     (dropout): Dropout(p=0.1, inplace=False)
177 |                 )
178 |                 )
179 |             )
180 |             (5): T5Block(
181 |                 (layer): ModuleList(
182 |                 (0): T5LayerSelfAttention(
183 |                     (SelfAttention): T5Attention(
184 |                     (q): Linear(in_features=512, out_features=512, bias=False)
185 |                     (k): Linear(in_features=512, out_features=512, bias=False)
186 |                     (v): Linear(in_features=512, out_features=512, bias=False)
187 |                     (o): Linear(in_features=512, out_features=512, bias=False)
188 |                     )
189 |                     (layer_norm): T5LayerNorm()
190 |                     (dropout): Dropout(p=0.1, inplace=False)
191 |                 )
192 |                 (1): T5LayerFF(
193 |                     (DenseReluDense): T5DenseReluDense(
194 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
195 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
196 |                     (dropout): Dropout(p=0.1, inplace=False)
197 |                     )
198 |                     (layer_norm): T5LayerNorm()
199 |                     (dropout): Dropout(p=0.1, inplace=False)
200 |                 )
201 |                 )
202 |             )
203 |             )
204 |             (final_layer_norm): T5LayerNorm()
205 |             (dropout): Dropout(p=0.1, inplace=False)
206 |         )
207 |         (decoder): T5Stack(
208 |             (embed_tokens): Embedding(32128, 512)
209 |             (block): ModuleList(
210 |             (0): T5Block(
211 |                 (layer): ModuleList(
212 |                 (0): T5LayerSelfAttention(
213 |                     (SelfAttention): T5Attention(
214 |                     (q): Linear(in_features=512, out_features=512, bias=False)
215 |                     (k): Linear(in_features=512, out_features=512, bias=False)
216 |                     (v): Linear(in_features=512, out_features=512, bias=False)
217 |                     (o): Linear(in_features=512, out_features=512, bias=False)
218 |                     (relative_attention_bias): Embedding(32, 8)
219 |                     )
220 |                     (layer_norm): T5LayerNorm()
221 |                     (dropout): Dropout(p=0.1, inplace=False)
222 |                 )
223 |                 (1): T5LayerCrossAttention(
224 |                     (EncDecAttention): T5Attention(
225 |                     (q): Linear(in_features=512, out_features=512, bias=False)
226 |                     (k): Linear(in_features=512, out_features=512, bias=False)
227 |                     (v): Linear(in_features=512, out_features=512, bias=False)
228 |                     (o): Linear(in_features=512, out_features=512, bias=False)
229 |                     )
230 |                     (layer_norm): T5LayerNorm()
231 |                     (dropout): Dropout(p=0.1, inplace=False)
232 |                 )
233 |                 (2): T5LayerFF(
234 |                     (DenseReluDense): T5DenseReluDense(
235 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
236 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
237 |                     (dropout): Dropout(p=0.1, inplace=False)
238 |                     )
239 |                     (layer_norm): T5LayerNorm()
240 |                     (dropout): Dropout(p=0.1, inplace=False)
241 |                 )
242 |                 )
243 |             )
244 |             (1): T5Block(
245 |                 (layer): ModuleList(
246 |                 (0): T5LayerSelfAttention(
247 |                     (SelfAttention): T5Attention(
248 |                     (q): Linear(in_features=512, out_features=512, bias=False)
249 |                     (k): Linear(in_features=512, out_features=512, bias=False)
250 |                     (v): Linear(in_features=512, out_features=512, bias=False)
251 |                     (o): Linear(in_features=512, out_features=512, bias=False)
252 |                     )
253 |                     (layer_norm): T5LayerNorm()
254 |                     (dropout): Dropout(p=0.1, inplace=False)
255 |                 )
256 |                 (1): T5LayerCrossAttention(
257 |                     (EncDecAttention): T5Attention(
258 |                     (q): Linear(in_features=512, out_features=512, bias=False)
259 |                     (k): Linear(in_features=512, out_features=512, bias=False)
260 |                     (v): Linear(in_features=512, out_features=512, bias=False)
261 |                     (o): Linear(in_features=512, out_features=512, bias=False)
262 |                     )
263 |                     (layer_norm): T5LayerNorm()
264 |                     (dropout): Dropout(p=0.1, inplace=False)
265 |                 )
266 |                 (2): T5LayerFF(
267 |                     (DenseReluDense): T5DenseReluDense(
268 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
269 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
270 |                     (dropout): Dropout(p=0.1, inplace=False)
271 |                     )
272 |                     (layer_norm): T5LayerNorm()
273 |                     (dropout): Dropout(p=0.1, inplace=False)
274 |                 )
275 |                 )
276 |             )
277 |             (2): T5Block(
278 |                 (layer): ModuleList(
279 |                 (0): T5LayerSelfAttention(
280 |                     (SelfAttention): T5Attention(
281 |                     (q): Linear(in_features=512, out_features=512, bias=False)
282 |                     (k): Linear(in_features=512, out_features=512, bias=False)
283 |                     (v): Linear(in_features=512, out_features=512, bias=False)
284 |                     (o): Linear(in_features=512, out_features=512, bias=False)
285 |                     )
286 |                     (layer_norm): T5LayerNorm()
287 |                     (dropout): Dropout(p=0.1, inplace=False)
288 |                 )
289 |                 (1): T5LayerCrossAttention(
290 |                     (EncDecAttention): T5Attention(
291 |                     (q): Linear(in_features=512, out_features=512, bias=False)
292 |                     (k): Linear(in_features=512, out_features=512, bias=False)
293 |                     (v): Linear(in_features=512, out_features=512, bias=False)
294 |                     (o): Linear(in_features=512, out_features=512, bias=False)
295 |                     )
296 |                     (layer_norm): T5LayerNorm()
297 |                     (dropout): Dropout(p=0.1, inplace=False)
298 |                 )
299 |                 (2): T5LayerFF(
300 |                     (DenseReluDense): T5DenseReluDense(
301 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
302 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
303 |                     (dropout): Dropout(p=0.1, inplace=False)
304 |                     )
305 |                     (layer_norm): T5LayerNorm()
306 |                     (dropout): Dropout(p=0.1, inplace=False)
307 |                 )
308 |                 )
309 |             )
310 |             (3): T5Block(
311 |                 (layer): ModuleList(
312 |                 (0): T5LayerSelfAttention(
313 |                     (SelfAttention): T5Attention(
314 |                     (q): Linear(in_features=512, out_features=512, bias=False)
315 |                     (k): Linear(in_features=512, out_features=512, bias=False)
316 |                     (v): Linear(in_features=512, out_features=512, bias=False)
317 |                     (o): Linear(in_features=512, out_features=512, bias=False)
318 |                     )
319 |                     (layer_norm): T5LayerNorm()
320 |                     (dropout): Dropout(p=0.1, inplace=False)
321 |                 )
322 |                 (1): T5LayerCrossAttention(
323 |                     (EncDecAttention): T5Attention(
324 |                     (q): Linear(in_features=512, out_features=512, bias=False)
325 |                     (k): Linear(in_features=512, out_features=512, bias=False)
326 |                     (v): Linear(in_features=512, out_features=512, bias=False)
327 |                     (o): Linear(in_features=512, out_features=512, bias=False)
328 |                     )
329 |                     (layer_norm): T5LayerNorm()
330 |                     (dropout): Dropout(p=0.1, inplace=False)
331 |                 )
332 |                 (2): T5LayerFF(
333 |                     (DenseReluDense): T5DenseReluDense(
334 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
335 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
336 |                     (dropout): Dropout(p=0.1, inplace=False)
337 |                     )
338 |                     (layer_norm): T5LayerNorm()
339 |                     (dropout): Dropout(p=0.1, inplace=False)
340 |                 )
341 |                 )
342 |             )
343 |             (4): T5Block(
344 |                 (layer): ModuleList(
345 |                 (0): T5LayerSelfAttention(
346 |                     (SelfAttention): T5Attention(
347 |                     (q): Linear(in_features=512, out_features=512, bias=False)
348 |                     (k): Linear(in_features=512, out_features=512, bias=False)
349 |                     (v): Linear(in_features=512, out_features=512, bias=False)
350 |                     (o): Linear(in_features=512, out_features=512, bias=False)
351 |                     )
352 |                     (layer_norm): T5LayerNorm()
353 |                     (dropout): Dropout(p=0.1, inplace=False)
354 |                 )
355 |                 (1): T5LayerCrossAttention(
356 |                     (EncDecAttention): T5Attention(
357 |                     (q): Linear(in_features=512, out_features=512, bias=False)
358 |                     (k): Linear(in_features=512, out_features=512, bias=False)
359 |                     (v): Linear(in_features=512, out_features=512, bias=False)
360 |                     (o): Linear(in_features=512, out_features=512, bias=False)
361 |                     )
362 |                     (layer_norm): T5LayerNorm()
363 |                     (dropout): Dropout(p=0.1, inplace=False)
364 |                 )
365 |                 (2): T5LayerFF(
366 |                     (DenseReluDense): T5DenseReluDense(
367 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
368 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
369 |                     (dropout): Dropout(p=0.1, inplace=False)
370 |                     )
371 |                     (layer_norm): T5LayerNorm()
372 |                     (dropout): Dropout(p=0.1, inplace=False)
373 |                 )
374 |                 )
375 |             )
376 |             (5): T5Block(
377 |                 (layer): ModuleList(
378 |                 (0): T5LayerSelfAttention(
379 |                     (SelfAttention): T5Attention(
380 |                     (q): Linear(in_features=512, out_features=512, bias=False)
381 |                     (k): Linear(in_features=512, out_features=512, bias=False)
382 |                     (v): Linear(in_features=512, out_features=512, bias=False)
383 |                     (o): Linear(in_features=512, out_features=512, bias=False)
384 |                     )
385 |                     (layer_norm): T5LayerNorm()
386 |                     (dropout): Dropout(p=0.1, inplace=False)
387 |                 )
388 |                 (1): T5LayerCrossAttention(
389 |                     (EncDecAttention): T5Attention(
390 |                     (q): Linear(in_features=512, out_features=512, bias=False)
391 |                     (k): Linear(in_features=512, out_features=512, bias=False)
392 |                     (v): Linear(in_features=512, out_features=512, bias=False)
393 |                     (o): Linear(in_features=512, out_features=512, bias=False)
394 |                     )
395 |                     (layer_norm): T5LayerNorm()
396 |                     (dropout): Dropout(p=0.1, inplace=False)
397 |                 )
398 |                 (2): T5LayerFF(
399 |                     (DenseReluDense): T5DenseReluDense(
400 |                     (wi): Linear(in_features=512, out_features=2048, bias=False)
401 |                     (wo): Linear(in_features=2048, out_features=512, bias=False)
402 |                     (dropout): Dropout(p=0.1, inplace=False)
403 |                     )
404 |                     (layer_norm): T5LayerNorm()
405 |                     (dropout): Dropout(p=0.1, inplace=False)
406 |                 )
407 |                 )
408 |             )
409 |             )
410 |             (final_layer_norm): T5LayerNorm()
411 |             (dropout): Dropout(p=0.1, inplace=False)
412 |         )
413 |         (lm_head): Linear(in_features=512, out_features=32128, bias=False)
414 |         )
415 |     ```
416 | 
417 | ## 3 最终成绩
418 | 本项目最终成绩为0.32107609  
419 |     ![](score.png)  
420 | 参数设置如下：  
421 | + 模型：bart-large-cnn
422 | + 搜索束个数：2
423 | + 最大序列长度：1024
424 | + 激活函数：gelu
425 | + 预测序列最短长度：30
426 | + 预测序列最长长度：590
427 | + 是否允许提前停止（预测出`<EOS>`即停止）：是
428 | 


--------------------------------------------------------------------------------
/params/bart/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_num_labels": 3,
 3 |   "activation_dropout": 0.0,
 4 |   "activation_function": "gelu",
 5 |   "add_final_layer_norm": false,
 6 |   "architectures": [
 7 |     "BartForConditionalGeneration"
 8 |   ],
 9 |   "attention_dropout": 0.0,
10 |   "bos_token_id": 0,
11 |   "classif_dropout": 0.0,
12 |   "classifier_dropout": 0.0,
13 |   "d_model": 1024,
14 |   "decoder_attention_heads": 16,
15 |   "decoder_ffn_dim": 4096,
16 |   "decoder_layerdrop": 0.0,
17 |   "decoder_layers": 12,
18 |   "decoder_start_token_id": 2,
19 |   "dropout": 0.1,
20 |   "early_stopping": true,
21 |   "encoder_attention_heads": 16,
22 |   "encoder_ffn_dim": 4096,
23 |   "encoder_layerdrop": 0.0,
24 |   "encoder_layers": 12,
25 |   "eos_token_id": 2,
26 |   "force_bos_token_to_be_generated": true,
27 |   "forced_bos_token_id": 0,
28 |   "forced_eos_token_id": 2,
29 |   "gradient_checkpointing": false,
30 |   "id2label": {
31 |     "0": "LABEL_0",
32 |     "1": "LABEL_1",
33 |     "2": "LABEL_2"
34 |   },
35 |   "init_std": 0.02,
36 |   "is_encoder_decoder": true,
37 |   "label2id": {
38 |     "LABEL_0": 0,
39 |     "LABEL_1": 1,
40 |     "LABEL_2": 2
41 |   },
42 |   "length_penalty": 2.0,
43 |   "max_length": 142,
44 |   "max_position_embeddings": 1024,
45 |   "min_length": 56,
46 |   "model_type": "bart",
47 |   "no_repeat_ngram_size": 3,
48 |   "normalize_before": false,
49 |   "num_beams": 4,
50 |   "num_hidden_layers": 12,
51 |   "output_past": true,
52 |   "pad_token_id": 1,
53 |   "prefix": " ",
54 |   "scale_embedding": false,
55 |   "task_specific_params": {
56 |     "summarization": {
57 |       "early_stopping": true,
58 |       "length_penalty": 2.0,
59 |       "max_length": 142,
60 |       "min_length": 56,
61 |       "no_repeat_ngram_size": 3,
62 |       "num_beams": 4
63 |     }
64 |   },
65 |   "transformers_version": "4.7.0.dev0",
66 |   "use_cache": true,
67 |   "vocab_size": 50264
68 | }
69 | 


--------------------------------------------------------------------------------
/params/pegasus/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "./",
 3 |   "activation_dropout": 0.1,
 4 |   "activation_function": "relu",
 5 |   "add_bias_logits": false,
 6 |   "add_final_layer_norm": true,
 7 |   "architectures": [
 8 |     "PegasusForConditionalGeneration"
 9 |   ],
10 |   "attention_dropout": 0.1,
11 |   "bos_token_id": 0,
12 |   "classif_dropout": 0.0,
13 |   "classifier_dropout": 0.0,
14 |   "d_model": 1024,
15 |   "decoder_attention_heads": 16,
16 |   "decoder_ffn_dim": 4096,
17 |   "decoder_layerdrop": 0.0,
18 |   "decoder_layers": 16,
19 |   "decoder_start_token_id": 0,
20 |   "do_blenderbot_90_layernorm": false,
21 |   "dropout": 0.1,
22 |   "encoder_attention_heads": 16,
23 |   "encoder_ffn_dim": 4096,
24 |   "encoder_layerdrop": 0.0,
25 |   "encoder_layers": 16,
26 |   "eos_token_id": 1,
27 |   "extra_pos_embeddings": 0,
28 |   "force_bos_token_to_be_generated": false,
29 |   "forced_eos_token_id": 1,
30 |   "gradient_checkpointing": false,
31 |   "id2label": {
32 |     "0": "LABEL_0",
33 |     "1": "LABEL_1",
34 |     "2": "LABEL_2"
35 |   },
36 |   "init_std": 0.02,
37 |   "is_encoder_decoder": true,
38 |   "label2id": {
39 |     "LABEL_0": 0,
40 |     "LABEL_1": 1,
41 |     "LABEL_2": 2
42 |   },
43 |   "length_penalty": 0.6,
44 |   "max_length": 64,
45 |   "max_position_embeddings": 512,
46 |   "model_type": "pegasus",
47 |   "normalize_before": true,
48 |   "normalize_embedding": false,
49 |   "num_beams": 8,
50 |   "num_hidden_layers": 16,
51 |   "pad_token_id": 0,
52 |   "scale_embedding": true,
53 |   "static_position_embeddings": true,
54 |   "transformers_version": "4.11.0.dev0",
55 |   "use_cache": true,
56 |   "vocab_size": 96103
57 | }
58 | 


--------------------------------------------------------------------------------
/params/pegasus/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}


--------------------------------------------------------------------------------
/params/pegasus/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/pegasus/spiece.model


--------------------------------------------------------------------------------
/params/pegasus/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"model_max_length": 512, "special_tokens_map_file": null, "full_tokenizer_file": null}


--------------------------------------------------------------------------------
/params/t5-base/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5WithLMHeadModel"
 4 |   ],
 5 |   "d_ff": 3072,
 6 |   "d_kv": 64,
 7 |   "d_model": 768,
 8 |   "decoder_start_token_id": 0,
 9 |   "dropout_rate": 0.1,
10 |   "eos_token_id": 1,
11 |   "initializer_factor": 1.0,
12 |   "is_encoder_decoder": true,
13 |   "layer_norm_epsilon": 1e-06,
14 |   "model_type": "t5",
15 |   "n_positions": 512,
16 |   "num_heads": 12,
17 |   "num_layers": 12,
18 |   "output_past": true,
19 |   "pad_token_id": 0,
20 |   "relative_attention_num_buckets": 32,
21 |   "task_specific_params": {
22 |     "summarization": {
23 |       "early_stopping": true,
24 |       "length_penalty": 2.0,
25 |       "max_length": 200,
26 |       "min_length": 30,
27 |       "no_repeat_ngram_size": 3,
28 |       "num_beams": 4,
29 |       "prefix": "summarize: "
30 |     },
31 |     "translation_en_to_de": {
32 |       "early_stopping": true,
33 |       "max_length": 300,
34 |       "num_beams": 4,
35 |       "prefix": "translate English to German: "
36 |     },
37 |     "translation_en_to_fr": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to French: "
42 |     },
43 |     "translation_en_to_ro": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to Romanian: "
48 |     }
49 |   },
50 |   "vocab_size": 32128
51 | }
52 | 


--------------------------------------------------------------------------------
/params/t5-base/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-base/spiece.model


--------------------------------------------------------------------------------
/params/t5-large/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5WithLMHeadModel"
 4 |   ],
 5 |   "d_ff": 4096,
 6 |   "d_kv": 64,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dropout_rate": 0.1,
10 |   "eos_token_id": 1,
11 |   "initializer_factor": 1.0,
12 |   "is_encoder_decoder": true,
13 |   "layer_norm_epsilon": 1e-06,
14 |   "model_type": "t5",
15 |   "n_positions": 512,
16 |   "num_heads": 16,
17 |   "num_layers": 24,
18 |   "output_past": true,
19 |   "pad_token_id": 0,
20 |   "relative_attention_num_buckets": 32,
21 |   "task_specific_params": {
22 |     "summarization": {
23 |       "early_stopping": true,
24 |       "length_penalty": 2.0,
25 |       "max_length": 200,
26 |       "min_length": 30,
27 |       "no_repeat_ngram_size": 3,
28 |       "num_beams": 4,
29 |       "prefix": "summarize: "
30 |     },
31 |     "translation_en_to_de": {
32 |       "early_stopping": true,
33 |       "max_length": 300,
34 |       "num_beams": 4,
35 |       "prefix": "translate English to German: "
36 |     },
37 |     "translation_en_to_fr": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to French: "
42 |     },
43 |     "translation_en_to_ro": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to Romanian: "
48 |     }
49 |   },
50 |   "vocab_size": 32128
51 | }
52 | 


--------------------------------------------------------------------------------
/params/t5-large/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-large/spiece.model


--------------------------------------------------------------------------------
/params/t5-small/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5WithLMHeadModel"
 4 |   ],
 5 |   "d_ff": 2048,
 6 |   "d_kv": 64,
 7 |   "d_model": 512,
 8 |   "decoder_start_token_id": 0,
 9 |   "dropout_rate": 0.1,
10 |   "eos_token_id": 1,
11 |   "initializer_factor": 1.0,
12 |   "is_encoder_decoder": true,
13 |   "layer_norm_epsilon": 1e-06,
14 |   "model_type": "t5",
15 |   "n_positions": 512,
16 |   "num_heads": 8,
17 |   "num_layers": 6,
18 |   "output_past": true,
19 |   "pad_token_id": 0,
20 |   "relative_attention_num_buckets": 32,
21 |   "task_specific_params": {
22 |     "summarization": {
23 |       "early_stopping": true,
24 |       "length_penalty": 2.0,
25 |       "max_length": 450,
26 |       "min_length": 30,
27 |       "no_repeat_ngram_size": 3,
28 |       "num_beams": 4,
29 |       "prefix": "summarize: "
30 |     },
31 |     "translation_en_to_de": {
32 |       "early_stopping": true,
33 |       "max_length": 300,
34 |       "num_beams": 4,
35 |       "prefix": "translate English to German: "
36 |     },
37 |     "translation_en_to_fr": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to French: "
42 |     },
43 |     "translation_en_to_ro": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to Romanian: "
48 |     }
49 |   },
50 |   "vocab_size": 32128
51 | }


--------------------------------------------------------------------------------
/params/t5-small/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/params/t5-small/spiece.model


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | tqdm
3 | transformers
4 | pandas
5 | rouge


--------------------------------------------------------------------------------
/score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/score.png


--------------------------------------------------------------------------------
/source/__pycache__/models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/models.cpython-37.pyc


--------------------------------------------------------------------------------
/source/__pycache__/pretrained_models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/pretrained_models.cpython-37.pyc


--------------------------------------------------------------------------------
/source/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/source/__pycache__/submodels.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/submodels.cpython-37.pyc


--------------------------------------------------------------------------------
/source/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyu2018/TextSum/20bbd5aec1051c59c880a931aa5eba6c3e55ebed/source/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/source/go.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import utils
 3 | from models import GetModel
 4 | import pretrained_models as pm
 5 | 
 6 | parser=argparse.ArgumentParser()
 7 | parser.add_argument("-p","--preprocess",help="预处理数据",action="store_true")
 8 | parser.add_argument("-b","--build",help="建立词频表",action="store_true")
 9 | parser.add_argument("-m","--make",help="建立词典",action="store_true")
10 | parser.add_argument("-t","--train",help="训练",type=str)
11 | parser.add_argument("-f","--fine_tune",help="微调",type=str)
12 | parser.add_argument("-g","--gen",help="生成submission",nargs=2,type=str)
13 | 
14 | 
15 | args=parser.parse_args()
16 | 
17 | def main():
18 |     if(args.preprocess):
19 |         print("--------------开始数据预处理--------------")
20 |         try:
21 |             utils.Preprocess()
22 |         except Exception as e:
23 |             print(e)
24 |         print("--------------数据预处理完毕--------------")
25 |         exit(0)
26 |     if(args.build):
27 |         print("--------------开始建立词频表--------------")
28 |         try:
29 |             utils.BuildVocabCounter()
30 |         except Exception as e:
31 |             print(e)
32 |         print("--------------词频表建立完毕--------------")
33 |         exit(0)
34 |     if(args.make):
35 |         print("--------------开始建立字典--------------")
36 |         try:
37 |             utils.MakeVocab()
38 |         except Exception as e:
39 |             print(e)
40 |         print("--------------字典建立完毕--------------")
41 |         exit(0)
42 |     if(args.train):
43 |         
44 |         try:
45 |             net=GetModel(args.train)
46 |             print("--------------开始训练模型--------------")
47 |             utils.Train(net)
48 |             print("--------------模型训练完毕--------------")
49 |         except Exception as e:
50 |             print(e)
51 |         exit(0)
52 |     
53 |     if(args.fine_tune):
54 |         try:
55 |             net,tkz=pm.GetPModel(args.fine_tune)
56 |             print("--------------开始微调--------------")
57 |             pm.FineTune(net,tkz)
58 |             print("--------------微调完毕--------------")
59 |         except Exception as e:
60 |             print(e)
61 |         exit(0)
62 |     if(args.gen):
63 |         
64 |         net,param_path=args.gen
65 |         
66 |         if(param_path=="x"):
67 |             param_path=None
68 |         try:
69 |             print("--------------开始生成submission--------------")
70 |             if(net=="gru"):
71 |                 net=GetModel(net)
72 |                 utils.GenSubmisson(net,param_path)
73 |             else:
74 |                 net,tkz=pm.GetPModel(net)
75 |                 pm.GenSub(net,tkz,param_path)
76 |             
77 |             print("--------------submission生成完毕--------------")
78 |         except Exception as e:
79 |             print(e)
80 |         exit(0)
81 | 
82 | 
83 | 
84 |     print(r"""
85 | ___________              __      _________                                  .__                     
86 | \__    ___/___ ___  ____/  |_   /   _____/__ __  _____   _____ _____ _______|__|_______ ___________ 
87 |   |    |_/ __ \\  \/  /\   __\  \_____  \|  |  \/     \ /     \\__  \\_  __ \  \___   // __ \_  __ \
88 |   |    |\  ___/ >    <  |  |    /        \  |  /  Y Y  \  Y Y  \/ __ \|  | \/  |/    /\  ___/|  | \/
89 |   |____| \___  >__/\_ \ |__|   /_______  /____/|__|_|  /__|_|  (____  /__|  |__/_____ \\___  >__|   
90 |              \/      \/                \/            \/      \/     \/               \/    \/       
91 | """)
92 |     print("-h, --help  show help message and exit")
93 | 
94 | if __name__=='__main__':
95 |     main()


--------------------------------------------------------------------------------
/source/models.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | import torch
  3 | from torch.tensor import Tensor
  4 | from settings import *
  5 | import utils
  6 | 
  7 | class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
  8 |     """带遮蔽的softmax交叉熵损失函数"""
  9 | 
 10 |     def _sequence_mask(self, X, valid_len, value=0):
 11 |         """ 在序列中屏蔽不相关的项。
 12 |             接收valid_len是多个有效长度组成的一维tensor，如[1，2]代表第一个序列有效长度为1，第二个序列有效长度为2
 13 |         """
 14 |         
 15 |         maxlen = X.size(1)
 16 |         mask = torch.arange((maxlen), dtype=torch.float32,
 17 |                             device=X.device)[None, :] < valid_len[:, None]
 18 |         X[~mask] = value
 19 |         # 有效长度以外的元素都被置零，不改变原始shape
 20 |         return X
 21 | 
 22 |     def forward(self, pred, label, valid_len):
 23 |         # 不用看标签中的padding的损失
 24 |         weights = torch.ones_like(label)
 25 |         weights = self._sequence_mask(weights, valid_len)
 26 |         self.reduction = 'none'
 27 |         unweighted_loss = super().forward(pred.permute(0, 2, 1), label)
 28 | 
 29 |         # 把整个序列的loss取平均，最后输出的shape是(batch_size)
 30 |         weighted_loss = (unweighted_loss * weights).mean(dim=1)
 31 |         return weighted_loss
 32 | 
 33 | 
 34 | class Encoder(nn.Module):
 35 |     '''编码器接口'''
 36 |     def  __init__(self, **kwargs):
 37 |         super(Encoder,self).__init__(**kwargs)
 38 |     
 39 |     def forward(self,X,*args):
 40 |         raise NotImplementedError
 41 | 
 42 | class Decoder(nn.Module):
 43 |     '''编码器接口'''
 44 |     def  __init__(self, **kwargs):
 45 |         super(Decoder,self).__init__(**kwargs)
 46 |     
 47 |     # 接收编码器的输出，作为当前步的先验状态
 48 |     def init_state(self,enc_outputs,*args):
 49 |         raise NotImplementedError
 50 |     # state和解码器输入共同作为输入
 51 |     # 在一次序列训练中，初始state为编码器输入，之后会不断自我更新
 52 |     def forward(self,X,state):
 53 |         raise NotImplementedError
 54 | 
 55 | class EncoderDecoder(nn.Module):
 56 |     '''编码器解码器架构基类'''
 57 |     def  __init__(self, encoder:Encoder,decoder:Decoder,**kwargs):
 58 |         super(EncoderDecoder,self).__init__(**kwargs)
 59 |         self.encoder=encoder
 60 |         self.decoder=decoder
 61 | 
 62 |     def forward(self,enc_X,dec_X,*args):
 63 |         enc_outputs=self.encoder(enc_X,*args)
 64 |         dec_state=self.decoder.init_state(enc_outputs)
 65 | 
 66 |         return self.decoder(dec_X,dec_state)
 67 | 
 68 | 
 69 | ################################## RNN（效果太差了）
 70 | class GruEncoder(Encoder):
 71 |     def __init__(self,in_dim,emb_dim,hidden_size,num_layers,dropout=0,**kwargs):
 72 |         super(GruEncoder,self).__init__(**kwargs)
 73 |         self.embdding=nn.Embedding(in_dim,emb_dim)
 74 |         self.rnn=nn.GRU(emb_dim,hidden_size,num_layers,dropout=dropout)
 75 | 
 76 |     def forward(self,X:Tensor,*args):
 77 |         X=self.embdding(X)
 78 |         # 更改数据维度为seq_len,batch_size,features
 79 |         X=X.permute(1,0,2)
 80 |         output,state=self.rnn(X)
 81 |         # shape分别为：
 82 |         # (seq_len,batch_size,hidden_size)
 83 |         # (num_layers,batch_size,hidden_size)
 84 |         return output,state
 85 | 
 86 | class GruDecoder(Decoder):
 87 |     def __init__(self,in_dim,emb_dim,hidden_size,num_layers,dropout=0,**kwargs):
 88 |         super(GruDecoder,self).__init__(**kwargs)
 89 |         self.embdding=nn.Embedding(in_dim,emb_dim)
 90 |         self.rnn=nn.GRU(emb_dim+hidden_size,hidden_size,num_layers,dropout=dropout)
 91 |         self.dense=nn.Linear(hidden_size,VOCAB_SIZE+4)
 92 | 
 93 |     def init_state(self, enc_outputs, *args):
 94 |         # 取enc的state
 95 |         return enc_outputs[1]
 96 | 
 97 |     def forward(self,X:Tensor,state:Tensor):
 98 |         X=self.embdding(X).permute(1,0,2)
 99 |         # 取最后时刻的最后一层
100 |         context=state[-1].repeat(X.shape[0],1,1)
101 |         
102 |         # 虽然state在h0已经传过来了，但是还是把state拼一下,拼到了特征的维度，问题不大
103 |         X_and_context=torch.cat((X,context),2)
104 |         output,state=self.rnn(X_and_context,hx=state)
105 |         output=self.dense(output).permute(1,0,2)
106 |         # shape分别为：
107 |         # (batch_size,seq_len,hidden_size)
108 |         # (num_layers,batch_size,hidden_size)
109 |         return output,state
110 | 
111 | def GetTextSum_GRU():
112 |     return EncoderDecoder(
113 |         GruEncoder(VOCAB_SIZE+4,512,256,2),
114 |         GruDecoder(VOCAB_SIZE+4,512,256,2)
115 |     )
116 | ##################################
117 | 
118 | 
119 | 
120 | def GetModel(name:str):
121 |     name=name.lower()
122 |     if(name=="gru"):
123 |         return GetTextSum_GRU().to(DEVICE)
124 |     
125 |     else:
126 |         raise Exception("该模型未实现！")
127 | 
128 | if __name__=='__main__':
129 |     # encoder=GruEncoder(VOCAB_SIZE+4,512,256,2)
130 |     # decoder=GruDecoder(VOCAB_SIZE+4,512,256,2)
131 |     # for enc_X,dec_X,y in utils.train_iter:
132 |     #     print(enc_X[0].shape)
133 |     #     enc_out=encoder(enc_X[0])
134 |         
135 |     #     state=decoder.init_state(enc_out)
136 |     #     output,state=decoder(dec_X[0],state)
137 |     #     print(output.shape)
138 |     #     loss_f=MaskedSoftmaxCELoss()
139 |     #     l=loss_f(output,y[0],y[1])
140 |     #     print(l)
141 |         
142 |     #     break
143 |         
144 |     net=GetTextSum_GRU()
145 |     
146 | 
147 |     with open("1.txt","w+") as f:
148 |         f.write(str(net))
149 | 


--------------------------------------------------------------------------------
/source/pretrained_models.py:
--------------------------------------------------------------------------------
  1 | # 使用预训练模型
  2 | from transformers import PegasusTokenizer,PegasusForConditionalGeneration
  3 | from transformers import T5Tokenizer, T5ForConditionalGeneration,AdamW
  4 | from transformers import BartTokenizer,BartForConditionalGeneration
  5 | from settings import *
  6 | from utils import GetRouge,CountFiles
  7 | import os
  8 | from torch.utils.data.dataset import TensorDataset
  9 | from torch.utils.data.dataloader import DataLoader
 10 | from torch.nn.modules.module import Module
 11 | 
 12 | current_model=""
 13 | 
 14 | 
 15 | 
 16 | def ToTensor(texts,summaries,tokenizer):
 17 |     task_prefix="summarize: "
 18 |     encoding = tokenizer([task_prefix + sequence for sequence in texts], 
 19 |                     padding='longest', 
 20 |                     max_length=SOURCE_THRESHOLD, 
 21 |                     truncation=True, 
 22 |                     return_tensors="pt")
 23 |     input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
 24 | 
 25 |     target_encoding = tokenizer(summaries, 
 26 |                         padding='longest', 
 27 |                         max_length=SUMMARY_THRESHOLD, 
 28 |                         truncation=True)
 29 |     labels = target_encoding.input_ids
 30 |     labels = [(i if i != tokenizer.pad_token_id else -100) for i in labels]
 31 |     labels = torch.tensor(labels)
 32 | 
 33 |     return TensorDataset(input_ids,attention_mask,labels)
 34 | 
 35 | def FineTune(net:Module,tokenizer):
 36 |     '''微调'''
 37 |     
 38 |     tset_texts=[]
 39 |     tset_summaries=[]
 40 |     vset_texts=[]
 41 |     vset_summaries=[]
 42 |     tset_len=CountFiles(DATA_DIR+"new_train")
 43 |     vset_len=CountFiles(DATA_DIR+"new_val")
 44 |     for i in range(tset_len):
 45 |         text,summary=ReadJson(i,DATA_DIR+"new_train")
 46 |         tset_texts.append(text)
 47 |         tset_summaries.append(summary)
 48 |     for i in range(vset_len):
 49 |         text,summary=ReadJson(i,DATA_DIR+"new_val")
 50 |         vset_texts.append(text)
 51 |         vset_summaries.append(summary)
 52 |     print("训练数据已读入内存...")    
 53 | 
 54 |     train_iter=DataLoader(
 55 |         ToTensor(tset_texts,tset_summaries,tokenizer),
 56 |         batch_size=BATCH_SZIE,
 57 |         shuffle=True,
 58 |         num_workers=4
 59 |         )
 60 |     val_iter=DataLoader(
 61 |         ToTensor(vset_texts,vset_summaries,tokenizer),
 62 |         batch_size=BATCH_SZIE,
 63 |         shuffle=False,
 64 |         num_workers=4
 65 |         )
 66 | 
 67 |     print("minibatch已生成...") 
 68 |        
 69 |     print("开始训练模型...")    
 70 |     opt=AdamW(net.parameters())
 71 |     from tqdm import tqdm
 72 |     import time
 73 |     min_loss=10
 74 |     for epoch in range(EPOCHS):
 75 |         train_loss=[]
 76 |         val_loss=[]
 77 |         net.train()
 78 |         for batch in tqdm(train_iter):
 79 |             input_ids,attention_mask,labels=[x.to(DEVICE) for x in batch]
 80 |             l = net(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
 81 |             l.backward()
 82 |             opt.step()       
 83 |             opt.zero_grad()
 84 |             with torch.no_grad():
 85 |                 train_loss.append(l.item())
 86 |         
 87 |         torch.cuda.empty_cache()
 88 |         net.eval()
 89 |         with torch.no_grad():
 90 |             for batch in tqdm(val_iter):
 91 |                 input_ids,attention_mask,labels=[x.to(DEVICE) for x in batch]
 92 |                 l = net(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
 93 |                 val_loss.append(l.item())
 94 |         
 95 |         if(sum(val_loss)<min_loss):
 96 |             min_loss=sum(val_loss)
 97 |             torch.save(net.state_dict(),PARAM_DIR+str(int(time.time()))+"_GRU.param")
 98 |             print(f"saved net with val_loss:{min_loss}")    
 99 |         
100 |         print(f"{epoch+1}: train_loss:{sum(train_loss)};val_loss:{sum(val_loss)}")
101 | 
102 | def TestOneSeq(net,tokenizer,text, target=None):
103 |     '''生成单个样本的摘要'''
104 |     torch.cuda.empty_cache()
105 |     net.eval()
106 |     
107 |     text = str(text).replace('\n', '')
108 |     input_tokenized = tokenizer.encode(
109 |         text,
110 |         truncation=True, 
111 |         return_tensors="pt",
112 |         max_length=SOURCE_THRESHOLD
113 |         ).to(DEVICE)
114 |     
115 |     if(current_model=="t5"):
116 |         summary_task = torch.tensor([[21603, 10]]).to(DEVICE)
117 |         input_tokenized = torch.cat([summary_task, input_tokenized], dim=-1).to(DEVICE)
118 |     
119 |     summary_ids = net.generate(input_tokenized,
120 |                                     num_beams=NUM_BEAMS,
121 |                                     no_repeat_ngram_size=3,
122 |                                     min_length=MIN_LEN,
123 |                                     max_length=MAX_LEN,
124 |                                     early_stopping=True)
125 |     output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
126 |     score=-1
127 |     if(target!=None):
128 |         score=GetRouge(output[0],target)
129 |     return output[0],score
130 | 
131 | # t5
132 | def GetTextSum_T5(name):
133 |     tokenizer=T5Tokenizer.from_pretrained(PARAM_DIR+name)
134 |     net=T5ForConditionalGeneration.from_pretrained(PARAM_DIR+name)
135 |     print(f"{name} 加载完毕")
136 |     return net.to(DEVICE),tokenizer
137 | 
138 | # bart
139 | def GetTextSum_BART():
140 |     tokenizer=BartTokenizer.from_pretrained(PARAM_DIR+"bart", output_past=True)
141 |     net=BartForConditionalGeneration.from_pretrained(PARAM_DIR+"bart", output_past=True)
142 |     print("bart 加载完毕")
143 |     return (net.to(DEVICE),tokenizer)
144 | 
145 | def GetTextSum_Pegasus():
146 |     tokenizer=PegasusTokenizer.from_pretrained(PARAM_DIR+"pegasus")
147 |     net=PegasusForConditionalGeneration.from_pretrained(PARAM_DIR+"pegasus")
148 |     print("pegasus 加载完毕")
149 |     return (net.to(DEVICE),tokenizer)
150 | 
151 | def GetPModel(name:str):
152 |     global current_model
153 |     name=name.lower()
154 |     print("正在加载模型")
155 |     if("t5" in name):
156 |         current_model="t5"
157 |         return GetTextSum_T5(name)
158 |     elif(name=="bart"):
159 |         return GetTextSum_BART()
160 |     elif(name=="pegasus"):
161 |         current_model="pegasus"
162 |         return GetTextSum_Pegasus()
163 |     else:
164 |         raise Exception("该模型未实现！")
165 |     
166 | def ReadJson(i,dir,test=False):
167 |     '''读取单个json文件（一个样本）'''
168 |     import json
169 | 
170 |     js_data=json.load(open(os.path.join(dir,f"{i}.json"),encoding="utf-8"))
171 |     if test:
172 |         return js_data["text"]
173 |     return js_data["text"],js_data["summary"]
174 | 
175 | def GenSub(net,tokenizer,param_path=None):
176 |     '''生成submission.csv'''
177 |     import csv
178 |     from tqdm import tqdm
179 |     
180 |     if(param_path!=None):
181 |         net.load_state_dict(torch.load(param_path))
182 |     res=[]
183 |     for i in tqdm(range(1000)):
184 |         text=ReadJson(i,DATA_DIR+"new_test",True)
185 |         summary=TestOneSeq(net,tokenizer,text)[0]
186 |         res.append([str(i),summary])
187 |     
188 |     with open(os.path.join(DATA_DIR, 'submission.csv'),'w+',newline="",encoding='utf-8') as csvfile:
189 |         writer=csv.writer(csvfile,delimiter="\t")   
190 |         writer.writerows(res)
191 | 
192 | 
193 | if __name__=='__main__':
194 |     net,tokenizer=GetPModel("pegasus")
195 |     # res=tokenizer(
196 |     #     ["hello world","hi"], 
197 |     #     return_tensors="pt",
198 |     #     padding='longest', 
199 |     #     max_length=MAX_LEN, 
200 |     #     truncation=True,
201 |     #     )
202 |     # print(res)
203 |     
204 |     # print(TestOneSeq(
205 |     #     net,tokenizer,
206 |     # "one-third of phone users would definitely upgrade to a facebook phone - and 73 % think the phone is a ` good idea ' . news of the phone emerged this week , with sources claiming that facebook had hired ex-apple engineers to work on an ` official ' facebook phone . facebook has made several ventures into the mobile market before in partnership with manufacturers such as htc and inq - but a new phone made by ex-apple engineers is rumoured to be in production . the previous ` facebook phone ' - inq 's cloud touch - puts all of your newsfeeds , pictures and other information on a well thought-out homescreen centred around facebook . it 's not the first facebook phone to hit . the market -- the social network giant has previously partnered with inq . and htc to produce facebook-oriented handsets , including phones with a . built-in ` like ' button . details of the proposed phone are scant , but facebook is already making moves into the mobile space with a series of high-profile app acquisitions . after its $ 1 billion purchase of instagram , the social network bought location-based social app glancee and photo-sharing app lightbox . facebook 's smartphone apps have also seen constant and large-scale redesigns , with adverts more prominent with the news feed . the handset is rumoured to be set for a 2013 release . it could be a major hit -- a flash poll of 968 people conducted by myvouchercodes found that 32 % of phone users would upgrade as soon as it became available . the key to its success could be porting apps to mobile -- something facebook is already doing . separate camera and chat apps already separate off some site functions , and third-party apps will shortly be available via a facebook app store . of those polled , 57 % hoped that it would be cheaper than an iphone -- presumably supported by facebook 's advertising . those polled were then asked why they would choose to purchase a facebook phone , if and when one became available , and were asked to select all reasons that applied to them from a list of possible answers . would you ` upgrade ' to a facebook phone ? would you ` upgrade ' to a facebook phone ? now share your opinion . the top five reasons were as follows : . 44 % of people liked the idea of having their mobile phone synced with their facebook account , whilst 41 % said they wanted to be able to use facebook apps on their smartphone . mark pearson , chairman of myvouchercodes.co.uk , said , ` it will be quite exciting to see the first facebook phone when it 's released next year . '",
207 |     # "poll of 968 phone users in uk .   32 % said they would definitely upgrade to a facebook phone .   users hope it might be cheaper than iphone . "
208 |     # ))
209 |     # GenSub(net,tokenizer)
210 |     
211 |     # opt=AdamW(net.parameters())
212 |     # opt.step()
213 | 
214 |     # FineTune(net,tokenizer)
215 |     
216 |     with open("1.txt","w+") as f:
217 |         f.write(str(net))
218 | 


--------------------------------------------------------------------------------
/source/settings.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #------------------ 路径设置 ------------------#
 3 | # 数据集目录
 4 | import torch
 5 | 
 6 | 
 7 | DATA_DIR="D:/2021UCAS/AdvancedAI/TextSum/dataset/"
 8 | # 模型参数/预训练模型目录
 9 | PARAM_DIR="D:/2021UCAS/AdvancedAI/TextSum/params/"
10 | # 词频表地址
11 | VOCAB_PATH="D:/2021UCAS/AdvancedAI/TextSum/dataset/vocab_cnt.pkl"
12 | # 单词->数字
13 | WORD_IDX_PATH="D:/2021UCAS/AdvancedAI/TextSum/dataset/word2idx.pkl"
14 | # 数字->单词
15 | IDX_WORD_PATH="D:/2021UCAS/AdvancedAI/TextSum/dataset/idx2word.pkl"
16 | 
17 | #------------------ 词典设置 ------------------#
18 | # 特殊符号
19 | PAD_WORD = '<pad>'
20 | UNK_WORD = '<unk>'
21 | BOS_WORD = '<bos>'
22 | EOS_WORD = '<eos>'
23 | PAD_NUM = 0
24 | UNK_NUM = 1
25 | BOS_NUM = 2
26 | EOS_NUM = 3
27 | # 词典大小(拉满就不会出现UNK),注意输入至网络时要加4（还有四个特殊字符）
28 | VOCAB_SIZE=10000
29 | # 最长原文序列长度
30 | MAX_SOURCE_LEN=2193
31 | # 最长摘要序列长度
32 | MAX_SUMMARY_LEN=587
33 | 
34 | # 限定序列长度（长于此长度做切割，短于此长度做padding）
35 | SOURCE_THRESHOLD=1800
36 | SUMMARY_THRESHOLD=550
37 | # 读取数据时的标志
38 | TRAIN_FALG=0
39 | VAL_FALG=1
40 | TEST_FALG=2
41 | # 数据清理规则
42 | # 顺序莫变！
43 | PATTERNS_ONCE=[
44 |     "by .*? published :.*?\. \| \..*? [0-9]+ \. ",
45 |     "by \. .*? \. ",
46 |     "-lrb- cnn -rrb- -- ",
47 |     "\t(.*?-lrb- .*? -rrb- -- )",
48 |     ]
49 | PATTERNS_ANY=[
50 |     "``|''"
51 |     ]
52 | 
53 | #------------------ 其他设置 ------------------#
54 | DEVICE=torch.device("cuda:0")
55 | EPOCHS=10
56 | BATCH_SZIE=28
57 | 
58 | 
59 | #------------------ 预训练模型设置 ------------------#
60 | 
61 | # 搜索束个数
62 | NUM_BEAMS=1
63 | # 预测序列最大长度
64 | MAX_LEN=590
65 | # 预测序列最小长度
66 | MIN_LEN=30
67 | 


--------------------------------------------------------------------------------
/source/temp.py:
--------------------------------------------------------------------------------
 1 | # import re
 2 | # s1="3   by . daily mail reporter . published : . 15:34 est , 13 july 2012 . | . updated : . 01:33 est , 16 july 2012 . kelsey grammer 's wife kayte has given birth to their first child together . the boss actor , 57 , and his 32-year-old spouse -- who were expecting twins -- are ` thrilled ' after welcoming a ` healthy baby girl ' weighing 6lbs 2oz into the world this morning in los angeles , and they have named her faith evangeline elisa grammer . but the couple revealed they tragically lost their unborn son shortly after announcing kayte was pregnant with twins . joy and heartache : kelsey grammer and kayte walsh , pictured in chicago esterday , have welcomed a baby girl , but also revealed they lost a twin boy during the pregnancy . in a personal note , they said : ` early . this morning kayte gave birth to faith evangeline elisa grammer . we . are thrilled . she was 6lbs 2oz when she entered the world at 1am on the . 13th of july in the year 2012 . mother and child are in excellent . health . ' ` we were ecstatic earlier this year , . when we announced that kayte was carrying twins . tragically we lost the . little boy shortly thereafter . this was not something we cared to make . known publicly at the time . ' ` it was unspeakably painful and we . know that people will understand our desire to keep the news private . then , as we know they will respect our privacy in this matter now . a . glorious birth with a lingering sadness is ours today . ` we choose to celebrate the life that has been given us ' : the pair released an emotional statement today . ` healthy baby girl ' : they have named the baby , who weighs 6lbs 2oz , faith evangeline elisa grammer . ` we choose to celebrate the life that . has been given us . we proudly introduce our faith to the world today . looking forward to the days ahead and the children yet to come . ' the couple -- who got married in . february 2011 and renewed their vows in june -- previously lost a child . when kayte suffered a miscarriage in 2010 . kelsey already has four kids , . spencer , 28 , and greer , 19 , from previous relationships and 10-year-old . mason and jude , seven , with ex-wife camille donatacci . the couple went public with their romance just weeks after he split from the real housewives of beverly hills star . ex wife : kelsey with real housewives star camille and their children jude and mason in 2008 .	 kayte gave birth to a ` healthy baby girl ' named faith evangeline elisa this morning .   couple reveal ` unspeakable ' pain at losing twin boy during pregnancy .   celebrating a ` glorious birth ' with ` lingering sadness '"
 3 | # s2="2   by . daily mail reporter . published : . 00:04 est , 14 july 2012 . | . updated : . 01:30 est , 16 july 2012 . sylvester stallone was said to have almost collapsed with grief on learning of the death of his son yesterday . the body of sage stallone , 36 , was found by his housekeeper at his los angeles home . prescription drugs were reportedly found nearby but police said it was too early to say whether they were the cause of his death . tragedy : sylvester stallone 's son sage was found dead this afternoon in his los angeles apartment after a suspected drug overdose . he was 36 , pictured here in 2006 in hollywood . a source close to stallone said : . ` when he heard the news , sly was shocked , short of breath and almost . collapsed . he just went quiet before sobbing uncontrollably . he is a . wreck at the moment . ' sage 's aunt melanie hart told the mail on sunday : ` people are speculating that it was suicide but we really have no idea . ' there were unconfirmed reports that . sage , whose mother is stallone 's first wife sasha czack , had been dead . for four days before his body was found . a source told radaronline that medics . arrived on the scene at 3.05 pm this afternoon and spent around 25 . minutes trying to revive sage before his death was pronounced at the . scene . his body was taken straight to the coroner 's office - and the insider claims no suicide note was found . ' i suspect he had been dead for quite a while when he was discovered , ' the source told the website . ` usually medics will be at the scene . for around 45 minutes but they were out of there within half an . hour . ` there were a number of prescription bottles found at the scene but it did not appear to be suicide and no note was found . ' pronounced dead at the scene : the coroner 's van was spotted at sage 's home in los angeles along with news crews . unresponsive : the filmmaker 's body was taken straight to the coroner 's office - and not to the hospital . a 9-1-1 call was placed shortly . before 3pm and the caller said sage was n't breathing and indicated it . could be a drug overdose , radar reports . an autopsy is scheduled to take place in the next 48 hours . shortly after news of sage 's death , a . spokesman released a statement on behalf of his action hero father , 66 , . who was at the comic con film convention in san diego yesterday . ` sylvester stallone is devastated and . grief-stricken over the sudden loss of his son , ' the actor 's . spokesperson michelle bega said in the statement . ` his compassion and thoughts are with sage 's mother , sasha . ' sudden death : the body of the 36-year-old sage stallone was brought out to the coroner 's van in los angeles . devastated : sly 's agent released a statement saying he was ` grief-striken ' at the loss of his son . mystery : an autopsy is scheduled to take place in the next 48 hours to determine the cause of death . earlier : sly was at comic com yesterday evening . red carpet smiles : sage pictured in 1996 at the hollywood premiere of daylight with his father sylvester and his now-wife jennifer flavin . double act : sage appeared alongside his father in the 1990 movie rocky v , playing the role of rocky 's son robert balboa . ` he was a very talented and wonderful young man . his loss will be felt forever . ' police said they found the younger . stallone in the home while responding to a ` welfare check ' , however . sage 's lawyer george braunstein said he was found by a housekeeper . friends and acquaintances had become concerned because they had n't heard from sage in the past day . braunstein said the death came as a shock , telling the new york post this afternoon : ` he was in good spirits , and working . on all kinds of projects . ` he was planning on getting married . i am just devastated . he was an extremely wonderful , loving guy . this is a tragedy . ' before the heartbreak : stallone was pictured yesterday with arnold schwarzenegger at the comic con film convention in san diego . sage moonblood stallone was the . oldest of sylvester stallone 's children and co-starred with his father . in two films . he was the first of two sons stallone had with first wife . sasha czack . he made his acting debut in 1990 's . rocky v - he played his stallone 's onscreen son - and also appeared with . his father in 1996 's daylight . hand in hand : sylvester pictured back in 1982 with his first wife sasha czack , sage 's mother . also in 1996 , sage stallone and . veteran film editor bob murawski co-founded grindhouse releasing , a . company dedicated to preserving and promoting the b-movies and . exploitation films of the 1970s and 80s . he also directed the 2006 short vic , which screened at the palm springs film festival . braunstein said sage had frequent requests to work on films . ` he was a full of life filmmaker with . his whole future ahead of him , ' he said . ` he was just very up and . enthusiastic and positive . ' i think it was probably some sort of accident , ' he said of the death . braunstein added that sage stallone greatly admired his father but was working hard to make his own name in the film industry . ` he was very proud of his father and proud to be his father 's son , ' he said . stallone 's split from sage 's mother czack in 1985 after 11 years together . they also have a another son . seargeoh , 32 , who is autistic . stallone went on to wed model and actress brigitte . nielsen in beverly hills but they split just two . years later in a very public divorce . he married third wife , jennifer . flavin , in 1997 after an eight-year on-again , off-again relationship and . they have three daughters : sophia rose , 15 , sistine rose , 14 , and . scarlet rose , 10 . sage , who was raised by his mother following his parents ' divorce , felt distant from his father growing up , a theme which hit home as they were filming rocky v together . big boots to fill : sage said he always worried about living up to his father 's success , seen here together again in rocky v . ` when i was screaming , `` you never spent time with me ! you never spent time with my mother ! '' - that was true , ' he told people magazine in 1996 . ` i was looking into my father 's face and really saying that . ' but it proved a turning point for the father and son , who went on to form a close bond and they acted again together in the 1996 film daylight . ` between takes , sly and sage would roll around in the dirt like two puppies , ' the director rob cohen observed at the time . sage certainly felt the pressure of growing up with such a famous father and would worry that he would never match his success . ` i tell him , `` as long as you give it your best , that 's all that matters , '' his mother sasha said in that same year . sage went on to pursue a career behind the camera and shunned the wild hollywood party scene , preferring to watch horror zombie films instead . ` people call me a hermit , ' he said while promoting the film . ` but i 'm happy . '	 star ` devastated and grief-stricken ' over sudden loss of his eldest child .   sage played the 66-year-old 's onscreen son in rocky v .   an autopsy is scheduled to take place in the next 48 hours after filmmaker was found next to prescription drugs ."
 4 | # s3="1   -lrb- cnn -rrb- -- to resolve america 's ongoing , bruising battle over the debt and deficit , house republican paul ryan and senate democrat patty murray announced a deal on december 10 to halt spending cuts -- mostly in defense -- and lock in a two-year budget agreement to avoid another government shutdown on january 15 . but in eagerly seeking agreement with the republicans who shut the government down in october , democrats risk hurting the economy 's fragile recovery by accepting too much budget austerity embedded in the newly adopted budget . president obama and the democrats won big over the republicans in october 's budget fight . instead of pressing their advantage , democrats took tax increases for the rich off the table , agreed to cut federal pensions and did not get unemployment benefits extended . the democrats basically threw away their political gains . the deal repeals less than half of the sequestration cuts planned for 2014 . if obama and congress continue their shortsighted obsession with austerity and budget cuts , they ignore the big economic lesson from the past several years : austerity hurts prosperity . the congressional budget office estimated that repealing the entire 2013-2014 spending cuts would increase gross domestic product by $ 113 billion and create 900,000 additional jobs next year . the october 2013 government shutdown took another $ 24 billion out of the gdp . unemployment remains stuck around 7 % . though the deal reduces a bit of fiscal uncertainty , it hardly affected the u.s. growth forecasts for big banks , despite bank economists citing some pessimism because of `` austerity shock '' from spending cuts and `` uncertainty shock '' from washington 's continued fiscal battles . republicans bargain for more cuts and fewer taxes , but cutting military spending makes them nervous , so they attack social security and medicare . the wall street-affiliated democratic group third way is helping . it launched an attack on sen. elizabeth warren , d-massachusetts , and others who rightly refuse to cut social security as part of a long-term budget solution . we all know that republicans like to defend the wealthy and slash government . but why does austerity , especially cuts to old-age programs , have credibility with obama and other democrats ? advocates of `` grand bargains , '' cutting programs to balance the budget , wrongly presume the budget is a fixed quantity . they imagine it like a fixed pie . programs for the young , like education , must be paid for by cutting other programs , like social security . but their belief that a dollar taken from the old will be spent on the young is not only divisive , mean and fierce -- it is wrong . in his december 6 speech on inequality , obama talked about the sky-high and stubborn child poverty rate : more than 24 % . but cutting social security and medicare will only destabilize the economy and increase the elderly poverty rate . in many countries , programs for elderly people are not traded off against help for the young . when support for old-age programs increases , so does spending on children . advanced democratic countries ' spending on the elderly is positively correlated with education spending . one analysis shows that a 10 % increase in spending on education is correlated with a 7.3 % increase in spending on pensions . the congressional budget office warns that long-term deficits can hurt the economy . want to reduce the debt and deficit ? tax the wealthy , which wo n't hurt the economy . economists emmanuel saez and thomas piketty estimate that raising the tax rate for the top 1 % as high as 80 % would generate far more revenue . sen. tom harkin , d-iowa , and rep. peter defazio , d-oregon , propose a transactions tax -- a three-penny charge on every $ 100 traded in the stock market , which the congressional budget office estimates would raise $ 352 billion over 10 years . this small tax would also reduce stock churning by speculators , creating a nice secondary benefit . want to find even more savings ? sen. harry reid , d-nevada , wisely put tax loopholes that cost the treasury almost a trillion dollars per year on the table . for example , reid called for eliminating the small , but noxious , tax break for buying yachts and the $ 17 billion break that comes from taxing private equity , real estate and hedge fund profits as `` carried interest '' rather than at the ordinary income rate of 39.6 % instead of the capital gains rate of 20 % . there is one piece of good news : the deficit is coming down , from 9.2 % when obama took office to 4.1 % of gdp in 2017 . faster economic growth would shrink the deficit more rapidly . in contrast , further spending cuts will slow the economy and deficit reduction along with it . so , this is no time for obama to accept a lower budget path , or to consider cuts in social security and medicare . the small budget deficit reductions in this deal -- less than one-half of 1 % of the total debt or $ 23 billion -- would almost pay for extended unemployment benefits for one year at $ 25 billion . democrats are flinching under continued pressure from republicans playing out their long game as they ready for another bitter fight when the debt limit is reached next spring . but the president and the democrats have a winning economic and political strategy : raise revenues and keep social security and medicare strong . do n't throw october 's hard-won victory away ; it wo n't help the elderly , it wo n't help children , and it wo n't help the economy . the opinions expressed in this commentary are solely those of rick mcgahey and teresa ghilarducci .	 democrats and republicans reach a deal on the budget .   rick mcgahey , teresa ghilarducci : austerity in budget will hurt our economy .   they say president obama should not make cuts to programs like social security .   mcgahey , ghilarducci : taxing the wealthy would generate revenue and cut deficit ."
 5 | # s4="123 new york -lrb- cnn -rrb- -- preliminary tests indicate ricin was found in letters sent this past weekend to new york mayor michael bloomberg , new york deputy police commissioner paul browne said wednesday . browne said the letters to bloomberg contained a threat to the mayor and mentioned the debate on gun laws . `` the letter obviously , referred to our anti-gun efforts but there 's 12,000 people -lrb- who -rrb- are going to get killed this year with guns and 19,000 that are going to commit suicide with guns , and we 're not going to walk away from those efforts , '' bloomberg said . one letter addressed to the mayor 's office was opened at the city government 's mail facility , browne said . the suspicious material found in the two letters was a `` pink-orange oily substance , '' he said , adding that it was the second of two tests that showed what appeared to be trace amounts of ricin . what is ricin ? the substance is being tested at the national bioforensic analysis center in maryland , with conclusive results expected by friday . some of the emergency services workers who touched the letter friday were examined after they showed minor intestinal symptoms of ricin exposure on saturday , browne said . the symptoms have since subsided . civilian workers showed no symptoms , browne said in a statement . `` we take a lot of security measures as you know , '' bloomberg said . `` the men and women that open the mail for example ... they are well trained . '' the second letter to the mayor was opened by mark glaze , director of mayors against illegal guns -- founded and co-chaired by bloomberg -- in washington on sunday . browne 's statement appeared to indicate glaze showed no symptoms . a spokeswoman for the organization declined to comment wednesday . opinion : ricin - almost never deadly . both letters were postmarked in shreveport , louisiana , on may 20 , the american postal workers union said on its website . bloomberg is an outspoken critic of current gun laws . in march , he said nationwide background checks on all gun sales would save lives . `` we know that 's true , because in states that already require background checks on private sales , the rate of women murdered by an intimate partner armed with a gun is 38 % lower than in states that do n't have such background checks , '' he said . fbi spokesman jim margolin told cnn the agency is working to determine from where the letters were sent and who sent them . if inhaled , injected or ingested , less than a pinpoint of ricin can kill a person within 36 to 48 hours because of the failure of the respiratory and circulatory systems . there is no known antidote for the toxin , which is derived from castor beans . it has been included in letters in the past few months sent to president barack obama and other officials . in april , letters were sent to obama ; sen. roger wicker , r-mississippi ; and sadie holland , a judge in lee county , mississippi . james everett dutschke of tupelo , mississippi , has been charged with possession and use of a biological agent in connection with the case . last week , fbi agents arrested matthew ryan buquet after a grand jury charged him with mailing threatening communication to a senior judge in the u.s. district court for the eastern district of washington state . the fbi said in a statement that tests -- conducted by that agency and the spokane regional health district -- showed that a suspicious substance found with the letter was `` active ricin toxin . '' there are no indications the cases are connected . man , 37 , arrested in probe of washington state ricin-laced letters . cnn 's deborah feyerick , jason kessler , lawrence crook iii , carol cratty and mary snow contributed to this report .	 new : suspicious substance was oily , new york police official says .   new : postal union says letters were postmarked in shreveport , louisiana .   letters were addressed to bloomberg , one went to an organization he founded .   ricin is a toxin that can kill in a matter of days . "
 6 | # s5="975 london , england -lrb- cnn -rrb- -- human rights and freedom of the press in china , the detention of terrorist suspects by the united states and russia 's treatment of political dissent are the focus of scrutiny in amnesty international 's annual report , released wednesday , which looks at the state of human rights around the world . amnesty international protestors outside the us supreme court in january dressed as guantanamo bay detainees . the 398-page report comes 60 years after the united nations adopted the universal declaration of human rights , and amnesty says governments still need to act on their promises . `` the biggest threat to the future of human rights is the absence of a shared vision and collective leadership , '' the organization said in a statement . irene khan , amnesty 's secretary-general , said that in particular , `` the human-rights flash points in darfur , zimbabwe , gaza , iraq and myanmar demand immediate attention . '' the report , the group said , `` reveals a world riven by inequality , scarred by discrimination and distorted by political repression . '' according to its count , people are tortured or subject to other ill treatment in at least 81 countries , face unfair trials in at least 54 countries and are not allowed to express themselves freely in at least 77 countries . of the 150 countries and regions listed in the report , amnesty paid particular attention to china , the host of this summer 's olympic games . the group said growing numbers of human rights activists were imprisoned or harassed in china in 2007 , with ethnic and religious minorities -- including tibetans , falun gong practitioners and christians -- repressed or persecuted . death penalty statistics in china are difficult to assess , amnesty said , but based on public reports , the group estimated that at least 470 people were executed in 2007 . amnesty also noted the repression of free speech in china and said censorship of the internet and other media intensified last year . `` the chinese authorities maintained efforts to tightly control the flow of information , '' the report said . `` they decided what topics and news stories could be published , and media outlets were sometimes required to respond within minutes to government directives . the authorities continued to block web sites and to filter internet content based on specified words and topics . '' around 30 journalists and at least 50 others are known to be in prison for posting their views online , amnesty said . amnesty also criticized the death penalty in the united states , where 42 people were executed last year . it noted new jersey 's decision in december to abolish the death penalty made it the first u.s. state in more than 40 years to do away with executions . as it has in previous annual reports , amnesty criticized the detention of hundreds of foreign nationals at the u.s. naval base at guantanamo bay , cuba . `` the usa must close guantanamo detention camp and secret detention centers , prosecute the detainees under fair trial standards or release them , and unequivocally reject the use of torture and ill-treatment , '' amnesty said . the group noted that guantanamo detainees are held indefinitely , most of them without charge and without recourse to u.s. courts . most detainees there are held in isolation in maximum-security facilities , heightening concerns for their physical and mental health , amnesty said . in fact , more is written on the united states than any other country listed in the report . asked about that at a press conference tuesday , khan said , `` we certainly devote a lot of time to sudan , to china , to zimbabwe and other countries . but we look to the u.s. to provide leadership around the world . governments around the world look to the united states as a role model for their own behavior . '' in a lengthy section on iraq , amnesty noted that thousands of civilians , including children , were killed or injured in ongoing sectarian violence during 2007 . `` all sides involved in the fighting committed gross human rights violations , some of which amounted to war crimes and crimes against humanity , '' the report said . abductions , torture and murder , with bodies left in the street , occur daily , and the violence has caused 2 million iraqis to flee to syria , jordan and elsewhere , amnesty said . u.s. forces held some 25,000 detainees `` without charge or trial , '' the group said , and 33 people were executed , `` some after grossly unfair trials . '' in afghanistan , conflict and insecurity aggravated by drought and floods contributed to `` large-scale displacement '' of people throughout the year . `` at least 6,500 people were estimated to have been killed in the context of the conflict , '' the report said . `` violations of international humanitarian and human rights law were committed with impunity by all parties , including afghan and international security forces and insurgent groups . '' russia must show greater tolerance for political dissent , amnesty said . `` the russian authorities were increasingly intolerant of dissent or criticism , branding it ` unpatriotic , ' '' the report said . `` a crackdown on civil and political rights was evident throughout the year and in particular during the run-up to the state duma -lsb- parliament -rsb- elections in december . '' the european court of human rights ruled that russia was responsible for enforced disappearances , torture and extrajudicial executions in 15 judgments relating to the recent conflict in chechnya , amnesty said . there were fewer reported disappearances in the chechen republic in 2007 than in previous years , amnesty said , but continued human rights violations made people reluctant to report abuses . the report also criticized human rights conditions in iran , gaza and myanmar . human rights conditions in zimbabwe continued to decline in 2007 , the report said , `` with an increase in organized violence and torture and restrictions on the rights to freedom of association , assembly and expression . '' members of the main opposition party , the mdc , along with other human rights defenders , were arrested , and many were tortured while in custody , amnesty said . some 4 million people required food aid because of the nation 's deteriorating economy , and victims of forced evictions in 2005 continued to live in `` deplorable conditions '' while president robert mugabe 's government failed to remedy their situation . `` human rights problems are not isolated tragedies , but are like viruses that can infect and spread rapidly , endangering all of us , '' khan said . `` governments today must show the same degree of vision , courage and commitment that led the united nations to adopt the universal declaration of human rights 60 years ago . ''"
 7 | # s6="6737	by . eleanor crooks , press association . maria sharapova reached her third successive french open final by battling past eugenie bouchard . sharapova maintained her remarkable record in three-set matches by winning an 18th consecutive deciding set on clay in a 4-6 , 7-5 , 6-2 victory . the russian won her first title at roland garros in 2012 before losing to serena williams 12 months ago . on form : maria sharapova fought back from a set down to overcome a stiff challenge from eugenie bouchard . rising star : eugenie bouchard , 20 , was playing in her second consecutive grand slam semi-final . bouchard , . who was playing in her second straight grand slam semi-final , had lost . comfortably to sharapova in the second round last year and demonstrated . again the huge strides she has made . she possesses the same steely-eyed determination as sharapova and her mental strength is remarkable for a 20-year-old . the . canadian said after beating angelique kerber in the fourth round that . she did not have a best friend in tennis , adding : ' i do n't think the . tennis tour is the place to have friends . for me , it 's all competition . ' it . was a sentence that could well have been written by sharapova so it was . no surprise that this was not a match for the faint-hearted . bouchard . has improved significantly since making the last four at the australian . open in january , hitting the ball a lot more aggressively , and it was . she who struck first with a break for 2-1 . pumped up : sharapova celebrates as she comes back from a set down to seal her place in the final . sharapova . fought back to level at 4-4 but bouchard forged ahead again immediately . and held to take the set , saving a break point with the gutsiest of . backhand winners onto the line . sharapova . had recovered from a set down in both her last two matches against sam . stosur and garbine muguruza and set about doing the same , moving into a . 5-2 lead . but . the russian 's serve , never something to be relied upon , was having an . off day and , serving for the set , she twice double-faulted on set point . rising star : bouchard gets down low to play a forehand as she takes the first set over sharapova . there was also a second-serve ace on a break point for good measure but on her third chance bouchard pounced . the . 20-year-old was unable to resist when sharapova broke again at 5-5 , . though , and this time the seventh seed clinched the set when bouchard . netted a forehand . bouchard . had never lost a grand slam match in which she had won the first set . before but the sense was sharapova 's prowess in deciding sets would be . the crucial factor . scene of success : sharapova will play in her third consecutive french open final on saturday . the russian moved ahead at 3-1 , and for the first time bouchard was making bad mistakes on the big points . she . held for 4-2 , saving two break points , but in the next game missed a . routine forehand and a volley as sharapova moved to within one game of . victory . bouchard . fought on , saving four match points in terrific style , but there was . nothing she could do when a sharapova forehand fizzed off the baseline . after two hours and 27 minutes . french kiss : sharapova acknowledges the roland garros crowd after semi-final victory .	 sharapova fought back from a set down to beat canadian bouchard .   the 2012 champion won 4-6 , 7-5 , 6-2 in two hours and 27 minutes .   sharapova will play simona halep in saturday 's final at roland garros . "
 8 | # s7="0	editor 's note : in our behind the scenes series , cnn correspondents share their experiences in covering news and analyze the stories behind the events . here , soledad o'brien takes users inside a jail where many of the inmates are mentally ill . an inmate housed on the `` forgotten floor , '' where many mentally ill inmates are housed in miami before trial . miami , florida -lrb- cnn -rrb- -- the ninth floor of the miami-dade pretrial detention facility is dubbed the `` forgotten floor . '' here , inmates with the most severe mental illnesses are incarcerated until they 're ready to appear in court . most often , they face drug charges or charges of assaulting an officer -- charges that judge steven leifman says are usually `` avoidable felonies . '' he says the arrests often result from confrontations with police . mentally ill people often wo n't do what they 're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid , delusional , and less likely to follow directions , according to leifman . so , they end up on the ninth floor severely mentally disturbed , but not getting any real help because they 're in jail . we toured the jail with leifman . he is well known in miami as an advocate for justice and the mentally ill . even though we were not exactly welcomed with open arms by the guards , we were given permission to shoot videotape and tour the floor . go inside the ` forgotten floor ' '' at first , it 's hard to determine where the people are . the prisoners are wearing sleeveless robes . imagine cutting holes for arms and feet in a heavy wool sleeping bag -- that 's kind of what they look like . they 're designed to keep the mentally ill patients from injuring themselves . that 's also why they have no shoes , laces or mattresses . leifman says about one-third of all people in miami-dade county jails are mentally ill . so , he says , the sheer volume is overwhelming the system , and the result is what we see on the ninth floor . of course , it is a jail , so it 's not supposed to be warm and comforting , but the lights glare , the cells are tiny and it 's loud . we see two , sometimes three men -- sometimes in the robes , sometimes naked , lying or sitting in their cells . `` i am the son of the president . you need to get me out of here ! '' one man shouts at me . he is absolutely serious , convinced that help is on the way -- if only he could reach the white house . leifman tells me that these prisoner-patients will often circulate through the system , occasionally stabilizing in a mental hospital , only to return to jail to face their charges . it 's brutally unjust , in his mind , and he has become a strong advocate for changing things in miami . over a meal later , we talk about how things got this way for mental patients . leifman says 200 years ago people were considered `` lunatics '' and they were locked up in jails even if they had no charges against them . they were just considered unfit to be in society . over the years , he says , there was some public outcry , and the mentally ill were moved out of jails and into hospitals . but leifman says many of these mental hospitals were so horrible they were shut down . where did the patients go ? nowhere . the streets . they became , in many cases , the homeless , he says . they never got treatment . leifman says in 1955 there were more than half a million people in state mental hospitals , and today that number has been reduced 90 percent , and 40,000 to 50,000 people are in mental hospitals . the judge says he 's working to change this . starting in 2008 , many inmates who would otherwise have been brought to the `` forgotten floor '' will instead be sent to a new mental health facility -- the first step on a journey toward long-term treatment , not just punishment . leifman says it 's not the complete answer , but it 's a start . leifman says the best part is that it 's a win-win solution . the patients win , the families are relieved , and the state saves money by simply not cycling these prisoners through again and again . and , for leifman , justice is served . e-mail to a friend ."
 9 | # s8="21  lagos , nigeria -lrb- reuters -rrb- -- nigeria 's television survival show has been"
10 | # pat1="by .*? published :.*?\. \| \..*? [0-9]+ \. "
11 | # pat2="-lrb- cnn -rrb- -- "
12 | # pat3="\t(.*?-lrb- .*? -rrb- -- )"
13 | # pat4="``|''"
14 | # pat5="by \. .*? \. "
15 | # res=re.sub(pat1,"",s4,1)
16 | # res=re.sub(pat2,"",res,1)
17 | # res=re.sub(pat3,"",res,1)
18 | # res=re.sub(pat4,"",res)
19 | # res=re.sub(pat5,"",res,1)
20 | # print(res)
21 | # import torch
22 | # from torch import nn
23 | 
24 | # transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
25 | # src = torch.rand((10, 32, 512))
26 | # tgt = torch.rand((20, 32, 512))
27 | # out = transformer_model(src, tgt)
28 | # print(out.shape)
29 | 
30 | # l=[1,2,3,4]
31 | 
32 | # print([i if i!=4 else 5 for i in l])
33 | 
34 | 
35 | # import utils
36 | 
37 | # a=["hello hello","hi world"]
38 | # b=["hello hello","world"]
39 | 
40 | # utils.GetRouge(a,b)
41 | 
42 | # import torch
43 | # from transformers import T5Tokenizer, T5Model
44 | 
45 | # tokenizer=T5Tokenizer.from_pretrained('t5-small')
46 | # text = ['Hello world!', 'Hello python!']
47 | # inputs = tokenizer(text, return_tensors='pt', padding=True)
48 | # print(inputs)
49 | 
50 | from transformers import T5Tokenizer, T5ForConditionalGeneration,AdamW
51 | import torch
52 | from settings import *
53 | 
54 | tokenizer = T5Tokenizer.from_pretrained(PARAM_DIR+"t5-small")
55 | model = T5ForConditionalGeneration.from_pretrained(PARAM_DIR+"t5-small")
56 | 
57 | # the following 2 hyperparameters are task-specific
58 | max_source_length = 512
59 | max_target_length = 128
60 | 
61 | # Suppose we have the following 2 training examples:
62 | input_sequence_1 = "Welcome to NYC"
63 | output_sequence_1 = "Bienvenue à NYC"
64 | 
65 | input_sequence_2 = "HuggingFace is a company as e dd"
66 | output_sequence_2 = "HuggingFace est une entreprise"
67 | 
68 | # encode the inputs
69 | task_prefix = "translate English to French: "
70 | input_sequences = [input_sequence_1, input_sequence_2]
71 | encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
72 |                      padding='longest', 
73 |                      max_length=max_source_length, 
74 |                      truncation=True, 
75 |                      return_tensors="pt")
76 | input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
77 | 
78 | # encode the targets
79 | target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
80 |                             padding='longest', 
81 |                             max_length=max_target_length, 
82 |                             truncation=True)
83 | labels = target_encoding.input_ids
84 | 
85 | # replace padding token id's of the labels by -100
86 | labels = [
87 |            [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
88 | ] 
89 | labels = torch.tensor(labels)
90 | loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
91 | loss.backward()
92 | opt=AdamW(model.parameters())
93 | opt.step()
94 | 
95 | print(loss)


--------------------------------------------------------------------------------
/source/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | from torch import nn
  4 | from torch import optim
  5 | from torch.nn.modules.module import Module
  6 | from tqdm.std import tqdm
  7 | from settings import *
  8 | import json
  9 | import pickle as pkl
 10 | import re
 11 | from torch.utils.data.dataset import Dataset
 12 | from torch.utils.data.dataloader import DataLoader
 13 | import torch
 14 | from rouge import Rouge
 15 | import models
 16 | 
 17 | ############################### Just run for one time! ###############################
 18 | def Preprocess(train_path=DATA_DIR+"train_dataset.csv",test_path=DATA_DIR+"test_dataset.csv"):
 19 |     '''
 20 |     清理数据、划分验证集后重新保存至新文件
 21 |     '''
 22 |     
 23 |     # 数据清洗
 24 |     def _cleanData(data):
 25 |         print("数据清洗开始=========================================")
 26 |         
 27 |         clean_data=[]
 28 |         for i,d in tqdm(enumerate(data)):
 29 |             res=d
 30 |             for pat in PATTERNS_ONCE:
 31 |                 #################################之后修改
 32 |                 if("\t" in pat):
 33 |                     res=re.sub(pat,"\t",res,1)
 34 |                 else:
 35 |                     res=re.sub(pat,"",res,1)
 36 |                 ####################################
 37 |             for pat in PATTERNS_ANY:
 38 |                 res=re.sub(pat,"",res)
 39 |             
 40 |             clean_data.append(res)
 41 | 
 42 |         print("数据清洗完毕=========================================")
 43 |         return clean_data
 44 |     
 45 |     # 将处理后的数据保存为json文件
 46 |     def _save2Json(data,mode):
 47 |         
 48 |         
 49 |         if mode==2:
 50 |             
 51 |             for i in range(len(data)): 
 52 |                 source=data[i].split('\t')[1].strip('\n')
 53 |                 if source!='': 
 54 |                     dict_data={"text":source,"summary":'no summary'}#测试集没有参考摘要
 55 |                     
 56 |                     with open(new_test_path+str(i)+'.json','w+',encoding='utf-8') as f:
 57 |                         f.write(json.dumps(dict_data,ensure_ascii=False))
 58 |                     
 59 |         
 60 |         else:
 61 |             
 62 |             for i in range(len(data)):
 63 |                 
 64 |                 if len(data[i].split('\t'))==3:
 65 |                     source_seg=data[i].split("\t")[1]
 66 |                     traget_seg=data[i].split("\t")[2].strip('\n')
 67 |                     
 68 |                     
 69 |                     if source_seg and traget_seg !='':
 70 |                         dict_data={"text":source_seg,"summary":traget_seg}
 71 |                         path=new_train_path
 72 |                         if mode==1:
 73 |                             path= new_val_path  
 74 |                         with open(path+str(i)+'.json','w+',encoding='utf-8') as f:
 75 |                             f.write(json.dumps(dict_data,ensure_ascii=False)) 
 76 |                         
 77 | 
 78 |     
 79 |     with open(train_path,'r',encoding='utf-8') as f:
 80 |         train_data_all=f.readlines()
 81 | 
 82 |     with open(test_path,'r',encoding='utf-8') as f:
 83 |         test_data=f.readlines()
 84 |     
 85 |     # 数据清洗
 86 |     train_data_all=_cleanData(train_data_all)
 87 |     test_data=_cleanData(test_data)
 88 | 
 89 |     # with open("./1.csv",'w',encoding='utf-8') as f:
 90 |     #     f.writelines(train_data_all)
 91 |     # with open("./2.csv",'w',encoding='utf-8') as f:
 92 |     #     f.writelines(test_data)
 93 |     # random.shuffle(train_data_all)
 94 |     
 95 |     # 设置新文件路径
 96 |     new_train_path=os.path.join(DATA_DIR,"new_train/")
 97 |     new_val_path=os.path.join(DATA_DIR,"new_val/")
 98 |     new_test_path=os.path.join(DATA_DIR,"new_test/")
 99 | 
100 |     if not os.path.exists(new_train_path):
101 |         os.makedirs(new_train_path)
102 | 
103 |     if not os.path.exists(new_val_path):
104 |         os.makedirs(new_val_path)
105 | 
106 |     if not os.path.exists(new_test_path):
107 |         os.makedirs(new_test_path)
108 | 
109 |     train_data=train_data_all[:8000] #把训练集重新划分为训练子集和验证子集，保证验证集上loss最小的模型，预测测试集
110 |     val_data=train_data_all[8000:]
111 | 
112 |     _save2Json(train_data,TRAIN_FALG)
113 |     _save2Json(val_data,VAL_FALG)
114 |     _save2Json(test_data,TEST_FALG)
115 |     
116 | 
117 | def CountFiles(path):
118 |     '''
119 |     计算目标文件夹json文件数目
120 |     '''
121 |     matcher = re.compile(r'[0-9]+\.json')
122 |     match = lambda name: bool(matcher.match(name))
123 |     names = os.listdir(path)
124 |     n_data = len(list(filter(match, names)))
125 |     return n_data
126 | 
127 | def BuildVocabCounter(data_dir=DATA_DIR):
128 |     '''
129 |     统计所有词汇，建立词频表
130 |     '''
131 |     from collections import Counter
132 |     
133 |     def GetTokens(path):
134 |         n_data=CountFiles(path)
135 |         summary_words=[]
136 |         source_words=[]
137 |         for i in range(n_data):
138 |             js_data=json.load(open(os.path.join(path,f"{i}.json"),encoding="utf-8"))
139 |             summary=''.join(js_data['summary']).strip()
140 |             summary_words.extend(summary.strip().split(' '))
141 |             
142 |             source=''.join(js_data['text']).strip()
143 |             source_words.extend(source.strip().split(' '))
144 | 
145 |         return source_words+summary_words
146 | 
147 |     # print(_count_data(data_dir+"new_train"))
148 |     vocab_counter=Counter()
149 |     vocab_counter.update(t for t in GetTokens(data_dir+"new_train") if t !="")
150 |     vocab_counter.update(t for t in GetTokens(data_dir+"new_val") if t !="")
151 |     vocab_counter.update(t for t in GetTokens(data_dir+"new_test") if t !="")
152 |     # print(vocab_counter.values())
153 | 
154 |     with open(VOCAB_PATH,"wb") as f:
155 |         pkl.dump(vocab_counter,f)
156 | 
157 | def MakeVocab(vocab_size=VOCAB_SIZE):
158 |     '''
159 |     建立词典，通过vocab_size设置字典大小，将常用词设置到字典即可，其他生僻词汇用'<unk>'表示
160 |     '''
161 |     with open(VOCAB_PATH,"rb") as f:
162 |         wc=pkl.load(f)
163 |     word2idx, idx2word = {}, {}
164 |     word2idx[PAD_WORD] = 0
165 |     word2idx[UNK_WORD] = 1
166 |     word2idx[BOS_WORD] = 2
167 |     word2idx[EOS_WORD] = 3
168 |     for i, (w, _) in enumerate(wc.most_common(vocab_size), 4):
169 |         word2idx[w] = i
170 |     for w, i in word2idx.items():
171 |         idx2word[i] = w
172 |     
173 |     with open(WORD_IDX_PATH,"wb") as f:
174 |         pkl.dump(word2idx,f)
175 |     with open(IDX_WORD_PATH,"wb") as f:
176 |         pkl.dump(idx2word,f)
177 | 
178 | def GetNumOfLongestSeq():
179 |     '''
180 |     找到最长的seq长度，用于padding
181 |     '''
182 |     
183 |     def _findInFolders(path,length):
184 |         max_len=0
185 |         for i in range(length):
186 |             js_data=json.load(open(os.path.join(path,f"{i}.json"),encoding="utf-8"))
187 |             l_data=js_data["summary"].split(" ")
188 |             l=len(l_data)
189 |             if(max_len<len(l_data)):
190 |                 max_len=l
191 |         return max_len
192 |     
193 |     train_path=os.path.join(DATA_DIR,"new_train/")
194 |     val_path=os.path.join(DATA_DIR,"new_val/")
195 |     test_path=os.path.join(DATA_DIR,"new_test/")
196 | 
197 |     train_length=CountFiles(train_path)
198 |     val_length=CountFiles(val_path)
199 |     test_length=CountFiles(test_path)
200 |     
201 |     return max(
202 |         _findInFolders(train_path,train_length),
203 |         _findInFolders(val_path,val_length),
204 |         _findInFolders(test_path,test_length))
205 |     
206 | 
207 | 
208 | ############################### - - ###############################
209 | class TextDataset(Dataset):
210 |     '''生成TensorDataset'''
211 |     def __init__(self,flag,word2id:dict):
212 |         self.word2id=word2id
213 |         self.path=DATA_DIR
214 |         self.flag=flag
215 |         if(flag==TRAIN_FALG):
216 |             self.path+="new_train"
217 |         elif(flag==VAL_FALG):
218 |             self.path+="new_val"
219 |         elif(flag==TEST_FALG):
220 |             self.path+="new_test"
221 |         else:
222 |             raise Exception(f"No this flag:{flag}")
223 |     
224 |     def __len__(self):
225 |         return CountFiles(self.path)
226 | 
227 |     def __getitem__(self, index):
228 |         source=ReadJson2List(self.path,index)
229 |         summary=ReadJson2List(self.path,index,True)
230 |         # 处理summary中奇怪的问题
231 |         summary=[i for i in summary if (i!='' and i!=' ')]
232 |         # print(summary)
233 |         enc_x=[self.word2id[word] if word in self.word2id.keys() else UNK_NUM for word in source]
234 |         #padding
235 |         enc_x,enc_x_l=PaddingSeq(enc_x,SOURCE_THRESHOLD) 
236 |         
237 |         if(self.flag!=TEST_FALG):
238 |             dec_x=[self.word2id[word] if word in self.word2id.keys() else UNK_NUM for word in summary]
239 |             # decoder输入前面加上BOS、decoder的label最后加上EOS
240 |             y=list(dec_x);y.append(EOS_NUM)
241 |             y,y_l=PaddingSeq(y,SUMMARY_THRESHOLD)
242 | 
243 |             dec_x.insert(0,BOS_NUM)
244 |             dec_x,dec_x_l=PaddingSeq(dec_x,SUMMARY_THRESHOLD)
245 |         if(self.flag==TEST_FALG):
246 |             return (torch.LongTensor(enc_x),enc_x_l)
247 |         # 返回值依次为：编码器输入，编码器输入有效长度，解码器输入，解码器输入有效长度，标签，标签有效长度
248 |         return (torch.LongTensor(enc_x),enc_x_l),(torch.LongTensor(dec_x),dec_x_l),(torch.LongTensor(y),y_l)
249 | 
250 | def PaddingSeq(line,threshold):
251 |     """填充文本序列，直接填充转换完的index列表"""
252 |     p_len=len(line)
253 |     if(p_len>threshold):
254 |         if(EOS_NUM in line):
255 |             line[threshold-1]=EOS_NUM
256 |         return line[:threshold],threshold
257 |     return line + [PAD_NUM] * (threshold - len(line)),p_len
258 | 
259 | def ReadJson2List(dir,i,label=False):
260 |     '''读取单个json文件（一个样本），并按空格分割转换成列表'''
261 |     
262 |     js_data=json.load(open(os.path.join(dir,f"{i}.json"),encoding="utf-8"))
263 |     if label:
264 |         return js_data["summary"].split(" ")
265 |     return js_data["text"].split(" ")
266 | 
267 | 
268 | def GetRouge(pred,label):
269 |     '''获取ROUGR-L值'''
270 |     rouge=Rouge()
271 |     rouge_score = rouge.get_scores(pred, label)
272 |     rouge_L_f1 = 0
273 |     rouge_L_p = 0
274 |     rouge_L_r = 0
275 |     for d in rouge_score:
276 |         rouge_L_f1 += d["rouge-l"]["f"]
277 |         rouge_L_p += d["rouge-l"]["p"]
278 |         rouge_L_r += d["rouge-l"]["r"]
279 |     
280 |     return (rouge_L_f1 / len(rouge_score))
281 |     
282 |     print("rouge_f1:%.2f" % (rouge_L_f1 / len(rouge_score)))
283 |     print("rouge_p:%.2f" % (rouge_L_p / len(rouge_score)))
284 |     print("rouge_r:%.2f" % (rouge_L_r / len(rouge_score)))
285 | 
286 | 
287 | # 将数据转换为成batch的Tensor，win平台有bug，多进程不能写在函数里
288 | with open(WORD_IDX_PATH,"rb") as f:
289 |         w2i=pkl.load(f)
290 | train_iter=DataLoader(TextDataset(TRAIN_FALG,w2i),shuffle=True,batch_size=BATCH_SZIE,num_workers=8)
291 | val_iter=DataLoader(TextDataset(VAL_FALG,w2i),shuffle=False,batch_size=BATCH_SZIE,num_workers=4)
292 | test_iter=DataLoader(TextDataset(TEST_FALG,w2i),shuffle=False,batch_size=1)
293 | 
294 | def Train(net:Module,lr=0.01):
295 |     """训练序列到序列模型。"""
296 |     from tqdm import tqdm
297 | 
298 |     def xavier_init_weights(m):
299 |         if type(m) == nn.Linear:
300 |             nn.init.xavier_uniform_(m.weight)
301 |         if type(m) == nn.GRU:
302 |             for param in m._flat_weights_names:
303 |                 if "weight" in param:
304 |                     nn.init.xavier_uniform_(m._parameters[param])
305 | 
306 |     net.apply(xavier_init_weights)
307 |     net.to(DEVICE)
308 |     optimizer = optim.Adam(net.parameters(), lr=lr)
309 |     loss = models.MaskedSoftmaxCELoss()
310 |     
311 |     # 验证集loss降到10000以下时开始保存每轮更低的参数
312 |     min_loss=10000
313 |     for epoch in range(EPOCHS):
314 |         train_loss=[]
315 |         val_loss=[]
316 | 
317 |         net.train()
318 |         for batch in tqdm(train_iter):
319 |             (enc_X, enc_x_l), (dec_x, dec_x_l), (y,y_l) = [(x[0].to(DEVICE),x[1].to(DEVICE)) for x in batch]
320 |             
321 |             
322 |             pred, _ = net(enc_X, dec_x, enc_x_l)
323 |             l = loss(pred, y, y_l).sum()
324 |             l.backward()
325 |             
326 |             optimizer.step()       
327 |             optimizer.zero_grad()
328 | 
329 |             with torch.no_grad():
330 |                 train_loss.append(l.item())
331 |             
332 |         # 释放显存
333 |         torch.cuda.empty_cache()
334 | 
335 |         net.eval()
336 |         with torch.no_grad():
337 |             for batch in tqdm(val_iter):
338 |                 (enc_X, enc_x_l), (dec_x, dec_x_l), (y,y_l) = [(x[0].to(DEVICE),x[1].to(DEVICE)) for x in batch]
339 |                 pred, _ = net(enc_X, dec_x, enc_x_l)
340 |                 l = loss(pred, y, y_l).sum()
341 |                 val_loss.append(l.item())
342 | 
343 |         # 保存模型参数，秒级时间戳保证唯一性
344 |         if(sum(val_loss)<min_loss):
345 |             min_loss=sum(val_loss)
346 |             torch.save(net.state_dict(),PARAM_DIR+str(int(time.time()))+"_GRU.param")
347 |             print(f"saved net with val_loss:{min_loss}")
348 |         print(f"{epoch+1}: train_loss:{sum(train_loss)};val_loss:{sum(val_loss)}")
349 | 
350 | 
351 | def TestOneSeq(source:str,net:Module,param_path,max_steps=100,label=None):
352 |     '''测试单个文本，生成摘要'''
353 |     with open(WORD_IDX_PATH,"rb") as f:
354 |         w2i=pkl.load(f)
355 |     with open(IDX_WORD_PATH,"rb") as f:
356 |         i2w=pkl.load(f)
357 |     
358 |     words_list=source.strip().lower().split(" ")
359 |     words_list=[i for i in words_list if (i!='' and i!=' ')]
360 |     # print(words_list)
361 |     id_list=[w2i[word] if word in w2i.keys() else UNK_NUM for word in words_list]
362 |     id_list.append(EOS_NUM)
363 |     print(id_list)
364 |     enc_X,enc_X_l=PaddingSeq(id_list,SOURCE_THRESHOLD)
365 |     enc_X=torch.unsqueeze(torch.tensor(enc_X, dtype=torch.long, device=DEVICE),dim=0)
366 |     enc_X_l=torch.tensor(enc_X_l, dtype=torch.long, device=DEVICE)
367 | 
368 |     enc_outputs = net.encoder(enc_X, enc_X_l)
369 |     dec_state = net.decoder.init_state(enc_outputs, enc_X_l)
370 |     
371 |     dec_X = torch.unsqueeze(
372 |         torch.tensor([BOS_NUM], dtype=torch.long, device=DEVICE),
373 |         dim=0)
374 | 
375 |     net.load_state_dict(torch.load(param_path))
376 |     net.eval()
377 |     output_seq, attention_weight_seq = [], []
378 |     for _ in range(max_steps):
379 |         Y, dec_state = net.decoder(dec_X, dec_state)
380 |         dec_X = Y.argmax(dim=2)
381 |         pred = dec_X.squeeze(dim=0).type(torch.int32).item()
382 |         
383 |         if pred == EOS_NUM:
384 |             break
385 |         output_seq.append(pred)
386 |     pred_seq=' '.join([i2w[i] for i in output_seq])
387 |     score=None
388 |     if(label!=None):
389 |         score=GetRouge(pred_seq,label)
390 |     return pred_seq, score
391 | 
392 |     
393 | 
394 | def GenSubmisson(net,param_path,max_steps=100):
395 |     '''依据测试集，生成submission文件'''
396 |     import csv
397 |     with open(IDX_WORD_PATH,"rb") as f:
398 |         i2w=pkl.load(f)
399 |     
400 |     net.load_state_dict(torch.load(param_path))
401 |     net.eval()
402 |     res=[]
403 |     count=0
404 |     for enc_X,enc_X_l in tqdm(test_iter):
405 |         enc_X,enc_X_l=enc_X.to(DEVICE),enc_X_l.to(DEVICE)
406 | 
407 |         dec_X = torch.unsqueeze(
408 |             torch.tensor([BOS_NUM], dtype=torch.long, device=DEVICE),dim=0
409 |             )
410 |         enc_outputs = net.encoder(enc_X, enc_X_l)
411 |         dec_state = net.decoder.init_state(enc_outputs, enc_X_l)
412 |         
413 |         output_seq, attention_weight_seq = [], []
414 |         for _ in range(max_steps):
415 |             Y, dec_state = net.decoder(dec_X, dec_state)
416 |             dec_X = Y.argmax(dim=2)
417 |             pred = dec_X.squeeze(dim=0).type(torch.int32).item()
418 |             if pred == EOS_NUM:
419 |                 break
420 |             output_seq.append(pred)
421 |         pred_seq=' '.join([i2w[i] for i in output_seq])
422 |         res.append([str(count),pred_seq])
423 |         count+=1
424 |     
425 |     with open(os.path.join(DATA_DIR, 'submission.csv'),'w+',newline="",encoding='utf-8') as csvfile:
426 |         writer=csv.writer(csvfile,delimiter="\t")   
427 |         writer.writerows(res)
428 | 
429 | 
430 | if __name__=='__main__':
431 |     Preprocess()
432 |     # BuildVocabCounter()
433 |     # MakeVocab(VOCAB_SIZE)
434 |     
435 |     # with open(WORD_IDX_PATH,"rb") as f:
436 |     #     a=pkl.load(f)
437 |     # with open(IDX_WORD_PATH,"rb") as f:
438 |     #     b=pkl.load(f)
439 |     
440 |     # print(a)
441 |     # print(b)
442 |     # print(ReadJson2List(os.path.join(DATA_DIR,"new_test/"),0,True))
443 |     # with open(WORD_IDX_PATH,"rb") as f:
444 |     #     w2i=pkl.load(f)
445 |     # # print(w2i['a'])
446 |     # a=TextDataset(VAL_FALG,w2i)
447 |     # x=a.__getitem__(1)
448 |     
449 |     # print(x)
450 |     # train_iter=DataLoader(TextDataset(VAL_FALG,w2i),shuffle=True,batch_size=128,num_workers=4)
451 |     # Train(models.GetTextSum_GRU(),0.01)
452 |     # GenSubmisson()
453 |     
454 |     # print(
455 |     #     TestOneSeq(
456 |     #     "one-third of phone users would definitely upgrade to a facebook phone - and 73 % think the phone is a ` good idea ' . news of the phone emerged this week , with sources claiming that facebook had hired ex-apple engineers to work on an ` official ' facebook phone . facebook has made several ventures into the mobile market before in partnership with manufacturers such as htc and inq - but a new phone made by ex-apple engineers is rumoured to be in production . the previous ` facebook phone ' - inq 's cloud touch - puts all of your newsfeeds , pictures and other information on a well thought-out homescreen centred around facebook . it 's not the first facebook phone to hit . the market -- the social network giant has previously partnered with inq . and htc to produce facebook-oriented handsets , including phones with a . built-in ` like ' button . details of the proposed phone are scant , but facebook is already making moves into the mobile space with a series of high-profile app acquisitions . after its $ 1 billion purchase of instagram , the social network bought location-based social app glancee and photo-sharing app lightbox . facebook 's smartphone apps have also seen constant and large-scale redesigns , with adverts more prominent with the news feed . the handset is rumoured to be set for a 2013 release . it could be a major hit -- a flash poll of 968 people conducted by myvouchercodes found that 32 % of phone users would upgrade as soon as it became available . the key to its success could be porting apps to mobile -- something facebook is already doing . separate camera and chat apps already separate off some site functions , and third-party apps will shortly be available via a facebook app store . of those polled , 57 % hoped that it would be cheaper than an iphone -- presumably supported by facebook 's advertising . those polled were then asked why they would choose to purchase a facebook phone , if and when one became available , and were asked to select all reasons that applied to them from a list of possible answers . would you ` upgrade ' to a facebook phone ? would you ` upgrade ' to a facebook phone ? now share your opinion . the top five reasons were as follows : . 44 % of people liked the idea of having their mobile phone synced with their facebook account , whilst 41 % said they wanted to be able to use facebook apps on their smartphone . mark pearson , chairman of myvouchercodes.co.uk , said , ` it will be quite exciting to see the first facebook phone when it 's released next year . '",
457 |     #     models.GetTextSum_GRU().to(DEVICE),
458 |     #     os.path.join(PARAM_DIR,"1638704899_GRU.param"),
459 |     #     label=" poll of 968 phone users in uk .   32 % said they would definitely upgrade to a facebook phone .   users hope it might be cheaper than iphone . "
460 |     # )
461 |     # )
462 | 
463 |     # GenSubmisson(
464 |     #     models.GetTextSum_GRU().to(DEVICE),
465 |     #     os.path.join(PARAM_DIR,"1638704899_GRU.param")
466 |     #     )
467 |     


--------------------------------------------------------------------------------