├── requirements.txt
├── tune.sh
├── download_long_Big_Patent_data.py
├── loading_scripts
├── longformer_for_pegasus.py
└── pegasus_to_4k.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | pip==21.1.2
2 | torch==1.8.1
3 | tensorflow==2.5.0
4 | transformers==3.5.1
5 | wheel==0.36.2
6 | datasets==1.7.0
7 | fairscale==0.3.7
8 |
--------------------------------------------------------------------------------
/tune.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.launch \
2 | --nproc_per_node=8 transformers/examples/seq2seq/run_summarization.py \
3 | --model_name_or_path Pegasus_4k/ \
4 | --do_train \
5 | --do_eval \
6 | --train_file train.json \
7 | --validation_file val.json \
8 | --output_dir saved/ \
9 | --per_device_train_batch_size=2 \
10 | --per_device_eval_batch_size=2 \
11 | --overwrite_output_dir \
12 | --predict_with_generate \
13 | --sharded_ddp zero_dp_3 \
14 | --save_steps 1000
15 | --save_total_limit 10 \
16 | --num_train_epochs 90 \
17 | --evaluation_strategy epoch \
18 | --fp16
19 |
--------------------------------------------------------------------------------
/download_long_Big_Patent_data.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | import random
3 | import json
4 |
5 | dataset = load_dataset("big_patent")
6 |
7 | text_sizes = []
8 | summary_sizes = []
9 | subset = "train"
10 |
11 | files = [
12 | "train.json",
13 | "val.json",
14 | "test.json",
15 | ]
16 |
17 | fps = {file: open(file, "wt") for file in files}
18 |
19 | num_rows = dataset[subset].num_rows
20 | num_train = 0
21 | num_test = 0
22 | num_val = 0
23 |
24 | for i in range(num_rows):
25 | text = dataset[subset][i]['description']
26 | text_length = len(text.split(" "))
27 | if text_length > 4000:
28 | r = random.random()
29 | if 0.9 < r and num_test < 2000:
30 | group = "test"
31 | num_test += 1
32 | elif 0.8 < r < 0.9 and num_val < 2000:
33 | group = "val"
34 | num_val += 1
35 | else:
36 | group = "train"
37 | num_train += 1
38 |
39 | summary = dataset[subset][i]['abstract']
40 | summary_size = len(summary.split(" "))
41 |
42 | json_obj = {"text": text, "summary": summary}
43 |
44 | json.dump(
45 | json_obj,
46 | fps[group + ".json"],
47 | sort_keys=False,
48 | indent=None,
49 | ensure_ascii=False,
50 | )
51 | fps[group + ".json"].write("\n")
52 |
53 | text_sizes.append(text_length)
54 | summary_sizes.append(summary_size)
55 |
56 | for fp in fps.values():
57 | fp.close()
58 |
--------------------------------------------------------------------------------
/loading_scripts/longformer_for_pegasus.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple, Dict
2 | from torch import nn, Tensor
3 | from longformer.longformer import LongformerSelfAttention
4 | #from transformers.modeling_bart import BartConfig, BartForConditionalGeneration
5 |
6 | #from transformers.modeling_pegasus import PegasusTokenizer, PegasusForConditionalGeneration
7 |
8 | from transformers import PegasusTokenizer, PegasusForConditionalGeneration
9 | from transformers.configuration_pegasus import PegasusConfig
10 |
11 |
12 |
13 |
14 | class LongformerEncoderDecoderForConditionalGeneration(PegasusForConditionalGeneration):
15 | def __init__(self, config):
16 | super().__init__(config)
17 | for i, layer in enumerate(self.model.encoder.layers):
18 | layer.self_attn = LongformerSelfAttentionForPegasus(config, layer_id=i)
19 |
20 |
21 |
22 | class LongformerEncoderDecoderConfig(PegasusConfig):
23 | def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
24 | autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
25 | gradient_checkpointing: bool = False, **kwargs):
26 | """
27 | Args:
28 | attention_window: list of attention window sizes of length = number of layers.
29 | window size = number of attention locations on each side.
30 | For an affective window size of 512, use `attention_window=[256]*num_layers`
31 | which is 256 on each side.
32 | attention_dilation: list of attention dilation of length = number of layers.
33 | attention dilation of `1` means no dilation.
34 | autoregressive: do autoregressive attention or have attention of both sides
35 | attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
36 | selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
37 | """
38 | super().__init__(**kwargs)
39 | self.attention_window = attention_window
40 | self.attention_dilation = attention_dilation
41 | self.autoregressive = autoregressive
42 | self.attention_mode = attention_mode
43 | self.gradient_checkpointing = gradient_checkpointing
44 | assert self.attention_mode=='sliding_chunks'
45 | # assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
46 |
47 |
48 | class LongformerSelfAttentionForPegasus(nn.Module):
49 | def __init__(self, config, layer_id):
50 | super().__init__()
51 | self.embed_dim = config.d_model
52 | self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id)
53 | self.output = nn.Linear(self.embed_dim, self.embed_dim)
54 |
55 | def forward(
56 | self,
57 | query,
58 | key: Optional[Tensor],
59 | key_padding_mask: Optional[Tensor] = None,
60 | layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
61 | attn_mask: Optional[Tensor] = None,
62 | need_weights=False,
63 | output_attentions=False,
64 | ) -> Tuple[Tensor, Optional[Tensor]]:
65 |
66 | tgt_len, bsz, embed_dim = query.size()
67 | assert embed_dim == self.embed_dim
68 | assert list(query.size()) == [tgt_len, bsz, embed_dim]
69 | assert attn_mask is None
70 |
71 | outputs = self.longformer_self_attn(
72 | query.transpose(0, 1), # LongformerSelfAttention expects (bsz, seqlen, embd_dim)
73 | attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1,
74 | head_mask=None,
75 | encoder_hidden_states=None,
76 | encoder_attention_mask=None,
77 | output_attentions=output_attentions,
78 | )
79 |
80 | attn_output = self.output(outputs[0].transpose(0, 1))
81 |
82 | return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None)
83 |
84 |
--------------------------------------------------------------------------------
/loading_scripts/pegasus_to_4k.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import copy
5 | import torch #ADDED
6 | import tensorflow as tf
7 |
8 | #from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart
9 | #from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig
10 | #from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration
11 |
12 | from longformer_for_pegasus import LongformerSelfAttentionForPegasus
13 | from longformer_for_pegasus import LongformerEncoderDecoderConfig
14 | from longformer_for_pegasus import LongformerEncoderDecoderForConditionalGeneration
15 |
16 |
17 | from transformers.modeling_longformer import LongformerSelfAttention
18 | from transformers import PegasusTokenizer
19 | from transformers import PegasusForConditionalGeneration
20 |
21 | import logging
22 | logger = logging.getLogger(__name__)
23 | logging.basicConfig(level=logging.INFO)
24 |
25 |
26 |
27 | def create_long_model(save_model_to, attention_window, max_pos):
28 | # base_model = 'google/pegasus-xsum'
29 | # base_model = 'sshleifer/student-pegasus-xsum-6-6'
30 | base_model = 'sshleifer/distill-pegasus-cnn-16-4'
31 | model = PegasusForConditionalGeneration.from_pretrained(base_model)
32 |
33 | tokenizer = PegasusTokenizer.from_pretrained(base_model, model_max_length=max_pos)
34 | config = LongformerEncoderDecoderConfig.from_pretrained(base_model)
35 |
36 |
37 |
38 | model.config = config
39 |
40 | config.attention_probs_dropout_prob = config.attention_dropout
41 | config.architectures = ['LongformerEncoderDecoderForConditionalGeneration', ]
42 |
43 | # extend position embeddings
44 | tokenizer.model_max_length = max_pos
45 | tokenizer.init_kwargs['model_max_length'] = max_pos
46 | current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
47 | assert current_max_pos == config.max_position_embeddings
48 |
49 | config.max_encoder_position_embeddings = max_pos
50 | config.max_decoder_position_embeddings = config.max_position_embeddings #512
51 | print("max_encoder_position_embeddings: ", config.max_encoder_position_embeddings)
52 | print("max_decoder_position_embeddings: ", config.max_decoder_position_embeddings)
53 |
54 | del config.max_position_embeddings
55 |
56 | assert max_pos > current_max_pos
57 |
58 |
59 | # allocate a larger position embedding matrix
60 | new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size)
61 | # copy position embeddings over and over to initialize the new position embeddings
62 |
63 | # k = 0
64 | # step = current_max_pos
65 | k = 0
66 | step = current_max_pos - k
67 | while k < max_pos - 1:
68 | new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[:]
69 | k += step
70 |
71 | model.model.encoder.embed_positions = torch.nn.Embedding.from_pretrained(new_encoder_pos_embed)
72 |
73 | print(model.model.encoder.layers)
74 |
75 |
76 | config.attention_window = [attention_window] * config.num_hidden_layers
77 | config.attention_dilation = [1] * config.num_hidden_layers
78 |
79 | # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
80 | for i, layer in enumerate(model.model.encoder.layers):
81 | longformer_self_attn_for_pegasus = LongformerSelfAttentionForPegasus(config, layer_id=i)
82 |
83 | longformer_self_attn_for_pegasus.longformer_self_attn.query = layer.self_attn.q_proj
84 | longformer_self_attn_for_pegasus.longformer_self_attn.key = layer.self_attn.k_proj
85 | longformer_self_attn_for_pegasus.longformer_self_attn.value = layer.self_attn.v_proj
86 |
87 | longformer_self_attn_for_pegasus.longformer_self_attn.query_global = copy.deepcopy(layer.self_attn.q_proj)
88 | longformer_self_attn_for_pegasus.longformer_self_attn.key_global = copy.deepcopy(layer.self_attn.k_proj)
89 | longformer_self_attn_for_pegasus.longformer_self_attn.value_global = copy.deepcopy(layer.self_attn.v_proj)
90 |
91 | longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj
92 | layer.self_attn = longformer_self_attn_for_pegasus
93 |
94 | print("OK")
95 |
96 | logger.info(f'saving model to {save_model_to}')
97 | model.save_pretrained(save_model_to)
98 | tokenizer.save_pretrained(save_model_to)
99 | return model, tokenizer
100 |
101 |
102 | def main():
103 | model, tokenizer = create_long_model(save_model_to="Pegasus_4k/", attention_window=512, max_pos=4096)
104 |
105 |
106 | if __name__ == "__main__":
107 | main()
108 |
109 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pegasus_with_Longformer_summarization
2 |
3 | ## Description
4 |
5 | Pegasus is a large Transformer-based encoder-decoder model with a new pre-training objective which is adapted to abstractive summarization. More specifically, the pre-training objective, called "Gap Sentence Generation (GSG)", consists of masking important sentences from a document and generating these gap-sentences.
6 |
7 | On the other hand, the Longformer is a Transformer which replaces the full-attention mechanism (quadratic dependency) with a novel attention mechanism which scales linearly with the input sequence length. Consequently, Longformer can process sequences up to 4,096 tokens long (8 times longer than BERT which is limited to 512 tokens).
8 |
9 | This project plugs Longformer's attention mechanism to Pegasus in order to perform abstractive summarization on long documents. The conversion is done in loading_scripts/Pegasus_to_4k.py which enables Pegasus to process sequences up to 4,096 tokens long (rather than 512 tokens). Note that the `max_pos` parameter can be changed to accept even longer sequences (e.g `max_pos=16384`). The new Pegasus model is then fine-tuned on BigPatent dataset. To assess the model's performance on long documents, all training examples are filtered such that they have a minimum length of 4000 tokens.
10 |
11 | This project was built using HuggingFace's Transformers library. The model is trained using model partitioning (with fairscale) and parallel batch processing on a cluster of 8 GPUs.
12 |
13 | ## How to run the project
14 | To run this project, clone the repo and execute the following commands:
15 |
16 | 1) `cd Pegasus_with_Longformer_summarization`
17 | 2) `pip install -r requirements.txt`
18 | 3) `pip install git+https://github.com/allenai/longformer.git`
19 | 4) `pip install tokenizers==0.10.3`
20 | 5) Comment out `import 'SAVE_STATE_WARNING' from torch.optim.lr_scheduler` in lib/python3.7/site-packages/transformers/trainer_pt_utils.py
21 | 6) Add `with torch.no_grad():` above `out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))` in lib/python3.7/site-packages/transformers/modeling_bart.py
22 | 7) `python loading_scripts/pegasus_to_4k.py`
23 | 8) `git clone -b v4.5.1-release https://github.com/huggingface/transformers`
24 | 9) `cd transformers`
25 | 10) `pip install -e .`
26 | 11) `cd .. ; python download_long_Big_Patent_data.py`
27 | 12) `bash tune.sh`
28 |
29 | ## Citation
30 |
31 | @article{DBLP:journals/corr/abs-1910-03771,
32 | author = {Thomas Wolf and
33 | Lysandre Debut and
34 | Victor Sanh and
35 | Julien Chaumond and
36 | Clement Delangue and
37 | Anthony Moi and
38 | Pierric Cistac and
39 | Tim Rault and
40 | Rémi Louf and
41 | Morgan Funtowicz and
42 | Jamie Brew},
43 | title = {HuggingFace's Transformers: State-of-the-art Natural Language Processing},
44 | journal = {CoRR},
45 | volume = {abs/1910.03771},
46 | year = {2019},
47 | url = {http://arxiv.org/abs/1910.03771},
48 | archivePrefix = {arXiv},
49 | eprint = {1910.03771},
50 | timestamp = {Tue, 02 Jun 2020 12:49:01 +0200},
51 | biburl = {https://dblp.org/rec/journals/corr/abs-1910-03771.bib},
52 | bibsource = {dblp computer science bibliography, https://dblp.org}
53 | }
54 |
55 | @article{DBLP:journals/corr/abs-2004-05150,
56 | author = {Iz Beltagy and
57 | Matthew E. Peters and
58 | Arman Cohan},
59 | title = {Longformer: The Long-Document Transformer},
60 | journal = {CoRR},
61 | volume = {abs/2004.05150},
62 | year = {2020},
63 | url = {https://arxiv.org/abs/2004.05150},
64 | archivePrefix = {arXiv},
65 | eprint = {2004.05150},
66 | timestamp = {Tue, 14 Apr 2020 16:40:34 +0200},
67 | biburl = {https://dblp.org/rec/journals/corr/abs-2004-05150.bib},
68 | bibsource = {dblp computer science bibliography, https://dblp.org}
69 | }
70 |
71 | @article{DBLP:journals/corr/abs-1912-08777,
72 | author = {Jingqing Zhang and
73 | Yao Zhao and
74 | Mohammad Saleh and
75 | Peter J. Liu},
76 | title = {{PEGASUS:} Pre-training with Extracted Gap-sentences for Abstractive
77 | Summarization},
78 | journal = {CoRR},
79 | volume = {abs/1912.08777},
80 | year = {2019},
81 | url = {http://arxiv.org/abs/1912.08777},
82 | archivePrefix = {arXiv},
83 | eprint = {1912.08777},
84 | timestamp = {Fri, 03 Jan 2020 16:10:45 +0100},
85 | biburl = {https://dblp.org/rec/journals/corr/abs-1912-08777.bib},
86 | bibsource = {dblp computer science bibliography, https://dblp.org}
87 | }
88 |
--------------------------------------------------------------------------------