├── requirements.txt ├── tune.sh ├── download_long_Big_Patent_data.py ├── loading_scripts ├── longformer_for_pegasus.py └── pegasus_to_4k.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | pip==21.1.2 2 | torch==1.8.1 3 | tensorflow==2.5.0 4 | transformers==3.5.1 5 | wheel==0.36.2 6 | datasets==1.7.0 7 | fairscale==0.3.7 8 | -------------------------------------------------------------------------------- /tune.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch \ 2 | --nproc_per_node=8 transformers/examples/seq2seq/run_summarization.py \ 3 | --model_name_or_path Pegasus_4k/ \ 4 | --do_train \ 5 | --do_eval \ 6 | --train_file train.json \ 7 | --validation_file val.json \ 8 | --output_dir saved/ \ 9 | --per_device_train_batch_size=2 \ 10 | --per_device_eval_batch_size=2 \ 11 | --overwrite_output_dir \ 12 | --predict_with_generate \ 13 | --sharded_ddp zero_dp_3 \ 14 | --save_steps 1000 15 | --save_total_limit 10 \ 16 | --num_train_epochs 90 \ 17 | --evaluation_strategy epoch \ 18 | --fp16 19 | -------------------------------------------------------------------------------- /download_long_Big_Patent_data.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import random 3 | import json 4 | 5 | dataset = load_dataset("big_patent") 6 | 7 | text_sizes = [] 8 | summary_sizes = [] 9 | subset = "train" 10 | 11 | files = [ 12 | "train.json", 13 | "val.json", 14 | "test.json", 15 | ] 16 | 17 | fps = {file: open(file, "wt") for file in files} 18 | 19 | num_rows = dataset[subset].num_rows 20 | num_train = 0 21 | num_test = 0 22 | num_val = 0 23 | 24 | for i in range(num_rows): 25 | text = dataset[subset][i]['description'] 26 | text_length = len(text.split(" ")) 27 | if text_length > 4000: 28 | r = random.random() 29 | if 0.9 < r and num_test < 2000: 30 | group = "test" 31 | num_test += 1 32 | elif 0.8 < r < 0.9 and num_val < 2000: 33 | group = "val" 34 | num_val += 1 35 | else: 36 | group = "train" 37 | num_train += 1 38 | 39 | summary = dataset[subset][i]['abstract'] 40 | summary_size = len(summary.split(" ")) 41 | 42 | json_obj = {"text": text, "summary": summary} 43 | 44 | json.dump( 45 | json_obj, 46 | fps[group + ".json"], 47 | sort_keys=False, 48 | indent=None, 49 | ensure_ascii=False, 50 | ) 51 | fps[group + ".json"].write("\n") 52 | 53 | text_sizes.append(text_length) 54 | summary_sizes.append(summary_size) 55 | 56 | for fp in fps.values(): 57 | fp.close() 58 | -------------------------------------------------------------------------------- /loading_scripts/longformer_for_pegasus.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Dict 2 | from torch import nn, Tensor 3 | from longformer.longformer import LongformerSelfAttention 4 | #from transformers.modeling_bart import BartConfig, BartForConditionalGeneration 5 | 6 | #from transformers.modeling_pegasus import PegasusTokenizer, PegasusForConditionalGeneration 7 | 8 | from transformers import PegasusTokenizer, PegasusForConditionalGeneration 9 | from transformers.configuration_pegasus import PegasusConfig 10 | 11 | 12 | 13 | 14 | class LongformerEncoderDecoderForConditionalGeneration(PegasusForConditionalGeneration): 15 | def __init__(self, config): 16 | super().__init__(config) 17 | for i, layer in enumerate(self.model.encoder.layers): 18 | layer.self_attn = LongformerSelfAttentionForPegasus(config, layer_id=i) 19 | 20 | 21 | 22 | class LongformerEncoderDecoderConfig(PegasusConfig): 23 | def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None, 24 | autoregressive: bool = False, attention_mode: str = 'sliding_chunks', 25 | gradient_checkpointing: bool = False, **kwargs): 26 | """ 27 | Args: 28 | attention_window: list of attention window sizes of length = number of layers. 29 | window size = number of attention locations on each side. 30 | For an affective window size of 512, use `attention_window=[256]*num_layers` 31 | which is 256 on each side. 32 | attention_dilation: list of attention dilation of length = number of layers. 33 | attention dilation of `1` means no dilation. 34 | autoregressive: do autoregressive attention or have attention of both sides 35 | attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer 36 | selfattention, 'sliding_chunks' for another implementation of Longformer selfattention 37 | """ 38 | super().__init__(**kwargs) 39 | self.attention_window = attention_window 40 | self.attention_dilation = attention_dilation 41 | self.autoregressive = autoregressive 42 | self.attention_mode = attention_mode 43 | self.gradient_checkpointing = gradient_checkpointing 44 | assert self.attention_mode=='sliding_chunks' 45 | # assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] 46 | 47 | 48 | class LongformerSelfAttentionForPegasus(nn.Module): 49 | def __init__(self, config, layer_id): 50 | super().__init__() 51 | self.embed_dim = config.d_model 52 | self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id) 53 | self.output = nn.Linear(self.embed_dim, self.embed_dim) 54 | 55 | def forward( 56 | self, 57 | query, 58 | key: Optional[Tensor], 59 | key_padding_mask: Optional[Tensor] = None, 60 | layer_state: Optional[Dict[str, Optional[Tensor]]] = None, 61 | attn_mask: Optional[Tensor] = None, 62 | need_weights=False, 63 | output_attentions=False, 64 | ) -> Tuple[Tensor, Optional[Tensor]]: 65 | 66 | tgt_len, bsz, embed_dim = query.size() 67 | assert embed_dim == self.embed_dim 68 | assert list(query.size()) == [tgt_len, bsz, embed_dim] 69 | assert attn_mask is None 70 | 71 | outputs = self.longformer_self_attn( 72 | query.transpose(0, 1), # LongformerSelfAttention expects (bsz, seqlen, embd_dim) 73 | attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1, 74 | head_mask=None, 75 | encoder_hidden_states=None, 76 | encoder_attention_mask=None, 77 | output_attentions=output_attentions, 78 | ) 79 | 80 | attn_output = self.output(outputs[0].transpose(0, 1)) 81 | 82 | return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None) 83 | 84 | -------------------------------------------------------------------------------- /loading_scripts/pegasus_to_4k.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import copy 5 | import torch #ADDED 6 | import tensorflow as tf 7 | 8 | #from longformer.longformer_encoder_decoder import LongformerSelfAttentionForBart 9 | #from longformer.longformer_encoder_decoder import LongformerEncoderDecoderConfig 10 | #from longformer.longformer_encoder_decoder import LongformerEncoderDecoderForConditionalGeneration 11 | 12 | from longformer_for_pegasus import LongformerSelfAttentionForPegasus 13 | from longformer_for_pegasus import LongformerEncoderDecoderConfig 14 | from longformer_for_pegasus import LongformerEncoderDecoderForConditionalGeneration 15 | 16 | 17 | from transformers.modeling_longformer import LongformerSelfAttention 18 | from transformers import PegasusTokenizer 19 | from transformers import PegasusForConditionalGeneration 20 | 21 | import logging 22 | logger = logging.getLogger(__name__) 23 | logging.basicConfig(level=logging.INFO) 24 | 25 | 26 | 27 | def create_long_model(save_model_to, attention_window, max_pos): 28 | # base_model = 'google/pegasus-xsum' 29 | # base_model = 'sshleifer/student-pegasus-xsum-6-6' 30 | base_model = 'sshleifer/distill-pegasus-cnn-16-4' 31 | model = PegasusForConditionalGeneration.from_pretrained(base_model) 32 | 33 | tokenizer = PegasusTokenizer.from_pretrained(base_model, model_max_length=max_pos) 34 | config = LongformerEncoderDecoderConfig.from_pretrained(base_model) 35 | 36 | 37 | 38 | model.config = config 39 | 40 | config.attention_probs_dropout_prob = config.attention_dropout 41 | config.architectures = ['LongformerEncoderDecoderForConditionalGeneration', ] 42 | 43 | # extend position embeddings 44 | tokenizer.model_max_length = max_pos 45 | tokenizer.init_kwargs['model_max_length'] = max_pos 46 | current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape 47 | assert current_max_pos == config.max_position_embeddings 48 | 49 | config.max_encoder_position_embeddings = max_pos 50 | config.max_decoder_position_embeddings = config.max_position_embeddings #512 51 | print("max_encoder_position_embeddings: ", config.max_encoder_position_embeddings) 52 | print("max_decoder_position_embeddings: ", config.max_decoder_position_embeddings) 53 | 54 | del config.max_position_embeddings 55 | 56 | assert max_pos > current_max_pos 57 | 58 | 59 | # allocate a larger position embedding matrix 60 | new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(max_pos, embed_size) 61 | # copy position embeddings over and over to initialize the new position embeddings 62 | 63 | # k = 0 64 | # step = current_max_pos 65 | k = 0 66 | step = current_max_pos - k 67 | while k < max_pos - 1: 68 | new_encoder_pos_embed[k:(k + step)] = model.model.encoder.embed_positions.weight[:] 69 | k += step 70 | 71 | model.model.encoder.embed_positions = torch.nn.Embedding.from_pretrained(new_encoder_pos_embed) 72 | 73 | print(model.model.encoder.layers) 74 | 75 | 76 | config.attention_window = [attention_window] * config.num_hidden_layers 77 | config.attention_dilation = [1] * config.num_hidden_layers 78 | 79 | # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` 80 | for i, layer in enumerate(model.model.encoder.layers): 81 | longformer_self_attn_for_pegasus = LongformerSelfAttentionForPegasus(config, layer_id=i) 82 | 83 | longformer_self_attn_for_pegasus.longformer_self_attn.query = layer.self_attn.q_proj 84 | longformer_self_attn_for_pegasus.longformer_self_attn.key = layer.self_attn.k_proj 85 | longformer_self_attn_for_pegasus.longformer_self_attn.value = layer.self_attn.v_proj 86 | 87 | longformer_self_attn_for_pegasus.longformer_self_attn.query_global = copy.deepcopy(layer.self_attn.q_proj) 88 | longformer_self_attn_for_pegasus.longformer_self_attn.key_global = copy.deepcopy(layer.self_attn.k_proj) 89 | longformer_self_attn_for_pegasus.longformer_self_attn.value_global = copy.deepcopy(layer.self_attn.v_proj) 90 | 91 | longformer_self_attn_for_pegasus.output = layer.self_attn.out_proj 92 | layer.self_attn = longformer_self_attn_for_pegasus 93 | 94 | print("OK") 95 | 96 | logger.info(f'saving model to {save_model_to}') 97 | model.save_pretrained(save_model_to) 98 | tokenizer.save_pretrained(save_model_to) 99 | return model, tokenizer 100 | 101 | 102 | def main(): 103 | model, tokenizer = create_long_model(save_model_to="Pegasus_4k/", attention_window=512, max_pos=4096) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pegasus_with_Longformer_summarization 2 | 3 | ## Description 4 | 5 | Pegasus is a large Transformer-based encoder-decoder model with a new pre-training objective which is adapted to abstractive summarization. More specifically, the pre-training objective, called "Gap Sentence Generation (GSG)", consists of masking important sentences from a document and generating these gap-sentences. 6 | 7 | On the other hand, the Longformer is a Transformer which replaces the full-attention mechanism (quadratic dependency) with a novel attention mechanism which scales linearly with the input sequence length. Consequently, Longformer can process sequences up to 4,096 tokens long (8 times longer than BERT which is limited to 512 tokens). 8 | 9 | This project plugs Longformer's attention mechanism to Pegasus in order to perform abstractive summarization on long documents. The conversion is done in loading_scripts/Pegasus_to_4k.py which enables Pegasus to process sequences up to 4,096 tokens long (rather than 512 tokens). Note that the `max_pos` parameter can be changed to accept even longer sequences (e.g `max_pos=16384`). The new Pegasus model is then fine-tuned on BigPatent dataset. To assess the model's performance on long documents, all training examples are filtered such that they have a minimum length of 4000 tokens. 10 | 11 | This project was built using HuggingFace's Transformers library. The model is trained using model partitioning (with fairscale) and parallel batch processing on a cluster of 8 GPUs. 12 | 13 | ## How to run the project 14 | To run this project, clone the repo and execute the following commands: 15 | 16 | 1) `cd Pegasus_with_Longformer_summarization` 17 | 2) `pip install -r requirements.txt` 18 | 3) `pip install git+https://github.com/allenai/longformer.git` 19 | 4) `pip install tokenizers==0.10.3` 20 | 5) Comment out `import 'SAVE_STATE_WARNING' from torch.optim.lr_scheduler` in lib/python3.7/site-packages/transformers/trainer_pt_utils.py 21 | 6) Add `with torch.no_grad():` above `out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))` in lib/python3.7/site-packages/transformers/modeling_bart.py 22 | 7) `python loading_scripts/pegasus_to_4k.py` 23 | 8) `git clone -b v4.5.1-release https://github.com/huggingface/transformers` 24 | 9) `cd transformers` 25 | 10) `pip install -e .` 26 | 11) `cd .. ; python download_long_Big_Patent_data.py` 27 | 12) `bash tune.sh` 28 | 29 | ## Citation 30 | 31 | @article{DBLP:journals/corr/abs-1910-03771,
32 |      author = {Thomas Wolf and 33 | Lysandre Debut and 34 | Victor Sanh and 35 | Julien Chaumond and 36 | Clement Delangue and 37 | Anthony Moi and 38 | Pierric Cistac and 39 | Tim Rault and 40 | Rémi Louf and 41 | Morgan Funtowicz and 42 | Jamie Brew},
43 |      title = {HuggingFace's Transformers: State-of-the-art Natural Language Processing},
44 |      journal = {CoRR},
45 |      volume = {abs/1910.03771},
46 |      year = {2019},
47 |      url = {http://arxiv.org/abs/1910.03771},
48 |      archivePrefix = {arXiv},
49 |      eprint = {1910.03771},
50 |      timestamp = {Tue, 02 Jun 2020 12:49:01 +0200},
51 |      biburl = {https://dblp.org/rec/journals/corr/abs-1910-03771.bib},
52 |      bibsource = {dblp computer science bibliography, https://dblp.org}
53 | } 54 | 55 | @article{DBLP:journals/corr/abs-2004-05150,
56 |      author = {Iz Beltagy and 57 | Matthew E. Peters and 58 | Arman Cohan},
59 |      title = {Longformer: The Long-Document Transformer},
60 |      journal = {CoRR},
61 |      volume = {abs/2004.05150},
62 |      year = {2020},
63 |      url = {https://arxiv.org/abs/2004.05150},
64 |      archivePrefix = {arXiv},
65 |      eprint = {2004.05150},
66 |      timestamp = {Tue, 14 Apr 2020 16:40:34 +0200},
67 |      biburl = {https://dblp.org/rec/journals/corr/abs-2004-05150.bib},
68 |      bibsource = {dblp computer science bibliography, https://dblp.org}
69 | } 70 | 71 | @article{DBLP:journals/corr/abs-1912-08777,
72 |      author = {Jingqing Zhang and 73 | Yao Zhao and 74 | Mohammad Saleh and 75 | Peter J. Liu},
76 |      title = {{PEGASUS:} Pre-training with Extracted Gap-sentences for Abstractive 77 | Summarization},
78 |      journal = {CoRR},
79 |      volume = {abs/1912.08777},
80 |      year = {2019},
81 |      url = {http://arxiv.org/abs/1912.08777},
82 |      archivePrefix = {arXiv},
83 |      eprint = {1912.08777},
84 |      timestamp = {Fri, 03 Jan 2020 16:10:45 +0100},
85 |      biburl = {https://dblp.org/rec/journals/corr/abs-1912-08777.bib},
86 |      bibsource = {dblp computer science bibliography, https://dblp.org}
87 | } 88 | --------------------------------------------------------------------------------