├── tests ├── __init__.py ├── model │ ├── __init__.py │ ├── test_summarizer.py │ ├── test_summarizer_attention.py │ ├── test_summarizer_transformer.py │ └── test_summarizer_bert.py ├── callbacks │ ├── __init__.py │ ├── test_validation_callback.py │ ├── test_evaluation_callback.py │ └── test_model_checkpointing_callback.py ├── evaluation │ ├── __init__.py │ └── test_bleu_scorer.py ├── resources │ ├── __init__.py │ ├── small_glove.txt │ └── trainer_test_config.yaml ├── preprocessing │ ├── __init__.py │ ├── test_bert_preprocessor.py │ ├── test_keras_tokenizer.py │ ├── test_vectorizer.py │ ├── test_bert_vectorizer.py │ ├── test_preprocessor.py │ ├── test_dataset_generator.py │ └── test_bucket_generator.py ├── test_embeddings.py ├── test_bert_training.py ├── test_trainer.py └── test_training.py ├── headliner ├── utils │ ├── __init__.py │ └── logger.py ├── __init__.py ├── evaluation │ ├── __init__.py │ ├── scorer.py │ └── bleu_scorer.py ├── model │ ├── __init__.py │ ├── basic_model.py │ ├── summarizer.py │ ├── attention_model.py │ ├── transformer_model.py │ ├── transformer_util.py │ ├── bert_model.py │ ├── basic_summarizer.py │ ├── attention_summarizer.py │ ├── transformer_summarizer.py │ └── bert_summarizer.py ├── callbacks │ ├── __init__.py │ ├── tensorboard_callback.py │ ├── validation_callback.py │ ├── model_checkpoint_callback.py │ └── evaluation_callback.py ├── preprocessing │ ├── __init__.py │ ├── keras_tokenizer.py │ ├── tokenizer.py │ ├── bert_preprocessor.py │ ├── dataset_generator.py │ ├── preprocessor.py │ ├── vectorizer.py │ ├── bert_vectorizer.py │ └── bucket_generator.py ├── losses.py ├── embeddings.py └── trainer.py ├── .gitattributes ├── setup.cfg ├── figures ├── seq2seq.jpg └── headline_generator.png ├── run_training.sh ├── mkdocs ├── docs │ ├── img │ │ ├── favicon.ico │ │ └── logo.svg │ └── examples │ │ ├── nmt_example.md │ │ ├── advanced_nmt_example.md │ │ └── bert_example.md ├── run_docs.sh ├── build_docs.sh ├── README.md ├── mkdocs.yml └── autogen.py ├── pypi.sh ├── .bumpversion.cfg ├── config └── trainer_config.yaml ├── .travis.yml ├── LICENSE ├── CONTRIBUTING.md ├── azure-pipelines.yml ├── setup.py ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /headliner/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/resources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python -------------------------------------------------------------------------------- /headliner/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.2' 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /tests/resources/small_glove.txt: -------------------------------------------------------------------------------- 1 | a 1 2 3 2 | b 4 5 6 -------------------------------------------------------------------------------- /headliner/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .bleu_scorer import BleuScorer 2 | -------------------------------------------------------------------------------- /figures/seq2seq.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-media/headliner/HEAD/figures/seq2seq.jpg -------------------------------------------------------------------------------- /run_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=. 3 | python3 headliner/training_runner.py 4 | -------------------------------------------------------------------------------- /mkdocs/docs/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-media/headliner/HEAD/mkdocs/docs/img/favicon.ico -------------------------------------------------------------------------------- /pypi.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | rm -rf dist/ 4 | python setup.py sdist bdist_wheel 5 | twine upload dist/* -------------------------------------------------------------------------------- /figures/headline_generator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-media/headliner/HEAD/figures/headline_generator.png -------------------------------------------------------------------------------- /headliner/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .attention_summarizer import AttentionSummarizer 2 | from .basic_summarizer import BasicSummarizer 3 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.0.2 3 | commit = False 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:headliner/__init__.py] 9 | 10 | -------------------------------------------------------------------------------- /headliner/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation_callback import EvaluationCallback 2 | from .model_checkpoint_callback import ModelCheckpointCallback 3 | from .validation_callback import ValidationCallback 4 | -------------------------------------------------------------------------------- /headliner/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .bucket_generator import BucketGenerator 2 | from .dataset_generator import DatasetGenerator 3 | from .preprocessor import Preprocessor 4 | from .vectorizer import Vectorizer 5 | -------------------------------------------------------------------------------- /mkdocs/run_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cp ../README.md docs/index.md 4 | cp ../CONTRIBUTING.md docs/CONTRIBUTING.md 5 | cp ../LICENSE docs/LICENSE.md 6 | cp -R ../figures docs/ 7 | python autogen.py 8 | mkdocs serve -------------------------------------------------------------------------------- /mkdocs/build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cp ../README.md docs/index.md 4 | cp ../CONTRIBUTING.md docs/CONTRIBUTING.md 5 | cp ../LICENSE docs/LICENSE.md 6 | cp -R ../figures docs/ 7 | python autogen.py 8 | mkdir ../docs 9 | mkdocs build -c -d ../docs/ -------------------------------------------------------------------------------- /mkdocs/README.md: -------------------------------------------------------------------------------- 1 | # Headliner Documentation 2 | 3 | ## Building the documentation 4 | - Install MkDocs: `pip install mkdocs mkdocs-material` 5 | - Serve MkDocs: `mkdocs serve` and then go to `http://127.0.0.1:8000/` to view it 6 | - Run `python autogen.py` to auto-generate the code documentation -------------------------------------------------------------------------------- /config/trainer_config.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 16 2 | max_vocab_size: 200000 3 | glove_path: null 4 | steps_per_epoch: 500 5 | tensorboard_dir: '/tmp/training' 6 | model_save_path: '/tmp/summarizer' 7 | bucketing_buffer_size_batches: 10000 8 | bucketing_batches_to_bucket: 100 9 | steps_to_log: 10 10 | logging_level: 'info' -------------------------------------------------------------------------------- /headliner/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def masked_crossentropy(targets: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: 5 | crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 6 | mask = tf.math.logical_not(tf.math.equal(targets, 0)) 7 | mask = tf.cast(mask, dtype=tf.int64) 8 | loss = crossentropy(targets, logits, sample_weight=mask) 9 | return loss 10 | -------------------------------------------------------------------------------- /tests/resources/trainer_test_config.yaml: -------------------------------------------------------------------------------- 1 | max_input_len: 10 2 | max_output_len: 9 3 | batch_size: 1 4 | max_vocab_size_encoder: 7 5 | max_vocab_size_decoder: 6 6 | embedding_path_encoder: 'glove.txt' 7 | embedding_path_decoder: null 8 | steps_per_epoch: 4 9 | tensorboard_dir: 'tensor_dir' 10 | model_save_path: 'model_save_path' 11 | use_bucketing: True 12 | shuffle_buffer_size: 1000 13 | bucketing_buffer_size_batches: 5 14 | bucketing_batches_to_bucket: 6 15 | steps_to_log: 7 16 | logging_level: 'debug' 17 | -------------------------------------------------------------------------------- /headliner/evaluation/scorer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Dict, Union 3 | 4 | import numpy as np 5 | 6 | 7 | class Scorer(abc.ABC): 8 | 9 | def __call__(self, prediction: [Dict[str, Union[str, np.array]]]) -> float: 10 | """ 11 | Evaluates prediction. 12 | 13 | Args: 14 | prediction: Dictionary providing all information about a model prediction such as 15 | output string, logits etc. 16 | 17 | Returns: Prediction score as float. 18 | """ 19 | 20 | raise NotImplementedError() 21 | -------------------------------------------------------------------------------- /headliner/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def get_logger(name: str) -> logging.Logger: 6 | logger = logging.getLogger(name) 7 | logger.setLevel(logging.DEBUG) 8 | logger.propagate = False 9 | 10 | if not logger.handlers: 11 | # stream handler ensures that logging events are passed to stdout 12 | ch = logging.StreamHandler(sys.stdout) 13 | ch.setLevel(logging.INFO) 14 | ch_formatter = logging.Formatter('%(message)s') 15 | ch.setFormatter(ch_formatter) 16 | logger.addHandler(ch) 17 | 18 | return logger 19 | -------------------------------------------------------------------------------- /tests/preprocessing/test_bert_preprocessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from spacy.lang.en import English 4 | from headliner.preprocessing.bert_preprocessor import BertPreprocessor 5 | 6 | 7 | class TestBertPreprocessor(unittest.TestCase): 8 | 9 | def test_preprocessing(self): 10 | nlp = English() 11 | preprocessor = BertPreprocessor(nlp=nlp) 12 | data = ('I love my dog. He is the best. He eats and poops.', 'Me and my dog.') 13 | data_preprocessed = preprocessor(data) 14 | self.assertEqual(('[CLS] I love my dog. [SEP] [CLS] He is the best. [SEP] [CLS] He eats and poops. [SEP]', 15 | '[CLS] Me and my dog. [SEP]'), data_preprocessed) 16 | -------------------------------------------------------------------------------- /tests/preprocessing/test_keras_tokenizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 3 | 4 | 5 | class TestKerasTokenizer(unittest.TestCase): 6 | 7 | def test_keras_tokenizer(self): 8 | tokenizer = KerasTokenizer(filters='', lower=False, oov_token='') 9 | tokenizer.fit(['a b c d']) 10 | encoded = tokenizer.encode('a b e') 11 | self.assertEqual([2, 3, 1], encoded) 12 | decoded = tokenizer.decode(encoded) 13 | self.assertEqual('a b ', decoded) 14 | self.assertEqual(5, tokenizer.vocab_size) 15 | self.assertEqual({'a', 'b', 'c', 'd', ''}, tokenizer.token_index.keys()) 16 | self.assertEqual({1, 2, 3, 4, 5}, set(tokenizer.token_index.values())) 17 | 18 | -------------------------------------------------------------------------------- /headliner/callbacks/tensorboard_callback.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class TensorboardCallback(tf.keras.callbacks.Callback): 5 | """ 6 | Callback for validation loss. 7 | """ 8 | 9 | def __init__(self, 10 | log_dir: str) -> None: 11 | """ 12 | Initializes the Callback. 13 | 14 | Args: 15 | log_dir: Tensorboard log directory to write to. 16 | """ 17 | 18 | super().__init__() 19 | self.summary_writer = tf.summary.create_file_writer(log_dir) 20 | 21 | def on_epoch_end(self, batch, logs=None) -> None: 22 | if logs is not None: 23 | for key, val in logs.items(): 24 | with self.summary_writer.as_default(): 25 | tf.summary.scalar(key, val, step=batch) 26 | -------------------------------------------------------------------------------- /headliner/preprocessing/keras_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Iterable, Dict 2 | 3 | from keras_preprocessing.text import Tokenizer as KTokenizer 4 | 5 | from headliner.preprocessing.tokenizer import Tokenizer 6 | 7 | 8 | class KerasTokenizer(Tokenizer): 9 | 10 | def __init__(self, **kwargs): 11 | self._keras_tokenizer = KTokenizer(**kwargs) 12 | 13 | def encode(self, text: str) -> List[int]: 14 | return self._keras_tokenizer.texts_to_sequences([text])[0] 15 | 16 | def decode(self, sequence: List[int]) -> str: 17 | return self._keras_tokenizer.sequences_to_texts([sequence])[0] 18 | 19 | @property 20 | def vocab_size(self) -> int: 21 | return len(self._keras_tokenizer.word_index) 22 | 23 | def fit(self, texts: Iterable[str]): 24 | self._keras_tokenizer.fit_on_texts(texts) 25 | 26 | @property 27 | def token_index(self) -> Dict[str, int]: 28 | return self._keras_tokenizer.word_index 29 | -------------------------------------------------------------------------------- /headliner/preprocessing/tokenizer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import abstractmethod 3 | from typing import List 4 | 5 | 6 | class Tokenizer(abc.ABC): 7 | """ 8 | Encodes text to sequences and decodes sequences to text. 9 | """ 10 | 11 | @abstractmethod 12 | def encode(self, text: str) -> List[int]: 13 | """ 14 | Encodes a given string into a sequence of indices. 15 | 16 | Args: 17 | text: Text to encode. 18 | 19 | Returns: Encoded sequence. 20 | """ 21 | pass 22 | 23 | @abstractmethod 24 | def decode(self, sequence: List[int]) -> str: 25 | """ 26 | Decodees a given sequence into a text. 27 | 28 | Args: 29 | sequence: Sequence to decode. 30 | 31 | Returns: Decoded text. 32 | 33 | """ 34 | pass 35 | 36 | @property 37 | @abstractmethod 38 | def vocab_size(self) -> int: 39 | """ 40 | Size of token vocab. 41 | """ 42 | pass 43 | -------------------------------------------------------------------------------- /mkdocs/docs/img/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_embeddings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import numpy as np 5 | from numpy import array 6 | from numpy.testing import assert_array_equal 7 | 8 | from headliner.embeddings import read_embedding, embedding_to_matrix 9 | 10 | 11 | class TestEmbeddings(unittest.TestCase): 12 | 13 | def test_read_embedding(self): 14 | current_dir = os.path.dirname(os.path.abspath(__file__)) 15 | file_path = os.path.join(current_dir, 'resources/small_glove.txt') 16 | glove = read_embedding(file_path, vector_dim=3) 17 | assert_array_equal(array([1, 2, 3]), glove['a']) 18 | 19 | def test_embedding_to_matrix(self): 20 | embedding = {'a': np.array(2), 'b': np.array(3), 'c': np.array(4)} 21 | token_index = {'a': 1, 'b': 2, 'd': 3} 22 | matrix = embedding_to_matrix(embedding, token_index, 1) 23 | np.testing.assert_array_equal(matrix[1], np.array(2)) 24 | np.testing.assert_array_equal(matrix[2], np.array(3)) 25 | # random values for zero index and tokens not in embedding 26 | self.assertTrue(-1 < float(matrix[0]) < 1) 27 | self.assertTrue(-1 < float(matrix[3]) < 1) 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6 4 | install: 5 | - pip install -e ".[tests, docs]" 6 | script: 7 | - pytest -vs --cov=headliner --cov-report xml --show-capture=no --disable-pytest-warnings tests/ 8 | - cd mkdocs && sh build_docs.sh 9 | deploy: 10 | provider: pages 11 | skip_cleanup: true 12 | github_token: "$GITHUB_TOKEN" 13 | local-dir: docs/ 14 | on: 15 | branch: master 16 | target_branch: gh-pages 17 | env: 18 | global: 19 | secure: gGBl1jMUnIgCmJFbjhF0fafCL5uAtIw5HlWRIb7BeZg+hTZ0ERFvxMXQzqpGSr1teFBrudaquF4tNa1JUlha70orWgplykvSFdf5Dq7GtuthSshPj9JcfHqJhZgEpABL4ed/U4t8M/aRZ2wERY0HSlvm2gGV4ghhmG8Sce+XCG3c2dPF4BeIKYK6KlHcQoMDVy6wjQIgBe2GXdm+VzoRUWqKoV59a+tXHgJb4sWL4VueSwOOZaTW2zQNeuB8NPfc4J36O90BrHaiw2XHoEVVA9zd9nntrhYs+G8C9oDQuEJZzq2mOsvmu4QQgdzkf7VIkzs7FNGEHL3TO+ecQvBaaA6eBnnCN0NoDJ723pkcROZQQHaN+AN04JeZBqv155hsXKRNZAM3NOLI9AZxIB6j+bz6KwdWzUt5jasjIvYPq2qiJ7nNzrIntoVaIC0sX2C0cJJdHsHk3Mh4TZDP7iJB8tGUmzN0z5ZvJR3rvZuHbEcihOXqs/Y6rY1mBA4dGeXcu2mIA+7ard7zGxbDnRRqzdJdmQdJRlSeFWSeAjRzJzkS9ebpPPH1eTqsRYtE3XnSDsiyiQEaxeCRvyvSH9582GO2ztiZwUVv8HhmSchiviXf48yv9PoyKkl/yuIg6vR+Jx+bIh/2CiJfzE1Ku07WnUkJl2tlmCF85ABgJcMwKCM= 20 | after_success: 21 | - codecov -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | Copyright (c) 2019 Axel Springer AI. All rights reserved. 4 | 5 | LICENSE 6 | 7 | The MIT License (MIT) 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. -------------------------------------------------------------------------------- /tests/callbacks/test_validation_callback.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import Mock 3 | 4 | from headliner.callbacks.validation_callback import ValidationCallback 5 | 6 | 7 | class TestValidationCallback(unittest.TestCase): 8 | 9 | def test_on_epoch_end(self): 10 | mock_summarizer = Mock() 11 | mock_summarizer.new_train_step.return_value = lambda input_seq, output_seq: 0.5 12 | mock_scorer_a, mock_scorer_b = Mock(), Mock() 13 | mock_scorer_a.return_value = 1 14 | mock_scorer_b.return_value = 2 15 | val_dataset_mock = Mock() 16 | val_dataset_mock.take.return_value = [(1, 2), (1, 2)] 17 | loss_function_mock = Mock() 18 | validation_callback = ValidationCallback(summarizer=mock_summarizer, 19 | val_dataset=val_dataset_mock, 20 | loss_function=loss_function_mock, 21 | batch_size=1) 22 | logs = {} 23 | validation_callback.on_epoch_end(0, logs=logs) 24 | self.assertEqual({'loss_val'}, logs.keys()) 25 | self.assertAlmostEqual(0.5, logs['loss_val'], places=10) 26 | -------------------------------------------------------------------------------- /tests/preprocessing/test_vectorizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 4 | from headliner.preprocessing.vectorizer import Vectorizer 5 | 6 | 7 | class TestVectorizer(unittest.TestCase): 8 | 9 | def test_vectorize(self): 10 | data = [('a b c', 'd')] 11 | tokenizer_encoder = KerasTokenizer() 12 | tokenizer_decoder = KerasTokenizer() 13 | tokenizer_encoder.fit([data[0][0]]) 14 | tokenizer_decoder.fit([data[0][1]]) 15 | vectorizer = Vectorizer(tokenizer_encoder, 16 | tokenizer_decoder, 17 | max_input_len=5, 18 | max_output_len=3) 19 | data_vectorized = [vectorizer(d) for d in data] 20 | self.assertEqual([([1, 2, 3], [1, 0, 0])], data_vectorized) 21 | 22 | data = [('a b c', 'd d d d')] 23 | data_vectorized = [vectorizer(d) for d in data] 24 | self.assertEqual([([1, 2, 3], [1, 1, 1])], data_vectorized) 25 | 26 | data = [('a a b b b b b', 'd d d d')] 27 | data_vectorized = [vectorizer(d) for d in data] 28 | self.assertEqual([([1, 1, 2, 2, 2], [1, 1, 1])], data_vectorized) 29 | -------------------------------------------------------------------------------- /headliner/preprocessing/bert_preprocessor.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from spacy.pipeline.pipes import Language 4 | 5 | 6 | class BertPreprocessor: 7 | 8 | def __init__(self, 9 | nlp: Language): 10 | """ 11 | Initializes the preprocessor. 12 | 13 | Args: 14 | nlp: Spacy natural language processing pipeline. 15 | """ 16 | self.nlp = nlp 17 | pipe = self.nlp.create_pipe('sentencizer') 18 | self.nlp.add_pipe(pipe) 19 | self.start_token = '[CLS]' 20 | self.end_token = '[SEP]' 21 | 22 | def __call__(self, data: Tuple[str, str]) -> Tuple[str, str]: 23 | """ Splits input text into sentences and adds start and end token to each sentence. """ 24 | text_encoder, text_decoder = data[0], data[1] 25 | doc = self.nlp(text_encoder) 26 | sentences = [self._process_sentence(s) for s in doc.sents] 27 | text_encoder = ' '.join(sentences) 28 | text_decoder = self.start_token + ' ' + text_decoder + ' ' + self.end_token 29 | return text_encoder, text_decoder 30 | 31 | def _process_sentence(self, sentence): 32 | return self.start_token + ' ' + sentence.string.strip() + ' ' + self.end_token 33 | -------------------------------------------------------------------------------- /tests/evaluation/test_bleu_scorer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from headliner.evaluation.bleu_scorer import BleuScorer 4 | 5 | 6 | class TestBleuScorer(unittest.TestCase): 7 | 8 | def test_score(self): 9 | bleu_scorer = BleuScorer(tokens_to_ignore={'', ''}) 10 | 11 | text_preprocessed = ('', 'this is a test') 12 | pred = 'this is a test' 13 | score = bleu_scorer({'preprocessed_text': text_preprocessed, 'predicted_text': pred}) 14 | self.assertAlmostEqual(1, score, 5) 15 | 16 | text_preprocessed = ('', 'this is a test') 17 | pred = 'this is a test ' 18 | score = bleu_scorer({'preprocessed_text': text_preprocessed, 'predicted_text': pred}) 19 | self.assertAlmostEqual(1, score, 5) 20 | 21 | text_preprocessed = ('', 'it is a guide to action which ensures that the military ' 22 | 'always obeys the commands of the party') 23 | pred = 'it is a guide to action that ensures that the military will ' \ 24 | 'forever heed Party commands ' 25 | score = bleu_scorer({'preprocessed_text': text_preprocessed, 'predicted_text': pred}) 26 | self.assertAlmostEqual(0.4138, score, 3) 27 | -------------------------------------------------------------------------------- /tests/callbacks/test_evaluation_callback.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import Mock 3 | 4 | from headliner.callbacks.evaluation_callback import EvaluationCallback 5 | 6 | 7 | class TestEvaluationCallback(unittest.TestCase): 8 | 9 | def test_on_epoch_end(self): 10 | mock_summarizer = Mock() 11 | mock_summarizer.predict_vectors.return_value = None 12 | mock_scorer_a, mock_scorer_b = Mock(), Mock() 13 | mock_scorer_a.return_value = 1 14 | mock_scorer_b.return_value = 2 15 | val_data = [('a', 'b'), ('c', 'd')] 16 | evaluation_callback = EvaluationCallback(summarizer=mock_summarizer, 17 | scorers={'mock_score_a': mock_scorer_a, 18 | 'mock_score_b': mock_scorer_b}, 19 | val_data=val_data, 20 | print_num_examples=0) 21 | logs = {} 22 | evaluation_callback.on_epoch_end(0, logs=logs) 23 | self.assertEqual({'mock_score_a', 'mock_score_b'}, logs.keys()) 24 | self.assertAlmostEqual(1.0, logs['mock_score_a'], places=10) 25 | self.assertAlmostEqual(2.0, logs['mock_score_b'], places=10) 26 | -------------------------------------------------------------------------------- /headliner/evaluation/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union 2 | 3 | import numpy as np 4 | from nltk.translate.bleu_score import sentence_bleu 5 | 6 | from headliner.evaluation.scorer import Scorer 7 | 8 | 9 | class BleuScorer(Scorer): 10 | """ 11 | Provides BLEU score for a model prediction. 12 | """ 13 | 14 | def __init__(self, tokens_to_ignore=None, weights=(0.25, 0.25, 0.25, 0.25)) -> None: 15 | """ 16 | Initializes the scorer. 17 | 18 | Args: 19 | tokens_to_ignore: Tokens to be removed before comparing input and output text. 20 | weights: Custom weights for 1,2,3,4 grams, e.g. (1, 0, 0, 0) will only measure 1-gram overlaps. 21 | """ 22 | self.tokens_to_exclude = tokens_to_ignore or [] 23 | self.weights = weights 24 | 25 | def __call__(self, prediction: [Dict[str, Union[str, np.array]]]) -> float: 26 | tokens_predicted = prediction['predicted_text'].split() 27 | tokens_output = prediction['preprocessed_text'][1].split() 28 | tokens_predicted_filtered = [t for t in tokens_predicted if t not in self.tokens_to_exclude] 29 | tokens_output_filtered = [t for t in tokens_output if t not in self.tokens_to_exclude] 30 | return sentence_bleu([tokens_output_filtered], tokens_predicted_filtered, weights=self.weights) 31 | -------------------------------------------------------------------------------- /tests/preprocessing/test_bert_vectorizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import BertTokenizer 4 | 5 | from headliner.preprocessing.bert_vectorizer import BertVectorizer 6 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 7 | 8 | 9 | class TestBertVectorizer(unittest.TestCase): 10 | 11 | def test_vectorize(self): 12 | data = ('[CLS] I love my dog. [SEP] [CLS] He is the best. [SEP]', '[CLS] Dog. [SEP]') 13 | tokenizer_encoder = BertTokenizer.from_pretrained('bert-base-uncased') 14 | tokenizer_decoder = KerasTokenizer() 15 | tokenizer_decoder.fit([data[1]]) 16 | vectorizer = BertVectorizer(tokenizer_encoder, 17 | tokenizer_decoder, 18 | max_input_len=50, 19 | max_output_len=3) 20 | 21 | data_vectorized = vectorizer(data) 22 | expected = ([101, 1045, 2293, 2026, 3899, 1012, 102, 101, 2002, 2003, 1996, 2190, 1012, 102], 23 | [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 2, 3]) 24 | self.assertEqual(expected, data_vectorized) 25 | input_decoded = vectorizer.decode_input(expected[0]) 26 | expected = '[CLS] i love my dog. [SEP] [CLS] he is the best. [SEP]' 27 | self.assertEqual(expected, input_decoded) 28 | 29 | 30 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | We welcome any contributions whether it's, 4 | 5 | - Submitting feedback 6 | - Fixing bugs 7 | - Or implementing a new feature. 8 | 9 | Please read this guide before making any contributions. 10 | 11 | #### Submit Feedback 12 | The feedback should be submitted by creating an issue at [GitHub issues](https://github.com/as-ideas/headliner/issues). 13 | Select the related template (bug report, feature request, or custom) and add the corresponding labels. 14 | 15 | #### Fix Bugs: 16 | You may look through the [GitHub issues](https://github.com/as-ideas/headliner/issues) for bugs. 17 | 18 | #### Implement Features 19 | You may look through the [GitHub issues](https://github.com/as-ideas/headliner/issues) for feature requests. 20 | 21 | ## Pull Requests (PR) 22 | 1. Fork the repository and a create a new branch from the master branch. 23 | 2. For bug fixes, add new tests and for new features please add changes to the documentation. 24 | 3. Do a PR from your new branch to our `master` branch of the original Headliner repo. 25 | 26 | ## Documentation 27 | - Make sure any new function or class you introduce has proper docstrings. 28 | 29 | ## Testing 30 | - We use [pytest](https://docs.pytest.org/en/latest/) for our testing. Make sure to write tests for any new feature and/or bug fixes. 31 | 32 | ## Main Contributor List 33 | We maintain a list of main contributors to appreciate all the contributions. -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | batch: true 3 | branches: 4 | include: 5 | - '*' 6 | paths: 7 | exclude: 8 | - '*.md' 9 | 10 | pr: 11 | paths: 12 | exclude: 13 | - '*.md' 14 | 15 | jobs: 16 | - job: 'Test' 17 | strategy: 18 | matrix: 19 | Python36Linux: 20 | imageName: 'ubuntu-16.04' 21 | python.version: '3.6' 22 | Python36Windows: 23 | imageName: 'vs2017-win2016' 24 | python.version: '3.6' 25 | Python36Mac: 26 | imageName: 'macos-10.13' 27 | python.version: '3.6' 28 | Python37Linux: 29 | imageName: 'ubuntu-16.04' 30 | python.version: '3.7' 31 | Python37Windows: 32 | imageName: 'vs2017-win2016' 33 | python.version: '3.7' 34 | Python37Mac: 35 | imageName: 'macos-10.13' 36 | python.version: '3.7' 37 | maxParallel: 4 38 | pool: 39 | vmImage: $(imageName) 40 | 41 | steps: 42 | - task: UsePythonVersion@0 43 | inputs: 44 | versionSpec: '$(python.version)' 45 | displayName: 'Use Python $(python.version)' 46 | 47 | - script: | 48 | python -m pip install --upgrade pip 49 | pip install -e ".[tests, docs]" 50 | displayName: 'Install dependencies' 51 | - script: | 52 | pip install pytest pytest-azurepipelines 53 | pytest -vs --show-capture=no tests 54 | displayName: 'pytest' -------------------------------------------------------------------------------- /tests/preprocessing/test_preprocessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from headliner.preprocessing.preprocessor import Preprocessor 4 | 5 | 6 | class TestPreprocessor(unittest.TestCase): 7 | 8 | def test_preprocessing(self): 9 | preprocessor = Preprocessor() 10 | data = (('First text!', 'first head'), ('2-nd täxt', 'Second head')) 11 | data_preprocessed = [preprocessor(d) for d in data] 12 | self.assertEqual((' first text ! ', ' first head '), data_preprocessed[0]) 13 | self.assertEqual((' #-nd täxt ', ' second head '), data_preprocessed[1]) 14 | 15 | preprocessor = Preprocessor(start_token='', 16 | end_token='', 17 | lower_case=True, 18 | hash_numbers=False) 19 | data_preprocessed = [preprocessor(d) for d in data] 20 | self.assertEqual((' 2-nd täxt ', ' second head '), data_preprocessed[1]) 21 | 22 | preprocessor = Preprocessor(start_token='', 23 | end_token='', 24 | lower_case=False, 25 | hash_numbers=True) 26 | data_preprocessed = [preprocessor(d) for d in data] 27 | self.assertEqual((' #-nd täxt ', ' Second head '), data_preprocessed[1]) 28 | 29 | -------------------------------------------------------------------------------- /tests/callbacks/test_model_checkpointing_callback.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | import os 5 | from unittest.mock import Mock, call 6 | 7 | from headliner.callbacks.model_checkpoint_callback import ModelCheckpointCallback 8 | 9 | 10 | class TestModelCheckpointCallback(unittest.TestCase): 11 | 12 | def setUp(self) -> None: 13 | self.temp_dir = tempfile.mkdtemp(prefix='TestModelCheckpointingCallback') 14 | 15 | def tearDown(self) -> None: 16 | shutil.rmtree(self.temp_dir, ignore_errors=True) 17 | 18 | def test_on_epoch_end(self): 19 | mock_summarizer = Mock() 20 | model_save_path = os.path.join(self.temp_dir, 'summarizer_save') 21 | model_checkoint_callback = ModelCheckpointCallback(file_path=model_save_path, 22 | summarizer=mock_summarizer, 23 | monitor='loss_val', 24 | mode='min') 25 | 26 | logs = {'loss_val': 10} 27 | model_checkoint_callback.on_epoch_end(0, logs=logs) 28 | mock_summarizer.save.assert_called_with(model_save_path) 29 | logs = {'loss_val': 20} 30 | model_checkoint_callback.on_epoch_end(0, logs=logs) 31 | mock_summarizer.save.assert_called_with(model_save_path) 32 | logs = {'loss_val': 5} 33 | model_checkoint_callback.on_epoch_end(0, logs=logs) 34 | mock_summarizer.save.assert_has_calls([call(model_save_path), call(model_save_path)]) 35 | -------------------------------------------------------------------------------- /headliner/preprocessing/dataset_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Callable 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class DatasetGenerator: 7 | 8 | def __init__(self, 9 | batch_size: int, 10 | shuffle_buffer_size=None, 11 | rank=2): 12 | self.batch_size = batch_size 13 | self.shuffle_buffer_size = shuffle_buffer_size 14 | if rank == 2: 15 | self.tensor_types = (tf.int32, tf.int32) 16 | self.tensor_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) 17 | elif rank == 3: 18 | self.tensor_types = (tf.int32, tf.int32, tf.int32) 19 | self.tensor_shapes = (tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])) 20 | else: 21 | raise ValueError('Rank must be either 2 or 3, but was: {}'.format(rank)) 22 | 23 | def __call__(self, data_generator_func: Callable[..., Iterable]) -> tf.data.Dataset: 24 | """ 25 | Initializes a dataset generator. 26 | 27 | Args: 28 | data_generator_func: Callable that returns an iterable over the data to be batched, e.g. lambda: [1, 2, 3]. 29 | """ 30 | 31 | dataset = tf.data.Dataset.from_generator(data_generator_func, 32 | self.tensor_types, 33 | self.tensor_shapes) 34 | if self.shuffle_buffer_size is not None: 35 | dataset = dataset.shuffle(self.shuffle_buffer_size) 36 | dataset = dataset.padded_batch(batch_size=self.batch_size, 37 | padded_shapes=self.tensor_shapes, 38 | drop_remainder=True) 39 | return dataset 40 | -------------------------------------------------------------------------------- /tests/preprocessing/test_dataset_generator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from headliner.preprocessing.dataset_generator import DatasetGenerator 6 | 7 | 8 | class TestDatasetGenerator(unittest.TestCase): 9 | 10 | def test_generate_dataset(self): 11 | data = [([1, 1], [2, 2]), ([1, 1, 1], [2, 2])] 12 | batch_generator = DatasetGenerator(batch_size=1) 13 | 14 | # batch size = 1 15 | batches_iter = iter(batch_generator(lambda: data)) 16 | batches = next(batches_iter) 17 | print(batches[0].numpy().tolist()) 18 | expected = [[[1, 1]], [[2, 2]]] 19 | np.testing.assert_array_equal(expected[0], batches[0].numpy().tolist()) 20 | np.testing.assert_array_equal(expected[1], batches[1].numpy().tolist()) 21 | 22 | # batch size = 2 23 | batch_generator = DatasetGenerator(batch_size=2) 24 | batches_iter = iter(batch_generator(lambda: data)) 25 | batches = next(batches_iter) 26 | expected = [[[1, 1, 0], [1, 1, 1]], [[2, 2], [2, 2]]] 27 | np.testing.assert_array_equal(expected[0], batches[0].numpy().tolist()) 28 | np.testing.assert_array_equal(expected[1], batches[1].numpy().tolist()) 29 | 30 | # batch size = 2, rank = 3 31 | data = [([1, 1], [0, 1], [2, 2]), 32 | ([1, 1, 1], [1, 1, 1], [3, 3, 3])] 33 | batch_generator = DatasetGenerator(batch_size=2, rank=3) 34 | batches_iter = iter(batch_generator(lambda: data)) 35 | batches = next(batches_iter) 36 | expected = [[[1, 1, 0], [1, 1, 1]], [[0, 1, 0], [1, 1, 1]], [[2, 2, 0], [3, 3, 3]]] 37 | np.testing.assert_array_equal(expected[0], batches[0].numpy().tolist()) 38 | np.testing.assert_array_equal(expected[1], batches[1].numpy().tolist()) 39 | -------------------------------------------------------------------------------- /mkdocs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Headliner 2 | site_author: Axel Springer AI 3 | 4 | nav: 5 | - Home: index.md 6 | - Examples: 7 | - Neural Machine Translation: examples/nmt_example.md 8 | - Advanced Neural Machine Translation: examples/advanced_nmt_example.md 9 | - BERT Machine Translation: examples/bert_example.md 10 | - Documentation: 11 | - Callbacks: 12 | - ValidationCallback: callbacks/validation_callback.md 13 | - EvaluationCallback: callbacks/evaluation_callback.md 14 | - ModelCheckpointCallback: callbacks/model_checkpoint_callback.md 15 | - Evaluation: 16 | - BleuScorer: evaluation/bleu_scorer.md 17 | - Model: 18 | - Summarizer: model/summarizer.md 19 | - SummarizerAttention: model/summarizer_attention.md 20 | - SummarizerTransformer: model/summarizer_transformer.md 21 | - Preprocessing: 22 | - BucketGenerator: preprocessing/bucket_generator.md 23 | - DatasetGenerator: preprocessing/dataset_generator.md 24 | - Preprocessor: preprocessing/preprocessor.md 25 | - Vectorizer: preprocessing/vectorizer.md 26 | - Tokenizer: preprocessing/tokenizer.md 27 | - Trainer: trainer.md 28 | - Losses: losses.md 29 | - Embeddings: embeddings.md 30 | - Contribution: CONTRIBUTING.md 31 | - License: LICENSE.md 32 | 33 | theme: 34 | name: 'material' 35 | palette: 36 | primary: 'black' 37 | accent: 'indigo' 38 | logo: 'img/logo.svg' 39 | favicon: 'img/favicon.ico' 40 | 41 | repo_name: 'as-ideas/headliner' 42 | repo_url: 'https://github.com/as-ideas/headliner' 43 | 44 | google_analytics: 45 | - 'UA-137434942-5' 46 | - 'auto' 47 | 48 | markdown_extensions: 49 | - codehilite -------------------------------------------------------------------------------- /mkdocs/docs/examples/nmt_example.md: -------------------------------------------------------------------------------- 1 | # Neural Machine Translation Example 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/as-ideas/headliner/blob/master/notebooks/Neural_Machine_Translation_Example.ipynb) 4 | 5 | ### Install TensorFlow and also our package via PyPI 6 | ```bash 7 | pip install tensorflow-gpu==2.0.0 8 | pip install headliner 9 | ``` 10 | 11 | ### Download the German-English sentence pairs 12 | ```bash 13 | wget http://www.manythings.org/anki/deu-eng.zip 14 | unzip deu-eng.zip 15 | ``` 16 | 17 | ### Create the dataset but only take a subset for faster training 18 | ```python 19 | import io 20 | 21 | def create_dataset(path, num_examples): 22 | lines = io.open(path, encoding='UTF-8').read().strip().split('\n') 23 | word_pairs = [[w for w in l.split('\t')[:2]] for l in lines[:num_examples]] 24 | return zip(*word_pairs) 25 | 26 | eng, ger, meta = create_dataset('deu.txt', 30000) 27 | data = list(zip(eng, ger)) 28 | ``` 29 | 30 | ### Split the dataset into train and test 31 | ```python 32 | from sklearn.model_selection import train_test_split 33 | 34 | train, test = train_test_split(data, test_size=100) 35 | ``` 36 | 37 | ### Define the model and train it 38 | ```python 39 | from headliner.trainer import Trainer 40 | from headliner.model.attention_summarizer import AttentionSummarizer 41 | 42 | summarizer = AttentionSummarizer(lstm_size=1024, embedding_size=256) 43 | trainer = Trainer(batch_size=64, 44 | steps_per_epoch=100, 45 | steps_to_log=20, 46 | max_output_len=10, 47 | model_save_path='/tmp/summarizer') 48 | trainer.train(summarizer, train, num_epochs=10, val_data=test) 49 | ``` 50 | 51 | ### Do some prediction 52 | ```python 53 | summarizer.predict('How are you?') 54 | ``` -------------------------------------------------------------------------------- /headliner/callbacks/validation_callback.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import tensorflow as tf 4 | 5 | from headliner.model.summarizer import Summarizer 6 | 7 | 8 | class ValidationCallback(tf.keras.callbacks.Callback): 9 | """ 10 | Callback for validation loss. 11 | """ 12 | 13 | def __init__(self, 14 | summarizer: Summarizer, 15 | val_dataset: tf.data.Dataset, 16 | loss_function: Callable[[tf.Tensor, tf.Tensor], tf.Tensor], 17 | batch_size: int) -> None: 18 | """ 19 | Initializes the Callback. 20 | 21 | Args: 22 | summarizer: Summarizer to validate. 23 | val_dataset: Validation dataset to validate the model on. 24 | loss_function: Loss function to apply to calculate the validation score. 25 | batch_size: Batch size of the validation dataset, needed for initializing the model. 26 | """ 27 | 28 | super().__init__() 29 | self.batch_size = batch_size 30 | self.summarizer = summarizer 31 | self.loss_function = loss_function 32 | self.val_dataset = val_dataset 33 | self.train_step = summarizer.new_train_step(self.loss_function, 34 | self.batch_size, 35 | apply_gradients=False) 36 | 37 | def on_epoch_end(self, batch, logs=None) -> None: 38 | if logs is None: 39 | logs = {} 40 | val_loss, count_batches_val = 0, 0 41 | for batch in self.val_dataset.take(-1): 42 | val_loss_batch = self.train_step(*batch) 43 | val_loss += val_loss_batch 44 | count_batches_val += 1 45 | if count_batches_val == 0: 46 | raise ValueError('Tried to validate on empty validation dataset, possibly due to batch size ' 47 | 'exceeding validation data size.') 48 | logs['loss_val'] = float(val_loss / count_batches_val) 49 | -------------------------------------------------------------------------------- /headliner/embeddings.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | 5 | 6 | def read_embedding(file_path: str, vector_dim: int) -> Dict[str, np.array]: 7 | """ 8 | Reads an embedding file in glove format into a dictionary mapping tokens to vectors. 9 | """ 10 | 11 | glove = {} 12 | with open(file_path, encoding='utf-8') as f: 13 | for line in f: 14 | values = line.split() 15 | # handle weird whitespaces in tokens 16 | if len(values[1:]) > vector_dim: 17 | wordcount = len(values) - vector_dim 18 | vec = np.asarray(values[wordcount:], dtype='float32') 19 | else: 20 | vec = np.asarray(values[1:], dtype='float32') 21 | token = values[0] 22 | glove[token] = vec 23 | return glove 24 | 25 | 26 | def embedding_to_matrix(embedding: Dict[str, np.array], 27 | token_index: Dict[str, int], 28 | embedding_dim: int) -> np.array: 29 | """ 30 | Converts an embedding dictionary into a weights matrix used to initialize an embedding layer. 31 | It ensures that all tokens in the token_index dictionare are mapped to a row, even those that are 32 | not contained in the provided embedding dictionary. Unknown tokens are initialized with a random 33 | vector with entries between -1 and 1. 34 | 35 | Args: 36 | embedding: dictionary mapping tokens to embedding vectors 37 | token_index: dictionary mapping tokens to indices that are fed into the embedding layer 38 | embedding_dim: size of the embedding vectors 39 | 40 | Returns: embedding weights as numpy array 41 | """ 42 | np.random.seed(42) 43 | embedding_matrix = 2. * np.random.rand(len(token_index) + 1, embedding_dim) - 1. 44 | for token, index in token_index.items(): 45 | embedding_vec = embedding.get(token) 46 | if embedding_vec is not None: 47 | embedding_matrix[index] = embedding_vec 48 | return embedding_matrix 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | long_description = ''' 4 | Headliner is a sequence modeling library that eases the training and 5 | **in particular, the deployment of custom sequence models** for both researchers and developers. 6 | You can very easily deploy your models in a few lines of code. It was originally 7 | built for our own research to generate headlines from news articles. 8 | That's why we chose the name, Headliner. Although this library was created internally to 9 | generate headlines, you can also use it for other tasks like machine translations, 10 | text summarization and many more. 11 | 12 | Read the documentation at: https://as-ideas.github.io/headliner/ 13 | 14 | Headliner is compatible with Python 3.6+ and is distributed under the MIT license. 15 | ''' 16 | 17 | setup( 18 | name='headliner', 19 | version='1.0.2', 20 | author='Christian Schäfer', 21 | author_email='c.schaefer.home@gmail.com', 22 | description='Easy training and deployment of seq2seq models.', 23 | long_description=long_description, 24 | license='MIT', 25 | install_requires=['scikit-learn', 'nltk', 'pyyaml', 'transformers>=2.2.2', 'spacy>=2.2.2'], 26 | extras_require={ 27 | 'tests': ['pytest', 'pytest-cov', 'codecov', 'tensorflow==2.0.0'], 28 | 'docs': ['mkdocs', 'mkdocs-material'], 29 | 'dev': ['bumpversion'] 30 | }, 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Developers', 34 | 'Intended Audience :: Education', 35 | 'Intended Audience :: Science/Research', 36 | 'License :: OSI Approved :: MIT License', 37 | 'Operating System :: POSIX :: Linux', 38 | 'Operating System :: MacOS :: MacOS X', 39 | 'Operating System :: Microsoft :: Windows', 40 | 'Programming Language :: Python :: 3', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: 3.7', 43 | 'Topic :: Scientific/Engineering', 44 | 'Topic :: Software Development :: Libraries', 45 | 'Topic :: Software Development :: Libraries :: Python Modules', 46 | ], 47 | packages=find_packages(exclude=('tests',)), 48 | ) 49 | -------------------------------------------------------------------------------- /headliner/callbacks/model_checkpoint_callback.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from headliner.model.summarizer import Summarizer 4 | 5 | 6 | class ModelCheckpointCallback(tf.keras.callbacks.Callback): 7 | """ 8 | Callback for checkpointing summarizer models. 9 | """ 10 | 11 | def __init__(self, 12 | file_path: str, 13 | summarizer: Summarizer, 14 | monitor='loss_val', 15 | mode='min') -> None: 16 | 17 | """ 18 | Initializes the Callback. 19 | 20 | Args: 21 | file_path: Path for saving the model (a directory). If existing, the model will be overwritten. 22 | summarizer: Summarizer to checkpoint. 23 | monitor: Name of the score monitor for improvements. 24 | mode: If set to 'min' a decrease of the monitored score is seen as an improvement, otherwise an increase. 25 | """ 26 | 27 | super().__init__() 28 | self.file_path = file_path 29 | self.summarizer = summarizer 30 | self.monitor = monitor 31 | self.mode = mode 32 | self.best_score = None 33 | 34 | def on_epoch_end(self, batch, logs=None) -> None: 35 | if logs is None: 36 | logs = {} 37 | if self.file_path is None: 38 | return 39 | score = logs[self.monitor] 40 | score_is_better = False 41 | if self.best_score is None: 42 | score_is_better = True 43 | else: 44 | if self.mode == 'min' and score < self.best_score: 45 | score_is_better = True 46 | if self.mode == 'max' and score > self.best_score: 47 | score_is_better = True 48 | if score_is_better: 49 | print('{score_name} improved from {prev} to {current}, ' 50 | 'saving summarizer to {path}'.format(score_name=self.monitor, 51 | prev=self.best_score, 52 | current=score, 53 | path=self.file_path)) 54 | self.best_score = score 55 | self.summarizer.save(self.file_path) 56 | else: 57 | print('{score_name} did not improve.'.format(score_name=self.monitor)) 58 | -------------------------------------------------------------------------------- /headliner/callbacks/evaluation_callback.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Callable, Iterable, Tuple 2 | 3 | import tensorflow as tf 4 | 5 | from headliner.model.summarizer import Summarizer 6 | from headliner.utils.logger import get_logger 7 | 8 | 9 | class EvaluationCallback(tf.keras.callbacks.Callback): 10 | """ 11 | Callback for custom scoring methods. 12 | """ 13 | 14 | def __init__(self, 15 | summarizer: Summarizer, 16 | scorers: Dict[str, Callable[[Dict], float]], 17 | val_data: Iterable[Tuple[str, str]], 18 | print_num_examples=5) -> None: 19 | """ 20 | Initializes the Callback. 21 | 22 | Args: 23 | summarizer: Summarizer that predicts over the validation data. 24 | scorers: Dictionary of {scorer_name: scorer}, where each scorer maps a prediction to a score. 25 | val_data: Raw validation data to predict on. 26 | print_num_examples: Number of prediction examples to output for eyeballing the prediction quality. 27 | """ 28 | 29 | super().__init__() 30 | self.summarizer = summarizer 31 | self.scorers = scorers 32 | self.val_data = val_data 33 | self.logger = get_logger(__name__) 34 | self.print_num_examples = print_num_examples 35 | 36 | def on_epoch_end(self, batch, logs=None) -> None: 37 | if logs is None: 38 | logs = {} 39 | val_scores = {score_name: 0. for score_name in self.scorers.keys()} 40 | count_val = 0 41 | for d in self.val_data: 42 | count_val += 1 43 | input_text, target_text = d 44 | prediction = self.summarizer.predict_vectors(input_text, target_text) 45 | if count_val <= self.print_num_examples: 46 | self.logger.info('\n(input) {} \n(target) {} \n(prediction) {}\n'.format( 47 | prediction['preprocessed_text'][0], 48 | prediction['preprocessed_text'][1], 49 | prediction['predicted_text'] 50 | )) 51 | elif len(self.scorers) == 0: 52 | break 53 | for score_name, scorer in self.scorers.items(): 54 | score = scorer(prediction) 55 | val_scores[score_name] += score 56 | for score_name, score in val_scores.items(): 57 | logs[score_name] = float(score / count_val) 58 | -------------------------------------------------------------------------------- /tests/preprocessing/test_bucket_generator.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | import unittest 4 | 5 | from headliner.preprocessing.bucket_generator import BucketGenerator 6 | 7 | 8 | class TestBucketGenerator(unittest.TestCase): 9 | 10 | def test_generate_batches_nonrandom(self): 11 | data = [[i] * i for i in range(10, 0, -1)] 12 | bucket_generator = BucketGenerator(lambda e: len(e), 13 | batch_size=2, 14 | buffer_size_batches=100, 15 | shuffle=False) 16 | buckets_gen = bucket_generator(data) 17 | result = [el for el in buckets_gen] 18 | expected = [[i] * i for i in range(1, 11)] 19 | self.assertEqual(expected, result) 20 | 21 | def test_generate_batches_random(self): 22 | data = [[i] * i for i in range(100, 0, -1)] 23 | random.shuffle(data) 24 | bucket_generator = BucketGenerator(lambda e: len(e), 25 | batch_size=2, 26 | buffer_size_batches=100, 27 | batches_to_bucket=10, 28 | shuffle=True, 29 | seed=42) 30 | buckets_gen = bucket_generator(data) 31 | result = [el for el in buckets_gen] 32 | 33 | # check whether all elements are returned 34 | expected_elements = list(itertools.chain.from_iterable(data)) 35 | expected_elements.sort() 36 | result_elements = list(itertools.chain.from_iterable(result)) 37 | result_elements.sort() 38 | self.assertEqual(expected_elements, result_elements) 39 | 40 | # check whether sequences of similar length are bucketed together 41 | # -> compare sum of length difference within batches against non-bucketed data 42 | raw_total_length_diff = 0 43 | result_total_length_diff = 0 44 | for i in range(0, len(data), 2): 45 | first_seq_raw, second_seq_raw = data[i:i + 2] 46 | first_seq_bucket, second_seq_bucket = result[i:i + 2] 47 | raw_total_length_diff += abs(len(second_seq_raw) - len(first_seq_raw)) 48 | result_total_length_diff += abs(len(second_seq_bucket) - len(first_seq_bucket)) 49 | self.assertTrue(result_total_length_diff < raw_total_length_diff / 4) 50 | -------------------------------------------------------------------------------- /headliner/preprocessing/preprocessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Tuple 3 | 4 | 5 | class Preprocessor: 6 | 7 | def __init__(self, 8 | start_token='', 9 | end_token='', 10 | punctuation_pattern='([!.?,])', 11 | filter_pattern='(["#$%&()*+/:;<=>@[\\]^_`{|}~\t\n])', 12 | add_input_start_end=True, 13 | lower_case=True, 14 | hash_numbers=True): 15 | """ 16 | Initializes the preprocessor. 17 | 18 | Args: 19 | start_token: Unique start token to be inserted at the beginning of the target text. 20 | end_token: Unique end token to be attached at the end of a target text. 21 | punctuation_pattern: Regex pattern for punktuation that is splitted from the tokens. 22 | filter_pattern: Regex pattern for characters to be removed from the text. 23 | add_input_start_end: Whether to add start and end token to input sequence. 24 | lower_case: Whether to perform lower casing. 25 | hash_numbers: Whether to replace numbers by a #. 26 | """ 27 | self.start_token = start_token 28 | self.end_token = end_token 29 | self.punctuation_pattern = punctuation_pattern 30 | self.filter_pattern = filter_pattern 31 | self.add_input_start_end = add_input_start_end 32 | self.lower_case = lower_case 33 | self.hash_numbers = hash_numbers 34 | 35 | def __call__(self, data: Tuple[str, str]) -> Tuple[str, str]: 36 | """ Performs regex logic for string cleansing and attaches start and end tokens to the text. """ 37 | text_encoder, text_decoder = self._normalize_string(data[0]), self._normalize_string(data[1]) 38 | if self.add_input_start_end: 39 | text_encoder = self.start_token + ' ' + text_encoder + ' ' + self.end_token 40 | text_decoder = self.start_token + ' ' + text_decoder + ' ' + self.end_token 41 | return text_encoder, text_decoder 42 | 43 | def _normalize_string(self, s: str) -> str: 44 | if self.lower_case: 45 | s = s.lower() 46 | if self.filter_pattern is not None: 47 | s = re.sub(self.filter_pattern, '', s) 48 | if self.hash_numbers: 49 | s = re.sub(r'\d+', '#', s) 50 | if self.punctuation_pattern is not None: 51 | s = re.sub(self.punctuation_pattern, r' \1', s) 52 | s = re.sub(r'\s+', r' ', s) 53 | return s 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | # Edit at https://www.gitignore.io/?templates=python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # pipenv 75 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 76 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 77 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 78 | # install all needed dependencies. 79 | #Pipfile.lock 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | .spyproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # Mr Developer 95 | .mr.developer.cfg 96 | .project 97 | .pydevproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | .dmypy.json 105 | dmypy.json 106 | 107 | # Pyre type checker 108 | .pyre/ 109 | 110 | # End of https://www.gitignore.io/api/python 111 | 112 | # Custom 113 | .DS_Store 114 | .idea/ 115 | .venv/ 116 | .venv/* 117 | /docs 118 | mkdocs/docs/figures/ 119 | mkdocs/docs/callbacks/ 120 | mkdocs/docs/evaluation/ 121 | mkdocs/docs/model/ 122 | mkdocs/docs/preprocessing/ 123 | mkdocs/docs/utils/ 124 | mkdocs/docs/*.md -------------------------------------------------------------------------------- /headliner/model/basic_model.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class Encoder(tf.keras.Model): 7 | 8 | def __init__(self, 9 | embedding_shape: Tuple[int, int], 10 | lstm_size=50, 11 | embedding_weights=None, 12 | embedding_trainable=True) -> None: 13 | super(Encoder, self).__init__() 14 | vocab_size, vec_dim = embedding_shape 15 | weights = None if embedding_weights is None else [embedding_weights] 16 | self.embedding = tf.keras.layers.Embedding(vocab_size, 17 | vec_dim, 18 | weights=weights, 19 | trainable=embedding_trainable) 20 | self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True, go_backwards=True) 21 | self.lstm_size = lstm_size 22 | 23 | def call(self, 24 | sequence: tf.Tensor, 25 | states: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: 26 | embed = self.embedding(sequence) 27 | output, state_h, state_c = self.lstm(embed, initial_state=states) 28 | return output, state_h, state_c 29 | 30 | def init_states(self, batch_size: int) -> Tuple[tf.Tensor, tf.Tensor]: 31 | return tf.zeros([batch_size, self.lstm_size]), tf.zeros([batch_size, self.lstm_size]) 32 | 33 | 34 | class Decoder(tf.keras.Model): 35 | 36 | def __init__(self, 37 | embedding_shape: Tuple[int, int], 38 | lstm_size=50, 39 | embedding_weights=None, 40 | embedding_trainable=True) -> None: 41 | super(Decoder, self).__init__() 42 | self.lstm_size = lstm_size 43 | vocab_size, vec_dim = embedding_shape 44 | weights = None if embedding_weights is None else [embedding_weights] 45 | self.embedding = tf.keras.layers.Embedding(vocab_size, 46 | vec_dim, 47 | weights=weights, 48 | trainable=embedding_trainable) 49 | self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True) 50 | self.dense = tf.keras.layers.Dense(vocab_size) 51 | 52 | def call(self, sequence: tf.Tensor, state: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: 53 | embed = self.embedding(sequence) 54 | lstm_out, state_h, state_c = self.lstm(embed, state) 55 | logits = self.dense(lstm_out) 56 | return logits, state_h, state_c 57 | -------------------------------------------------------------------------------- /headliner/preprocessing/vectorizer.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | 3 | from headliner.preprocessing.tokenizer import Tokenizer 4 | 5 | 6 | class Vectorizer: 7 | """ 8 | Transforms tuples of text into tuples of vector sequences. 9 | """ 10 | 11 | def __init__(self, 12 | tokenizer_encoder: Tokenizer, 13 | tokenizer_decoder: Tokenizer, 14 | max_input_len=None, 15 | max_output_len=None) -> None: 16 | """ 17 | Initializes the vectorizer. 18 | 19 | Args: 20 | tokenizer_encoder: Tokenizer that encodes the input text. 21 | tokenizer_decoder: Tokenizer that encodes the target text. 22 | max_input_len (optional): Maximum length of input sequence, 23 | longer sequences will be truncated. 24 | max_output_len (optional): Maximum length of target sequence, 25 | longer sequences will be truncated and shorter sequences 26 | will be padded to max len. 27 | """ 28 | self.encoding_dim = tokenizer_encoder.vocab_size + 1 29 | self.decoding_dim = tokenizer_decoder.vocab_size + 1 30 | self.max_input_len = max_input_len 31 | self.max_output_len = max_output_len 32 | self._tokenizer_encoder = tokenizer_encoder 33 | self._tokenizer_decoder = tokenizer_decoder 34 | 35 | def __call__(self, data: Tuple[str, str]) -> Tuple[List[int], List[int]]: 36 | """ 37 | Encodes preprocessed strings into sequences of one-hot indices. 38 | """ 39 | text_encoder, text_decoder = data[0], data[1] 40 | vec_encoder = self._tokenizer_encoder.encode(text_encoder) 41 | vec_decoder = self._tokenizer_decoder.encode(text_decoder) 42 | if self.max_input_len is not None: 43 | if len(vec_encoder) > self.max_input_len: 44 | vec_encoder = vec_encoder[:self.max_input_len - 1] + [vec_encoder[-1]] 45 | if self.max_output_len is not None: 46 | if len(vec_decoder) > self.max_output_len: 47 | vec_decoder = vec_decoder[:self.max_output_len - 1] + [vec_decoder[-1]] 48 | else: 49 | vec_decoder = vec_decoder + [0] * (self.max_output_len - len(vec_decoder)) 50 | 51 | return vec_encoder, vec_decoder 52 | 53 | def encode_input(self, text: str) -> List[int]: 54 | return self._tokenizer_encoder.encode(text) 55 | 56 | def encode_output(self, text: str) -> List[int]: 57 | return self._tokenizer_decoder.encode(text) 58 | 59 | def decode_input(self, sequence: List[int]) -> str: 60 | return self._tokenizer_encoder.decode(sequence) 61 | 62 | def decode_output(self, sequence: List[int]) -> str: 63 | return self._tokenizer_decoder.decode(sequence) 64 | -------------------------------------------------------------------------------- /tests/model/test_summarizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | from headliner.losses import masked_crossentropy 11 | from headliner.model.basic_summarizer import BasicSummarizer 12 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 13 | from headliner.preprocessing.preprocessor import Preprocessor 14 | from headliner.preprocessing.vectorizer import Vectorizer 15 | 16 | 17 | class TestSummarizer(unittest.TestCase): 18 | 19 | def setUp(self) -> None: 20 | np.random.seed(42) 21 | tf.random.set_seed(42) 22 | self.temp_dir = tempfile.mkdtemp(prefix='TestSummarizerTmp') 23 | 24 | def tearDown(self) -> None: 25 | shutil.rmtree(self.temp_dir, ignore_errors=True) 26 | 27 | def test_serde_happy_path(self) -> None: 28 | preprocessor = Preprocessor() 29 | tokenizer = KerasTokenizer(oov_token='') 30 | tokenizer.fit(['a b c {} {}'.format( 31 | preprocessor.start_token, preprocessor.end_token)]) 32 | vectorizer = Vectorizer(tokenizer, tokenizer) 33 | summarizer = BasicSummarizer(lstm_size=10, 34 | max_prediction_len=10, 35 | embedding_decoder_trainable=False, 36 | embedding_size=10) 37 | summarizer.init_model(preprocessor=preprocessor, 38 | vectorizer=vectorizer) 39 | 40 | # we need at least a train step to init the weights 41 | train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) 42 | train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) 43 | train_step(train_seq, train_seq) 44 | 45 | save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') 46 | summarizer.save(save_dir) 47 | summarizer_loaded = BasicSummarizer.load(save_dir) 48 | self.assertEqual(10, summarizer_loaded.lstm_size) 49 | self.assertEqual(10, summarizer_loaded.max_prediction_len) 50 | self.assertIsNotNone(summarizer_loaded.preprocessor) 51 | self.assertIsNotNone(summarizer_loaded.vectorizer) 52 | self.assertIsNotNone(summarizer_loaded.encoder) 53 | self.assertIsNotNone(summarizer_loaded.decoder) 54 | self.assertTrue(summarizer_loaded.encoder.embedding.trainable) 55 | self.assertFalse(summarizer_loaded.decoder.embedding.trainable) 56 | self.assertIsNotNone(summarizer_loaded.optimizer) 57 | 58 | pred = summarizer.predict_vectors('a c', '') 59 | pred_loaded = summarizer_loaded.predict_vectors('a c', '') 60 | np.testing.assert_almost_equal( 61 | pred['logits'], pred_loaded['logits'], decimal=6) -------------------------------------------------------------------------------- /tests/model/test_summarizer_attention.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from keras_preprocessing.text import Tokenizer 9 | 10 | from headliner.losses import masked_crossentropy 11 | from headliner.model.attention_summarizer import AttentionSummarizer 12 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 13 | from headliner.preprocessing.preprocessor import Preprocessor 14 | from headliner.preprocessing.vectorizer import Vectorizer 15 | 16 | 17 | class TestSummarizerAttention(unittest.TestCase): 18 | 19 | def setUp(self) -> None: 20 | np.random.seed(42) 21 | tf.random.set_seed(42) 22 | self.temp_dir = tempfile.mkdtemp(prefix='TestSummarizerAttentionTmp') 23 | 24 | def tearDown(self) -> None: 25 | shutil.rmtree(self.temp_dir, ignore_errors=True) 26 | 27 | def test_serde_happy_path(self) -> None: 28 | preprocessor = Preprocessor() 29 | tokenizer = KerasTokenizer(oov_token='') 30 | tokenizer.fit(['a b c {} {}'.format( 31 | preprocessor.start_token, preprocessor.end_token)]) 32 | vectorizer = Vectorizer(tokenizer, tokenizer) 33 | summarizer = AttentionSummarizer(lstm_size=10, 34 | max_prediction_len=10, 35 | embedding_size=10, 36 | embedding_encoder_trainable=False) 37 | summarizer.init_model(preprocessor=preprocessor, 38 | vectorizer=vectorizer) 39 | 40 | # we need at least a train step to init the weights 41 | train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) 42 | train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) 43 | train_step(train_seq, train_seq) 44 | 45 | save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') 46 | summarizer.save(save_dir) 47 | summarizer_loaded = AttentionSummarizer.load(save_dir) 48 | self.assertEqual(10, summarizer_loaded.lstm_size) 49 | self.assertEqual(10, summarizer_loaded.max_prediction_len) 50 | self.assertIsNotNone(summarizer_loaded.preprocessor) 51 | self.assertIsNotNone(summarizer_loaded.vectorizer) 52 | self.assertIsNotNone(summarizer_loaded.encoder) 53 | self.assertIsNotNone(summarizer_loaded.decoder) 54 | self.assertFalse(summarizer_loaded.encoder.embedding.trainable) 55 | self.assertTrue(summarizer_loaded.decoder.embedding.trainable) 56 | self.assertIsNotNone(summarizer_loaded.optimizer) 57 | 58 | pred = summarizer.predict_vectors('a c', '') 59 | pred_loaded = summarizer_loaded.predict_vectors('a c', '') 60 | np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6) 61 | -------------------------------------------------------------------------------- /tests/model/test_summarizer_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from headliner.losses import masked_crossentropy 10 | from headliner.model.transformer_summarizer import TransformerSummarizer 11 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 12 | from headliner.preprocessing.preprocessor import Preprocessor 13 | from headliner.preprocessing.vectorizer import Vectorizer 14 | 15 | 16 | class TestSummarizerTransformer(unittest.TestCase): 17 | 18 | def setUp(self) -> None: 19 | np.random.seed(42) 20 | tf.random.set_seed(42) 21 | self.temp_dir = tempfile.mkdtemp(prefix='TestSummarizerTransformerTmp') 22 | 23 | def tearDown(self) -> None: 24 | shutil.rmtree(self.temp_dir, ignore_errors=True) 25 | 26 | def test_serde_happy_path(self) -> None: 27 | preprocessor = Preprocessor() 28 | tokenizer = KerasTokenizer(oov_token='') 29 | tokenizer.fit(['a b c {} {}'.format( 30 | preprocessor.start_token, preprocessor.end_token)]) 31 | vectorizer = Vectorizer(tokenizer, tokenizer) 32 | summarizer = TransformerSummarizer(num_layers=1, 33 | num_heads=2, 34 | max_prediction_len=3, 35 | embedding_size=10, 36 | embedding_encoder_trainable=False) 37 | summarizer.init_model(preprocessor=preprocessor, 38 | vectorizer=vectorizer) 39 | 40 | # we need at least a train step to init the weights 41 | train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) 42 | train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) 43 | train_step(train_seq, train_seq) 44 | 45 | save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') 46 | summarizer.save(save_dir) 47 | summarizer_loaded = TransformerSummarizer.load(save_dir) 48 | self.assertEqual(1, summarizer_loaded.num_layers) 49 | self.assertEqual(2, summarizer_loaded.num_heads) 50 | self.assertEqual(3, summarizer_loaded.max_prediction_len) 51 | self.assertEqual(10, summarizer_loaded.embedding_size) 52 | self.assertIsNotNone(summarizer_loaded.preprocessor) 53 | self.assertIsNotNone(summarizer_loaded.vectorizer) 54 | self.assertIsNotNone(summarizer_loaded.transformer) 55 | self.assertFalse(summarizer_loaded.transformer.encoder.embedding.trainable) 56 | self.assertTrue(summarizer_loaded.transformer.decoder.embedding.trainable) 57 | self.assertIsNotNone(summarizer_loaded.optimizer) 58 | 59 | pred = summarizer.predict_vectors('a c', '') 60 | pred_loaded = summarizer_loaded.predict_vectors('a c', '') 61 | np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6) 62 | -------------------------------------------------------------------------------- /headliner/preprocessing/bert_vectorizer.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | 3 | from transformers import BertTokenizer 4 | 5 | from headliner.preprocessing.tokenizer import Tokenizer 6 | 7 | 8 | class BertVectorizer: 9 | """ 10 | Transforms tuples of text into tuples of vector sequences. 11 | """ 12 | 13 | def __init__(self, 14 | tokenizer_encoder: BertTokenizer, 15 | tokenizer_decoder: Tokenizer, 16 | max_input_len=512, 17 | max_output_len=None) -> None: 18 | """ 19 | Initializes the vectorizer. 20 | 21 | Args: 22 | tokenizer_encoder: Tokenizer that encodes the input text. 23 | tokenizer_decoder: Tokenizer that encodes the target text. 24 | max_input_len (optional): Maximum length of input sequence, 25 | longer sequences will be truncated. 26 | max_output_len (optional): Maximum length of target sequence, 27 | longer sequences will be truncated and shorter sequences 28 | will be padded to max len. 29 | """ 30 | self.encoding_dim = tokenizer_encoder.vocab_size + 1 31 | self.decoding_dim = tokenizer_decoder.vocab_size + 1 32 | self.max_input_len = max_input_len 33 | self.max_output_len = max_output_len 34 | self._tokenizer_encoder = tokenizer_encoder 35 | self._tokenizer_decoder = tokenizer_decoder 36 | 37 | def __call__(self, data: Tuple[str, str]) -> Tuple[List[int], List[int], List[int]]: 38 | """ 39 | Encodes preprocessed strings into sequences of one-hot indices. 40 | """ 41 | text_encoder, text_decoder = data[0], data[1] 42 | sentences = text_encoder.split('[SEP]')[:-1] 43 | vec_encoder = [] 44 | sentence_ids = [] 45 | for i, sent in enumerate(sentences): 46 | sent = sent + '[SEP]' 47 | vec = self._tokenizer_encoder.encode(sent, add_special_tokens=False) 48 | if len(vec_encoder) + len(vec) < self.max_input_len: 49 | vec_encoder.extend(vec) 50 | ids = [i % 2] * len(vec) 51 | sentence_ids.extend(ids) 52 | 53 | vec_decoder = self._tokenizer_decoder.encode(text_decoder) 54 | if self.max_output_len is not None: 55 | if len(vec_decoder) > self.max_output_len: 56 | vec_decoder = vec_decoder[:self.max_output_len - 1] + [vec_decoder[-1]] 57 | else: 58 | vec_decoder = vec_decoder + [0] * (self.max_output_len - len(vec_decoder)) 59 | 60 | return vec_encoder, sentence_ids, vec_decoder 61 | 62 | def encode_input(self, text: str) -> List[int]: 63 | return self._tokenizer_encoder.encode(text, add_special_tokens=False) 64 | 65 | def encode_output(self, text: str) -> List[int]: 66 | return self._tokenizer_decoder.encode(text) 67 | 68 | def decode_input(self, sequence: List[int]) -> str: 69 | return self._tokenizer_encoder.decode(sequence) 70 | 71 | def decode_output(self, sequence: List[int]) -> str: 72 | return self._tokenizer_decoder.decode(sequence) 73 | -------------------------------------------------------------------------------- /tests/model/test_summarizer_bert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from transformers import BertTokenizer 9 | 10 | from headliner.losses import masked_crossentropy 11 | from headliner.model.bert_summarizer import BertSummarizer 12 | from headliner.preprocessing.bert_vectorizer import BertVectorizer 13 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 14 | from headliner.preprocessing.preprocessor import Preprocessor 15 | 16 | 17 | class TestSummarizerBert(unittest.TestCase): 18 | 19 | def setUp(self) -> None: 20 | np.random.seed(42) 21 | tf.random.set_seed(42) 22 | self.temp_dir = tempfile.mkdtemp(prefix='TestSummarizerBertTmp') 23 | 24 | def tearDown(self) -> None: 25 | shutil.rmtree(self.temp_dir, ignore_errors=True) 26 | 27 | def test_serde_happy_path(self) -> None: 28 | preprocessor = Preprocessor(start_token='[CLS]', end_token='[SEP]') 29 | 30 | tokenizer_encoder = BertTokenizer.from_pretrained('bert-base-uncased') 31 | tokenizer_decoder = KerasTokenizer(oov_token='') 32 | tokenizer_decoder.fit(['a b c {} {}'.format( 33 | preprocessor.start_token, preprocessor.end_token)]) 34 | vectorizer = BertVectorizer(tokenizer_encoder, tokenizer_decoder) 35 | summarizer = BertSummarizer(num_layers_encoder=1, 36 | num_layers_decoder=1, 37 | bert_embedding_encoder='bert-base-uncased', 38 | num_heads=2, 39 | max_prediction_len=3, 40 | embedding_size_encoder=768, 41 | embedding_size_decoder=10, 42 | embedding_encoder_trainable=False) 43 | summarizer.init_model(preprocessor=preprocessor, 44 | vectorizer=vectorizer) 45 | 46 | # we need at least a train step to init the weights 47 | train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) 48 | train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) 49 | train_step(train_seq, train_seq, train_seq) 50 | 51 | save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') 52 | summarizer.save(save_dir) 53 | summarizer_loaded = BertSummarizer.load(save_dir) 54 | self.assertEqual(1, summarizer_loaded.num_layers_encoder) 55 | self.assertEqual(1, summarizer_loaded.num_layers_decoder) 56 | self.assertEqual(2, summarizer_loaded.num_heads) 57 | self.assertEqual(3, summarizer_loaded.max_prediction_len) 58 | self.assertEqual(768, summarizer_loaded.embedding_size_encoder) 59 | self.assertEqual(10, summarizer_loaded.embedding_size_decoder) 60 | self.assertIsNotNone(summarizer_loaded.preprocessor) 61 | self.assertIsNotNone(summarizer_loaded.vectorizer) 62 | self.assertIsNotNone(summarizer_loaded.transformer) 63 | self.assertFalse(summarizer_loaded.transformer.encoder.embedding.trainable) 64 | self.assertTrue(summarizer_loaded.transformer.decoder.embedding.trainable) 65 | self.assertIsNotNone(summarizer_loaded.optimizer_encoder) 66 | self.assertIsNotNone(summarizer_loaded.optimizer_decoder) 67 | 68 | pred = summarizer.predict_vectors('a c', '') 69 | pred_loaded = summarizer_loaded.predict_vectors('a c', '') 70 | np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6) 71 | -------------------------------------------------------------------------------- /headliner/model/summarizer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import abstractmethod 3 | from typing import Callable, Dict, Union 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from headliner.preprocessing import Preprocessor, Vectorizer 9 | 10 | 11 | class Summarizer(abc.ABC): 12 | 13 | def __init__(self): 14 | self.vectorizer: Union[Vectorizer, None] = None 15 | self.preprocessor: Union[Preprocessor, None] = None 16 | self.embedding_size: Union[int, None] = None 17 | 18 | @abstractmethod 19 | def init_model(self, 20 | preprocessor: Preprocessor, 21 | vectorizer: Vectorizer, 22 | embedding_weights_encoder=None, 23 | embedding_weights_decoder=None) -> None: 24 | """ 25 | Initializes the model and provides necessary information for compilation. 26 | 27 | Args: 28 | preprocessor: Preprocessor object that preprocesses text for training and prediction. 29 | vectorizer: Vectorizer object that performs vectorization of the text. 30 | embedding_weights_encoder (optional): Matrix to initialize the encoder embedding. 31 | embedding_weights_decoder (optional): Matrix to initialize the decoder embedding. 32 | """ 33 | 34 | pass 35 | 36 | @abstractmethod 37 | def predict(self, text: str) -> str: 38 | """ 39 | Predicts summary of an input text. 40 | """ 41 | 42 | pass 43 | 44 | @abstractmethod 45 | def predict_vectors(self, input_text: str, target_text: str) -> Dict[str, Union[str, np.array]]: 46 | """ 47 | Predicts summary of an input text and outputs information needed for evaluation: 48 | output logits, input tokens, output tokens, predicted tokens, preprocessed text, 49 | attention alignment. 50 | 51 | Args: 52 | input_text: Text used as input for prediction. 53 | target_text: Text used for evaluation. 54 | 55 | Returns: Dictionary with prediction information such as 56 | preprocessed_text, logits, alignment, predicted_sequence, predicted_text. 57 | 58 | """ 59 | 60 | pass 61 | 62 | @abstractmethod 63 | def new_train_step(self, 64 | loss_function: Callable[[tf.Tensor], tf.Tensor], 65 | batch_size: int, 66 | apply_gradients=True) -> Callable[[tf.Tensor, tf.Tensor], float]: 67 | """ 68 | Initializes the train_step function to train the model on batches of data. 69 | 70 | Args: 71 | loss_function: Loss function to perform backprop on. 72 | batch_size: Batch size to use for training. 73 | apply_gradients: Whether to apply the gradients, i.e. 74 | False if you want to validate the model on test data. 75 | 76 | Returns: Train step function that is applied to a batch and returns the loss. 77 | 78 | """ 79 | 80 | pass 81 | 82 | @abstractmethod 83 | def save(self, out_path: str) -> None: 84 | """ 85 | Saves the model to a file. 86 | 87 | Args: 88 | out_path: Path to directory for saving the model. 89 | 90 | """ 91 | 92 | pass 93 | 94 | @staticmethod 95 | @abstractmethod 96 | def load(in_path: str): 97 | """ 98 | Loads the model from a file. 99 | 100 | Args: 101 | in_path: Path to the model directory. 102 | 103 | Returns: Instance of the loaded summarizer. 104 | 105 | """ 106 | 107 | pass 108 | -------------------------------------------------------------------------------- /tests/test_bert_training.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from spacy.lang.en import English 6 | from transformers import BertTokenizer 7 | 8 | from headliner.losses import masked_crossentropy 9 | from headliner.model.bert_summarizer import BertSummarizer 10 | from headliner.preprocessing.bert_preprocessor import BertPreprocessor 11 | from headliner.preprocessing.bert_vectorizer import BertVectorizer 12 | from headliner.preprocessing.dataset_generator import DatasetGenerator 13 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 14 | 15 | 16 | class TestBertTraining(unittest.TestCase): 17 | 18 | def setUp(self) -> None: 19 | tf.random.set_seed(42) 20 | np.random.seed(42) 21 | data = [('I love dogs.', 'Dogs.'), 22 | ('I love cats.', 'Cats.')] 23 | tokenizer_encoder = BertTokenizer.from_pretrained('bert-base-uncased') 24 | self.preprocessor = BertPreprocessor(nlp=English()) 25 | data_prep = [self.preprocessor(d) for d in data] 26 | tokenizer_decoder = KerasTokenizer(lower=False, filters='') 27 | tokenizer_decoder.fit([d[1] for d in data_prep]) 28 | self.vectorizer = BertVectorizer(tokenizer_encoder=tokenizer_encoder, 29 | tokenizer_decoder=tokenizer_decoder, 30 | max_output_len=10) 31 | batch_generator = DatasetGenerator(2, rank=3) 32 | data_vecs = [self.vectorizer(d) for d in data_prep] 33 | self.dataset = batch_generator(lambda: data_vecs) 34 | self.loss_func = masked_crossentropy 35 | 36 | def test_training_summarizer_bert(self): 37 | bert_summarizer = BertSummarizer(num_heads=4, 38 | num_layers_encoder=0, 39 | num_layers_decoder=1, 40 | feed_forward_dim=20, 41 | embedding_size_encoder=768, 42 | embedding_size_decoder=64, 43 | bert_embedding_encoder='bert-base-uncased', 44 | embedding_encoder_trainable=True, 45 | dropout_rate=0, 46 | max_prediction_len=10) 47 | bert_summarizer.optimizer_encoder = tf.keras.optimizers.Adam(learning_rate=3e-5) 48 | bert_summarizer.optimizer_decoder = tf.keras.optimizers.Adam(learning_rate=1e-4) 49 | bert_summarizer.init_model(preprocessor=self.preprocessor, 50 | vectorizer=self.vectorizer, 51 | embedding_weights_encoder=None, 52 | embedding_weights_decoder=None) 53 | 54 | loss_bert = 0 55 | train_step = bert_summarizer.new_train_step(loss_function=self.loss_func, 56 | batch_size=2) 57 | for e in range(0, 10): 58 | for token_ids, sent_ids, target_ids in self.dataset.take(-1): 59 | loss_bert = train_step(token_ids, sent_ids, target_ids) 60 | print(str(loss_bert)) 61 | 62 | self.assertAlmostEqual(0.07848279923200607, float(loss_bert), 6) 63 | model_output = bert_summarizer.predict_vectors('I love dogs.', '') 64 | expected_first_logits = np.array([-1.881355, -1.493431, 1.358053, 3.050439, 0.636483]) 65 | np.testing.assert_allclose(expected_first_logits, model_output['logits'][0], atol=1e-3) 66 | self.assertEqual('[CLS] I love dogs. [SEP]', model_output['preprocessed_text'][0]) 67 | self.assertEqual('Dogs. [SEP]', model_output['predicted_text']) 68 | -------------------------------------------------------------------------------- /headliner/model/attention_model.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class Encoder(tf.keras.Model): 7 | 8 | def __init__(self, 9 | embedding_shape: Tuple[int, int], 10 | lstm_size=50, 11 | embedding_trainable=True, 12 | embedding_weights=None) -> None: 13 | super(Encoder, self).__init__() 14 | vocab_size, vec_dim = embedding_shape 15 | weights = None if embedding_weights is None else [embedding_weights] 16 | self.embedding = tf.keras.layers.Embedding(vocab_size, 17 | vec_dim, 18 | weights=weights, 19 | trainable=embedding_trainable) 20 | self.lstm = tf.keras.layers.LSTM(lstm_size, 21 | return_sequences=True, 22 | return_state=True) 23 | self.lstm_size = lstm_size 24 | 25 | def call(self, 26 | sequence: tf.Tensor, 27 | states: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: 28 | embed = self.embedding(sequence) 29 | output, state_h, state_c = self.lstm(embed, initial_state=states) 30 | return output, state_h, state_c 31 | 32 | def init_states(self, batch_size: int) -> Tuple[tf.Tensor, tf.Tensor]: 33 | return tf.zeros([batch_size, self.lstm_size]), \ 34 | tf.zeros([batch_size, self.lstm_size]) 35 | 36 | 37 | class LuongAttention(tf.keras.Model): 38 | 39 | def __init__(self, rnn_size): 40 | super(LuongAttention, self).__init__() 41 | self.wa = tf.keras.layers.Dense(rnn_size) 42 | 43 | def call(self, decoder_output, encoder_output): 44 | score = tf.matmul(decoder_output, self.wa(encoder_output), transpose_b=True) 45 | alignment = tf.nn.softmax(score, axis=2) 46 | context = tf.matmul(alignment, encoder_output) 47 | return context, alignment 48 | 49 | 50 | class Decoder(tf.keras.Model): 51 | 52 | def __init__(self, 53 | embedding_shape: Tuple[int, int], 54 | lstm_size=50, 55 | embedding_trainable=True, 56 | embedding_weights=None) -> None: 57 | super(Decoder, self).__init__() 58 | self.lstm_size = lstm_size 59 | vocab_size, vec_dim = embedding_shape 60 | weights = None if embedding_weights is None else [embedding_weights] 61 | self.embedding = tf.keras.layers.Embedding(vocab_size, vec_dim, 62 | weights=weights, 63 | trainable=embedding_trainable) 64 | self.lstm_size = lstm_size 65 | self.attention = LuongAttention(lstm_size) 66 | self.lstm = tf.keras.layers.LSTM(lstm_size, 67 | return_sequences=True, 68 | return_state=True) 69 | self.wc = tf.keras.layers.Dense(lstm_size, activation='tanh') 70 | self.ws = tf.keras.layers.Dense(vocab_size) 71 | 72 | def call(self, sequence, state, encoder_output): 73 | embed = self.embedding(sequence) 74 | lstm_out, state_h, state_c = self.lstm(embed, initial_state=state) 75 | context, alignment = self.attention(lstm_out, encoder_output) 76 | lstm_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1) 77 | lstm_out = self.wc(lstm_out) 78 | logits = self.ws(lstm_out) 79 | return logits, state_h, state_c, alignment 80 | 81 | def init_states(self, batch_size): 82 | return (tf.zeros([batch_size, self.lstm_size]), 83 | tf.zeros([batch_size, self.lstm_size])) 84 | -------------------------------------------------------------------------------- /headliner/preprocessing/bucket_generator.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | from typing import Iterable, Callable, Iterator, List 3 | 4 | 5 | class BucketGenerator: 6 | """ 7 | Performs bucketing of elements in a dataset by length. 8 | """ 9 | 10 | def __init__(self, 11 | element_length_function: Callable[..., int], 12 | batch_size: int, 13 | buffer_size_batches=1000, 14 | batches_to_bucket=10, 15 | shuffle=True, 16 | seed=None) -> None: 17 | """ 18 | Initializes the BucketGenerator. 19 | 20 | Args: 21 | element_length_function: Element_length_function: function from element in the dataset to int that 22 | determines the length of the element. 23 | batch_size: The size of the batches to bucket the sequences into 24 | buffer_size_batches: buffer_size_batches: number of batches to keep in internal memory. 25 | batches_to_bucket: Number of batches in buffer to use for bucketing. 26 | If set to buffer_size_batches, the resulting batches will be deterministic. 27 | shuffle: Whether to shuffle elements across batches and the resulting buckets. 28 | seed: Seed for shuffling. 29 | """ 30 | 31 | self.sequence_length_function = element_length_function 32 | self.batch_size = batch_size 33 | self.buffer_size_batches = buffer_size_batches 34 | self.batches_to_shuffle = batches_to_bucket 35 | self.shuffle = shuffle 36 | self.random = Random(seed) 37 | 38 | def __call__(self, data: Iterable) -> Iterable: 39 | """ 40 | Returns iterable of data with elements ordered by bucketed sequence lengths, e.g for batch size = 2 the 41 | transformation could look like this: 42 | [1], [3, 3, 3], [1], [4, 4, 4, 4] -> [1], [1], [3, 3, 3], [4, 4, 4, 4] 43 | """ 44 | data_iter = iter(data) 45 | bucket_gen = self._generate_buckets(data_iter) 46 | return bucket_gen 47 | 48 | def _generate_buckets(self, data_iter: Iterator) -> List[List]: 49 | buffered_data = self._fetch_buffered_data(data_iter) 50 | while len(buffered_data) > 0: 51 | buckets = self._to_buckets(buffered_data) 52 | for bucket in buckets: 53 | for element in bucket: 54 | yield element 55 | buffered_data = self._fetch_buffered_data(data_iter) 56 | del buckets 57 | 58 | def _to_buckets(self, buffered_data: List) -> List[List]: 59 | self._shuffle_if_required(buffered_data) 60 | buffered_data = self._sort_blocks(buffered_data) 61 | buckets = [] 62 | for i in range(0, len(buffered_data), self.batch_size): 63 | bucket = buffered_data[i:i + self.batch_size] 64 | if len(bucket) == self.batch_size: 65 | buckets.append(bucket) 66 | self._shuffle_if_required(buckets) 67 | return buckets 68 | 69 | def _sort_blocks(self, buffered_data: List) -> List: 70 | block_size = self.batches_to_shuffle * self.batch_size 71 | buffered_data_sorted = [] 72 | for i in range(0, len(buffered_data), block_size): 73 | sorted_block = buffered_data[i:i + block_size] 74 | sorted_block.sort(key=self.sequence_length_function) 75 | buffered_data_sorted.extend(sorted_block) 76 | return buffered_data_sorted 77 | 78 | def _fetch_buffered_data(self, data_iter: Iterator): 79 | buffered_data = [] 80 | for _ in range(self.buffer_size_batches * self.batch_size): 81 | try: 82 | buffered_data.append(next(data_iter)) 83 | except StopIteration: 84 | pass 85 | return buffered_data 86 | 87 | def _shuffle_if_required(self, list_to_shuffle): 88 | if self.shuffle: 89 | self.random.shuffle(list_to_shuffle) 90 | -------------------------------------------------------------------------------- /mkdocs/docs/examples/advanced_nmt_example.md: -------------------------------------------------------------------------------- 1 | # Advanced Neural Machine Translation Example 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/as-ideas/headliner/blob/master/notebooks/Advanced_Neural_Machine_Translation_Example.ipynb) 4 | 5 | ### Upgrade grpcio which is needed by tensorboard 2.0.2 6 | ```bash 7 | !pip install --upgrade grpcio 8 | ``` 9 | 10 | ### Install TensorFlow and also our package via PyPI 11 | ```bash 12 | pip install tensorflow-gpu==2.0.0 13 | pip install headliner 14 | ``` 15 | 16 | ### Download the German-English sentence pairs 17 | ```bash 18 | wget http://www.manythings.org/anki/deu-eng.zip 19 | unzip deu-eng.zip 20 | ``` 21 | 22 | ### Create the dataset but only take a subset for faster training 23 | ```python 24 | import io 25 | 26 | def create_dataset(path, num_examples): 27 | lines = io.open(path, encoding='UTF-8').read().strip().split('\n') 28 | word_pairs = [[w for w in l.split('\t')[:2]] for l in lines[:num_examples]] 29 | return zip(*word_pairs) 30 | 31 | eng, ger, meta = create_dataset('deu.txt', 30000) 32 | data = list(zip(eng, ger)) 33 | ``` 34 | 35 | ### Split the dataset into train and test 36 | ```python 37 | from sklearn.model_selection import train_test_split 38 | 39 | train, test = train_test_split(data, test_size=100) 40 | ``` 41 | 42 | ### Define custom preprocessing 43 | ```python 44 | from headliner.preprocessing import Preprocessor 45 | 46 | preprocessor = Preprocessor(lower_case=True) 47 | train_prep = [preprocessor(t) for t in train] 48 | ``` 49 | 50 | ### Fit custom tokenizers for input and target 51 | ```python 52 | from tensorflow_datasets.core.features.text import SubwordTextEncoder 53 | from headliner.preprocessing import Vectorizer 54 | 55 | inputs_prep = [t[0] for t in train_prep] 56 | targets_prep = [t[1] for t in train_prep] 57 | tokenizer_input = SubwordTextEncoder.build_from_corpus( 58 | inputs_prep, target_vocab_size=2**13, 59 | reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 60 | tokenizer_target = SubwordTextEncoder.build_from_corpus( 61 | targets_prep, target_vocab_size=2**13, 62 | reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 63 | 64 | vectorizer = Vectorizer(tokenizer_input, tokenizer_target) 65 | 'vocab size input {}, target {}'.format( 66 | vectorizer.encoding_dim, vectorizer.decoding_dim) 67 | ``` 68 | 69 | ### Start tensorboard 70 | ``` 71 | %load_ext tensorboard 72 | %tensorboard --logdir /tmp/transformer_tensorboard 73 | ``` 74 | 75 | ### Define the model and train it 76 | ```python 77 | from headliner.model.transformer_summarizer import TransformerSummarizer 78 | from headliner.trainer import Trainer 79 | 80 | summarizer = TransformerSummarizer(num_heads=4, 81 | feed_forward_dim=1024, 82 | num_layers=1, 83 | embedding_size=256, 84 | dropout_rate=0.1, 85 | max_prediction_len=50) 86 | summarizer.init_model(preprocessor, vectorizer) 87 | trainer = Trainer(steps_per_epoch=250, 88 | batch_size=32, 89 | model_save_path='/tmp/transformer_summarizer', 90 | tensorboard_dir='/tmp/transformer_tensorboard', 91 | steps_to_log=50) 92 | trainer.train(summarizer, train, num_epochs=10, val_data=test) 93 | ``` 94 | 95 | ### Load best model and do some prediction 96 | ```python 97 | best_summarizer = TransformerSummarizer.load('/tmp/transformer_summarizer') 98 | best_summarizer.predict('Do you like robots?') 99 | ``` 100 | 101 | ### Plot attention alignment for a prediction 102 | ```python 103 | import tensorflow as tf 104 | import matplotlib.pyplot as plt 105 | 106 | def plot_attention_weights(summarizer, pred_vectors, layer_name): 107 | fig = plt.figure(figsize=(16, 8)) 108 | input_text, _ = pred_vectors['preprocessed_text'] 109 | input_sequence = summarizer.vectorizer.encode_input(input_text) 110 | pred_sequence = pred_vectors['predicted_sequence'] 111 | attention = tf.squeeze(pred_vectors['attention_weights'][layer_name]) 112 | for head in range(attention.shape[0]): 113 | ax = fig.add_subplot(1, 2, head + 1) 114 | ax.matshow(attention[head][:-1, :], cmap='viridis') 115 | fontdict = {'fontsize': 10} 116 | ax.set_xticks(range(len(input_sequence))) 117 | ax.set_yticks(range(len(pred_sequence))) 118 | ax.set_ylim(len(pred_sequence) - 1.5, -0.5) 119 | ax.set_xticklabels( 120 | [summarizer.vectorizer.decode_input([i]) for i in input_sequence], 121 | fontdict=fontdict, 122 | rotation=90) 123 | ax.set_yticklabels([summarizer.vectorizer.decode_output([i]) 124 | for i in pred_sequence], fontdict=fontdict) 125 | ax.set_xlabel('Head {}'.format(head + 1)) 126 | plt.tight_layout() 127 | plt.show() 128 | 129 | pred_vectors = best_summarizer.predict_vectors( 130 | 'Tom ran out of the burning house.', '') 131 | plot_attention_weights(best_summarizer, pred_vectors, 'decoder_layer1_block2') 132 | ``` 133 | 134 | ### Continue training to improve the model and check the BLEU score 135 | ```python 136 | from headliner.evaluation import BleuScorer 137 | 138 | bleu_scorer = BleuScorer(tokens_to_ignore=[preprocessor.start_token, 139 | preprocessor.end_token]) 140 | trainer.train(best_summarizer, 141 | train, 142 | num_epochs=30, 143 | val_data=test, 144 | scorers={'bleu': bleu_scorer}) 145 | ``` 146 | 147 | -------------------------------------------------------------------------------- /mkdocs/docs/examples/bert_example.md: -------------------------------------------------------------------------------- 1 | # BERT Neural Machine Translation Example 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/as-ideas/headliner/blob/master/notebooks/BERT_Translation_Example.ipynb) 4 | 5 | ### Upgrade grpcio which is needed by tensorboard 2.0.2 6 | ```bash 7 | !pip install --upgrade grpcio 8 | ``` 9 | 10 | ### Install TensorFlow and also our package via PyPI 11 | ```bash 12 | pip install tensorflow-gpu==2.0.0 13 | pip install headliner 14 | ``` 15 | 16 | ### Download the German-English sentence pairs 17 | ```bash 18 | wget http://www.manythings.org/anki/deu-eng.zip 19 | unzip deu-eng.zip 20 | ``` 21 | 22 | ### Create the dataset but only take a subset for faster training 23 | ```python 24 | import io 25 | 26 | def create_dataset(path, num_examples): 27 | lines = io.open(path, encoding='UTF-8').read().strip().split('\n') 28 | word_pairs = [[w for w in l.split('\t')[:2]] for l in lines[:num_examples]] 29 | return zip(*word_pairs) 30 | 31 | eng, ger, meta = create_dataset('deu.txt', 200000) 32 | data = list(zip(eng, ger)) 33 | ``` 34 | 35 | ### Split the dataset into train and test 36 | ```python 37 | from sklearn.model_selection import train_test_split 38 | 39 | train, test = train_test_split(data, test_size=100) 40 | ``` 41 | 42 | ### Define custom preprocessing 43 | ```python 44 | from headliner.preprocessing.bert_preprocessor import BertPreprocessor 45 | from spacy.lang.en import English 46 | 47 | preprocessor = BertPreprocessor(nlp=English()) 48 | train_prep = [preprocessor(t) for t in train] 49 | train_prep[:5] 50 | ``` 51 | 52 | ### Create custom tokenizers for input and target 53 | ```python 54 | from tensorflow_datasets.core.features.text import SubwordTextEncoder 55 | from transformers import BertTokenizer 56 | from headliner.preprocessing.bert_vectorizer import BertVectorizer 57 | 58 | targets_prep = [t[1] for t in train_prep] 59 | tokenizer_input = BertTokenizer.from_pretrained('bert-base-uncased') 60 | tokenizer_target = SubwordTextEncoder.build_from_corpus( 61 | targets_prep, target_vocab_size=2**13, 62 | reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 63 | 64 | vectorizer = BertVectorizer(tokenizer_input, tokenizer_target) 65 | 'vocab size input {}, target {}'.format( 66 | vectorizer.encoding_dim, vectorizer.decoding_dim) 67 | ``` 68 | 69 | ### Start tensorboard 70 | ``` 71 | %load_ext tensorboard 72 | %tensorboard --logdir /tmp/bert_tensorboard 73 | ``` 74 | 75 | ### Define the model and train it 76 | ```python 77 | # Define the model and train it 78 | # You need to be quite patient, since the model has a lot of params 79 | import tensorflow as tf 80 | from headliner.model.bert_summarizer import BertSummarizer 81 | from headliner.trainer import Trainer 82 | 83 | summarizer = BertSummarizer(num_heads=8, 84 | feed_forward_dim=1024, 85 | num_layers_encoder=0, 86 | num_layers_decoder=4, 87 | bert_embedding_encoder='bert-base-uncased', 88 | embedding_encoder_trainable=False, 89 | embedding_size_encoder=768, 90 | embedding_size_decoder=768, 91 | dropout_rate=0, 92 | max_prediction_len=50) 93 | # Adjust learning rates of encoder and decoder optimizer schedules 94 | # You may want to try different learning rates and observe the loss 95 | summarizer.optimizer_decoder = BertSummarizer.new_optimizer_decoder( 96 | learning_rate_start=2e-2 97 | ) 98 | summarizer.optimizer_encoder = BertSummarizer.new_optimizer_encoder( 99 | learning_rate_start=5e-4 100 | ) 101 | summarizer.init_model(preprocessor, vectorizer) 102 | trainer = Trainer(steps_per_epoch=5000, 103 | batch_size=16, 104 | model_save_path='/tmp/bert_summarizer', 105 | tensorboard_dir='/tmp/bert_tensorboard', 106 | steps_to_log=10) 107 | trainer.train(summarizer, train, num_epochs=200, val_data=test) 108 | ``` 109 | 110 | ### Load best model and do some prediction 111 | ```python 112 | best_summarizer = BertSummarize.load('/tmp/bert_summarizer') 113 | best_summarizer.predict('Do you like robots?') 114 | ``` 115 | 116 | ### Plot attention alignment for a prediction 117 | ```python 118 | import tensorflow as tf 119 | import matplotlib.pyplot as plt 120 | 121 | def plot_attention_weights(summarizer, pred_vectors, layer_name): 122 | fig = plt.figure(figsize=(16, 8)) 123 | input_text, _ = pred_vectors['preprocessed_text'] 124 | input_sequence = summarizer.vectorizer.encode_input(input_text) 125 | pred_sequence = pred_vectors['predicted_sequence'] 126 | attention = tf.squeeze(pred_vectors['attention_weights'][layer_name]) 127 | for head in range(attention.shape[0]): 128 | ax = fig.add_subplot(1, 2, head + 1) 129 | ax.matshow(attention[head][:-1, :], cmap='viridis') 130 | fontdict = {'fontsize': 10} 131 | ax.set_xticks(range(len(input_sequence))) 132 | ax.set_yticks(range(len(pred_sequence))) 133 | ax.set_ylim(len(pred_sequence) - 1.5, -0.5) 134 | ax.set_xticklabels( 135 | [summarizer.vectorizer.decode_input([i]) for i in input_sequence], 136 | fontdict=fontdict, 137 | rotation=90) 138 | ax.set_yticklabels([summarizer.vectorizer.decode_output([i]) 139 | for i in pred_sequence], fontdict=fontdict) 140 | ax.set_xlabel('Head {}'.format(head + 1)) 141 | plt.tight_layout() 142 | plt.show() 143 | 144 | pred_vectors = best_summarizer.predict_vectors( 145 | 'Tom ran out of the burning house.', '') 146 | plot_attention_weights(best_summarizer, pred_vectors, 'decoder_layer4_block2') 147 | ``` 148 | 149 | -------------------------------------------------------------------------------- /tests/test_trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import logging 3 | import os 4 | import unittest 5 | import numpy as np 6 | from tensorflow.python.keras.callbacks import Callback 7 | from headliner.model import AttentionSummarizer 8 | from headliner.preprocessing.preprocessor import Preprocessor 9 | from headliner.trainer import Trainer 10 | 11 | 12 | class TestTrainer(unittest.TestCase): 13 | 14 | def setUp(self) -> None: 15 | tf.random.set_seed(42) 16 | np.random.seed(42) 17 | 18 | def test_init_from_config(self) -> None: 19 | current_dir = os.path.dirname(os.path.abspath(__file__)) 20 | config_path = os.path.join(current_dir, 'resources/trainer_test_config.yaml') 21 | trainer = Trainer.from_config(config_path) 22 | self.assertEqual(10, trainer.max_input_len) 23 | self.assertEqual(9, trainer.max_output_len) 24 | self.assertEqual(1, trainer.batch_size) 25 | self.assertEqual(7, trainer.max_vocab_size_encoder) 26 | self.assertEqual(6, trainer.max_vocab_size_decoder) 27 | self.assertEqual('glove.txt', trainer.embedding_path_encoder) 28 | self.assertEqual(None, trainer.embedding_path_decoder) 29 | self.assertEqual(4, trainer.steps_per_epoch) 30 | self.assertEqual('tensor_dir', trainer.tensorboard_dir) 31 | self.assertEqual('model_save_path', trainer.model_save_path) 32 | self.assertTrue(trainer.use_bucketing) 33 | self.assertIsNone(trainer.shuffle_buffer_size) 34 | self.assertEqual(5, trainer.bucketing_buffer_size_batches) 35 | self.assertEqual(6, trainer.bucketing_batches_to_bucket) 36 | self.assertEqual(7, trainer.steps_to_log) 37 | self.assertEqual(logging.DEBUG, trainer.logger.level) 38 | 39 | def test_init(self) -> None: 40 | preprocessor = Preprocessor(start_token='', lower_case=False, hash_numbers=False) 41 | trainer = Trainer(max_output_len=9, 42 | batch_size=1, 43 | max_vocab_size_encoder=2, 44 | max_vocab_size_decoder=3, 45 | embedding_path_encoder='glove.txt', 46 | steps_per_epoch=4, 47 | tensorboard_dir='tensor_dir', 48 | model_save_path='model_save_path', 49 | shuffle_buffer_size=10, 50 | bucketing_buffer_size_batches=5, 51 | bucketing_batches_to_bucket=6, 52 | steps_to_log=7, 53 | logging_level=logging.DEBUG, 54 | preprocessor=preprocessor) 55 | 56 | self.assertEqual(1, trainer.batch_size) 57 | self.assertEqual(2, trainer.max_vocab_size_encoder) 58 | self.assertEqual(3, trainer.max_vocab_size_decoder) 59 | self.assertEqual('glove.txt', trainer.embedding_path_encoder) 60 | self.assertIsNone(trainer.embedding_path_decoder) 61 | self.assertEqual(4, trainer.steps_per_epoch) 62 | self.assertEqual('tensor_dir', trainer.tensorboard_dir) 63 | self.assertEqual('model_save_path', trainer.model_save_path) 64 | self.assertFalse(trainer.use_bucketing) 65 | self.assertEqual(10, trainer.shuffle_buffer_size) 66 | self.assertEqual(5, trainer.bucketing_buffer_size_batches) 67 | self.assertEqual(6, trainer.bucketing_batches_to_bucket) 68 | self.assertEqual(7, trainer.steps_to_log) 69 | self.assertEqual(9, trainer.max_output_len) 70 | self.assertEqual(logging.DEBUG, trainer.logger.level) 71 | self.assertEqual('', trainer.preprocessor.start_token) 72 | self.assertEqual(False, trainer.preprocessor.lower_case) 73 | self.assertEqual(False, trainer.preprocessor.hash_numbers) 74 | 75 | def test_init_model(self) -> None: 76 | logging.basicConfig(level=logging.INFO) 77 | data = [('a b', 'a'), ('a b c', 'b')] 78 | summarizer = AttentionSummarizer(lstm_size=16, 79 | embedding_size=10) 80 | trainer = Trainer(batch_size=2, 81 | steps_per_epoch=10, 82 | max_vocab_size_encoder=10, 83 | max_vocab_size_decoder=10, 84 | model_save_path=None, 85 | max_input_len=5, 86 | max_output_len=3) 87 | trainer.train(summarizer, data, num_epochs=1) 88 | # encoding dim and decoding dim are num unique tokens + 4 (pad, start, end, oov) 89 | self.assertIsNotNone(summarizer.vectorizer) 90 | self.assertEqual(7, summarizer.vectorizer.encoding_dim) 91 | self.assertEqual(6, summarizer.vectorizer.decoding_dim) 92 | self.assertEqual(5, summarizer.vectorizer.max_input_len) 93 | self.assertEqual(3, summarizer.vectorizer.max_output_len) 94 | 95 | def test_train(self) -> None: 96 | 97 | class LogCallback(Callback): 98 | 99 | def __init__(self): 100 | super().__init__() 101 | 102 | def on_epoch_end(self, epoch, logs=None): 103 | self.logs = logs 104 | 105 | data = [('a b', 'a'), ('a b c', 'b')] 106 | 107 | summarizer = AttentionSummarizer(lstm_size=16, 108 | embedding_size=10) 109 | log_callback = LogCallback() 110 | trainer = Trainer(batch_size=2, 111 | steps_per_epoch=10, 112 | max_vocab_size_encoder=10, 113 | max_vocab_size_decoder=10, 114 | model_save_path=None, 115 | max_output_len=3) 116 | 117 | trainer.train(summarizer, 118 | data, 119 | num_epochs=2, 120 | callbacks=[log_callback]) 121 | 122 | logs = log_callback.logs 123 | self.assertAlmostEqual(1.7135955810546875, logs['loss'], 6) 124 | -------------------------------------------------------------------------------- /tests/test_training.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from headliner.losses import masked_crossentropy 7 | from headliner.model.basic_summarizer import BasicSummarizer 8 | from headliner.model.attention_summarizer import AttentionSummarizer 9 | from headliner.model.transformer_summarizer import TransformerSummarizer 10 | from headliner.preprocessing.dataset_generator import DatasetGenerator 11 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 12 | from headliner.preprocessing.preprocessor import Preprocessor 13 | from headliner.preprocessing.vectorizer import Vectorizer 14 | 15 | 16 | class TestTraining(unittest.TestCase): 17 | 18 | def setUp(self) -> None: 19 | tf.random.set_seed(42) 20 | np.random.seed(42) 21 | self.data = [('a b', 'c'), ('a b c', 'd')] 22 | tokenizer_encoder = KerasTokenizer(lower=False, filters='') 23 | tokenizer_decoder = KerasTokenizer(lower=False, filters='') 24 | tokenizer_encoder.fit(['a b c ']) 25 | tokenizer_decoder.fit(['c d ']) 26 | self.vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder, 27 | tokenizer_decoder=tokenizer_decoder, 28 | max_output_len=3) 29 | self.preprocessor = Preprocessor() 30 | batch_generator = DatasetGenerator(2) 31 | data_prep = [self.preprocessor(d) for d in self.data] 32 | data_vecs = [self.vectorizer(d) for d in data_prep] 33 | self.dataset = batch_generator(lambda: data_vecs) 34 | self.loss_func = masked_crossentropy 35 | 36 | def test_training_summarizer_attention(self) -> None: 37 | attention_summarizer = AttentionSummarizer(lstm_size=10, 38 | embedding_size=10) 39 | attention_summarizer.init_model(preprocessor=self.preprocessor, 40 | vectorizer=self.vectorizer, 41 | embedding_weights_encoder=None, 42 | embedding_weights_decoder=None) 43 | loss_attention = 0 44 | train_step = attention_summarizer.new_train_step(loss_function=self.loss_func, 45 | batch_size=2) 46 | for _ in range(10): 47 | for source_seq, target_seq in self.dataset.take(-1): 48 | loss_attention = train_step(source_seq, target_seq) 49 | print(str(loss_attention)) 50 | 51 | self.assertAlmostEqual(1.577033519744873, float(loss_attention), 5) 52 | output_attention = attention_summarizer.predict_vectors('a c', '') 53 | expected_first_logits = np.array([-0.077805, 0.012667, 0.021359, -0.04872, 0.014989]) 54 | np.testing.assert_allclose(expected_first_logits, output_attention['logits'][0], atol=1e-6) 55 | self.assertEqual(' a c ', output_attention['preprocessed_text'][0]) 56 | self.assertEqual('d ', output_attention['predicted_text']) 57 | 58 | def test_training_summarizer_basic(self) -> None: 59 | basic_summarizer = BasicSummarizer(lstm_size=10, 60 | embedding_size=10) 61 | basic_summarizer.init_model(preprocessor=self.preprocessor, 62 | vectorizer=self.vectorizer, 63 | embedding_weights_encoder=None, 64 | embedding_weights_decoder=None) 65 | loss = 0 66 | train_step = basic_summarizer.new_train_step(loss_function=self.loss_func, 67 | batch_size=2) 68 | for e in range(0, 10): 69 | for source_seq, target_seq in self.dataset.take(-1): 70 | loss = train_step(source_seq, target_seq) 71 | 72 | self.assertAlmostEqual(1.5850255489349365, float(loss), 5) 73 | output = basic_summarizer.predict_vectors('a c', '') 74 | expected_first_logits = np.array([-0.00621 , 0.007277, 0.015851, -0.034298, 0.044253]) 75 | np.testing.assert_allclose(expected_first_logits, output['logits'][0], atol=1e-6) 76 | self.assertEqual(' a c ', output['preprocessed_text'][0]) 77 | self.assertEqual('', output['predicted_text']) 78 | 79 | def test_training_summarizer_transformer(self): 80 | transformer_summarizer = TransformerSummarizer(num_heads=1, 81 | num_layers=1, 82 | feed_forward_dim=20, 83 | embedding_size=10, 84 | dropout_rate=0, 85 | max_prediction_len=3) 86 | transformer_summarizer.init_model(preprocessor=self.preprocessor, 87 | vectorizer=self.vectorizer, 88 | embedding_weights_encoder=None, 89 | embedding_weights_decoder=None) 90 | loss_transformer = 0 91 | train_step = transformer_summarizer.new_train_step(loss_function=self.loss_func, 92 | batch_size=2) 93 | for e in range(0, 10): 94 | for source_seq, target_seq in self.dataset.take(-1): 95 | loss_transformer = train_step(source_seq, target_seq) 96 | print(str(loss_transformer)) 97 | 98 | self.assertAlmostEqual(1.3421446084976196, float(loss_transformer), 5) 99 | output_transformer = transformer_summarizer.predict_vectors('a c', '') 100 | expected_first_logits = np.array([-0.514366, 1.416978, -0.679771, -0.488442, -0.022602]) 101 | np.testing.assert_allclose(expected_first_logits, output_transformer['logits'][0], atol=1e-6) 102 | self.assertEqual(' a c ', output_transformer['preprocessed_text'][0]) 103 | self.assertEqual('c c c', output_transformer['predicted_text']) 104 | -------------------------------------------------------------------------------- /headliner/model/transformer_model.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from headliner.model.transformer_util import * 4 | 5 | 6 | class Encoder(tf.keras.layers.Layer): 7 | 8 | def __init__(self, 9 | num_layers: int, 10 | num_heads: int, 11 | feed_forward_dim: int, 12 | embedding_shape: Tuple[int, int], 13 | embedding_trainable=True, 14 | embedding_weights=None, 15 | dropout_rate=0.1, 16 | max_seq_len=10000) -> None: 17 | super(Encoder, self).__init__() 18 | 19 | self.num_layers = num_layers 20 | vocab_size, vec_dim = embedding_shape 21 | weights = None if embedding_weights is None else [embedding_weights] 22 | self.embedding_size = vec_dim 23 | self.embedding = tf.keras.layers.Embedding(vocab_size, 24 | vec_dim, 25 | weights=weights, 26 | trainable=embedding_trainable) 27 | self.pos_encoding = positional_encoding(max_seq_len, self.embedding_size) 28 | self.enc_layers = [EncoderLayer(vec_dim, num_heads, feed_forward_dim, dropout_rate) 29 | for _ in range(num_layers)] 30 | self.dropout = tf.keras.layers.Dropout(dropout_rate) 31 | 32 | def call(self, x, training, mask): 33 | seq_len = tf.shape(x)[1] 34 | x = self.embedding(x) 35 | x *= tf.math.sqrt(tf.cast(self.embedding_size, tf.float32)) 36 | x += self.pos_encoding[:, :seq_len, :] 37 | x = self.dropout(x, training=training) 38 | for i in range(self.num_layers): 39 | x = self.enc_layers[i](x, training, mask) 40 | return x 41 | 42 | 43 | class Decoder(tf.keras.layers.Layer): 44 | 45 | def __init__(self, 46 | num_layers: int, 47 | num_heads: int, 48 | feed_forward_dim: int, 49 | embedding_shape: Tuple[int, int], 50 | embedding_trainable=True, 51 | embedding_weights=None, 52 | dropout_rate=0.1, 53 | max_seq_len=10000) -> None: 54 | super(Decoder, self).__init__() 55 | 56 | self.num_layers = num_layers 57 | vocab_size, vec_dim = embedding_shape 58 | weights = None if embedding_weights is None else [embedding_weights] 59 | self.embedding_size = vec_dim 60 | self.embedding = tf.keras.layers.Embedding(vocab_size, 61 | vec_dim, 62 | weights=weights, 63 | trainable=embedding_trainable) 64 | self.pos_encoding = positional_encoding(max_seq_len, vec_dim) 65 | self.dec_layers = [DecoderLayer(vec_dim, num_heads, feed_forward_dim, dropout_rate) 66 | for _ in range(num_layers)] 67 | self.dropout = tf.keras.layers.Dropout(dropout_rate) 68 | 69 | def call(self, 70 | x, 71 | enc_output, 72 | training, 73 | look_ahead_mask, 74 | padding_mask): 75 | seq_len = tf.shape(x)[1] 76 | attention_weights = {} 77 | 78 | x = self.embedding(x) 79 | x *= tf.math.sqrt(tf.cast(self.embedding_size, tf.float32)) 80 | x += self.pos_encoding[:, :seq_len, :] 81 | x = self.dropout(x, training=training) 82 | 83 | for i in range(self.num_layers): 84 | x, block1, block2 = self.dec_layers[i](x, enc_output, training, 85 | look_ahead_mask, padding_mask) 86 | attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 87 | attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 88 | 89 | return x, attention_weights 90 | 91 | 92 | class Transformer(tf.keras.Model): 93 | 94 | def __init__(self, 95 | num_layers: int, 96 | num_heads: int, 97 | feed_forward_dim: int, 98 | embedding_shape_encoder: Tuple[int, int], 99 | embedding_shape_decoder: Tuple[int, int], 100 | embedding_encoder_trainable=True, 101 | embedding_decoder_trainable=True, 102 | embedding_weights_encoder=None, 103 | embedding_weights_decoder=None, 104 | dropout_rate=0.1, 105 | max_sequence_len=10000) -> None: 106 | super(Transformer, self).__init__() 107 | 108 | self.encoder = Encoder(num_layers, 109 | num_heads, 110 | feed_forward_dim, 111 | embedding_shape_encoder, 112 | embedding_trainable=embedding_encoder_trainable, 113 | embedding_weights=embedding_weights_encoder, 114 | dropout_rate=dropout_rate, 115 | max_seq_len=max_sequence_len) 116 | 117 | self.decoder = Decoder(num_layers, 118 | num_heads, 119 | feed_forward_dim, 120 | embedding_shape_decoder, 121 | embedding_trainable=embedding_decoder_trainable, 122 | embedding_weights=embedding_weights_decoder, 123 | dropout_rate=dropout_rate, 124 | max_seq_len=max_sequence_len) 125 | 126 | self.final_layer = tf.keras.layers.Dense(embedding_shape_decoder[0]) 127 | 128 | def call(self, inp, tar, training, enc_padding_mask, 129 | look_ahead_mask, dec_padding_mask): 130 | enc_output = self.encoder(inp, training, enc_padding_mask) 131 | 132 | dec_output, attention_weights = self.decoder( 133 | tar, enc_output, training, look_ahead_mask, dec_padding_mask) 134 | final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size) 135 | return final_output, attention_weights 136 | -------------------------------------------------------------------------------- /headliner/model/transformer_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def get_angles(pos, i, embedding_size): 6 | angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(embedding_size)) 7 | return pos * angle_rates 8 | 9 | 10 | def positional_encoding(max_len, embedding_size): 11 | angle_rads = get_angles(np.arange(max_len)[:, np.newaxis], 12 | np.arange(embedding_size)[np.newaxis, :], 13 | embedding_size) 14 | 15 | angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) 16 | angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) 17 | pos_encoding = angle_rads[np.newaxis, ...] 18 | return tf.cast(pos_encoding, dtype=tf.float32) 19 | 20 | 21 | def create_padding_mask(seq): 22 | seq = tf.cast(tf.math.equal(seq, 0), tf.float32) 23 | return seq[:, tf.newaxis, tf.newaxis, :] 24 | 25 | 26 | def create_look_ahead_mask(size): 27 | mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) 28 | return mask 29 | 30 | 31 | def scaled_dot_product_attention(q, k, v, mask): 32 | matmul_qk = tf.matmul(q, k, transpose_b=True) 33 | dk = tf.cast(tf.shape(k)[-1], tf.float32) 34 | scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) 35 | if mask is not None: 36 | scaled_attention_logits += (mask * -1e9) 37 | attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) 38 | output = tf.matmul(attention_weights, v) 39 | 40 | return output, attention_weights 41 | 42 | 43 | def point_wise_feed_forward_network(embedding_size, feed_forward_dim): 44 | return tf.keras.Sequential([ 45 | tf.keras.layers.Dense(feed_forward_dim, activation='relu'), 46 | tf.keras.layers.Dense(embedding_size) 47 | ]) 48 | 49 | 50 | def create_masks(inp, tar): 51 | enc_padding_mask = create_padding_mask(inp) 52 | dec_padding_mask = create_padding_mask(inp) 53 | look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) 54 | dec_target_padding_mask = create_padding_mask(tar) 55 | combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) 56 | return enc_padding_mask, combined_mask, dec_padding_mask 57 | 58 | 59 | class MultiHeadAttention(tf.keras.layers.Layer): 60 | 61 | def __init__(self, 62 | embedding_size: int, 63 | num_heads: int) -> None: 64 | super(MultiHeadAttention, self).__init__() 65 | self.num_heads = num_heads 66 | self.embedding_size = embedding_size 67 | assert embedding_size % self.num_heads == 0 68 | self.depth = embedding_size // self.num_heads 69 | self.wq = tf.keras.layers.Dense(embedding_size) 70 | self.wk = tf.keras.layers.Dense(embedding_size) 71 | self.wv = tf.keras.layers.Dense(embedding_size) 72 | self.dense = tf.keras.layers.Dense(embedding_size) 73 | 74 | def split_heads(self, x, batch_size): 75 | x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) 76 | return tf.transpose(x, perm=[0, 2, 1, 3]) 77 | 78 | def call(self, v, k, q, mask): 79 | batch_size = tf.shape(q)[0] 80 | q = self.wq(q) 81 | k = self.wk(k) 82 | v = self.wv(v) 83 | q = self.split_heads(q, batch_size) 84 | k = self.split_heads(k, batch_size) 85 | v = self.split_heads(v, batch_size) 86 | scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask) 87 | scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) 88 | concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_size)) 89 | output = self.dense(concat_attention) 90 | return output, attention_weights 91 | 92 | 93 | class EncoderLayer(tf.keras.layers.Layer): 94 | 95 | def __init__(self, 96 | embedding_size: int, 97 | num_heads: int, 98 | feed_forward_dim: int, 99 | dropout_rate=0.1) -> None: 100 | super(EncoderLayer, self).__init__() 101 | self.mha = MultiHeadAttention(embedding_size, num_heads) 102 | self.ffn = point_wise_feed_forward_network(embedding_size, feed_forward_dim) 103 | self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 104 | self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 105 | self.dropout1 = tf.keras.layers.Dropout(dropout_rate) 106 | self.dropout2 = tf.keras.layers.Dropout(dropout_rate) 107 | 108 | def call(self, x, training, mask): 109 | attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, embedding_size) 110 | attn_output = self.dropout1(attn_output, training=training) 111 | out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, embedding_size) 112 | ffn_output = self.ffn(out1) # (batch_size, input_seq_len, embedding_size) 113 | ffn_output = self.dropout2(ffn_output, training=training) 114 | out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, embedding_size) 115 | return out2 116 | 117 | 118 | class DecoderLayer(tf.keras.layers.Layer): 119 | 120 | def __init__(self, 121 | embedding_size: int, 122 | num_heads: int, 123 | feed_forward_dim: int, 124 | dropout_rate=0.1) -> None: 125 | super(DecoderLayer, self).__init__() 126 | self.mha1 = MultiHeadAttention(embedding_size, num_heads) 127 | self.mha2 = MultiHeadAttention(embedding_size, num_heads) 128 | self.ffn = point_wise_feed_forward_network(embedding_size, feed_forward_dim) 129 | self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 130 | self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 131 | self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 132 | self.dropout1 = tf.keras.layers.Dropout(dropout_rate) 133 | self.dropout2 = tf.keras.layers.Dropout(dropout_rate) 134 | self.dropout3 = tf.keras.layers.Dropout(dropout_rate) 135 | 136 | def call(self, 137 | x, 138 | enc_output, 139 | training, 140 | look_ahead_mask, 141 | padding_mask): 142 | attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) 143 | attn1 = self.dropout1(attn1, training=training) 144 | out1 = self.layernorm1(attn1 + x) 145 | attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask) 146 | attn2 = self.dropout2(attn2, training=training) 147 | out2 = self.layernorm2(attn2 + out1) 148 | ffn_output = self.ffn(out2) 149 | ffn_output = self.dropout3(ffn_output, training=training) 150 | out3 = self.layernorm3(ffn_output + out2) 151 | 152 | return out3, attn_weights_block1, attn_weights_block2 153 | -------------------------------------------------------------------------------- /headliner/model/bert_model.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from transformers import TFBertModel 4 | 5 | from headliner.model.transformer_util import * 6 | 7 | 8 | class Encoder(tf.keras.Model): 9 | 10 | def __init__(self, 11 | num_layers: int, 12 | num_heads: int, 13 | feed_forward_dim: int, 14 | embedding_shape: Tuple[int, int], 15 | bert_embedding_name=None, 16 | embedding_trainable=True, 17 | embedding_weights=None, 18 | dropout_rate=0.1, 19 | max_seq_len=10000) -> None: 20 | super(Encoder, self).__init__() 21 | 22 | self.num_layers = num_layers 23 | vocab_size, vec_dim = embedding_shape 24 | self.embedding_size = vec_dim 25 | self.bert_embedding_name = bert_embedding_name 26 | if bert_embedding_name is not None: 27 | self.embedding = TFBertModel.from_pretrained(bert_embedding_name) 28 | self.embedding.trainable = embedding_trainable 29 | else: 30 | weights = None if embedding_weights is None else [embedding_weights] 31 | self.embedding = tf.keras.layers.Embedding(vocab_size, 32 | vec_dim, 33 | weights=weights, 34 | trainable=embedding_trainable) 35 | self.pos_encoding = positional_encoding(max_seq_len, self.embedding_size) 36 | self.enc_layers = [EncoderLayer(vec_dim, num_heads, feed_forward_dim, dropout_rate) 37 | for _ in range(num_layers)] 38 | self.dropout = tf.keras.layers.Dropout(dropout_rate) 39 | 40 | def call(self, x, sent_ids, training, mask): 41 | seq_len = tf.shape(x)[1] 42 | # make attention mask consumable for huggingface transformes 43 | # 1 for non-masked tokens, 0 for masked tokens 44 | mask = mask[:, 0, 0, :] 45 | attention_mask = tf.cast(tf.math.equal(mask, 0), tf.int32) 46 | if self.bert_embedding_name is not None: 47 | x = {'input_ids': x, 48 | 'token_type_ids': sent_ids, 49 | 'attention_mask': attention_mask} 50 | x = self.embedding(x, training=training)[0] 51 | else: 52 | x = self.embedding(x) 53 | x *= tf.math.sqrt(tf.cast(self.embedding_size, tf.float32)) 54 | x += self.pos_encoding[:, :seq_len, :] 55 | x = self.dropout(x, training=training) 56 | for i in range(self.num_layers): 57 | x = self.enc_layers[i](x, training, mask) 58 | return x 59 | 60 | 61 | class Decoder(tf.keras.Model): 62 | 63 | def __init__(self, 64 | num_layers: int, 65 | num_heads: int, 66 | feed_forward_dim: int, 67 | embedding_shape: Tuple[int, int], 68 | embedding_trainable=True, 69 | embedding_weights=None, 70 | dropout_rate=0.1, 71 | max_seq_len=10000) -> None: 72 | super(Decoder, self).__init__() 73 | 74 | self.num_layers = num_layers 75 | vocab_size, vec_dim = embedding_shape 76 | self.embedding_size = vec_dim 77 | weights = None if embedding_weights is None else [embedding_weights] 78 | self.embedding = tf.keras.layers.Embedding(vocab_size, 79 | vec_dim, 80 | weights=weights, 81 | trainable=embedding_trainable) 82 | self.pos_encoding = positional_encoding(max_seq_len, vec_dim) 83 | self.dec_layers = [DecoderLayer(vec_dim, num_heads, feed_forward_dim, dropout_rate) 84 | for _ in range(num_layers)] 85 | self.dropout = tf.keras.layers.Dropout(dropout_rate) 86 | self.final_layer = tf.keras.layers.Dense(embedding_shape[0]) 87 | 88 | def call(self, 89 | x, 90 | enc_output, 91 | training, 92 | look_ahead_mask, 93 | padding_mask): 94 | seq_len = tf.shape(x)[1] 95 | attention_weights = {} 96 | 97 | x = self.embedding(x) 98 | x *= tf.math.sqrt(tf.cast(self.embedding_size, tf.float32)) 99 | x += self.pos_encoding[:, :seq_len, :] 100 | x = self.dropout(x, training=training) 101 | for i in range(self.num_layers): 102 | x, block1, block2 = self.dec_layers[i](x, enc_output, training, 103 | look_ahead_mask, padding_mask) 104 | attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 105 | attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 106 | x = self.final_layer(x) 107 | return x, attention_weights 108 | 109 | 110 | class Transformer(tf.keras.Model): 111 | 112 | def __init__(self, 113 | num_layers_encoder: int, 114 | num_layers_decoder: int, 115 | num_heads: int, 116 | feed_forward_dim: int, 117 | embedding_shape_encoder: Tuple[int, int], 118 | embedding_shape_decoder: Tuple[int, int], 119 | bert_embedding_encoder=None, 120 | embedding_encoder_trainable=True, 121 | embedding_decoder_trainable=True, 122 | embedding_weights_encoder=None, 123 | embedding_weights_decoder=None, 124 | dropout_rate=0.1, 125 | max_seq_len=10000) -> None: 126 | super(Transformer, self).__init__() 127 | 128 | self.encoder = Encoder(num_layers_encoder, 129 | num_heads, 130 | feed_forward_dim, 131 | embedding_shape_encoder, 132 | bert_embedding_name=bert_embedding_encoder, 133 | embedding_trainable=embedding_encoder_trainable, 134 | embedding_weights=embedding_weights_encoder, 135 | dropout_rate=dropout_rate, 136 | max_seq_len=max_seq_len) 137 | 138 | self.decoder = Decoder(num_layers_decoder, 139 | num_heads, 140 | feed_forward_dim, 141 | embedding_shape_decoder, 142 | embedding_trainable=embedding_decoder_trainable, 143 | embedding_weights=embedding_weights_decoder, 144 | dropout_rate=dropout_rate, 145 | max_seq_len=max_seq_len) 146 | 147 | def call(self, 148 | inp, 149 | sent_ids, 150 | tar, 151 | training, 152 | enc_padding_mask, 153 | look_ahead_mask, 154 | dec_padding_mask): 155 | enc_output = self.encoder(inp, sent_ids, training, enc_padding_mask) 156 | dec_output, attention_weights = self.decoder( 157 | tar, enc_output, training, look_ahead_mask, dec_padding_mask) 158 | return dec_output, attention_weights 159 | -------------------------------------------------------------------------------- /headliner/model/basic_summarizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Callable, Dict, Union 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from headliner.model.basic_model import Encoder, Decoder 9 | from headliner.model.summarizer import Summarizer 10 | from headliner.preprocessing.preprocessor import Preprocessor 11 | from headliner.preprocessing.vectorizer import Vectorizer 12 | 13 | 14 | class BasicSummarizer(Summarizer): 15 | 16 | def __init__(self, lstm_size=50, max_prediction_len=20, embedding_size=50, embedding_encoder_trainable=True, 17 | embedding_decoder_trainable=True): 18 | 19 | super().__init__() 20 | self.lstm_size = lstm_size 21 | self.max_prediction_len = max_prediction_len 22 | self.embedding_size = embedding_size 23 | self.embedding_encoder_trainable = embedding_encoder_trainable 24 | self.embedding_decoder_trainable = embedding_decoder_trainable 25 | self.optimizer = BasicSummarizer._new_optimizer() 26 | self.encoder = None 27 | self.decoder = None 28 | self.embedding_shape_in = None 29 | self.embedding_shape_out = None 30 | 31 | def init_model(self, 32 | preprocessor: Preprocessor, 33 | vectorizer: Vectorizer, 34 | embedding_weights_encoder=None, 35 | embedding_weights_decoder=None) -> None: 36 | self.preprocessor = preprocessor 37 | self.vectorizer = vectorizer 38 | self.embedding_shape_in = (self.vectorizer.encoding_dim, self.embedding_size) 39 | self.embedding_shape_out = (self.vectorizer.decoding_dim, self.embedding_size) 40 | self.encoder = Encoder(self.embedding_shape_in, 41 | self.lstm_size, 42 | embedding_trainable=self.embedding_encoder_trainable, 43 | embedding_weights=embedding_weights_encoder) 44 | self.decoder = Decoder(self.embedding_shape_out, 45 | self.lstm_size, 46 | embedding_trainable=self.embedding_decoder_trainable, 47 | embedding_weights=embedding_weights_decoder) 48 | self.encoder.compile(optimizer=self.optimizer) 49 | self.decoder.compile(optimizer=self.optimizer) 50 | 51 | def __getstate__(self): 52 | """ Prevents pickle from serializing encoder and decoder. """ 53 | state = self.__dict__.copy() 54 | del state['encoder'] 55 | del state['decoder'] 56 | del state['optimizer'] 57 | return state 58 | 59 | def predict(self, text: str) -> str: 60 | return self.predict_vectors(text, '')['predicted_text'] 61 | 62 | def predict_vectors(self, input_text: str, target_text: str) -> Dict[str, Union[str, np.array]]: 63 | text_preprocessed = self.preprocessor((input_text, target_text)) 64 | en_inputs, _ = self.vectorizer(text_preprocessed) 65 | en_initial_states = self.encoder.init_states(1) 66 | en_outputs = self.encoder(tf.constant([en_inputs]), en_initial_states) 67 | start_end_seq = self.vectorizer.encode_output( 68 | ' '.join([self.preprocessor.start_token, self.preprocessor.end_token])) 69 | de_start_index, de_end_index = start_end_seq[:1], start_end_seq[-1:] 70 | de_input = tf.constant([de_start_index]) 71 | de_state_h, de_state_c = en_outputs[1:] 72 | output = {'preprocessed_text': text_preprocessed, 73 | 'logits': [], 74 | 'alignment': [], 75 | 'predicted_sequence': []} 76 | for _ in range(self.max_prediction_len): 77 | de_output, de_state_h, de_state_c = self.decoder(de_input, (de_state_h, de_state_c)) 78 | de_input = tf.argmax(de_output, -1) 79 | pred_token_index = de_input.numpy()[0][0] 80 | if pred_token_index != 0: 81 | output['logits'].append(np.squeeze(de_output.numpy())) 82 | output['predicted_sequence'].append(pred_token_index) 83 | if pred_token_index == de_end_index: 84 | break 85 | output['predicted_text'] = self.vectorizer.decode_output(output['predicted_sequence']) 86 | return output 87 | 88 | def new_train_step(self, 89 | loss_function: Callable[[tf.Tensor], tf.Tensor], 90 | batch_size: int, 91 | apply_gradients=True) -> Callable[[tf.Tensor, tf.Tensor], float]: 92 | 93 | train_step_signature = [ 94 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 95 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 96 | ] 97 | encoder = self.encoder 98 | decoder = self.decoder 99 | optimizer = self.optimizer 100 | 101 | @tf.function(input_signature=train_step_signature) 102 | def train_step(source_seq: tf.Tensor, 103 | target_seq: tf.Tensor) -> float: 104 | en_initial_states = self.encoder.init_states(source_seq.get_shape()[0]) 105 | with tf.GradientTape() as tape: 106 | en_outputs = encoder(source_seq, en_initial_states) 107 | en_states = en_outputs[1:] 108 | de_states = en_states 109 | de_outputs = decoder(target_seq[:, :-1], de_states) 110 | logits = de_outputs[0] 111 | loss = loss_function(target_seq[:, 1:], logits) 112 | if apply_gradients is True: 113 | variables = encoder.trainable_variables + decoder.trainable_variables 114 | gradients = tape.gradient(loss, variables) 115 | optimizer.apply_gradients(zip(gradients, variables)) 116 | return float(loss) 117 | 118 | return train_step 119 | 120 | def save(self, out_path): 121 | if not os.path.exists(out_path): 122 | os.mkdir(out_path) 123 | summarizer_path = os.path.join(out_path, 'summarizer.pkl') 124 | encoder_path = os.path.join(out_path, 'encoder') 125 | decoder_path = os.path.join(out_path, 'decoder') 126 | with open(summarizer_path, 'wb+') as handle: 127 | pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) 128 | self.encoder.save_weights(encoder_path, save_format='tf') 129 | self.decoder.save_weights(decoder_path, save_format='tf') 130 | 131 | @staticmethod 132 | def load(in_path): 133 | summarizer_path = os.path.join(in_path, 'summarizer.pkl') 134 | encoder_path = os.path.join(in_path, 'encoder') 135 | decoder_path = os.path.join(in_path, 'decoder') 136 | with open(summarizer_path, 'rb') as handle: 137 | summarizer = pickle.load(handle) 138 | summarizer.encoder = Encoder(summarizer.embedding_shape_in, 139 | summarizer.lstm_size, 140 | embedding_trainable=summarizer.embedding_encoder_trainable) 141 | summarizer.decoder = Decoder(summarizer.embedding_shape_out, 142 | summarizer.lstm_size, 143 | embedding_trainable=summarizer.embedding_decoder_trainable) 144 | optimizer = BasicSummarizer._new_optimizer() 145 | summarizer.encoder.compile(optimizer=optimizer) 146 | summarizer.decoder.compile(optimizer=optimizer) 147 | summarizer.encoder.load_weights(encoder_path) 148 | summarizer.decoder.load_weights(decoder_path) 149 | summarizer.optimizer = summarizer.encoder.optimizer 150 | return summarizer 151 | 152 | @staticmethod 153 | def _new_optimizer() -> tf.keras.optimizers.Optimizer: 154 | return tf.keras.optimizers.Adam() 155 | -------------------------------------------------------------------------------- /mkdocs/autogen.py: -------------------------------------------------------------------------------- 1 | # Heavily borrowed from the Auto-Keras project: 2 | # https://github.com/jhfjhfj1/autokeras/blob/master/mkdocs/autogen.py 3 | 4 | import ast 5 | import os 6 | import re 7 | 8 | 9 | def delete_space(parts, start, end): 10 | if start > end or end >= len(parts): 11 | return None 12 | count = 0 13 | while count < len(parts[start]): 14 | if parts[start][count] == ' ': 15 | count += 1 16 | else: 17 | break 18 | return '\n'.join(y for y in [x[count:] for x in parts[start : end + 1] if len(x) > count]) 19 | 20 | 21 | def change_args_to_dict(string): 22 | if string is None: 23 | return None 24 | ans = [] 25 | strings = string.split('\n') 26 | ind = 1 27 | start = 0 28 | while ind <= len(strings): 29 | if ind < len(strings) and strings[ind].startswith(" "): 30 | ind += 1 31 | else: 32 | if start < ind: 33 | ans.append('\n'.join(strings[start:ind])) 34 | start = ind 35 | ind += 1 36 | d = {} 37 | for line in ans: 38 | if ":" in line and len(line) > 0: 39 | lines = line.split(":") 40 | d[lines[0]] = lines[1].strip() 41 | return d 42 | 43 | 44 | def remove_next_line(comments): 45 | for x in comments: 46 | if comments[x] is not None and '\n' in comments[x]: 47 | comments[x] = ' '.join(comments[x].split('\n')) 48 | return comments 49 | 50 | 51 | def skip_space_line(parts, ind): 52 | while ind < len(parts): 53 | if re.match(r'^\s*$', parts[ind]): 54 | ind += 1 55 | else: 56 | break 57 | return ind 58 | 59 | 60 | # check if comment is None or len(comment) == 0 return {} 61 | def parse_func_string(comment): 62 | if comment is None or len(comment) == 0: 63 | return {} 64 | comments = {} 65 | paras = ('Args', 'Attributes', 'Returns', 'Raises') 66 | comment_parts = [ 67 | 'short_description', 68 | 'long_description', 69 | 'Args', 70 | 'Attributes', 71 | 'Returns', 72 | 'Raises', 73 | ] 74 | for x in comment_parts: 75 | comments[x] = None 76 | 77 | parts = re.split(r'\n', comment) 78 | ind = 1 79 | while ind < len(parts): 80 | if re.match(r'^\s*$', parts[ind]): 81 | break 82 | else: 83 | ind += 1 84 | 85 | comments['short_description'] = '\n'.join( 86 | ['\n'.join(re.split('\n\s+', x.strip())) for x in parts[0:ind]] 87 | ).strip(':\n\t ') 88 | ind = skip_space_line(parts, ind) 89 | 90 | start = ind 91 | while ind < len(parts): 92 | if parts[ind].strip().startswith(paras): 93 | break 94 | else: 95 | ind += 1 96 | long_description = '\n'.join( 97 | ['\n'.join(re.split('\n\s+', x.strip())) for x in parts[start:ind]] 98 | ).strip(':\n\t ') 99 | comments['long_description'] = long_description 100 | 101 | ind = skip_space_line(paras, ind) 102 | while ind < len(parts): 103 | if parts[ind].strip().startswith(paras): 104 | start = ind 105 | start_with = parts[ind].strip() 106 | ind += 1 107 | while ind < len(parts): 108 | if parts[ind].strip().startswith(paras): 109 | break 110 | else: 111 | ind += 1 112 | part = delete_space(parts, start + 1, ind - 1) 113 | if start_with.startswith(paras[0]): 114 | comments[paras[0]] = change_args_to_dict(part) 115 | elif start_with.startswith(paras[1]): 116 | comments[paras[1]] = change_args_to_dict(part) 117 | elif start_with.startswith(paras[2]): 118 | comments[paras[2]] = change_args_to_dict(part) 119 | elif start_with.startswith(paras[3]): 120 | comments[paras[3]] = part 121 | ind = skip_space_line(parts, ind) 122 | else: 123 | ind += 1 124 | 125 | remove_next_line(comments) 126 | return comments 127 | 128 | 129 | def md_parse_line_break(comment): 130 | comment = comment.replace(' ', '\n\n') 131 | return comment.replace(' - ', '\n\n- ') 132 | 133 | 134 | def to_md(comment_dict): 135 | doc = '' 136 | if 'short_description' in comment_dict: 137 | doc += comment_dict['short_description'] 138 | doc += '\n\n' 139 | 140 | if 'long_description' in comment_dict: 141 | doc += md_parse_line_break(comment_dict['long_description']) 142 | doc += '\n' 143 | 144 | if 'Args' in comment_dict and comment_dict['Args'] is not None: 145 | doc += '##### Args\n' 146 | for arg, des in comment_dict['Args'].items(): 147 | doc += '* **' + arg + '**: ' + des + '\n\n' 148 | 149 | if 'Attributes' in comment_dict and comment_dict['Attributes'] is not None: 150 | doc += '##### Attributes\n' 151 | for arg, des in comment_dict['Attributes'].items(): 152 | doc += '* **' + arg + '**: ' + des + '\n\n' 153 | 154 | if 'Returns' in comment_dict and comment_dict['Returns'] is not None: 155 | doc += '##### Returns\n' 156 | if isinstance(comment_dict['Returns'], str): 157 | doc += comment_dict['Returns'] 158 | doc += '\n' 159 | else: 160 | for arg, des in comment_dict['Returns'].items(): 161 | doc += '* **' + arg + '**: ' + des + '\n\n' 162 | return doc 163 | 164 | 165 | def parse_func_args(function): 166 | args = [a.arg for a in function.args.args if a.arg != 'self'] 167 | kwargs = [] 168 | if function.args.kwarg: 169 | kwargs = ['**' + function.args.kwarg.arg] 170 | 171 | return '(' + ', '.join(args + kwargs) + ')' 172 | 173 | 174 | def get_func_comments(function_definitions): 175 | doc = '' 176 | for f in function_definitions: 177 | temp_str = to_md(parse_func_string(ast.get_docstring(f))) 178 | doc += ''.join( 179 | [ 180 | '### ', 181 | f.name.replace('_', '\\_'), 182 | '\n', 183 | '```python', 184 | '\n', 185 | 'def ', 186 | f.name, 187 | parse_func_args(f), 188 | '\n', 189 | '```', 190 | '\n', 191 | temp_str, 192 | '\n', 193 | ] 194 | ) 195 | 196 | return doc 197 | 198 | 199 | def get_comments_str(file_name): 200 | with open(file_name) as fd: 201 | file_contents = fd.read() 202 | module = ast.parse(file_contents) 203 | 204 | function_definitions = [node for node in module.body if isinstance(node, ast.FunctionDef)] 205 | 206 | doc = get_func_comments(function_definitions) 207 | 208 | class_definitions = [node for node in module.body if isinstance(node, ast.ClassDef)] 209 | for class_def in class_definitions: 210 | temp_str = to_md(parse_func_string(ast.get_docstring(class_def))) 211 | 212 | # excludes private methods (start with '_') 213 | method_definitions = [ 214 | node 215 | for node in class_def.body 216 | if isinstance(node, ast.FunctionDef) and (node.name[0] != '_' or node.name[:2] == '__') 217 | ] 218 | 219 | temp_str += get_func_comments(method_definitions) 220 | doc += '## class ' + class_def.name + '\n' + temp_str 221 | return doc 222 | 223 | 224 | def extract_comments(directory): 225 | for parent, dir_names, file_names in os.walk(directory): 226 | for file_name in file_names: 227 | if os.path.splitext(file_name)[1] == '.py' and file_name != '__init__.py': 228 | # with open 229 | doc = get_comments_str(os.path.join(parent, file_name)) 230 | directory = os.path.join('docs', parent.replace('../headliner/', '')) 231 | if not os.path.exists(directory): 232 | os.makedirs(directory) 233 | 234 | output_file = open(os.path.join(directory, file_name[:-3] + '.md'), 'w') 235 | output_file.write(doc) 236 | output_file.close() 237 | 238 | 239 | extract_comments('../headliner/') -------------------------------------------------------------------------------- /headliner/model/attention_summarizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Callable, Dict, Union 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from headliner.model.attention_model import Encoder, Decoder 9 | from headliner.model.summarizer import Summarizer 10 | from headliner.preprocessing.preprocessor import Preprocessor 11 | from headliner.preprocessing.vectorizer import Vectorizer 12 | 13 | 14 | class AttentionSummarizer(Summarizer): 15 | 16 | def __init__(self, lstm_size=50, max_prediction_len=20, embedding_size=50, embedding_encoder_trainable=True, 17 | embedding_decoder_trainable=True): 18 | super().__init__() 19 | self.lstm_size = lstm_size 20 | self.max_prediction_len = max_prediction_len 21 | self.embedding_size = embedding_size 22 | self.embedding_encoder_trainable = embedding_encoder_trainable 23 | self.embedding_decoder_trainable = embedding_decoder_trainable 24 | self.optimizer = AttentionSummarizer._new_optimizer() 25 | self.encoder = None 26 | self.decoder = None 27 | self.embedding_shape_in = None 28 | self.embedding_shape_out = None 29 | 30 | def init_model(self, 31 | preprocessor: Preprocessor, 32 | vectorizer: Vectorizer, 33 | embedding_weights_encoder=None, 34 | embedding_weights_decoder=None) -> None: 35 | self.preprocessor = preprocessor 36 | self.vectorizer = vectorizer 37 | self.embedding_shape_in = (self.vectorizer.encoding_dim, self.embedding_size) 38 | self.embedding_shape_out = (self.vectorizer.decoding_dim, self.embedding_size) 39 | self.encoder = Encoder(self.embedding_shape_in, 40 | self.lstm_size, 41 | embedding_trainable=self.embedding_encoder_trainable, 42 | embedding_weights=embedding_weights_encoder) 43 | self.decoder = Decoder(self.embedding_shape_out, 44 | self.lstm_size, 45 | embedding_trainable=self.embedding_decoder_trainable, 46 | embedding_weights=embedding_weights_decoder) 47 | self.encoder.compile(optimizer=self.optimizer) 48 | self.decoder.compile(optimizer=self.optimizer) 49 | 50 | def __getstate__(self): 51 | """ Prevents pickle from serializing encoder and decoder """ 52 | state = self.__dict__.copy() 53 | del state['encoder'] 54 | del state['decoder'] 55 | del state['optimizer'] 56 | return state 57 | 58 | def predict(self, text: str) -> str: 59 | return self.predict_vectors(text, '')['predicted_text'] 60 | 61 | def predict_vectors(self, input_text: str, target_text: str) -> Dict[str, Union[str, np.array]]: 62 | text_preprocessed = self.preprocessor((input_text, target_text)) 63 | en_inputs, _ = self.vectorizer(text_preprocessed) 64 | en_initial_states = self.encoder.init_states(1) 65 | en_outputs = self.encoder(tf.constant([en_inputs]), en_initial_states) 66 | start_end_seq = self.vectorizer.encode_output( 67 | ' '.join([self.preprocessor.start_token, self.preprocessor.end_token])) 68 | de_start_index, de_end_index = start_end_seq[:1], start_end_seq[-1:] 69 | de_input = tf.constant([de_start_index]) 70 | de_state_h, de_state_c = en_outputs[1:] 71 | output = {'preprocessed_text': text_preprocessed, 72 | 'logits': [], 73 | 'alignment': [], 74 | 'predicted_sequence': []} 75 | for _ in range(self.max_prediction_len): 76 | de_output, de_state_h, de_state_c, alignment = self.decoder(de_input, (de_state_h, de_state_c), 77 | en_outputs[0]) 78 | de_input = tf.expand_dims(tf.argmax(de_output, -1), 0) 79 | pred_token_index = de_input.numpy()[0][0] 80 | if pred_token_index != 0: 81 | output['logits'].append(np.squeeze(de_output.numpy())) 82 | output['alignment'].append(np.squeeze(alignment.numpy())) 83 | output['predicted_sequence'].append(pred_token_index) 84 | if pred_token_index == de_end_index: 85 | break 86 | output['predicted_text'] = self.vectorizer.decode_output(output['predicted_sequence']) 87 | return output 88 | 89 | def new_train_step(self, 90 | loss_function: Callable[[tf.Tensor], tf.Tensor], 91 | batch_size: int, 92 | apply_gradients=True) -> Callable[[tf.Tensor, tf.Tensor], float]: 93 | 94 | train_step_signature = [ 95 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 96 | tf.TensorSpec(shape=(batch_size, self.vectorizer.max_output_len), dtype=tf.int32), 97 | ] 98 | encoder = self.encoder 99 | decoder = self.decoder 100 | optimizer = self.optimizer 101 | 102 | def train_step(source_seq, target_seq): 103 | loss = 0 104 | en_initial_states = encoder.init_states(batch_size) 105 | with tf.GradientTape() as tape: 106 | en_outputs = encoder(source_seq, en_initial_states) 107 | en_states = en_outputs[1:] 108 | de_state_h, de_state_c = en_states 109 | for i in range(target_seq.shape[1] - 1): 110 | decoder_in = tf.expand_dims(target_seq[:, i], 1) 111 | logit, de_state_h, de_state_c, _ = decoder( 112 | decoder_in, (de_state_h, de_state_c), en_outputs[0]) 113 | loss += loss_function(target_seq[:, i + 1], logit) 114 | if apply_gradients is True: 115 | variables = encoder.trainable_variables + decoder.trainable_variables 116 | gradients = tape.gradient(loss, variables) 117 | optimizer.apply_gradients(zip(gradients, variables)) 118 | return loss / (target_seq.shape[1] - 1) 119 | 120 | if self.vectorizer.max_output_len is not None: 121 | return tf.function(train_step, input_signature=train_step_signature) 122 | else: 123 | return train_step 124 | 125 | def save(self, out_path: str) -> None: 126 | if not os.path.exists(out_path): 127 | os.mkdir(out_path) 128 | summarizer_path = os.path.join(out_path, 'summarizer.pkl') 129 | encoder_path = os.path.join(out_path, 'encoder') 130 | decoder_path = os.path.join(out_path, 'decoder') 131 | with open(summarizer_path, 'wb+') as handle: 132 | pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) 133 | self.encoder.save_weights(encoder_path, save_format='tf') 134 | self.decoder.save_weights(decoder_path, save_format='tf') 135 | 136 | @staticmethod 137 | def load(in_path: str): 138 | summarizer_path = os.path.join(in_path, 'summarizer.pkl') 139 | encoder_path = os.path.join(in_path, 'encoder') 140 | decoder_path = os.path.join(in_path, 'decoder') 141 | with open(summarizer_path, 'rb') as handle: 142 | summarizer = pickle.load(handle) 143 | summarizer.encoder = Encoder(summarizer.embedding_shape_in, 144 | summarizer.lstm_size, 145 | embedding_trainable=summarizer.embedding_encoder_trainable) 146 | summarizer.decoder = Decoder(summarizer.embedding_shape_out, 147 | summarizer.lstm_size, 148 | embedding_trainable=summarizer.embedding_decoder_trainable) 149 | optimizer = AttentionSummarizer._new_optimizer() 150 | summarizer.encoder.compile(optimizer=optimizer) 151 | summarizer.decoder.compile(optimizer=optimizer) 152 | summarizer.encoder.load_weights(encoder_path) 153 | summarizer.decoder.load_weights(decoder_path) 154 | summarizer.optimizer = summarizer.encoder.optimizer 155 | return summarizer 156 | 157 | @staticmethod 158 | def _new_optimizer() -> tf.keras.optimizers.Optimizer: 159 | return tf.keras.optimizers.Adam() 160 | -------------------------------------------------------------------------------- /headliner/model/transformer_summarizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Callable 4 | from typing import Dict, Union 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from headliner.model.summarizer import Summarizer 10 | from headliner.model.transformer_model import Transformer, create_masks 11 | from headliner.preprocessing.preprocessor import Preprocessor 12 | from headliner.preprocessing.vectorizer import Vectorizer 13 | 14 | 15 | class TransformerSummarizer(Summarizer): 16 | 17 | def __init__(self, 18 | max_prediction_len=20, 19 | num_layers=1, 20 | num_heads=2, 21 | feed_forward_dim=512, 22 | dropout_rate=0, 23 | embedding_size=128, 24 | embedding_encoder_trainable=True, 25 | embedding_decoder_trainable=True, 26 | max_sequence_len=10000): 27 | 28 | super().__init__() 29 | self.max_prediction_len = max_prediction_len 30 | self.embedding_size = embedding_size 31 | self.num_layers = num_layers 32 | self.num_heads = num_heads 33 | self.dropout_rate = dropout_rate 34 | self.feed_forward_dim = feed_forward_dim 35 | self.embedding_encoder_trainable = embedding_encoder_trainable 36 | self.embedding_decoder_trainable = embedding_decoder_trainable 37 | self.optimizer = TransformerSummarizer.new_optimizer() 38 | self.transformer = None 39 | self.embedding_shape_in = None 40 | self.embedding_shape_out = None 41 | self.max_sequence_len = max_sequence_len 42 | 43 | def __getstate__(self): 44 | """ Prevents pickle from serializing the transformer and optimizer """ 45 | state = self.__dict__.copy() 46 | del state['transformer'] 47 | del state['optimizer'] 48 | return state 49 | 50 | def init_model(self, 51 | preprocessor: Preprocessor, 52 | vectorizer: Vectorizer, 53 | embedding_weights_encoder=None, 54 | embedding_weights_decoder=None) -> None: 55 | self.preprocessor = preprocessor 56 | self.vectorizer = vectorizer 57 | self.embedding_shape_in = (self.vectorizer.encoding_dim, self.embedding_size) 58 | self.embedding_shape_out = (self.vectorizer.decoding_dim, self.embedding_size) 59 | self.transformer = Transformer(num_layers=self.num_layers, 60 | num_heads=self.num_heads, 61 | feed_forward_dim=self.feed_forward_dim, 62 | embedding_shape_encoder=self.embedding_shape_in, 63 | embedding_shape_decoder=self.embedding_shape_out, 64 | embedding_encoder_trainable=self.embedding_encoder_trainable, 65 | embedding_decoder_trainable=self.embedding_decoder_trainable, 66 | embedding_weights_encoder=embedding_weights_encoder, 67 | embedding_weights_decoder=embedding_weights_decoder, 68 | dropout_rate=self.dropout_rate, 69 | max_sequence_len=self.max_sequence_len) 70 | self.transformer.compile(optimizer=self.optimizer) 71 | 72 | def new_train_step(self, 73 | loss_function: Callable[[tf.Tensor], tf.Tensor], 74 | batch_size: int, 75 | apply_gradients=True): 76 | 77 | transformer = self.transformer 78 | optimizer = self.optimizer 79 | 80 | train_step_signature = [ 81 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 82 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 83 | ] 84 | 85 | @tf.function(input_signature=train_step_signature) 86 | def train_step(inp, tar): 87 | tar_inp = tar[:, :-1] 88 | tar_real = tar[:, 1:] 89 | enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) 90 | with tf.GradientTape() as tape: 91 | predictions, _ = transformer(inp, tar_inp, 92 | True, 93 | enc_padding_mask, 94 | combined_mask, 95 | dec_padding_mask) 96 | loss = loss_function(tar_real, predictions) 97 | if apply_gradients: 98 | gradients = tape.gradient(loss, transformer.trainable_variables) 99 | optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) 100 | return loss 101 | 102 | return train_step 103 | 104 | def predict(self, text: str) -> str: 105 | return self.predict_vectors(text, '')['predicted_text'] 106 | 107 | def predict_vectors(self, input_text: str, target_text: str) -> Dict[str, Union[str, np.array]]: 108 | text_preprocessed = self.preprocessor((input_text, target_text)) 109 | en_inputs, _ = self.vectorizer(text_preprocessed) 110 | en_inputs = tf.expand_dims(en_inputs, 0) 111 | start_end_seq = self.vectorizer.encode_output( 112 | ' '.join([self.preprocessor.start_token, self.preprocessor.end_token])) 113 | de_start_index, de_end_index = start_end_seq[:1], start_end_seq[-1:] 114 | decoder_output = tf.expand_dims(de_start_index, 0) 115 | output = {'preprocessed_text': text_preprocessed, 116 | 'logits': [], 117 | 'attention_weights': [], 118 | 'predicted_sequence': []} 119 | for _ in range(self.max_prediction_len): 120 | enc_padding_mask, combined_mask, dec_padding_mask = create_masks( 121 | en_inputs, decoder_output) 122 | predictions, attention_weights = self.transformer(en_inputs, 123 | decoder_output, 124 | False, 125 | enc_padding_mask, 126 | combined_mask, 127 | dec_padding_mask) 128 | 129 | predictions = predictions[:, -1:, :] 130 | pred_token_index = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) 131 | decoder_output = tf.concat([decoder_output, pred_token_index], axis=-1) 132 | if pred_token_index != 0: 133 | output['logits'].append(np.squeeze(predictions.numpy())) 134 | output['attention_weights'] = attention_weights 135 | output['predicted_sequence'].append(int(pred_token_index)) 136 | if pred_token_index == de_end_index: 137 | break 138 | output['predicted_text'] = self.vectorizer.decode_output(output['predicted_sequence']) 139 | return output 140 | 141 | def save(self, out_path: str) -> None: 142 | if not os.path.exists(out_path): 143 | os.mkdir(out_path) 144 | summarizer_path = os.path.join(out_path, 'summarizer.pkl') 145 | transformer_path = os.path.join(out_path, 'transformer') 146 | with open(summarizer_path, 'wb+') as handle: 147 | pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) 148 | self.transformer.save_weights(transformer_path, save_format='tf') 149 | 150 | @staticmethod 151 | def load(in_path: str): 152 | summarizer_path = os.path.join(in_path, 'summarizer.pkl') 153 | transformer_path = os.path.join(in_path, 'transformer') 154 | with open(summarizer_path, 'rb') as handle: 155 | summarizer = pickle.load(handle) 156 | summarizer.transformer = Transformer(num_layers=summarizer.num_layers, 157 | num_heads=summarizer.num_heads, 158 | feed_forward_dim=summarizer.feed_forward_dim, 159 | embedding_shape_encoder=summarizer.embedding_shape_in, 160 | embedding_shape_decoder=summarizer.embedding_shape_out, 161 | embedding_encoder_trainable=summarizer.embedding_encoder_trainable, 162 | embedding_decoder_trainable=summarizer.embedding_decoder_trainable, 163 | dropout_rate=summarizer.dropout_rate) 164 | optimizer = TransformerSummarizer.new_optimizer() 165 | summarizer.transformer.compile(optimizer=optimizer) 166 | summarizer.transformer.load_weights(transformer_path) 167 | summarizer.optimizer = summarizer.transformer.optimizer 168 | return summarizer 169 | 170 | @staticmethod 171 | def new_optimizer() -> tf.keras.optimizers.Optimizer: 172 | return tf.keras.optimizers.Adam() 173 | -------------------------------------------------------------------------------- /headliner/model/bert_summarizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Callable 4 | from typing import Dict, Union 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from headliner.model.bert_model import Transformer, create_masks 10 | from headliner.model.summarizer import Summarizer 11 | from headliner.preprocessing.bert_vectorizer import BertVectorizer 12 | from headliner.preprocessing.preprocessor import Preprocessor 13 | from headliner.utils.logger import get_logger 14 | 15 | 16 | class BertSummarizer(Summarizer): 17 | 18 | def __init__(self, 19 | max_prediction_len=20, 20 | num_layers_encoder=1, 21 | num_layers_decoder=1, 22 | num_heads=2, 23 | feed_forward_dim=512, 24 | dropout_rate=0, 25 | embedding_size_encoder=768, 26 | embedding_size_decoder=64, 27 | bert_embedding_encoder=None, 28 | bert_embedding_decoder=None, 29 | embedding_encoder_trainable=True, 30 | embedding_decoder_trainable=True, 31 | max_sequence_len=10000): 32 | 33 | super().__init__() 34 | self.max_prediction_len = max_prediction_len 35 | self.embedding_size_encoder = embedding_size_encoder 36 | self.embedding_size_decoder = embedding_size_decoder 37 | self.num_layers_encoder = num_layers_encoder 38 | self.num_layers_decoder = num_layers_decoder 39 | self.num_heads = num_heads 40 | self.dropout_rate = dropout_rate 41 | self.feed_forward_dim = feed_forward_dim 42 | self.embedding_encoder_trainable = embedding_encoder_trainable 43 | self.embedding_decoder_trainable = embedding_decoder_trainable 44 | self.bert_embedding_encoder = bert_embedding_encoder 45 | self.bert_embedding_decoder = bert_embedding_decoder 46 | self.optimizer_encoder = BertSummarizer.new_optimizer_encoder() 47 | self.optimizer_decoder = BertSummarizer.new_optimizer_decoder() 48 | self.transformer = None 49 | self.embedding_shape_in = None 50 | self.embedding_shape_out = None 51 | self.max_sequence_len = max_sequence_len 52 | self.logger = get_logger(__name__) 53 | 54 | def __getstate__(self): 55 | """ Prevents pickle from serializing the transformer and optimizer """ 56 | state = self.__dict__.copy() 57 | del state['transformer'] 58 | del state['logger'] 59 | del state['optimizer_encoder'] 60 | del state['optimizer_decoder'] 61 | return state 62 | 63 | def init_model(self, 64 | preprocessor: Preprocessor, 65 | vectorizer: BertVectorizer, 66 | embedding_weights_encoder=None, 67 | embedding_weights_decoder=None 68 | ) -> None: 69 | self.preprocessor = preprocessor 70 | self.vectorizer = vectorizer 71 | self.embedding_shape_in = (self.vectorizer.encoding_dim, self.embedding_size_encoder) 72 | self.embedding_shape_out = (self.vectorizer.decoding_dim, self.embedding_size_decoder) 73 | self.transformer = Transformer(num_layers_encoder=self.num_layers_encoder, 74 | num_layers_decoder=self.num_layers_decoder, 75 | num_heads=self.num_heads, 76 | feed_forward_dim=self.feed_forward_dim, 77 | embedding_shape_encoder=self.embedding_shape_in, 78 | embedding_shape_decoder=self.embedding_shape_out, 79 | bert_embedding_encoder=self.bert_embedding_encoder, 80 | embedding_encoder_trainable=self.embedding_encoder_trainable, 81 | embedding_decoder_trainable=self.embedding_decoder_trainable, 82 | embedding_weights_encoder=embedding_weights_encoder, 83 | embedding_weights_decoder=embedding_weights_decoder, 84 | dropout_rate=self.dropout_rate, 85 | max_seq_len=self.max_sequence_len) 86 | self.transformer.encoder.compile(optimizer=self.optimizer_encoder) 87 | self.transformer.decoder.compile(optimizer=self.optimizer_decoder) 88 | 89 | def new_train_step(self, 90 | loss_function: Callable[[tf.Tensor], tf.Tensor], 91 | batch_size: int, 92 | apply_gradients=True): 93 | 94 | transformer = self.transformer 95 | optimizer_encoder = self.optimizer_encoder 96 | optimizer_decoder = self.optimizer_decoder 97 | 98 | train_step_signature = [ 99 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 100 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 101 | tf.TensorSpec(shape=(batch_size, None), dtype=tf.int32), 102 | ] 103 | 104 | @tf.function(input_signature=train_step_signature) 105 | def train_step(inp, sent_ids, tar): 106 | tar_inp = tar[:, :-1] 107 | tar_real = tar[:, 1:] 108 | enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) 109 | with tf.GradientTape(persistent=True) as tape: 110 | predictions, _ = transformer(inp, 111 | sent_ids, 112 | tar_inp, 113 | True, 114 | enc_padding_mask, 115 | combined_mask, 116 | dec_padding_mask) 117 | loss = loss_function(tar_real, predictions) 118 | if apply_gradients: 119 | gradients_decoder = tape.gradient(loss, transformer.decoder.trainable_variables) 120 | gradients_encoder = tape.gradient(loss, transformer.encoder.trainable_variables) 121 | optimizer_decoder.apply_gradients(zip(gradients_decoder, transformer.decoder.trainable_variables)) 122 | optimizer_encoder.apply_gradients(zip(gradients_encoder, transformer.encoder.trainable_variables)) 123 | 124 | return loss 125 | 126 | return train_step 127 | 128 | def predict(self, text: str) -> str: 129 | return self.predict_vectors(text, '')['predicted_text'] 130 | 131 | def predict_vectors(self, input_text: str, target_text: str) -> Dict[str, Union[str, np.array]]: 132 | text_preprocessed = self.preprocessor((input_text, target_text)) 133 | en_inputs, sent_ids, _ = self.vectorizer(text_preprocessed) 134 | en_inputs = tf.expand_dims(en_inputs, 0) 135 | sent_ids = tf.expand_dims(sent_ids, 0) 136 | start_end_seq = self.vectorizer.encode_output( 137 | ' '.join([self.preprocessor.start_token, self.preprocessor.end_token])) 138 | de_start_index, de_end_index = start_end_seq[:1], start_end_seq[-1:] 139 | decoder_output = tf.expand_dims(de_start_index, 0) 140 | output = {'preprocessed_text': text_preprocessed, 141 | 'logits': [], 142 | 'attention_weights': [], 143 | 'predicted_sequence': []} 144 | for _ in range(self.max_prediction_len): 145 | enc_padding_mask, combined_mask, dec_padding_mask = create_masks( 146 | en_inputs, decoder_output) 147 | predictions, attention_weights = self.transformer(en_inputs, 148 | sent_ids, 149 | decoder_output, 150 | False, 151 | enc_padding_mask, 152 | combined_mask, 153 | dec_padding_mask) 154 | 155 | predictions = predictions[:, -1:, :] 156 | pred_token_index = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) 157 | decoder_output = tf.concat([decoder_output, pred_token_index], axis=-1) 158 | if pred_token_index != 0: 159 | output['logits'].append(np.squeeze(predictions.numpy())) 160 | output['attention_weights'] = attention_weights 161 | output['predicted_sequence'].append(int(pred_token_index)) 162 | if pred_token_index == de_end_index: 163 | break 164 | output['predicted_text'] = self.vectorizer.decode_output(output['predicted_sequence']) 165 | return output 166 | 167 | def save(self, out_path: str) -> None: 168 | if not os.path.exists(out_path): 169 | os.mkdir(out_path) 170 | summarizer_path = os.path.join(out_path, 'summarizer.pkl') 171 | encoder_path = os.path.join(out_path, 'encoder') 172 | decoder_path = os.path.join(out_path, 'decoder') 173 | with open(summarizer_path, 'wb+') as handle: 174 | pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) 175 | self.transformer.encoder.save_weights(encoder_path, save_format='tf') 176 | self.transformer.decoder.save_weights(decoder_path, save_format='tf') 177 | 178 | @staticmethod 179 | def load(in_path: str): 180 | summarizer_path = os.path.join(in_path, 'summarizer.pkl') 181 | encoder_path = os.path.join(in_path, 'encoder') 182 | decoder_path = os.path.join(in_path, 'decoder') 183 | with open(summarizer_path, 'rb') as handle: 184 | summarizer = pickle.load(handle) 185 | summarizer.logger = get_logger(__name__) 186 | summarizer.transformer = Transformer(num_layers_encoder=summarizer.num_layers_encoder, 187 | num_layers_decoder=summarizer.num_layers_decoder, 188 | num_heads=summarizer.num_heads, 189 | feed_forward_dim=summarizer.feed_forward_dim, 190 | embedding_shape_encoder=summarizer.embedding_shape_in, 191 | embedding_shape_decoder=summarizer.embedding_shape_out, 192 | bert_embedding_encoder=summarizer.bert_embedding_encoder, 193 | embedding_encoder_trainable=summarizer.embedding_encoder_trainable, 194 | embedding_decoder_trainable=summarizer.embedding_decoder_trainable, 195 | dropout_rate=summarizer.dropout_rate) 196 | optimizer_encoder = BertSummarizer.new_optimizer_encoder() 197 | optimizer_decoder = BertSummarizer.new_optimizer_decoder() 198 | summarizer.transformer.encoder.compile(optimizer=optimizer_encoder) 199 | summarizer.transformer.decoder.compile(optimizer=optimizer_decoder) 200 | summarizer.transformer.encoder.load_weights(encoder_path) 201 | summarizer.transformer.decoder.load_weights(decoder_path) 202 | summarizer.optimizer_encoder = summarizer.transformer.encoder.optimizer 203 | summarizer.optimizer_decoder = summarizer.transformer.decoder.optimizer 204 | return summarizer 205 | 206 | @staticmethod 207 | def new_optimizer_decoder(learning_rate_start=0.02, warmup_steps=10000) -> tf.keras.optimizers.Optimizer: 208 | learning_rate = CustomSchedule(warmup_steps=warmup_steps, learning_rate_start=learning_rate_start) 209 | optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.999, ) 210 | return optimizer 211 | 212 | @staticmethod 213 | def new_optimizer_encoder(learning_rate_start=5e-4, warmup_steps=20000) -> tf.keras.optimizers.Optimizer: 214 | learning_rate = CustomSchedule(warmup_steps=warmup_steps, learning_rate_start=learning_rate_start) 215 | optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.999, ) 216 | return optimizer 217 | 218 | 219 | class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): 220 | 221 | def __init__(self, warmup_steps=10000, learning_rate_start=1e-1): 222 | super(CustomSchedule, self).__init__() 223 | self.warmup_steps = warmup_steps 224 | self.learning_rate_start = learning_rate_start 225 | 226 | def __call__(self, step): 227 | arg1 = step ** -0.5 228 | arg2 = step * (self.warmup_steps ** -1.5) 229 | return self.learning_rate_start * tf.math.minimum(arg1, arg2) 230 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Headliner 2 | 3 | [![Build Status](https://dev.azure.com/axelspringerai/Public/_apis/build/status/as-ideas.headliner?branchName=master)](https://dev.azure.com/axelspringerai/Public/_build/latest?definitionId=2&branchName=master) 4 | [![Build Status](https://travis-ci.org/as-ideas/headliner.svg?branch=master)](https://travis-ci.org/as-ideas/headliner) 5 | [![Docs](https://img.shields.io/badge/docs-online-brightgreen)](https://as-ideas.github.io/headliner/) 6 | [![codecov](https://codecov.io/gh/as-ideas/headliner/branch/master/graph/badge.svg)](https://codecov.io/gh/as-ideas/headliner) 7 | [![PyPI Version](https://img.shields.io/pypi/v/headliner)](https://pypi.org/project/headliner/) 8 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/as-ideas/headliner/blob/master/LICENSE) 9 | 10 | Headliner is a sequence modeling library that eases the training and **in particular, the deployment of custom sequence models** 11 | for both researchers and developers. You can very easily deploy your models in a few lines of code. It was originally 12 | built for our own research to generate headlines from [Welt news articles](https://www.welt.de/) (see figure 1). That's why we chose the name, Headliner. 13 | 14 |

15 | 16 |

17 |

18 | Figure 1: One example from our Welt.de headline generator. 19 |

20 | 21 | ## Update 21.01.2020 22 | The library now supports fine-tuning pre-trained BERT models with 23 | custom preprocessing as in [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf)! 24 | 25 | check out 26 | [this](https://colab.research.google.com/github/as-ideas/headliner/blob/master/notebooks/BERT_Translation_Example.ipynb) 27 | tutorial on colab! 28 | 29 | ## 🧠 Internals 30 | We use sequence-to-sequence (seq2seq) under the hood, 31 | an encoder-decoder framework (see figure 2). We provide a very simple interface to train 32 | and deploy seq2seq models. Although this library was created internally to 33 | generate headlines, you can also use it for **other tasks like machine translations, 34 | text summarization and many more.** 35 | 36 |

37 | 38 |

39 |

40 | Figure 2: Encoder-decoder sequence-to-sequence model. 41 |

42 | 43 | ### Why Headliner? 44 | 45 | You may ask why another seq2seq library? There are a couple of them out there already. 46 | For example, Facebook has [fairseq](https://github.com/pytorch/fairseq), Google has [seq2seq](https://github.com/google/seq2seq) 47 | and there is also [OpenNMT](http://opennmt.net/). 48 | Although those libraries are great, they have a few drawbacks for our use case e.g. the former doesn't focus much on production 49 | whereas the Google one is not actively maintained. OpenNMT was the closest one to match our requirements i.e. 50 | it has a strong focus on production. However, we didn't like that their workflow 51 | (preparing data, training and evaluation) is mainly done via the command line. 52 | They also expose a well-defined API though but the complexity there is still too high with too much custom code 53 | (see their [minimal transformer training example](https://github.com/OpenNMT/OpenNMT-tf/blob/master/examples/library/minimal_transformer_training.py)). 54 | 55 | Therefore, we built this library for us with the following goals in mind: 56 | 57 | * Easy-to-use API for training and deployment (only a few lines of code) 58 | * Uses TensorFlow 2.0 with all its new features (`tf.function`, `tf.keras.layers` etc.) 59 | * Modular classes: text preprocessing, modeling, evaluation 60 | * Extensible for different encoder-decoder models 61 | * Works on large text data 62 | 63 | For more details on the library, read the documentation at: [https://as-ideas.github.io/headliner/](https://as-ideas.github.io/headliner/) 64 | 65 | Headliner is compatible with Python 3.6 and is distributed under the MIT license. 66 | 67 | ## ⚙️ Installation 68 | > ⚠️ Before installing Headliner, you need to install TensorFlow as we use this as our deep learning framework. For more 69 | > details on how to install it, have a look at the [TensorFlow installation instructions](https://www.tensorflow.org/install/). 70 | 71 | Then you can install Headliner itself. There are two ways to install Headliner: 72 | 73 | * Install Headliner from PyPI (recommended): 74 | 75 | ```bash 76 | pip install headliner 77 | ``` 78 | 79 | * Install Headliner from the GitHub source: 80 | 81 | ```bash 82 | git clone https://github.com/as-ideas/headliner.git 83 | cd headliner 84 | python setup.py install 85 | ``` 86 | 87 | ## 📖 Usage 88 | 89 | ### Training 90 | For the training, you need to import one of our provided models or create your own custom one. Then you need to 91 | create the dataset, a `tuple` of input-output sequences, and then train it: 92 | 93 | ```python 94 | from headliner.trainer import Trainer 95 | from headliner.model.transformer_summarizer import TransformerSummarizer 96 | 97 | data = [('You are the stars, earth and sky for me!', 'I love you.'), 98 | ('You are great, but I have other plans.', 'I like you.')] 99 | 100 | summarizer = TransformerSummarizer(embedding_size=64, max_prediction_len=20) 101 | trainer = Trainer(batch_size=2, steps_per_epoch=100) 102 | trainer.train(summarizer, data, num_epochs=2) 103 | summarizer.save('/tmp/summarizer') 104 | ``` 105 | 106 | ### Prediction 107 | The prediction can be done in a few lines of code: 108 | 109 | ```python 110 | from headliner.model.transformer_summarizer import TransformerSummarizer 111 | 112 | summarizer = TransformerSummarizer.load('/tmp/summarizer') 113 | summarizer.predict('You are the stars, earth and sky for me!') 114 | ``` 115 | 116 | ### Models 117 | Currently available models include a basic encoder-decoder, 118 | an encoder-decoder with Luong attention, the transformer and 119 | a transformer on top of a pre-trained BERT-model: 120 | 121 | ```python 122 | from headliner.model.basic_summarizer import BasicSummarizer 123 | from headliner.model.attention_summarizer import AttentionSummarizer 124 | from headliner.model.transformer_summarizer import TransformerSummarizer 125 | from headliner.model.bert_summarizer import BertSummarizer 126 | 127 | basic_summarizer = BasicSummarizer() 128 | attention_summarizer = AttentionSummarizer() 129 | transformer_summarizer = TransformerSummarizer() 130 | bert_summarizer = BertSummarizer() 131 | ``` 132 | 133 | ### Advanced training 134 | Training using a validation split and model checkpointing: 135 | 136 | ```python 137 | from headliner.model.transformer_summarizer import TransformerSummarizer 138 | from headliner.trainer import Trainer 139 | 140 | train_data = [('You are the stars, earth and sky for me!', 'I love you.'), 141 | ('You are great, but I have other plans.', 'I like you.')] 142 | val_data = [('You are great, but I have other plans.', 'I like you.')] 143 | 144 | summarizer = TransformerSummarizer(num_heads=1, 145 | feed_forward_dim=512, 146 | num_layers=1, 147 | embedding_size=64, 148 | max_prediction_len=50) 149 | trainer = Trainer(batch_size=8, 150 | steps_per_epoch=50, 151 | max_vocab_size_encoder=10000, 152 | max_vocab_size_decoder=10000, 153 | tensorboard_dir='/tmp/tensorboard', 154 | model_save_path='/tmp/summarizer') 155 | 156 | trainer.train(summarizer, train_data, val_data=val_data, num_epochs=3) 157 | ``` 158 | 159 | ### Advanced prediction 160 | Prediction information such as attention weights and logits can be accessed via predict_vectors returning a dictionary: 161 | 162 | ```python 163 | from headliner.model.transformer_summarizer import TransformerSummarizer 164 | 165 | summarizer = TransformerSummarizer.load('/tmp/summarizer') 166 | summarizer.predict_vectors('You are the stars, earth and sky for me!') 167 | ``` 168 | 169 | ### Resume training 170 | A previously trained summarizer can be loaded and then retrained. In this case the data preprocessing and vectorization is loaded from the model. 171 | 172 | ```python 173 | train_data = [('Some new training data.', 'New data.')] * 10 174 | 175 | summarizer_loaded = TransformerSummarizer.load('/tmp/summarizer') 176 | trainer = Trainer(batch_size=2) 177 | trainer.train(summarizer_loaded, train_data) 178 | summarizer_loaded.save('/tmp/summarizer_retrained') 179 | ``` 180 | 181 | ### Use pretrained GloVe embeddings 182 | Embeddings in GloVe format can be injected in to the trainer as follows. Optionally, set the embedding to non-trainable. 183 | 184 | ```python 185 | trainer = Trainer(embedding_path_encoder='/tmp/embedding_encoder.txt', 186 | embedding_path_decoder='/tmp/embedding_decoder.txt') 187 | 188 | # make sure the embedding size matches to the embedding size of the files 189 | summarizer = TransformerSummarizer(embedding_size=64, 190 | embedding_encoder_trainable=False, 191 | embedding_decoder_trainable=False) 192 | ``` 193 | 194 | ### Custom preprocessing 195 | A model can be initialized with custom preprocessing and tokenization: 196 | 197 | ```python 198 | from headliner.preprocessing.preprocessor import Preprocessor 199 | 200 | train_data = [('Some inputs.', 'Some outputs.')] * 10 201 | 202 | preprocessor = Preprocessor(filter_pattern='', 203 | lower_case=True, 204 | hash_numbers=False) 205 | train_prep = [preprocessor(t) for t in train_data] 206 | inputs_prep = [t[0] for t in train_prep] 207 | targets_prep = [t[1] for t in train_prep] 208 | 209 | # Build tf subword tokenizers. Other custom tokenizers can be implemented 210 | # by subclassing headliner.preprocessing.Tokenizer 211 | from tensorflow_datasets.core.features.text import SubwordTextEncoder 212 | tokenizer_input = SubwordTextEncoder.build_from_corpus( 213 | inputs_prep, target_vocab_size=2**13, reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 214 | tokenizer_target = SubwordTextEncoder.build_from_corpus( 215 | targets_prep, target_vocab_size=2**13, reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 216 | 217 | vectorizer = Vectorizer(tokenizer_input, tokenizer_target) 218 | summarizer = TransformerSummarizer(embedding_size=64, max_prediction_len=50) 219 | summarizer.init_model(preprocessor, vectorizer) 220 | 221 | trainer = Trainer(batch_size=2) 222 | trainer.train(summarizer, train_data, num_epochs=3) 223 | ``` 224 | 225 | 226 | ### Use pre-trained BERT embeddings 227 | Pre-trained BERT models can be included as follows. 228 | Be aware that pre-trained BERT models are expensive to train and require custom preprocessing! 229 | 230 | ```python 231 | from headliner.preprocessing.bert_preprocessor import BertPreprocessor 232 | from spacy.lang.en import English 233 | 234 | train_data = [('Some inputs.', 'Some outputs.')] * 10 235 | 236 | # use BERT-specific start and end token 237 | preprocessor = BertPreprocessor(nlp=English() 238 | train_prep = [preprocessor(t) for t in train_data] 239 | targets_prep = [t[1] for t in train_prep] 240 | 241 | 242 | from tensorflow_datasets.core.features.text import SubwordTextEncoder 243 | from transformers import BertTokenizer 244 | from headliner.model.bert_summarizer import BertSummarizer 245 | 246 | # Use a pre-trained BERT embedding and BERT tokenizer for the encoder 247 | tokenizer_input = BertTokenizer.from_pretrained('bert-base-uncased') 248 | tokenizer_target = SubwordTextEncoder.build_from_corpus( 249 | targets_prep, target_vocab_size=2**13, reserved_tokens=[preprocessor.start_token, preprocessor.end_token]) 250 | 251 | vectorizer = BertVectorizer(tokenizer_input, tokenizer_target) 252 | summarizer = BertSummarizer(num_heads=2, 253 | feed_forward_dim=512, 254 | num_layers_encoder=0, 255 | num_layers_decoder=4, 256 | bert_embedding_encoder='bert-base-uncased', 257 | embedding_size_encoder=768, 258 | embedding_size_decoder=768, 259 | dropout_rate=0.1, 260 | max_prediction_len=50)) 261 | summarizer.init_model(preprocessor, vectorizer) 262 | 263 | trainer = Trainer(batch_size=2) 264 | trainer.train(summarizer, train_data, num_epochs=3) 265 | ``` 266 | 267 | 268 | ### Training on large datasets 269 | Large datasets can be handled by using an iterator: 270 | 271 | ```python 272 | def read_data_iteratively(): 273 | return (('Some inputs.', 'Some outputs.') for _ in range(1000)) 274 | 275 | class DataIterator: 276 | def __iter__(self): 277 | return read_data_iteratively() 278 | 279 | data_iter = DataIterator() 280 | 281 | summarizer = TransformerSummarizer(embedding_size=10, max_prediction_len=20) 282 | trainer = Trainer(batch_size=16, steps_per_epoch=1000) 283 | trainer.train(summarizer, data_iter, num_epochs=3) 284 | ``` 285 | 286 | ## 🤝 Contribute 287 | We welcome all kinds of contributions such as new models, new examples and many more. 288 | See the [Contribution](CONTRIBUTING.md) guide for more details. 289 | 290 | ## 📝 Cite this work 291 | Please cite Headliner in your publications if this is useful for your research. Here is an example BibTeX entry: 292 | ```BibTeX 293 | @misc{axelspringerai2019headliners, 294 | title={Headliner}, 295 | author={Christian Schäfer & Dat Tran}, 296 | year={2019}, 297 | howpublished={\url{https://github.com/as-ideas/headliner}}, 298 | } 299 | ``` 300 | 301 | ## 🏗 Maintainers 302 | * Christian Schäfer, github: [cschaefer26](https://github.com/cschaefer26) 303 | * Dat Tran, github: [datitran](https://github.com/datitran) 304 | 305 | ## © Copyright 306 | 307 | See [LICENSE](LICENSE) for details. 308 | 309 | ## References 310 | 311 | [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) 312 | 313 | [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025) 314 | 315 | ## Acknowlegements 316 | 317 | https://www.tensorflow.org/tutorials/text/transformer 318 | 319 | https://github.com/huggingface/transformers 320 | 321 | https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/ 322 | 323 | -------------------------------------------------------------------------------- /headliner/trainer.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | import tempfile 5 | from collections import Counter 6 | from typing import Tuple, List, Iterable, Callable, Dict, Union 7 | 8 | import tensorflow as tf 9 | import yaml 10 | 11 | from headliner.callbacks.evaluation_callback import EvaluationCallback 12 | from headliner.callbacks.model_checkpoint_callback import ModelCheckpointCallback 13 | from headliner.callbacks.tensorboard_callback import TensorboardCallback 14 | from headliner.callbacks.validation_callback import ValidationCallback 15 | from headliner.embeddings import read_embedding, embedding_to_matrix 16 | from headliner.evaluation.scorer import Scorer 17 | from headliner.losses import masked_crossentropy 18 | from headliner.model.bert_summarizer import BertSummarizer 19 | from headliner.model.summarizer import Summarizer 20 | from headliner.preprocessing.bucket_generator import BucketGenerator 21 | from headliner.preprocessing.dataset_generator import DatasetGenerator 22 | from headliner.preprocessing.keras_tokenizer import KerasTokenizer 23 | from headliner.preprocessing.preprocessor import Preprocessor 24 | from headliner.preprocessing.vectorizer import Vectorizer 25 | from headliner.utils.logger import get_logger 26 | 27 | START_TOKEN = '' 28 | END_TOKEN = '' 29 | OOV_TOKEN = '' 30 | 31 | 32 | class Trainer: 33 | 34 | def __init__(self, 35 | max_input_len=None, 36 | max_output_len=None, 37 | batch_size=16, 38 | max_vocab_size_encoder=200000, 39 | max_vocab_size_decoder=200000, 40 | embedding_path_encoder=None, 41 | embedding_path_decoder=None, 42 | steps_per_epoch=500, 43 | tensorboard_dir=None, 44 | model_save_path=None, 45 | shuffle_buffer_size=100000, 46 | use_bucketing=False, 47 | bucketing_buffer_size_batches=10000, 48 | bucketing_batches_to_bucket=100, 49 | logging_level=logging.INFO, 50 | num_print_predictions=5, 51 | steps_to_log=10, 52 | preprocessor: Union[Preprocessor, None] = None) -> None: 53 | """ 54 | Initializes the trainer. 55 | 56 | Args: 57 | max_input_len (output): Maximum length of input sequences, longer sequences will be truncated. 58 | max_output_len (output): Maximum length of output sequences, longer sequences will be truncated. 59 | batch_size: Size of mini-batches for stochastic gradient descent. 60 | max_vocab_size_encoder: Maximum number of unique tokens to consider for encoder embeddings. 61 | max_vocab_size_decoder: Maximum number of unique tokens to consider for decoder embeddings. 62 | embedding_path_encoder: Path to embedding file for the encoder. 63 | embedding_path_decoder: Path to embedding file for the decoder. 64 | steps_per_epoch: Number of steps to train until callbacks are invoked. 65 | tensorboard_dir: Directory for saving tensorboard logs. 66 | model_save_path: Directory for saving the best model. 67 | shuffle_buffer_size: Size of the buffer for shuffling the files before batching. 68 | use_bucketing: Whether to bucket the sequences by length to reduce the amount of padding. 69 | bucketing_buffer_size_batches: Number of batches to buffer when bucketing sequences. 70 | bucketing_batches_to_bucket: Number of buffered batches from which sequences are collected for bucketing. 71 | logging_level: Level of logging to use, e.g. logging.INFO or logging.DEBUG. 72 | num_print_predictions: Number of sample predictions to print in each evaluation. 73 | steps_to_log: Number of steps to wait for logging output. 74 | preprocessor (optional): custom preprocessor, if None a standard preprocessor will be created. 75 | """ 76 | 77 | self.max_input_len = max_input_len 78 | self.max_output_len = max_output_len 79 | self.batch_size = batch_size 80 | self.max_vocab_size_encoder = max_vocab_size_encoder 81 | self.max_vocab_size_decoder = max_vocab_size_decoder 82 | self.bucketing_buffer_size_batches = bucketing_buffer_size_batches 83 | self.bucketing_batches_to_bucket = bucketing_batches_to_bucket 84 | self.embedding_path_encoder = embedding_path_encoder 85 | self.embedding_path_decoder = embedding_path_decoder 86 | self.steps_per_epoch = steps_per_epoch 87 | self.tensorboard_dir = tensorboard_dir 88 | self.model_save_path = model_save_path 89 | self.loss_function = masked_crossentropy 90 | self.use_bucketing = use_bucketing 91 | self.shuffle_buffer_size = None if use_bucketing else shuffle_buffer_size 92 | 93 | self.bucket_generator = None 94 | if use_bucketing: 95 | self.bucket_generator = BucketGenerator(element_length_function=lambda vecs: len(vecs[0]), 96 | batch_size=self.batch_size, 97 | buffer_size_batches=self.bucketing_buffer_size_batches, 98 | batches_to_bucket=self.bucketing_batches_to_bucket, 99 | shuffle=True, 100 | seed=42) 101 | self.logger = get_logger(__name__) 102 | self.logger.setLevel(logging_level) 103 | self.num_print_predictions = num_print_predictions 104 | self.steps_to_log = steps_to_log 105 | self.preprocessor = preprocessor or Preprocessor(start_token=START_TOKEN, end_token=END_TOKEN) 106 | 107 | @classmethod 108 | def from_config(cls, file_path, **kwargs): 109 | with open(file_path, 'r', encoding='utf-8') as f: 110 | cfg = yaml.load(f) 111 | batch_size = cfg['batch_size'] 112 | max_vocab_size_encoder = cfg['max_vocab_size_encoder'] 113 | max_vocab_size_decoder = cfg['max_vocab_size_decoder'] 114 | glove_path_encoder = cfg['embedding_path_encoder'] 115 | glove_path_decoder = cfg['embedding_path_decoder'] 116 | steps_per_epoch = cfg['steps_per_epoch'] 117 | tensorboard_dir = cfg['tensorboard_dir'] 118 | model_save_path = cfg['model_save_path'] 119 | use_bucketing = cfg['use_bucketing'] 120 | shuffle_buffer_size = cfg['shuffle_buffer_size'] 121 | bucketing_buffer_size_batches = cfg['bucketing_buffer_size_batches'] 122 | bucketing_batches_to_bucket = cfg['bucketing_batches_to_bucket'] 123 | steps_to_log = cfg['steps_to_log'] 124 | logging_level = logging.INFO 125 | logging_level_string = cfg['logging_level'] 126 | max_input_len = cfg['max_input_len'] 127 | max_output_len = cfg['max_output_len'] 128 | if logging_level_string == 'debug': 129 | logging_level = logging.DEBUG 130 | elif logging_level_string == 'error': 131 | logging_level = logging.ERROR 132 | return Trainer(batch_size=batch_size, 133 | max_vocab_size_encoder=max_vocab_size_encoder, 134 | max_vocab_size_decoder=max_vocab_size_decoder, 135 | embedding_path_encoder=glove_path_encoder, 136 | embedding_path_decoder=glove_path_decoder, 137 | steps_per_epoch=steps_per_epoch, 138 | tensorboard_dir=tensorboard_dir, 139 | model_save_path=model_save_path, 140 | use_bucketing=use_bucketing, 141 | shuffle_buffer_size=shuffle_buffer_size, 142 | bucketing_buffer_size_batches=bucketing_buffer_size_batches, 143 | bucketing_batches_to_bucket=bucketing_batches_to_bucket, 144 | logging_level=logging_level, 145 | steps_to_log=steps_to_log, 146 | max_input_len=max_input_len, 147 | max_output_len=max_output_len, 148 | **kwargs) 149 | 150 | def train(self, 151 | summarizer: Summarizer, 152 | train_data: Iterable[Tuple[str, str]], 153 | val_data: Iterable[Tuple[str, str]] = None, 154 | num_epochs=2500, 155 | scorers: Dict[str, Scorer] = None, 156 | callbacks: List[tf.keras.callbacks.Callback] = None) -> None: 157 | """ 158 | Trains a summarizer or resumes training of a previously initialized summarizer. 159 | 160 | Args: 161 | summarizer: Model to train, can be either a freshly created model or a loaded model. 162 | train_data: Data to train the model on. 163 | val_data (optional): Validation data. 164 | num_epochs: Number of epochs to train. 165 | scorers (optional): Dictionary with {score_name, scorer} to add validation scores to the logs. 166 | callbacks (optional): Additional custom callbacks. 167 | """ 168 | if summarizer.preprocessor is None or summarizer.vectorizer is None: 169 | self.logger.info('training a bare model, preprocessing data to init model...') 170 | self._init_model(summarizer, train_data) 171 | else: 172 | self.logger.info('training an already initialized model...') 173 | vectorize_train = self._vectorize_data(preprocessor=summarizer.preprocessor, 174 | vectorizer=summarizer.vectorizer, 175 | bucket_generator=self.bucket_generator) 176 | vectorize_val = self._vectorize_data(preprocessor=summarizer.preprocessor, 177 | vectorizer=summarizer.vectorizer, 178 | bucket_generator=None) 179 | train_gen, val_gen = self._create_dataset_generators(summarizer) 180 | train_dataset = train_gen(lambda: vectorize_train(train_data)) 181 | val_dataset = val_gen(lambda: vectorize_val(val_data)) 182 | 183 | train_callbacks = callbacks or [] 184 | if val_data is not None: 185 | train_callbacks.extend([ 186 | EvaluationCallback(summarizer=summarizer, 187 | scorers=scorers or {}, 188 | val_data=val_data, 189 | print_num_examples=self.num_print_predictions), 190 | ValidationCallback(summarizer=summarizer, 191 | val_dataset=val_dataset, 192 | loss_function=self.loss_function, 193 | batch_size=self.batch_size), 194 | ]) 195 | loss_monitor = 'loss_val' if val_data is not None else 'loss' 196 | train_callbacks.append( 197 | ModelCheckpointCallback(file_path=self.model_save_path, 198 | summarizer=summarizer, 199 | monitor=loss_monitor, 200 | mode='min')) 201 | 202 | if self.tensorboard_dir is not None: 203 | tb_callback = TensorboardCallback(log_dir=self.tensorboard_dir) 204 | train_callbacks.append(tb_callback) 205 | logs = {} 206 | epoch_count, batch_count, train_losses = 0, 0, [] 207 | train_step = summarizer.new_train_step(self.loss_function, 208 | self.batch_size, 209 | apply_gradients=True) 210 | while epoch_count < num_epochs: 211 | for train_batch in train_dataset.take(-1): 212 | batch_count += 1 213 | current_loss = train_step(*train_batch) 214 | train_losses.append(current_loss) 215 | logs['loss'] = float(sum(train_losses)) / len(train_losses) 216 | if batch_count % self.steps_to_log == 0: 217 | self.logger.info('epoch {epoch}, batch {batch}, ' 218 | 'logs: {logs}'.format(epoch=epoch_count, 219 | batch=batch_count, 220 | logs=logs)) 221 | if batch_count % self.steps_per_epoch == 0: 222 | train_losses.clear() 223 | for callback in train_callbacks: 224 | callback.on_epoch_end(epoch_count, logs=logs) 225 | epoch_count += 1 226 | if epoch_count >= num_epochs: 227 | break 228 | 229 | self.logger.info('finished iterating over dataset, total batches: {}'.format(batch_count)) 230 | if batch_count == 0: 231 | raise ValueError('Iterating over the dataset yielded zero batches!') 232 | 233 | def _init_model(self, 234 | summarizer: Summarizer, 235 | train_data: Iterable[Tuple[str, str]]) -> None: 236 | 237 | tokenizer_encoder, tokenizer_decoder = self._create_tokenizers(train_data) 238 | self.logger.info('vocab encoder: {vocab_enc}, vocab decoder: {vocab_dec}'.format( 239 | vocab_enc=tokenizer_encoder.vocab_size, vocab_dec=tokenizer_decoder.vocab_size)) 240 | vectorizer = Vectorizer(tokenizer_encoder, 241 | tokenizer_decoder, 242 | max_input_len=self.max_input_len, 243 | max_output_len=self.max_output_len) 244 | embedding_weights_encoder, embedding_weights_decoder = None, None 245 | 246 | if self.embedding_path_encoder is not None: 247 | self.logger.info('loading encoder embedding from {}'.format(self.embedding_path_encoder)) 248 | embedding = read_embedding(self.embedding_path_encoder, summarizer.embedding_size) 249 | embedding_weights_encoder = embedding_to_matrix(embedding=embedding, 250 | token_index=tokenizer_encoder.token_index, 251 | embedding_dim=summarizer.embedding_size) 252 | if self.embedding_path_decoder is not None: 253 | self.logger.info('loading decoder embedding from {}'.format(self.embedding_path_decoder)) 254 | embedding = read_embedding(self.embedding_path_decoder, summarizer.embedding_size) 255 | embedding_weights_decoder = embedding_to_matrix(embedding=embedding, 256 | token_index=tokenizer_decoder.token_index, 257 | embedding_dim=summarizer.embedding_size) 258 | summarizer.init_model(preprocessor=self.preprocessor, 259 | vectorizer=vectorizer, 260 | embedding_weights_encoder=embedding_weights_encoder, 261 | embedding_weights_decoder=embedding_weights_decoder) 262 | 263 | def _vectorize_data(self, 264 | preprocessor: Preprocessor, 265 | vectorizer: Vectorizer, 266 | bucket_generator: BucketGenerator = None) \ 267 | -> Callable[[Iterable[Tuple[str, str]]], 268 | Iterable[Tuple[List[int], List[int]]]]: 269 | 270 | def vectorize(raw_data: Iterable[Tuple[str, str]]): 271 | data_preprocessed = (preprocessor(d) for d in raw_data) 272 | data_vectorized = (vectorizer(d) for d in data_preprocessed) 273 | if bucket_generator is None: 274 | return data_vectorized 275 | else: 276 | return bucket_generator(data_vectorized) 277 | 278 | return vectorize 279 | 280 | def _create_tokenizers(self, 281 | train_data: Iterable[Tuple[str, str]] 282 | ) -> Tuple[KerasTokenizer, KerasTokenizer]: 283 | 284 | self.logger.info('fitting tokenizers...') 285 | counter_encoder = Counter() 286 | counter_decoder = Counter() 287 | train_preprocessed = (self.preprocessor(d) for d in train_data) 288 | for text_encoder, text_decoder in train_preprocessed: 289 | counter_encoder.update(text_encoder.split()) 290 | counter_decoder.update(text_decoder.split()) 291 | tokens_encoder = {token_count[0] for token_count 292 | in counter_encoder.most_common(self.max_vocab_size_encoder)} 293 | tokens_decoder = {token_count[0] for token_count 294 | in counter_decoder.most_common(self.max_vocab_size_decoder)} 295 | tokens_encoder.update({self.preprocessor.start_token, self.preprocessor.end_token}) 296 | tokens_decoder.update({self.preprocessor.start_token, self.preprocessor.end_token}) 297 | tokenizer_encoder = KerasTokenizer(oov_token=OOV_TOKEN, lower=False, filters='') 298 | tokenizer_decoder = KerasTokenizer(oov_token=OOV_TOKEN, lower=False, filters='') 299 | tokenizer_encoder.fit(sorted(list(tokens_encoder))) 300 | tokenizer_decoder.fit(sorted(list(tokens_decoder))) 301 | return tokenizer_encoder, tokenizer_decoder 302 | 303 | def _create_dataset_generators(self, summarizer): 304 | data_rank = 3 if isinstance(summarizer, BertSummarizer) else 2 305 | train_gen = DatasetGenerator(batch_size=self.batch_size, 306 | shuffle_buffer_size=self.shuffle_buffer_size, 307 | rank=data_rank) 308 | val_gen = DatasetGenerator(batch_size=self.batch_size, 309 | shuffle_buffer_size=None, 310 | rank=data_rank) 311 | return train_gen, val_gen 312 | --------------------------------------------------------------------------------