")
37 |
38 | def testGetXUnkToken(self):
39 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
40 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[], topP=0.5)
41 | self.assertFalse(result)
42 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[], freqK=10)
43 | self.assertFalse(result)
44 | result = tfkit.utility.tok.get_freqK_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
45 | freqK=1)
46 | self.assertTrue(len(result) > 0)
47 | result = tfkit.utility.tok.get_topP_unk_token(tokenizer, file_paths=[self.DATASET_DIR + '/unk_tok.csv'],
48 | topP=0.9)
49 | self.assertTrue(len(result) > 0)
50 |
51 | def testHandleExceed(self):
52 | tokenizer = BertTokenizer.from_pretrained('voidful/albert_chinese_tiny')
53 | seq = " ".join([str(_) for _ in range(100)])
54 | maxlen = 50
55 | for mode in ['noop', 'remove', 'slide', 'start_slice', 'end_slice']:
56 | rlt, _ = tfkit.utility.tok.handle_exceed(tokenizer, seq, maxlen, mode=mode)
57 | if mode == 'remove':
58 | self.assertTrue(len(rlt) == 0)
59 | if mode == 'slide':
60 | self.assertTrue(len(rlt) > 1)
61 | for i in rlt:
62 | print(i)
63 | if mode != 'noop':
64 | self.assertTrue(len(i) == 50)
65 |
--------------------------------------------------------------------------------
/tfkit/task/qa/preprocessor.py:
--------------------------------------------------------------------------------
1 | import nlp2
2 | import tfkit.utility.tok as tok
3 | import torch
4 | from tfkit.utility.data_filereader import get_qa_data_from_file
5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 | def read_file_to_data(self, path):
10 | return get_qa_data_from_file(path)
11 |
12 | def preprocess_component_prepare_input(self, item):
13 | mapping_index = []
14 | pos = 1 # cls as start 0
15 | input_text_list = nlp2.split_sentence_to_array(item['input'])
16 | for i in input_text_list:
17 | for _ in range(len(self.tokenizer.tokenize(i))):
18 | if _ < 1:
19 | mapping_index.append({'char': i, 'pos': pos})
20 | pos += 1
21 | item['mapping_index'] = mapping_index
22 | return item
23 |
24 | def preprocess_component_convert_to_id(self, item, **param_dict):
25 | input_text, target = item['input'], item.get('target', None)
26 | tokenized_input = [tok.tok_begin(self.tokenizer)] + input_text + [tok.tok_sep(self.tokenizer)]
27 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
28 | start_index = item['input_index'][0]
29 | end_index = item['input_index'][1]
30 | if target:
31 | item['target'] = [0, 0]
32 | target_start, target_end = target
33 | ori_start = target_start = int(target_start)
34 | ori_end = target_end = int(target_end)
35 | ori_ans = tokenized_input[ori_start:ori_end]
36 | target_start -= start_index
37 | target_end -= start_index
38 | # print("target_start", self.parameters['maxlen'],item['mapping_index'][target_start]['pos'],ori_end)
39 | # if item['mapping_index'][target_start]['pos'] > ori_end or target_start < 0 \
40 | # or target_start > self.parameters['maxlen'] \
41 | # or target_end >= self.parameters['maxlen'] - 2:
42 | # target_start = 0
43 | # target_end = 0
44 | # else:
45 | for map_pos, map_tok in enumerate(item['mapping_index'][start_index:]):
46 | if start_index < map_tok['pos'] <= end_index:
47 | length = len(self.tokenizer.tokenize(map_tok['char']))
48 | if map_pos < ori_start:
49 | target_start += length - 1
50 | if map_pos < ori_end:
51 | target_end += length - 1
52 | item['target'] = [target_start + 1, target_end + 1] # cls +1
53 |
54 | item['input'] = input_id
55 | item['mask'] = [1] * len(input_id)
56 | item['raw_input'] = tokenized_input
57 | yield item
58 |
59 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
60 | row_dict = {
61 | 'input': item['input'],
62 | 'mask': item['mask']
63 | }
64 | if 'target' in item:
65 | row_dict['target'] = item['target']
66 | return {key: torch.tensor(value) for key, value in row_dict.items()}
67 |
--------------------------------------------------------------------------------
/tfkit/task/once/preprocessor.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility.tok as tok
2 | from tfkit.utility.data_filereader import get_gen_data_from_file
3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
4 |
5 |
6 | class Preprocessor(GeneralNLPPreprocessor):
7 | def read_file_to_data(self, path):
8 | return get_gen_data_from_file(path)
9 |
10 | def set_global_parameters(self):
11 | self.tokenize_target = True
12 |
13 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
14 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
15 | tokenized_input, tokenized_target, n_target = item['input'], item.get('target', None), item.get('ntarget', None)
16 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
17 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
18 | if "neg" in likelihood:
19 | # formatting neg data in csv
20 | if n_target is None:
21 | ntext_arr = [
22 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
23 | elif tok.tok_sep(self.tokenizer) in n_target:
24 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
25 | else:
26 | ntext_arr = [n_target.strip()]
27 | for neg_text in ntext_arr:
28 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
29 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
30 | 'ntarget': self.tokenizer.convert_tokens_to_ids(neg_text)}
31 |
32 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
33 | tok_pad = tok.tok_pad_id(tokenizer)
34 | tok_bos = tok.tok_begin_id(tokenizer)
35 | tok_sep = tok.tok_sep_id(tokenizer)
36 | tok_mask = tok.tok_mask_id(tokenizer)
37 |
38 | row_dict = {}
39 | t_input_id = item['input']
40 | encoder_mask_id = [1] * (len(t_input_id))
41 | encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id)))
42 | target_start = len(t_input_id)
43 | target_end = maxlen
44 | target_length = target_end - target_start
45 | t_input_id.extend([tok_pad] * (maxlen - len(t_input_id)))
46 | if 'target' in item and item['target'] is not None:
47 | target = item['target'] + [tok_sep]
48 | target.extend([-1] * (maxlen - len(target)))
49 | row_dict['target'] = target
50 | row_dict['ntarget'] = [-1] * maxlen
51 | if 'ntarget' in item and len(item['ntarget'].strip()) > 0:
52 | tokenized_ntarget_id = item['ntarget']
53 | tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id)))
54 | if len(tokenized_ntarget_id) <= maxlen:
55 | row_dict['ntarget'] = tokenized_ntarget_id
56 |
57 | input_length = min(maxlen, target_start * 3)
58 | row_dict['input'] = t_input_id
59 | row_dict['mask'] = encoder_mask_id
60 | row_dict['start'] = target_start
61 | row_dict['end'] = maxlen
62 | row_dict['input_length'] = input_length
63 | row_dict['target_length'] = target_length
64 | return row_dict
65 |
--------------------------------------------------------------------------------
/tfkit/utility/base_model.py:
--------------------------------------------------------------------------------
1 | """Base model class for all TFKit tasks."""
2 |
3 | from abc import ABC, abstractmethod
4 | from typing import Any, Callable, Dict, Optional, Union
5 |
6 | import torch
7 | from torch import nn
8 | from transformers import PreTrainedModel, PreTrainedTokenizer
9 |
10 |
11 | class BaseTFKitModel(nn.Module, ABC):
12 | """Base class for all TFKit task models.
13 |
14 | Provides common functionality for all TFKit models including:
15 | - Consistent initialization patterns
16 | - Predictor setup
17 | - Cache management
18 | - Utility methods for model dimensions
19 | """
20 |
21 | def __init__(self, tokenizer: PreTrainedTokenizer, pretrained: PreTrainedModel,
22 | maxlen: int = 512, **kwargs) -> None:
23 | """Initialize the base model.
24 |
25 | Args:
26 | tokenizer: The tokenizer for text processing
27 | pretrained: The pretrained transformer model
28 | maxlen: Maximum sequence length
29 | **kwargs: Additional arguments passed to subclasses
30 | """
31 | super().__init__()
32 | self.tokenizer = tokenizer
33 | self.pretrained = pretrained
34 | self.maxlen = maxlen
35 | self.vocab_size = max(pretrained.config.vocab_size, tokenizer.__len__())
36 |
37 | # Initialize predictor - to be implemented by subclasses
38 | self.predictor: Optional[Any] = None
39 | self.predict: Optional[Callable] = None
40 |
41 | def _setup_predictor(self, predictor_class: type, preprocessor_class: type) -> None:
42 | """Setup predictor and prediction method.
43 |
44 | Args:
45 | predictor_class: The predictor class to instantiate
46 | preprocessor_class: The preprocessor class to use with the predictor
47 | """
48 | predictor = predictor_class(self, preprocessor_class)
49 | self.predictor = predictor
50 | self.predict = predictor.predict
51 |
52 | def clean_cache(self) -> None:
53 | """Clean model cache - default implementation."""
54 | if hasattr(self, 'encoder_outputs'):
55 | self.encoder_outputs = None
56 | if hasattr(self, 'past_key_values'):
57 | self.past_key_values = None
58 |
59 | @abstractmethod
60 | def forward(self, batch_data: Dict[str, Any], eval: bool = False,
61 | **kwargs) -> Union[torch.Tensor, Dict[str, Any]]:
62 | """Forward pass - must be implemented by subclasses.
63 |
64 | Args:
65 | batch_data: Dictionary containing batch data
66 | eval: Whether in evaluation mode
67 | **kwargs: Additional arguments
68 |
69 | Returns:
70 | Loss tensor during training or results dictionary during evaluation
71 | """
72 | pass
73 |
74 | def get_hidden_size(self) -> int:
75 | """Get the hidden size of the pretrained model.
76 |
77 | Returns:
78 | Hidden size dimension
79 | """
80 | return self.pretrained.config.hidden_size
81 |
82 | def get_vocab_size(self) -> int:
83 | """Get the vocabulary size.
84 |
85 | Returns:
86 | Vocabulary size
87 | """
88 | return self.vocab_size
89 |
--------------------------------------------------------------------------------
/tfkit/task/once/model.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn.functional import softmax
6 |
7 | from tfkit.task.once import Preprocessor
8 | from tfkit.utility.base_model import BaseTFKitModel
9 | from tfkit.utility.loss import *
10 | from tfkit.utility.predictor import NonAutoRegressivePredictor
11 | from tfkit.utility.tok import *
12 |
13 |
14 | class Model(BaseTFKitModel):
15 | """Once generation model for non-autoregressive text generation."""
16 |
17 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None, **kwargs):
18 | super().__init__(tokenizer, pretrained, maxlen, **kwargs)
19 | self.model = nn.Linear(self.get_hidden_size(), self.get_vocab_size())
20 | self._setup_predictor(NonAutoRegressivePredictor, Preprocessor)
21 |
22 | def forward(self, batch_data, eval=False, max_return=1, **kwargs):
23 | inputs = batch_data['input']
24 | masks = batch_data['mask']
25 | starts = batch_data['start']
26 | ends = batch_data['end']
27 | tokens_tensor = torch.as_tensor(inputs)
28 | mask_tensors = torch.as_tensor(masks)
29 |
30 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
31 | sequence_output = output[0]
32 | prediction_scores = self.model(sequence_output)
33 |
34 | if eval:
35 | result_dict = {
36 | 'max_item': [],
37 | 'label_prob': defaultdict(list),
38 | 'prob_list': []
39 | }
40 | start = batch_data['start'][0]
41 | stop = False
42 | topK_ids = [[]] * max_return
43 | topK_probs = [1] * max_return
44 | while start < self.maxlen and not stop:
45 | softmax_score = softmax(prediction_scores[0][start], dim=0)
46 | max_item_id = torch.argmax(softmax_score, -1).item()
47 | max_item_prob = softmax_score[max_item_id].item()
48 | if max_return > 1:
49 | topK = torch.topk(softmax_score, max_return)
50 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
51 | topK_ids[k].append(tid)
52 | topK_probs[k] *= prob
53 | else:
54 | topK_ids[0].append(max_item_id)
55 | topK_probs[0] *= max_item_prob
56 |
57 | if tok_sep_id(self.tokenizer) == max_item_id:
58 | stop = True
59 | start += 1
60 | result_dict['prob_list'] = topK_probs
61 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
62 | zip(topK_ids, topK_probs)]
63 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
64 | outputs = result_dict
65 | else:
66 | targets = batch_data['target']
67 | negative_targets = batch_data['ntarget']
68 | loss_tensors = torch.as_tensor(targets)
69 | negativeloss_tensors = torch.as_tensor(negative_targets)
70 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
71 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
72 | loss_tensors.view(-1))
73 | if not torch.all(negativeloss_tensors.eq(-1)).item():
74 | negative_loss_fct = NegativeCElLoss()
75 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
76 | negativeloss_tensors.view(-1))
77 | masked_lm_loss += negative_loss
78 | outputs = masked_lm_loss
79 |
80 | return outputs
81 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Pytest configuration and fixtures for TFKit testing."""
2 |
3 | import os
4 | import tempfile
5 | from typing import Dict, List, Any
6 |
7 | import pytest
8 | import torch
9 | from transformers import AutoTokenizer, AutoModel
10 |
11 | from tfkit.utility.constants import DEFAULT_MAXLEN, DEFAULT_BATCH_SIZE
12 |
13 |
14 | @pytest.fixture
15 | def mock_tokenizer():
16 | """Create a mock tokenizer for testing."""
17 | return AutoTokenizer.from_pretrained('bert-base-uncased')
18 |
19 |
20 | @pytest.fixture
21 | def mock_pretrained():
22 | """Create a mock pretrained model for testing."""
23 | return AutoModel.from_pretrained('bert-base-uncased')
24 |
25 |
26 | @pytest.fixture
27 | def mock_batch_data():
28 | """Create mock batch data for testing."""
29 | return {
30 | 'input': torch.randint(0, 1000, (2, 10)),
31 | 'mask': torch.ones(2, 10),
32 | 'target': torch.randint(0, 2, (2, 1)),
33 | 'task': [b'test_task', b'test_task']
34 | }
35 |
36 |
37 | @pytest.fixture
38 | def mock_tasks_detail():
39 | """Create mock tasks detail for classification testing."""
40 | return {
41 | 'test_task': ['label1', 'label2', 'label3']
42 | }
43 |
44 |
45 | @pytest.fixture
46 | def temp_dir():
47 | """Create a temporary directory for testing."""
48 | with tempfile.TemporaryDirectory() as tmp_dir:
49 | yield tmp_dir
50 |
51 |
52 | @pytest.fixture
53 | def sample_training_args():
54 | """Create sample training arguments for testing."""
55 | return {
56 | 'batch': DEFAULT_BATCH_SIZE,
57 | 'lr': [5e-5],
58 | 'epoch': 2,
59 | 'maxlen': DEFAULT_MAXLEN,
60 | 'grad_accum': 1,
61 | 'task': ['clas'],
62 | 'config': 'bert-base-uncased',
63 | 'train': ['dummy_train.csv'],
64 | 'test': ['dummy_test.csv'],
65 | 'savedir': 'test_checkpoints',
66 | 'seed': 42,
67 | 'worker': 1,
68 | 'no_eval': True
69 | }
70 |
71 |
72 | @pytest.fixture
73 | def mock_csv_data():
74 | """Create mock CSV data for testing."""
75 | return """input,target
76 | "This is a test sentence",label1
77 | "Another test sentence",label2
78 | "Third test sentence",label1
79 | """
80 |
81 |
82 | class MockLogger:
83 | """Mock logger for testing."""
84 |
85 | def __init__(self):
86 | self.logs = []
87 | self.metrics = []
88 |
89 | def write_log(self, message: str) -> None:
90 | self.logs.append(message)
91 |
92 | def write_metric(self, name: str, value: Any, step: int) -> None:
93 | self.metrics.append((name, value, step))
94 |
95 | def write_config(self, config: Dict[str, Any]) -> None:
96 | self.logs.append(f"Config: {config}")
97 |
98 |
99 | @pytest.fixture
100 | def mock_logger():
101 | """Create a mock logger for testing."""
102 | return MockLogger()
103 |
104 |
105 | class MockAccelerator:
106 | """Mock accelerator for testing."""
107 |
108 | def __init__(self):
109 | self.state = type('State', (), {'backend': None})()
110 |
111 | def prepare(self, *args):
112 | if len(args) == 1:
113 | return args[0]
114 | return args
115 |
116 | def backward(self, loss):
117 | loss.backward()
118 |
119 | def print(self, *args, **kwargs):
120 | print(*args, **kwargs)
121 |
122 | def wait_for_everyone(self):
123 | pass
124 |
125 | def get_state_dict(self, model):
126 | return model.state_dict()
127 |
128 |
129 | @pytest.fixture
130 | def mock_accelerator():
131 | """Create a mock accelerator for testing."""
132 | return MockAccelerator()
133 |
134 |
135 | @pytest.fixture(autouse=True)
136 | def set_test_environment():
137 | """Set up test environment variables."""
138 | os.environ['TOKENIZERS_PARALLELISM'] = 'false'
139 | os.environ['OMP_NUM_THREADS'] = '1'
140 | yield
141 | # Cleanup is automatic
--------------------------------------------------------------------------------
/tfkit/test/test_zeval.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import tfkit
4 | from tfkit.test import *
5 |
6 |
7 | class TestEval(unittest.TestCase):
8 |
9 | def testHelp(self):
10 | result = os.system('tfkit-eval -h')
11 | self.assertTrue(result == 0)
12 |
13 | def test_parser(self):
14 | parser, _ = tfkit.eval.parse_eval_args(
15 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print'])
16 | print(parser)
17 | self.assertTrue(parser.get('model') == ['once'])
18 |
19 | eval_parser, model_parser = tfkit.eval.parse_eval_args(
20 | ['--model', 'once', '--metric', 'emf1', '--valid', 'test.csv', '--print', '--decodenum', '2'])
21 | self.assertTrue(eval_parser.get('model') == ['once'])
22 | self.assertTrue(model_parser.get('decodenum') == '2')
23 |
24 | def testEvalGen(self):
25 | tfkit.eval.main(
26 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
27 | result = os.system(
28 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
29 | self.assertTrue(result == 0)
30 |
31 | def testEvalGenOnce(self):
32 | tfkit.eval.main(
33 | ['--model', ONCE_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
34 | result = os.system(
35 | 'tfkit-eval --model ' + ONCE_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
36 | self.assertTrue(result == 0)
37 |
38 | def testEvalGenOnceCTC(self):
39 | tfkit.eval.main(
40 | ['--model', ONCECTC_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
41 | result = os.system(
42 | 'tfkit-eval --model ' + ONCECTC_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
43 | self.assertTrue(result == 0)
44 |
45 | def testEvalSeq2Seq(self):
46 | tfkit.eval.main(
47 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print',
48 | '--decodenum', '2'])
49 | tfkit.eval.main(
50 | ['--model', SEQ2SEQ_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
51 | result = os.system(
52 | 'tfkit-eval --model ' + SEQ2SEQ_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
53 | self.assertTrue(result == 0)
54 |
55 | def testEvalCLM(self):
56 | tfkit.eval.main(
57 | ['--model', CLM_MODEL_PATH, '--valid', GEN_DATASET, '--metric', 'emf1', '--print'])
58 | result = os.system(
59 | 'tfkit-eval --model ' + CLM_MODEL_PATH + ' --valid ' + GEN_DATASET + ' --metric emf1 --print')
60 | self.assertTrue(result == 0)
61 |
62 | def testEvalAddedTokenModel(self):
63 | result = os.system(
64 | 'tfkit-eval --model ' + ADDTOKFILE_MODEL_PATH + ' --valid ' + ADDTOK_DATASET + ' --metric emf1 --print')
65 | self.assertTrue(result == 0)
66 |
67 | def testEvalClassify(self):
68 | tfkit.eval.main(
69 | ['--model', CLAS_MODEL_PATH, '--valid', CLAS_DATASET, '--metric', 'clas', '--print'])
70 | result = os.system(
71 | 'tfkit-eval --model ' + CLAS_MODEL_PATH + ' --valid ' + CLAS_DATASET + ' --metric clas --print')
72 | self.assertTrue(result == 0)
73 |
74 | # def testEvalQA(self):
75 | # tfkit.eval.main(
76 | # ['--model', QA_MODEL_PATH, '--valid', QA_DATASET, '--metric', 'emf1', '--print'])
77 | # result = os.system(
78 | # 'tfkit-eval --model ' + QA_MODEL_PATH + ' --valid ' + QA_DATASET + ' --metric emf1 --print')
79 | # self.assertTrue(result == 0)
80 | #
81 | # def testEvalTag(self):
82 | # tfkit.eval.main(
83 | # ['--model', TAG_MODEL_PATH, '--valid', TAG_DATASET, '--metric', 'clas', '--print'])
84 | # result = os.system(
85 | # 'tfkit-eval --model ' + TAG_MODEL_PATH + ' --valid ' + TAG_DATASET + ' --metric clas --print')
86 | # self.assertTrue(result == 0)
--------------------------------------------------------------------------------
/tfkit/task/tag/preprocessor.py:
--------------------------------------------------------------------------------
1 | import tfkit.utility.tok as tok
2 | from tfkit.utility.data_filereader import get_tag_data_from_file
3 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
4 |
5 | get_data_from_file = get_tag_data_from_file
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 |
10 | def read_file_to_data(self, path):
11 | return get_tag_data_from_file(path)
12 |
13 | def preprocess(self, item, **param_dict):
14 | input_text, target = item['input'], item.get('target', None)
15 | separator = param_dict.get('separator', ' ')
16 | word_token_mapping = []
17 | token_word_mapping = []
18 | pos = 0
19 |
20 | for word_i, word in enumerate(input_text.split(separator)):
21 | tokenize_word = self.tokenizer.tokenize(word)
22 | for _ in range(len(tokenize_word)):
23 | if _ < 1: # only record first token (one word one record)
24 | word_token_mapping.append({'char': word, 'pos': pos, 'len': len(tokenize_word)})
25 | token_word_mapping.append({'tok': tokenize_word[_], 'word': word, 'pos': len(word_token_mapping) - 1})
26 | pos += 1
27 |
28 | t_input_list, t_pos_list = tok.handle_exceed(self.tokenizer, input_text, self.parameters['maxlen'] - 2,
29 | mode=self.parameters.get('handle_exceed'),
30 | keep_after_sep=False)
31 | preprocessed_data = []
32 | for t_input, t_pos in zip(t_input_list, t_pos_list): # -1 for cls
33 | # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
34 | row_dict = dict()
35 | tokenized_input = [tok.tok_begin(self.tokenizer)] + t_input
36 | input_id = self.tokenizer.convert_tokens_to_ids(tokenized_input)
37 |
38 | if target is not None:
39 | target_token = []
40 | for input_word, target_label in zip(word_token_mapping, target.split(separator)):
41 | if t_pos[0] <= input_word['pos'] < t_pos[1]:
42 | for _ in range(input_word['len']):
43 | target_token += [target_label]
44 |
45 | target_id = [target_token[0]] + target_token
46 |
47 | if len(input_id) != len(target_id):
48 | print(list(zip(input.split(separator), target.split(separator))))
49 | print(self.tokenizer.decode(input_id))
50 | print(input_id)
51 | print(target_id)
52 | print("input target len not equal ", len(input_id), len(target_id))
53 | continue
54 | row_dict['target'] = target_id
55 |
56 | row_dict['input'] = input_id
57 | row_dict['word_token_mapping'] = word_token_mapping
58 | row_dict['token_word_mapping'] = token_word_mapping
59 | row_dict['end'] = len(input_id)
60 | row_dict['pos'] = t_pos
61 | preprocessed_data.append(row_dict)
62 | return preprocessed_data
63 |
64 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 | labels = item['task_dict']
66 | print("item['input']",len(item['input']))
67 | mask_id = [1] * len(item['input'])
68 | mask_id.extend([0] * (maxlen - len(mask_id)))
69 | item['input'].extend([0] * (self.parameters['maxlen'] - len(item['input'])))
70 | row_dict = {
71 | 'input': item['input'],
72 | 'mask': mask_id,
73 | 'pos': item['pos'],
74 | }
75 | # 'token_word_mapping': item['token_word_mapping']
76 | if 'target' in item:
77 | print(labels['tag'])
78 | target_id = [labels['tag'].index(i) for i in item['target']]
79 | if "O" in labels['tag']:
80 | target_id = [labels['tag'].index("O")] + target_id
81 | else:
82 | target_id = [target_id[0]] + target_id
83 | target_id.extend([0] * (self.parameters['maxlen'] - len(target_id)))
84 | row_dict['target'] = target_id
85 |
86 | return row_dict
87 |
--------------------------------------------------------------------------------
/tfkit/task/clm/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.functional import softmax
4 |
5 | from tfkit.task.clm import Preprocessor
6 | from tfkit.utility.base_model import BaseTFKitModel
7 | from tfkit.utility.predictor import AutoRegressivePredictor
8 |
9 |
10 | class Model(BaseTFKitModel):
11 | """Causal Language Model for text generation."""
12 |
13 | def __init__(self, tokenizer, pretrained, maxlen=512, **kwargs):
14 | super().__init__(tokenizer, pretrained, maxlen, **kwargs)
15 | self.model = self._resolve_output_head()
16 | self.uses_pretrained_head = self.model is not None
17 | if not self.uses_pretrained_head:
18 | self.model = nn.Linear(self.get_hidden_size(), self.get_vocab_size())
19 |
20 | self._setup_predictor(AutoRegressivePredictor, Preprocessor)
21 |
22 | def _resolve_output_head(self):
23 | """Return the pretrained language modeling head if available."""
24 |
25 | if hasattr(self.pretrained, "get_output_embeddings"):
26 | output_embeddings = self.pretrained.get_output_embeddings()
27 | if output_embeddings is not None:
28 | return output_embeddings
29 | if hasattr(self.pretrained, "lm_head"):
30 | return self.pretrained.lm_head
31 | if hasattr(self.pretrained, "cls"):
32 | return self.pretrained.cls
33 | return None
34 |
35 | def forward(self, batch_data, eval=False, beamsearch=False, max_return=1, **kwargs):
36 | inputs = batch_data['input']
37 | masks = batch_data['mask']
38 | tokens_tensor = torch.as_tensor(inputs)
39 | mask_tensors = torch.as_tensor(masks)
40 | model_kwargs = {
41 | 'attention_mask': mask_tensors,
42 | 'return_dict': True,
43 | }
44 | if eval:
45 | model_kwargs['use_cache'] = False
46 |
47 | if eval:
48 | outputs = self.pretrained(tokens_tensor, **model_kwargs)
49 | prediction_scores = outputs['logits'] if 'logits' in outputs else outputs[0]
50 | else:
51 | targets = batch_data['target']
52 | loss_tensors = torch.as_tensor(targets)
53 |
54 | if self.uses_pretrained_head:
55 | labels = loss_tensors.clone().long()
56 | labels[labels == -1] = -100
57 | model_kwargs['labels'] = labels
58 | outputs = self.pretrained(tokens_tensor, **model_kwargs)
59 | prediction_scores = outputs['logits'] if 'logits' in outputs else outputs[0]
60 | masked_lm_loss = outputs['loss']
61 | else:
62 | loss_tensors = loss_tensors.long()
63 | outputs = self.pretrained(tokens_tensor, **model_kwargs)
64 | hidden_states = outputs['last_hidden_state'] if 'last_hidden_state' in outputs else outputs[0]
65 | prediction_scores = self.model(hidden_states)
66 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
67 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
68 | loss_tensors.view(-1))
69 |
70 | if eval:
71 | result_dict = {}
72 | start = batch_data['start'][0]
73 | softmax_score = softmax(prediction_scores[0][start], dim=-1).flatten()
74 | max_item_id = torch.argmax(softmax_score, -1).item()
75 | max_item_prob = softmax_score[max_item_id].item()
76 | result_dict['max_item'] = (self.tokenizer.convert_ids_to_tokens(max_item_id), max_item_prob)
77 | if max_return > 1:
78 | topK = torch.topk(softmax_score, max_return)
79 | prob_result = [(self.tokenizer.convert_ids_to_tokens(tid), prob) for prob, tid in
80 | zip(topK.values.data.tolist(), topK.indices.data.tolist())]
81 | result_dict['prob_list'] = softmax_score.data.tolist()[:max_return]
82 | result_dict['label_prob'] = prob_result
83 | outputs = result_dict
84 | else:
85 | outputs = masked_lm_loss
86 | return outputs
87 |
--------------------------------------------------------------------------------
/tfkit/utility/tok.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import nlp2
4 | from tqdm import tqdm
5 | from transformers import AutoTokenizer
6 |
7 | UNIVERSAL_SEP = "///"
8 |
9 |
10 | def tok_begin(tokenizer):
11 | if tokenizer.special_tokens_map.get('bos_token') is not None:
12 | return tokenizer.special_tokens_map.get('bos_token')
13 | elif tokenizer.special_tokens_map.get('cls_token') is not None:
14 | return tokenizer.special_tokens_map.get('cls_token')
15 | return 'cls'
16 |
17 |
18 | def tok_begin_id(tokenizer):
19 | return tokenizer.convert_tokens_to_ids(tok_begin(tokenizer))
20 |
21 |
22 | def tok_sep(tokenizer):
23 | if tokenizer.special_tokens_map.get('sep_token') is not None:
24 | return tokenizer.special_tokens_map.get('sep_token')
25 | elif tokenizer.special_tokens_map.get('eos_token') is not None:
26 | return tokenizer.special_tokens_map.get('eos_token')
27 | return 'sep'
28 |
29 |
30 | def tok_sep_id(tokenizer):
31 | return tokenizer.convert_tokens_to_ids(tok_sep(tokenizer))
32 |
33 |
34 | def tok_mask(tokenizer):
35 | if tokenizer.special_tokens_map.get('mask_token'):
36 | return tokenizer.special_tokens_map.get('mask_token')
37 | return 'msk'
38 |
39 |
40 | def tok_mask_id(tokenizer):
41 | return tokenizer.convert_tokens_to_ids(tok_mask(tokenizer))
42 |
43 |
44 | def tok_pad(tokenizer):
45 | if tokenizer.special_tokens_map.get('pad_token'):
46 | return tokenizer.special_tokens_map.get('pad_token')
47 | return 'pad'
48 |
49 |
50 | def tok_pad_id(tokenizer):
51 | return tokenizer.convert_tokens_to_ids(tok_pad(tokenizer))
52 |
53 |
54 | def get_all_tok_from_config(config):
55 | tokenizer = AutoTokenizer.from_pretrained(config)
56 | return list(tokenizer.get_vocab().keys())
57 |
58 |
59 | def handle_exceed(tokenizer, seq, maxlen, mode=['noop', 'remove', 'slide', 'start_slice', 'end_slice'],
60 | keep_after_sep=True):
61 | if isinstance(seq, list):
62 | return seq, [[len(seq)]]
63 | mode = mode[0] if isinstance(mode, list) else mode
64 | sep_tok = tok_sep(tokenizer)
65 | sep_split = seq.split(sep_tok)
66 | ext_seq = [sep_tok] + tokenizer.tokenize(sep_tok.join(sep_split[1:])) \
67 | if len(sep_split) > 1 and keep_after_sep else []
68 | t_seq = tokenizer.tokenize(sep_split[0])
69 | if mode == 'noop':
70 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
71 | if mode == 'remove':
72 | if len(t_seq + ext_seq) <= maxlen:
73 | return [t_seq + ext_seq], [[0, len(t_seq + ext_seq)]]
74 | else:
75 | return [], [[0, 0]]
76 | if mode == 'slide':
77 | return nlp2.sliding_windows(t_seq, maxlen - len(ext_seq), append_seq=ext_seq)
78 | if mode == 'start_slice':
79 | slices = t_seq[:maxlen - len(ext_seq)]
80 | slices.extend(ext_seq)
81 | return [slices], [[0, maxlen - len(ext_seq)]]
82 | if mode == 'end_slice':
83 | start_pos = len(t_seq) + len(ext_seq) - maxlen
84 | slices = t_seq[start_pos:]
85 | slices.extend(ext_seq)
86 | return [slices], [[max(0, start_pos), len(t_seq)]]
87 |
88 |
89 | def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
90 | unk_count_dict = OrderedDict()
91 | for path in file_paths:
92 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
93 | for tok in nlp2.split_sentence_to_array(input_sent):
94 | if tokenizer._unk_token in tokenizer.tokenize(tok):
95 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
96 | top_range = int((len(unk_count_dict) + 1) * topP * 100)
97 | return list(unk_count_dict.keys())[:top_range]
98 |
99 |
100 | def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
101 | unk_count_dict = OrderedDict()
102 | for path in file_paths:
103 | for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
104 | for tok in nlp2.split_sentence_to_array(input_sent):
105 | if tokenizer._unk_token in tokenizer.tokenize(tok):
106 | unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
107 | return [key for key, value in unk_count_dict.items() if value >= freqK]
108 |
--------------------------------------------------------------------------------
/tfkit/task/tag/model.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | from typing import Dict, List, Any, Optional
3 |
4 | import torch
5 | from torch import nn
6 | from torch.nn.functional import softmax
7 |
8 | from tfkit.task.tag import Preprocessor
9 | from tfkit.utility.base_model import BaseTFKitModel
10 | from tfkit.utility.constants import DEFAULT_MAXLEN
11 | from tfkit.utility.loss import FocalLoss
12 | from tfkit.utility.predictor import TaggingPredictor
13 |
14 |
15 | class Model(BaseTFKitModel):
16 | """Sequence tagging model for token classification tasks."""
17 |
18 | def __init__(self, tokenizer, pretrained, tasks_detail: Dict[str, List[str]],
19 | maxlen: int = DEFAULT_MAXLEN, dropout: float = 0.2, **kwargs):
20 | super().__init__(tokenizer, pretrained, maxlen, **kwargs)
21 |
22 | # Initialize tagging-specific components
23 | self.labels = list(tasks_detail.values())[0]
24 | self.dropout = nn.Dropout(dropout)
25 | self.tagger = nn.Linear(self.get_hidden_size(), len(self.labels))
26 | self.loss_fct = FocalLoss()
27 |
28 | self._setup_predictor(TaggingPredictor, Preprocessor)
29 |
30 | def forward(self, batch_data, eval=False, separator=" ", **kwargs):
31 | inputs = batch_data["input"]
32 | masks = batch_data["mask"]
33 |
34 | bert_output = self.compute_bert_output(inputs, masks)
35 |
36 | if eval:
37 | outputs = self.compute_eval_output(batch_data, bert_output)
38 | else:
39 | outputs = self.compute_loss_output(batch_data, bert_output)
40 |
41 | return outputs
42 |
43 | def compute_bert_output(self, inputs, masks):
44 | token_tensor = torch.as_tensor(inputs, dtype=torch.long)
45 | mask_tensors = torch.as_tensor(masks)
46 | bert_output = self.pretrained(token_tensor, attention_mask=mask_tensors)
47 | res = bert_output[0]
48 | pooled_output = self.dropout(res)
49 | reshaped_logits = self.tagger(pooled_output)
50 |
51 | return reshaped_logits
52 |
53 | def compute_eval_output(self, batch_data, reshaped_logits):
54 | result_dict = {
55 | 'label_prob_all': [],
56 | 'label_map': []
57 | }
58 |
59 | ilogit = softmax(reshaped_logits[0], dim=1)
60 | result_labels = ilogit.data.tolist()
61 | start, end = batch_data['pos'][0]
62 | token_word_mapping = batch_data['token_word_mapping']
63 |
64 | for pos, logit_prob in enumerate(result_labels[1:]): # skip cls and sep
65 | if start + pos >= len(token_word_mapping):
66 | break
67 |
68 | word, pos = self.compute_word_pos(token_word_mapping, start, pos)
69 | self.update_result_dict(result_dict, logit_prob, word, pos)
70 |
71 | result_dict['token_word_mapping'] = token_word_mapping[start:end]
72 |
73 | return result_dict
74 |
75 | @staticmethod
76 | def compute_word_pos(token_word_mapping, start, pos):
77 | word = token_word_mapping[start + pos]['word']
78 | pos = token_word_mapping[start + pos]['pos']
79 |
80 | return word, pos
81 |
82 | def update_result_dict(self, result_dict, logit_prob, word, pos):
83 | if len(result_dict['label_map']) > pos:
84 | self.update_existing_result(result_dict, logit_prob, word, pos)
85 | else:
86 | self.append_new_result(result_dict, logit_prob, word)
87 |
88 | def update_existing_result(self, result_dict, logit_prob, word, pos):
89 | O = Counter(result_dict['label_prob_all'][-1][word])
90 | N = Counter(dict(zip(self.labels, logit_prob)))
91 | mean_prob = {k: v / 2 for k, v in (O + N).items()}
92 | result_dict['label_prob_all'][-1] = {word: mean_prob}
93 | result_dict['label_map'][-1] = {
94 | word: max(mean_prob, key=mean_prob.get)}
95 |
96 | def append_new_result(self, result_dict, logit_prob, word):
97 | max_index = logit_prob.index(max(logit_prob))
98 | result_dict['label_map'].append({word: self.labels[max_index]})
99 | result_dict['label_prob_all'].append({word: dict(zip(self.labels, logit_prob))})
100 |
101 | def compute_loss_output(self, batch_data, reshaped_logits):
102 | targets = batch_data["target"]
103 | target_tensor = torch.as_tensor(targets, dtype=torch.long)
104 | loss = self.loss_fct(reshaped_logits.view(-1, len(self.labels)), target_tensor.view(-1))
105 |
106 | return loss
107 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | ## What is it
29 | TFKit is a tool kit mainly for language generation.
30 | It leverages the use of transformers on many tasks with different models in this all-in-one framework.
31 | All you need is a little change of config.
32 |
33 | ## Task Supported
34 | With transformer models - BERT/ALBERT/T5/BART......
35 | | | |
36 | |-|-|
37 | | Text Generation | :memo: seq2seq language model |
38 | | Text Generation | :pen: causal language model |
39 | | Text Generation | :printer: once generation model / once generation model with ctc loss |
40 | | Text Generation | :pencil: onebyone generation model |
41 |
42 | # Getting Started
43 | Learn more from the [document](https://voidful.github.io/TFkit/).
44 |
45 | ## How To Use
46 |
47 | ### Step 0: Install
48 | Simple installation from PyPI
49 | ```bash
50 | pip install git+https://github.com/voidful/TFkit.git@refactor-dataset
51 | ```
52 |
53 | ### Step 1: Prepare dataset in csv format
54 | [Task format](https://voidful.tech/TFkit/tasks/)
55 | ```
56 | input, target
57 | ```
58 |
59 | ### Step 2: Train model
60 | ```bash
61 | tfkit-train \
62 | --task clas \
63 | --config xlm-roberta-base \
64 | --train training_data.csv \
65 | --test testing_data.csv \
66 | --lr 4e-5 \
67 | --maxlen 384 \
68 | --epoch 10 \
69 | --savedir roberta_sentiment_classifier
70 | ```
71 |
72 | ### Step 3: Evaluate
73 | ```bash
74 | tfkit-eval \
75 | --task roberta_sentiment_classifier/1.pt \
76 | --metric clas \
77 | --valid testing_data.csv
78 | ```
79 |
80 | ## Advanced features
81 |
82 | Multi-task training
83 |
84 | ```bash
85 | tfkit-train \
86 | --task clas clas \
87 | --config xlm-roberta-base \
88 | --train training_data_taskA.csv training_data_taskB.csv \
89 | --test testing_data_taskA.csv testing_data_taskB.csv \
90 | --lr 4e-5 \
91 | --maxlen 384 \
92 | --epoch 10 \
93 | --savedir roberta_sentiment_classifier_multi_task
94 | ```
95 |
96 |
97 | ## Not maintained task
98 | Due to time constraints, the following tasks are temporarily not supported
99 | | | |
100 | |-|-|
101 | | Classification | :label: multi-class and multi-label classification |
102 | | Question Answering | :page_with_curl: extractive qa |
103 | | Question Answering | :radio_button: multiple-choice qa |
104 | | Tagging | :eye_speech_bubble: sequence level tagging / sequence level with crf |
105 | | Self-supervise Learning | :diving_mask: mask language model |
106 |
107 | ## Supplement
108 | - [transformers models list](https://huggingface.co/models): you can find any pretrained models here
109 | - [nlprep](https://github.com/voidful/NLPrep): download and preprocessing data in one line
110 | - [nlp2go](https://github.com/voidful/nlp2go): create demo api as quickly as possible.
111 |
112 |
113 | ## Contributing
114 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
115 |
116 | ## License 
117 |
118 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
119 |
120 | ## Icons reference
121 | Icons modify from Freepik from www.flaticon.com
122 | Icons modify from Nikita Golubev from www.flaticon.com
123 |
--------------------------------------------------------------------------------
/tfkit/task/clas/model.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Any
2 |
3 | import torch
4 | from torch import nn, softmax, sigmoid
5 |
6 | from tfkit.task.clas import Preprocessor
7 | from tfkit.utility.base_model import BaseTFKitModel
8 | from tfkit.utility.constants import DEFAULT_MAXLEN, DEFAULT_DROPOUT
9 | from tfkit.utility.loss import FocalLoss, BCEFocalLoss
10 | from tfkit.utility.predictor import ClassificationPredictor
11 |
12 |
13 | class Model(BaseTFKitModel):
14 | """Multi-class and multi-label classification model."""
15 |
16 | def __init__(self, tokenizer, pretrained, tasks_detail: Dict[str, List[str]],
17 | maxlen: int = DEFAULT_MAXLEN, dropout: float = DEFAULT_DROPOUT, **kwargs):
18 | super().__init__(tokenizer, pretrained, maxlen, **kwargs)
19 |
20 | # Initialize classification-specific components
21 | self.dropout = nn.Dropout(dropout)
22 | self.loss_fct = FocalLoss()
23 | self.loss_fct_mt = BCEFocalLoss()
24 |
25 | # Setup multi-task classification heads
26 | self.tasks = dict()
27 | self.tasks_detail = tasks_detail
28 | self.classifier_list = nn.ModuleList()
29 | for task, labels in tasks_detail.items():
30 | self.classifier_list.append(nn.Linear(self.get_hidden_size(), len(labels)))
31 | self.tasks[task] = len(self.classifier_list) - 1
32 |
33 | self._setup_predictor(ClassificationPredictor, Preprocessor)
34 |
35 | def get_all_task(self):
36 | """
37 | list all classification task
38 | :return: tasks list
39 | """
40 | return list(self.tasks.keys())
41 |
42 | def mean_pooling(self, model_output, attention_mask):
43 | """
44 | Mean Pooling - Take attention mask into account for correct averaging
45 | from https://github.com/UKPLab/sentence-transformers
46 | modify - mask from -1 to 0
47 | :param model_output:
48 | :param attention_mask:
49 | :return:
50 | """
51 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()
52 | input_mask_expanded[input_mask_expanded < 0] = 0
53 | sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)
54 | sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
55 | return sum_embeddings / sum_mask
56 |
57 | def forward(self, batch_data, eval=False, **kwargs):
58 | # covert input to correct data type
59 | tasks = batch_data['task']
60 | tasks = [bytes(t).decode(encoding="utf-8", errors="ignore") for t in tasks]
61 | inputs = torch.as_tensor(batch_data['input'])
62 | targets = torch.as_tensor(batch_data['target'])
63 | masks = torch.as_tensor(batch_data['mask'])
64 | # define model output
65 | result_dict = {
66 | 'max_item': [],
67 | 'prob_list': [],
68 | 'label_prob': []
69 | }
70 |
71 | result_logits = []
72 | result_labels = []
73 | for p, zin in enumerate(zip(tasks, inputs, masks)):
74 | task, input, mask = zin
75 | task_id = self.tasks[task]
76 | task_labels = self.tasks_detail[task]
77 | output = self.pretrained(input.unsqueeze(0), mask.unsqueeze(0))[0]
78 | pooled_output = self.dropout(self.mean_pooling(output, mask.unsqueeze(0)))
79 | classifier_output = self.classifier_list[task_id](pooled_output)
80 | reshaped_logit = classifier_output.view(-1, len(task_labels)) # 0 for cls position
81 | result_logits.append(reshaped_logit)
82 | if not eval:
83 | target = targets[p]
84 | result_labels.append(target)
85 | else:
86 | if 'multi_label' in task:
87 | reshaped_logit = sigmoid(reshaped_logit)
88 | else:
89 | reshaped_logit = softmax(reshaped_logit, dim=1)
90 | logit_prob = reshaped_logit[0].data.tolist()
91 | logit_label = dict(zip(task_labels, logit_prob))
92 | result_dict['label_prob'].append({task: logit_label})
93 | if 'multi_label' in task:
94 | result_dict['max_item'].append({task: [k for k, v in logit_label.items() if v > 0.5]})
95 | else:
96 | result_dict['max_item'].append({task: [task_labels[logit_prob.index(max(logit_prob))]]})
97 |
98 | if eval:
99 | outputs = result_dict
100 | else:
101 | loss = 0
102 | for logit, labels, task in zip(result_logits, result_labels, tasks):
103 | if 'multi_label' in task:
104 | loss += self.loss_fct_mt(logit, labels.type_as(logit))
105 | else:
106 | loss += self.loss_fct(logit, labels)
107 | outputs = loss
108 |
109 | return outputs
110 |
--------------------------------------------------------------------------------
/tests/test_model_loader.py:
--------------------------------------------------------------------------------
1 | from types import SimpleNamespace
2 | from unittest.mock import MagicMock
3 |
4 | import pytest
5 |
6 | from tfkit.utility import model as model_utils
7 | from tfkit.utility.model import load_pretrained_model, load_pretrained_tokenizer
8 |
9 |
10 | def _make_config(**overrides):
11 | defaults = {
12 | "is_encoder_decoder": False,
13 | "architectures": [],
14 | "is_decoder": False,
15 | }
16 | defaults.update(overrides)
17 | return SimpleNamespace(**defaults)
18 |
19 |
20 | def test_load_pretrained_model_prefers_seq2seq(monkeypatch):
21 | config = _make_config(is_encoder_decoder=True)
22 |
23 | auto_config = MagicMock()
24 | auto_config.from_pretrained.return_value = config
25 | monkeypatch.setattr(model_utils, "AutoConfig", auto_config)
26 |
27 | seq2seq_loader = MagicMock()
28 | seq2seq_instance = object()
29 | seq2seq_loader.from_pretrained.return_value = seq2seq_instance
30 | monkeypatch.setattr(model_utils, "AutoModelForSeq2SeqLM", seq2seq_loader)
31 |
32 | causal_loader = MagicMock()
33 | monkeypatch.setattr(model_utils, "AutoModelForCausalLM", causal_loader)
34 |
35 | base_loader = MagicMock()
36 | monkeypatch.setattr(model_utils, "AutoModel", base_loader)
37 |
38 | result = load_pretrained_model("mock-model", ["seq2seq"]) # type: ignore[arg-type]
39 |
40 | assert result is seq2seq_instance
41 | seq2seq_loader.from_pretrained.assert_called_once()
42 | causal_loader.from_pretrained.assert_not_called()
43 | base_loader.from_pretrained.assert_not_called()
44 |
45 |
46 | def test_load_pretrained_model_prefers_causal(monkeypatch):
47 | config = _make_config(architectures=["CustomForCausalLM"])
48 |
49 | auto_config = MagicMock()
50 | auto_config.from_pretrained.return_value = config
51 | monkeypatch.setattr(model_utils, "AutoConfig", auto_config)
52 |
53 | seq2seq_loader = MagicMock()
54 | monkeypatch.setattr(model_utils, "AutoModelForSeq2SeqLM", seq2seq_loader)
55 |
56 | causal_loader = MagicMock()
57 | causal_instance = object()
58 | causal_loader.from_pretrained.return_value = causal_instance
59 | monkeypatch.setattr(model_utils, "AutoModelForCausalLM", causal_loader)
60 |
61 | base_loader = MagicMock()
62 | monkeypatch.setattr(model_utils, "AutoModel", base_loader)
63 |
64 | result = load_pretrained_model("mock-model", ["clm"]) # type: ignore[arg-type]
65 |
66 | assert result is causal_instance
67 | causal_loader.from_pretrained.assert_called_once()
68 | base_loader.from_pretrained.assert_not_called()
69 |
70 |
71 | def test_load_pretrained_model_causal_fallback(monkeypatch):
72 | config = _make_config(architectures=["CustomForCausalLM"])
73 |
74 | auto_config = MagicMock()
75 | auto_config.from_pretrained.return_value = config
76 | monkeypatch.setattr(model_utils, "AutoConfig", auto_config)
77 |
78 | seq2seq_loader = MagicMock()
79 | monkeypatch.setattr(model_utils, "AutoModelForSeq2SeqLM", seq2seq_loader)
80 |
81 | causal_loader = MagicMock()
82 | causal_loader.from_pretrained.side_effect = ValueError("missing head")
83 | monkeypatch.setattr(model_utils, "AutoModelForCausalLM", causal_loader)
84 |
85 | base_loader = MagicMock()
86 | base_instance = object()
87 | base_loader.from_pretrained.return_value = base_instance
88 | monkeypatch.setattr(model_utils, "AutoModel", base_loader)
89 |
90 | result = load_pretrained_model("mock-model", ["clm"]) # type: ignore[arg-type]
91 |
92 | assert result is base_instance
93 | base_loader.from_pretrained.assert_called_once()
94 | assert config.is_decoder is True
95 |
96 |
97 | def test_load_pretrained_model_trust_remote_code_env(monkeypatch):
98 | monkeypatch.setenv("TFKIT_TRUST_REMOTE_CODE", "false")
99 |
100 | config = _make_config()
101 | auto_config = MagicMock()
102 | auto_config.from_pretrained.return_value = config
103 | monkeypatch.setattr(model_utils, "AutoConfig", auto_config)
104 |
105 | base_loader = MagicMock()
106 | base_instance = object()
107 | base_loader.from_pretrained.return_value = base_instance
108 | monkeypatch.setattr(model_utils, "AutoModel", base_loader)
109 |
110 | result = load_pretrained_model("mock-model", ["clas"]) # type: ignore[arg-type]
111 |
112 | assert result is base_instance
113 | auto_config.from_pretrained.assert_called_once_with(
114 | "mock-model", trust_remote_code=False
115 | )
116 | base_loader.from_pretrained.assert_called_once()
117 | _, kwargs = base_loader.from_pretrained.call_args
118 | assert kwargs.get("trust_remote_code") is False
119 |
120 |
121 | def test_load_pretrained_tokenizer_respects_env(monkeypatch):
122 | monkeypatch.setenv("TFKIT_TRUST_REMOTE_CODE", "0")
123 |
124 | tokenizer_loader = MagicMock()
125 | monkeypatch.setattr(model_utils, "AutoTokenizer", tokenizer_loader)
126 |
127 | load_pretrained_tokenizer("mock-tokenizer")
128 |
129 | tokenizer_loader.from_pretrained.assert_called_once_with(
130 | "mock-tokenizer", trust_remote_code=False
131 | )
132 |
--------------------------------------------------------------------------------
/tests/test_task_generation.py:
--------------------------------------------------------------------------------
1 | from types import SimpleNamespace
2 |
3 | import torch
4 | from torch import nn
5 |
6 | from tfkit.task.clm.model import Model as CLMModel
7 | from tfkit.task.seq2seq.model import Model as Seq2SeqModel
8 |
9 |
10 | class DummyTokenizer:
11 | def __init__(self, vocab_size):
12 | self.vocab_size = vocab_size
13 |
14 | def __len__(self):
15 | return self.vocab_size
16 |
17 | def convert_ids_to_tokens(self, idx):
18 | return f"token-{idx}"
19 |
20 |
21 | class DummyCausalPretrained(nn.Module):
22 | def __init__(self):
23 | super().__init__()
24 | self.config = SimpleNamespace(vocab_size=5, hidden_size=4)
25 | self.output_layer = nn.Linear(self.config.hidden_size, self.config.vocab_size)
26 | self.last_kwargs = None
27 |
28 | def get_output_embeddings(self):
29 | return self.output_layer
30 |
31 | def forward(self, input_ids, attention_mask=None, return_dict=True, **kwargs):
32 | self.last_kwargs = kwargs
33 | batch_size, seq_len = input_ids.shape
34 | logits = torch.zeros(batch_size, seq_len, self.config.vocab_size)
35 | outputs = {
36 | "logits": logits,
37 | "last_hidden_state": torch.zeros(batch_size, seq_len, self.config.hidden_size),
38 | }
39 | if "labels" in kwargs:
40 | outputs["loss"] = torch.tensor(0.0)
41 | return outputs
42 |
43 |
44 | class DummyEncoderPretrained(nn.Module):
45 | def __init__(self):
46 | super().__init__()
47 | self.config = SimpleNamespace(vocab_size=5, hidden_size=4)
48 | self.last_kwargs = None
49 |
50 | def get_output_embeddings(self):
51 | return None
52 |
53 | def forward(self, input_ids, attention_mask=None, return_dict=True, **kwargs):
54 | self.last_kwargs = kwargs
55 | batch_size, seq_len = input_ids.shape
56 | hidden = torch.zeros(batch_size, seq_len, self.config.hidden_size)
57 | return {"last_hidden_state": hidden}
58 |
59 |
60 | class DummySeq2SeqPretrained(nn.Module):
61 | def __init__(self):
62 | super().__init__()
63 | self.config = SimpleNamespace(vocab_size=3, hidden_size=4)
64 | self.decoder = nn.Module()
65 | self.output_layer = nn.Linear(self.config.hidden_size, self.config.vocab_size)
66 |
67 | def get_output_embeddings(self):
68 | return self.output_layer
69 |
70 | def forward(
71 | self,
72 | input_ids=None,
73 | attention_mask=None,
74 | decoder_input_ids=None,
75 | decoder_attention_mask=None,
76 | output_hidden_states=False,
77 | use_cache=False,
78 | return_dict=True,
79 | **kwargs,
80 | ):
81 | batch_size, seq_len = decoder_input_ids.shape
82 | hidden = torch.zeros(batch_size, seq_len, self.config.hidden_size)
83 | outputs = {
84 | "last_hidden_state": hidden,
85 | "decoder_hidden_states": (hidden,),
86 | }
87 | return outputs
88 |
89 |
90 | def test_clm_model_uses_pretrained_head_for_loss():
91 | tokenizer = DummyTokenizer(vocab_size=5)
92 | pretrained = DummyCausalPretrained()
93 | model = CLMModel(tokenizer=tokenizer, pretrained=pretrained)
94 |
95 | batch = {
96 | "input": torch.zeros((1, 2), dtype=torch.long),
97 | "mask": torch.ones((1, 2), dtype=torch.long),
98 | "target": torch.tensor([[0, -1]]),
99 | }
100 |
101 | loss = model.forward(batch, eval=False)
102 | assert torch.is_tensor(loss)
103 | assert "labels" in pretrained.last_kwargs
104 | assert pretrained.last_kwargs["labels"].tolist() == [[0, -100]]
105 |
106 | eval_batch = {
107 | **batch,
108 | "start": [0],
109 | }
110 | result = model.forward(eval_batch, eval=True)
111 | assert isinstance(result, dict)
112 | assert "max_item" in result
113 |
114 |
115 | def test_clm_model_falls_back_to_linear_head():
116 | tokenizer = DummyTokenizer(vocab_size=5)
117 | pretrained = DummyEncoderPretrained()
118 | model = CLMModel(tokenizer=tokenizer, pretrained=pretrained)
119 |
120 | batch = {
121 | "input": torch.zeros((1, 2), dtype=torch.long),
122 | "mask": torch.ones((1, 2), dtype=torch.long),
123 | "target": torch.tensor([[0, -1]]),
124 | }
125 |
126 | loss = model.forward(batch, eval=False)
127 | assert torch.is_tensor(loss)
128 | assert pretrained.last_kwargs == {}
129 |
130 |
131 | def test_seq2seq_model_uses_pretrained_output_head():
132 | tokenizer = DummyTokenizer(vocab_size=3)
133 | pretrained = DummySeq2SeqPretrained()
134 | model = Seq2SeqModel(tokenizer=tokenizer, pretrained=pretrained)
135 |
136 | batch = {
137 | "input": torch.zeros((1, 1), dtype=torch.long),
138 | "prev": torch.zeros((1, 1), dtype=torch.long),
139 | "encoder_mask": torch.ones((1, 1), dtype=torch.long),
140 | "decoder_mask": torch.ones((1, 1), dtype=torch.long),
141 | "target": torch.zeros((1, 1), dtype=torch.long),
142 | "ntarget": torch.full((1, 1), -1),
143 | }
144 |
145 | loss = model.forward(batch, eval=False)
146 | assert torch.is_tensor(loss)
147 | assert model.model is pretrained.output_layer
148 |
--------------------------------------------------------------------------------
/tfkit/task/seq2seq/preprocessor.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import tfkit.utility.tok as tok
4 | from tfkit.utility.data_filereader import get_gen_data_from_file
5 | from tfkit.utility.data_processor import GeneralNLPPreprocessor
6 |
7 |
8 | class Preprocessor(GeneralNLPPreprocessor):
9 | def read_file_to_data(self, path):
10 | return get_gen_data_from_file(path)
11 |
12 | def set_global_parameters(self):
13 | self.tokenize_target = True
14 |
15 | def preprocess_component_convert_to_id(self, item, likelihood=['none', 'pos', 'neg', 'both'], **param_dict):
16 | likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
17 | tokenized_input, tokenized_target, n_target, b_target = item['input'], \
18 | item.get('target', None), \
19 | item.get('ntarget', None), \
20 | item.get('btarget', None)
21 | previous = item.get("previous", [])
22 | if tokenized_target is None:
23 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
24 | 'previous': self.tokenizer.convert_tokens_to_ids(previous)}
25 | elif b_target and len(b_target) > 0:
26 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
27 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
28 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
29 | 'btarget': self.tokenizer.encode(b_target)}
30 | else:
31 | if "neg" in likelihood or 'both' in likelihood:
32 | # formatting neg data in csv
33 | if n_target is None:
34 | ntext_arr = [
35 | tok.tok_sep(self.tokenizer) + self.tokenizer.convert_tokens_to_string(tokenized_target)]
36 | elif tok.tok_sep(self.tokenizer) in n_target:
37 | ntext_arr = [ntext.strip() for ntext in n_target.split(tok.tok_sep(self.tokenizer))]
38 | else:
39 | ntext_arr = [n_target.strip()]
40 | for neg_text in ntext_arr:
41 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
42 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
43 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
44 | 'ntarget': self.tokenizer.encode(neg_text)}
45 | else:
46 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
47 | 'previous': self.tokenizer.convert_tokens_to_ids(previous),
48 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target)}
49 |
50 | # whole sentence masking
51 | if 'pos' in likelihood:
52 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
53 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
54 | 'previous': self.tokenizer.convert_tokens_to_ids(
55 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target))}
56 | elif 'both' in likelihood:
57 | for neg_text in ntext_arr:
58 | yield {'input': self.tokenizer.convert_tokens_to_ids(tokenized_input),
59 | 'target': self.tokenizer.convert_tokens_to_ids(tokenized_target),
60 | 'previous': self.tokenizer.convert_tokens_to_ids(
61 | [tok.tok_mask(self.tokenizer)] * len(tokenized_target)),
62 | 'ntarget': self.tokenizer.encode(neg_text)}
63 |
64 | def postprocess(self, item, tokenizer, maxlen, **kwargs):
65 | t_input_id, previous = item['input'], item['previous']
66 | row_dict = {}
67 | if 'target' in item:
68 | target = item['target']
69 | tokenized_target_id = []
70 | if len(previous) == len(target):
71 | tokenized_prev_id = [self.tok_mask_id] * maxlen
72 | else:
73 | tokenized_prev_id = [self.tok_sep_id] + target
74 | tokenized_target_id.extend(target + [self.tok_sep_id])
75 | row_dict['target'] = tokenized_target_id
76 | row_dict['target_pad'] = [-1]
77 | row_dict['prev'] = tokenized_prev_id
78 | row_dict['ntarget'] = [-1] * maxlen
79 | if 'ntarget' in item and len(item['ntarget']) > 0:
80 | tokenized_ntarget_id = item['ntarget']
81 | if len(tokenized_ntarget_id) <= maxlen:
82 | row_dict['ntarget'] = tokenized_ntarget_id
83 | if 'btarget' in item and len(item['btarget']) > 0:
84 | row_dict['btarget'] = tokenizer.encode(item['btarget'])
85 | else:
86 | tokenized_prev_id = [self.tok_sep_id]
87 | tokenized_prev_id.extend(previous)
88 | row_dict['prev'] = tokenized_prev_id
89 |
90 | row_dict['input'] = t_input_id
91 | row_dict['encoder_mask'] = [1] * len(t_input_id)
92 | row_dict['decoder_mask'] = [1] * len(tokenized_prev_id)
93 | return {key: torch.tensor(value) for key, value in row_dict.items()}
94 |
--------------------------------------------------------------------------------
/tfkit/utility/dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | from collections import defaultdict
3 | from random import choice
4 |
5 | import joblib
6 | import nlp2
7 | from torch.utils import data
8 | from tqdm.contrib.concurrent import process_map
9 |
10 | from tfkit.utility.constants import CACHE_EXTENSION
11 |
12 | try:
13 | from datasets import load_dataset
14 | except Exception: # pragma: no cover - optional dependency
15 | load_dataset = None
16 |
17 |
18 | def get_dataset(file_path, task_class, tokenizer, parameter):
19 | panel = nlp2.Panel()
20 | # all_arg = nlp2.function_get_all_arg_with_value(task_class.preprocessor.prepare_convert_to_id)
21 | # if parameter.get('panel'):
22 | # print("Operation panel for data preprocessing.")
23 | # for missarg in nlp2.function_check_missing_arg(task_class.preprocessor,
24 | # parameter):
25 | # panel.add_element(k=missarg, v=all_arg[missarg], msg=missarg, default=all_arg[missarg])
26 | # filled_arg = panel.get_result_dict()
27 | # parameter.update(filled_arg)
28 | if load_dataset is not None and not os.path.isfile(file_path):
29 | try:
30 | hf_ds = load_dataset(file_path, split=parameter.get('split', 'train'))
31 | return HFDataset(hf_ds, tokenizer=tokenizer,
32 | preprocessor=task_class.Preprocessor,
33 | preprocessing_arg=parameter)
34 | except Exception:
35 | pass
36 | ds = TFKitDataset(fpath=file_path, tokenizer=tokenizer,
37 | preprocessor=task_class.Preprocessor,
38 | preprocessing_arg=parameter)
39 | return ds
40 |
41 |
42 | class TFKitDataset(data.Dataset):
43 | def __init__(self, fpath, tokenizer, preprocessor, preprocessing_arg={}):
44 | cache_path = fpath + "_" + tokenizer.name_or_path.replace("/", "_") + CACHE_EXTENSION
45 | self.task_dict = {}
46 | self.preprocessor = preprocessor(tokenizer, kwargs=preprocessing_arg)
47 | self.tokenizer = tokenizer
48 | if os.path.isfile(cache_path) and preprocessing_arg.get('cache', False):
49 | with open(cache_path, "rb") as fo:
50 | outdata = joblib.load(fo)
51 | sample = outdata['sample']
52 | length = outdata['length']
53 | self.task_dict = outdata['task']
54 | else:
55 | print(f"Start preprocessing...")
56 | sample = defaultdict(list)
57 | length = 0
58 | get_data_item = self.preprocessor.read_file_to_data(fpath)
59 | while True:
60 | try:
61 | for items in process_map(self.preprocessor.preprocess, next(get_data_item),
62 | chunksize=1000):
63 | for i in items:
64 | length += 1
65 | for k, v in i.items():
66 | sample[k].append(v)
67 | print(f"loaded {length} data.")
68 | except StopIteration as e:
69 | tasks = e.value
70 | break
71 | self.task_dict = tasks
72 | print(f"There are {length} datas after preprocessing.")
73 | if preprocessing_arg.get('cache', False):
74 | with open(cache_path, 'wb') as fo:
75 | outdata = {'sample': sample, 'task': self.task_dict, 'length': length}
76 | joblib.dump(outdata, fo)
77 | self.length = length
78 | self.sample = sample
79 | self.task = self.task_dict
80 |
81 | def increase_with_sampling(self, total):
82 | for _ in range(total - self.length):
83 | for key in self.sample.keys():
84 | self.sample[key].append(choice(self.sample[key]))
85 |
86 | def __len__(self):
87 | return self.length
88 |
89 | def __getitem__(self, idx):
90 | return self.preprocessor.postprocess(
91 | {**{'task_dict': self.task_dict}, **{key: self.sample[key][idx] for key in self.sample.keys()}},
92 | self.tokenizer,
93 | maxlen=self.preprocessor.parameters['maxlen'])
94 |
95 |
96 | class HFDataset(data.Dataset):
97 | """Dataset wrapper for the HuggingFace datasets library."""
98 |
99 | def __init__(self, hf_dataset, tokenizer, preprocessor, preprocessing_arg=None):
100 | preprocessing_arg = preprocessing_arg or {}
101 | self.task_dict = {}
102 | self.sample = defaultdict(list)
103 | self.preprocessor = preprocessor(tokenizer, kwargs=preprocessing_arg)
104 | self.tokenizer = tokenizer
105 |
106 | print("Start preprocessing with HuggingFace dataset...")
107 | length = 0
108 | for raw_item in hf_dataset:
109 | for items in self.preprocessor.preprocess(raw_item):
110 | length += 1
111 | for k, v in items.items():
112 | self.sample[k].append(v)
113 | self.length = length
114 | self.task = self.task_dict
115 |
116 | def __len__(self):
117 | return self.length
118 |
119 | def __getitem__(self, idx):
120 | return self.preprocessor.postprocess(
121 | {**{'task_dict': self.task_dict}, **{key: self.sample[key][idx] for key in self.sample.keys()}},
122 | self.tokenizer,
123 | maxlen=self.preprocessor.parameters['maxlen'])
124 |
--------------------------------------------------------------------------------
/tfkit/test/utility/test_utility_loss.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import torch
5 | from torch import nn
6 | from torch.autograd import Variable
7 |
8 | dir_path = os.path.dirname(os.path.realpath(__file__))
9 | sys.path.append(os.path.abspath(os.path.join(dir_path, os.pardir)))
10 |
11 | import unittest
12 | import tfkit
13 |
14 |
15 | class TestLoss(unittest.TestCase):
16 | outputs = Variable(torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]]), requires_grad=False)
17 | targets = Variable(torch.Tensor([1, 1]).long(), requires_grad=False)
18 | alln_targets = Variable(torch.Tensor([-1, -1]).long(), requires_grad=False)
19 | onen_targets = Variable(torch.Tensor([1, -1]).long(), requires_grad=False)
20 |
21 | def testLabelSmoothingCrossEntropy(self):
22 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
23 | targets = torch.Tensor([1, 1]).long()
24 | alln_targets = torch.Tensor([0, -1]).long()
25 | onen_targets = torch.Tensor([1, -1]).long()
26 |
27 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
28 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, ignore_index=-1)
29 |
30 | self.assertTrue(criterion(outputs, targets).item() <
31 | custom_criterion(outputs, targets).item())
32 | self.assertTrue(criterion(outputs, onen_targets).item() <
33 | custom_criterion(outputs, onen_targets).item())
34 |
35 | criterion = nn.CrossEntropyLoss()
36 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3)
37 | self.assertTrue(criterion(outputs, targets).item() <
38 | custom_criterion(outputs, targets).item())
39 |
40 | custom_criterion = tfkit.utility.loss.LabelSmoothingLoss(3, reduction='none')
41 | print(custom_criterion(self.outputs, self.targets))
42 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
43 |
44 | def testDiceLoss(self):
45 | custom_criterion = tfkit.utility.loss.DiceLoss(ignore_index=-1)
46 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.targets).item() < 1)
47 | self.assertTrue(0.99 < custom_criterion(self.outputs, self.alln_targets).item() <= 1)
48 | self.assertTrue(0.8 < custom_criterion(self.outputs, self.onen_targets).item() < 1)
49 |
50 | custom_criterion = tfkit.utility.loss.DiceLoss(reduction='none')
51 | print(custom_criterion(self.outputs, self.targets))
52 | self.assertTrue(list(custom_criterion(self.outputs, self.targets).shape) == [2])
53 |
54 | def testLossDrop(self):
55 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
56 | targets = torch.Tensor([1, 1]).long()
57 | norm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
58 | loss_fct = nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # -1 index = padding token
59 | masked_lm_loss = loss_fct(outputs, targets)
60 | masked_lm_loss = masked_lm_loss.view(-1, len(targets)) # view by batch size
61 | masked_lm_loss = masked_lm_loss.sum(dim=0)
62 | masked_lm_loss = masked_lm_loss.mean()
63 | print(masked_lm_loss.mean(), norm_loss_fct(outputs, targets).mean())
64 |
65 | def testBCEFocalLoss(self):
66 | outputs = torch.Tensor([[0, 1, 0], [0.2, 0, 0]])
67 | targets = torch.Tensor([[0, 1, 0], [1, 0, 0]])
68 | criterion = nn.BCELoss()
69 | custom_criterion = tfkit.utility.loss.BCEFocalLoss()
70 | self.assertTrue(criterion(outputs, targets).item() >
71 | custom_criterion(outputs, targets).item())
72 |
73 | def testNegativeCElLoss(self):
74 | outputs = torch.Tensor([[0.00000000000009, 5, 0.5], [0.00000000000000000001, 69, 9]])
75 | targets = torch.Tensor([1, 1]).long()
76 | alln_targets = torch.Tensor([-1, -1]).long()
77 | onen_targets = torch.Tensor([1, -1]).long()
78 |
79 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
80 | custom_criterion = tfkit.utility.loss.NegativeCElLoss()
81 | self.assertTrue(
82 | criterion(outputs, targets).item() < custom_criterion(outputs, self.targets).item())
83 | self.assertTrue(criterion(outputs, onen_targets).item() < custom_criterion(outputs, onen_targets).item())
84 |
85 | def testFocalLoss(self):
86 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
87 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=0)
88 | self.assertAlmostEqual(criterion(self.outputs, self.targets).item(),
89 | custom_criterion(self.outputs, self.targets).item())
90 | self.assertAlmostEqual(criterion(self.outputs, self.alln_targets).item(),
91 | custom_criterion(self.outputs, self.alln_targets).item())
92 | self.assertAlmostEqual(criterion(self.outputs, self.onen_targets).item(),
93 | custom_criterion(self.outputs, self.onen_targets).item())
94 |
95 | custom_criterion = tfkit.utility.loss.FocalLoss(gamma=1)
96 | self.assertTrue(criterion(self.outputs, self.targets) > custom_criterion(self.outputs, self.targets))
97 | self.assertTrue(criterion(self.outputs, self.alln_targets).item() - custom_criterion(self.outputs,
98 | self.alln_targets).item() < 1)
99 | self.assertTrue(criterion(self.outputs, self.onen_targets) > custom_criterion(self.outputs, self.onen_targets))
100 |
--------------------------------------------------------------------------------
/tfkit/utility/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 |
6 |
7 | class BCEFocalLoss(nn.Module):
8 | def __init__(self, gamma=2):
9 | super(BCEFocalLoss, self).__init__()
10 | self.gamma = gamma
11 |
12 | def forward(self, input, target):
13 | BCE_loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
14 | pt = torch.exp(-BCE_loss) # prevents nans when probability 0
15 | focal_loss = (1 - pt) ** self.gamma * BCE_loss
16 | return focal_loss.mean()
17 |
18 |
19 | class FocalLoss(nn.Module):
20 | def __init__(self, gamma=2, ignore_index=-1):
21 | super(FocalLoss, self).__init__()
22 | self.gamma = gamma
23 | self.softmax = nn.Softmax(dim=1)
24 | self.nll = nn.NLLLoss(ignore_index=ignore_index)
25 |
26 | def forward(self, input, target):
27 | softmax = self.softmax(input)
28 | logpt = torch.log(softmax)
29 | pt = Variable(logpt.data.exp())
30 | return self.nll((1 - pt) ** self.gamma * logpt, target)
31 |
32 |
33 | class SeqCTCLoss(nn.Module):
34 | def __init__(self, blank_index):
35 | super(SeqCTCLoss, self).__init__()
36 | self.blank_index = blank_index
37 |
38 | def forward(self, logits, input_lengths, targets, target_lengths):
39 | # lengths : (batch_size, )
40 | # log_logits : (T, batch_size, n_class), this kind of shape is required for ctc_loss
41 | # log_logits = logits + (logit_mask.unsqueeze(-1) + 1e-45).log()
42 | log_logits = logits.log_softmax(-1).transpose(0, 1)
43 | loss = F.ctc_loss(log_logits,
44 | targets,
45 | input_lengths,
46 | target_lengths,
47 | blank=self.blank_index,
48 | reduction='mean',
49 | zero_infinity=True)
50 | return loss
51 |
52 |
53 | class SelfKDLoss(nn.Module):
54 |
55 | def __init__(self, alpha=0.1, temperature=2,ignore_index=-1):
56 | super(SelfKDLoss, self).__init__()
57 | self.alpha = alpha
58 | self.temperature = temperature
59 | self.ignore_index = ignore_index
60 |
61 | def forward(self, outputs, teacher_outputs, labels):
62 | loss = nn.KLDivLoss()(F.log_softmax(outputs / self.temperature, dim=-1),
63 | F.softmax(teacher_outputs / self.temperature, dim=-1)) * (
64 | self.alpha * self.temperature * self.temperature) + F.cross_entropy(outputs, labels,ignore_index=self.ignore_index,) * (
65 | 1. - self.alpha)
66 | return loss
67 |
68 |
69 | class DiceLoss(nn.Module):
70 | """From 'Dice Loss for Data-imbalanced NLP Tasks'"""
71 |
72 | def __init__(self, ignore_index=None, reduction='mean'):
73 | super(DiceLoss, self).__init__()
74 | self.ignore_index = ignore_index
75 | self.reduction = reduction
76 |
77 | def forward(self, y_pred, y_true):
78 | y_pred = torch.softmax(y_pred, dim=1)
79 | if self.ignore_index is not None:
80 | mask = y_true == -1
81 | filtered_target = y_true
82 | filtered_target[mask] = 0
83 | torch.gather(y_pred, dim=1, index=filtered_target.unsqueeze(1))
84 | mask = mask.unsqueeze(1).expand(y_pred.data.size())
85 | y_pred[mask] = 0
86 | pred_prob = torch.gather(y_pred, dim=1, index=y_true.unsqueeze(1))
87 | dsc_i = 1 - ((1 - pred_prob) * pred_prob) / ((1 - pred_prob) * pred_prob + 1)
88 | if self.reduction == 'mean':
89 | return dsc_i.mean()
90 | else:
91 | return dsc_i.view(-1)
92 |
93 |
94 | class NegativeCElLoss(nn.Module):
95 | def __init__(self, ignore_index=-1, reduction='mean'):
96 | super(NegativeCElLoss, self).__init__()
97 | self.softmax = nn.Softmax(dim=1)
98 | self.alpha = 1
99 | self.nll = nn.NLLLoss(ignore_index=ignore_index, reduction=reduction)
100 |
101 | def forward(self, input, target):
102 | nsoftmax = self.softmax(input)
103 | nsoftmax = torch.clamp((1.0 - nsoftmax), min=1e-32)
104 | return self.nll(torch.log(nsoftmax) * self.alpha, target)
105 |
106 |
107 | class LabelSmoothingLoss(nn.Module):
108 | def __init__(self, classes, smoothing=0.1, dim=-1, ignore_index=None, reduction='mean'):
109 | super(LabelSmoothingLoss, self).__init__()
110 | self.confidence = 1.0 - smoothing
111 | self.smoothing = smoothing
112 | self.cls = classes
113 | self.dim = dim
114 | self.reduction = reduction
115 | self.ignore_index = ignore_index
116 |
117 | def forward(self, pred, target):
118 | pred = pred.log_softmax(dim=self.dim)
119 | with torch.no_grad():
120 | true_dist = torch.zeros_like(pred)
121 | true_dist.fill_(self.smoothing / (self.cls - 1))
122 | if self.ignore_index is not None:
123 | mask = target == -1
124 | filtered_target = target.clone()
125 | filtered_target[mask] = 0
126 | true_dist.scatter_(1, filtered_target.unsqueeze(1), self.confidence)
127 | mask = mask.unsqueeze(1).expand(pred.data.size())
128 | true_dist[mask] = 0
129 | else:
130 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
131 | if self.reduction == 'mean':
132 | return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
133 | else:
134 | return torch.sum(-true_dist * pred, dim=self.dim)
135 |
--------------------------------------------------------------------------------
/tfkit/task/oncectc/model.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn.functional import softmax
6 |
7 | from tfkit.task.once import Preprocessor
8 | from tfkit.utility.base_model import BaseTFKitModel
9 | from tfkit.utility.constants import BLANK_TOKEN
10 | from tfkit.utility.loss import *
11 | from tfkit.utility.loss import SeqCTCLoss
12 | from tfkit.utility.predictor import NonAutoRegressivePredictor
13 | from tfkit.utility.tok import *
14 |
15 |
16 | class Model(BaseTFKitModel):
17 | """Once generation model with CTC loss for non-autoregressive text generation."""
18 |
19 | def __init__(self, tokenizer, pretrained, maxlen=512, tasks_detail=None, **kwargs):
20 | super().__init__(tokenizer, pretrained, maxlen, **kwargs)
21 |
22 | # Setup CTC-specific components
23 | self.blank_token = BLANK_TOKEN
24 | self.tokenizer.add_tokens(self.blank_token)
25 | self.pretrained.resize_token_embeddings(len(tokenizer))
26 | self.blank_index = self.tokenizer.convert_tokens_to_ids([self.blank_token])[0]
27 | self.loss = SeqCTCLoss(blank_index=self.blank_index)
28 |
29 | # Update vocab size after adding tokens
30 | self.vocab_size = max(self.pretrained.config.vocab_size, self.tokenizer.__len__())
31 | self.model = nn.Linear(self.get_hidden_size(), self.vocab_size)
32 |
33 | self._setup_predictor(NonAutoRegressivePredictor, Preprocessor)
34 |
35 | def forward(self, batch_data, eval=False, max_return=1, **kwargs):
36 | inputs = batch_data['input']
37 | masks = batch_data['mask']
38 | starts = batch_data['start']
39 | ends = batch_data['end']
40 | tokens_tensor = torch.as_tensor(inputs)
41 | mask_tensors = torch.as_tensor(masks)
42 |
43 | output = self.pretrained(tokens_tensor, attention_mask=mask_tensors)
44 | sequence_output = output[0]
45 | prediction_scores = self.model(sequence_output)
46 | batch_size = list(tokens_tensor.shape)[0]
47 | prediction_scores = prediction_scores.view(batch_size, -1, self.vocab_size)
48 |
49 | if eval:
50 | result_dict = {
51 | 'max_item': [],
52 | 'label_prob': defaultdict(list),
53 | 'prob_list': []
54 | }
55 | start = batch_data['start'][0]
56 | topK_ids = [[]] * max_return
57 | topK_probs = [1] * max_return
58 |
59 | pscore = prediction_scores.detach().cpu()
60 | predicted_indexs = pscore.argmax(2).tolist()[0]
61 | predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indexs)
62 | output = []
63 | for pos, (predicted_index, predicted_token) in enumerate(zip(predicted_indexs, predicted_tokens)):
64 | if len(output) > 0 and predicted_index == output[-1]:
65 | continue
66 | if predicted_token == self.blank_token:
67 | continue
68 | if predicted_token == tok_pad(self.tokenizer):
69 | continue
70 | if predicted_token == tok_sep(self.tokenizer):
71 | break
72 |
73 | softmax_score = softmax(prediction_scores[0][pos], dim=0)
74 | max_item_id = torch.argmax(softmax_score, -1).item()
75 | max_item_prob = softmax_score[max_item_id].item()
76 | if max_return > 1:
77 | topK = torch.topk(softmax_score, max_return)
78 | for k, (prob, tid) in enumerate(zip(topK.values.data.tolist(), topK.indices.data.tolist())):
79 | topK_ids[k].append(tid)
80 | topK_probs[k] *= prob
81 | else:
82 | topK_ids[0].append(max_item_id)
83 | topK_probs[0] *= max_item_prob
84 | start += 1
85 |
86 | result_dict['prob_list'] = topK_probs
87 | result_dict['label_prob'] = [[self.tokenizer.decode(ids), prob] for ids, prob in
88 | zip(topK_ids, topK_probs)]
89 | result_dict['max_item'] = [i[0] for i in result_dict['label_prob']]
90 | outputs = result_dict
91 | else:
92 | targets = batch_data['target']
93 | negative_targets = batch_data['ntarget']
94 | input_lengths = batch_data['input_length']
95 | target_lengths = batch_data['target_length']
96 |
97 | target_tensors = torch.as_tensor(targets)
98 | input_length_tensors = torch.as_tensor(input_lengths)
99 | target_length_tensors = torch.as_tensor(target_lengths)
100 |
101 | loss_tensors = torch.as_tensor(targets)
102 | negativeloss_tensors = torch.as_tensor(negative_targets)
103 | ctc_lm_loss = self.loss(prediction_scores,
104 | input_length_tensors,
105 | target_tensors.view(batch_size, -1),
106 | target_length_tensors)
107 |
108 | loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
109 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.vocab_size),
110 | loss_tensors.view(-1))
111 | if not torch.all(negativeloss_tensors.eq(-1)).item():
112 | negative_loss_fct = NegativeCElLoss()
113 | negative_loss = negative_loss_fct(prediction_scores.view(-1, self.vocab_size),
114 | negativeloss_tensors.view(-1))
115 | masked_lm_loss += negative_loss
116 | outputs = ctc_lm_loss + masked_lm_loss
117 |
118 | return outputs
119 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | ## Getting started
23 |
24 | ### Installing via pip
25 | ```bash
26 | pip install tfkit
27 | ```
28 |
29 | * You can use tfkit for model training and evaluation with `tfkit-train` and `tfkit-eval`.
30 |
31 | ### Running TFKit on the task you wanted
32 |
33 | ### First step - prepare your dataset
34 | The key to combine different task together is to make different task with same data format.
35 |
36 | **notice**
37 |
38 | * All data will be in csv format - tfkit will use **csv** for all task, normally it will have two columns, first columns is the input of models, the second column is the output of models.
39 | * Plain text with no tokenization - there is no need to tokenize text before training, or do re-calculating for tokenization, tfkit will handle it for you.
40 | * No header is needed.
41 |
42 | For example, a sentiment classification dataset will be like:
43 | ```csv
44 | how dare you,negative
45 | ```
46 |
47 | !!! hint
48 | For the detail and example format on different, you can check [here](tasks/)
49 |
50 | !!! hint
51 | nlprep is a tool for data split/preprocessing/argumentation, it can help you to create ready to train data for tfkit, check [here](https://github.com/voidful/NLPrep)
52 |
53 | ### Second step - model training
54 |
55 | Using `tfkit-train` for model training, you can use
56 |
57 | Before training a model, there is something you need to clarify:
58 |
59 | - `--model` what is your model to handle this task? check [here](models/) to the detail of models.
60 | - `--config` what pretrained model you want to use? you can go [https://huggingface.co/models](https://huggingface.co/models) to search for available pretrained models.
61 | - `--train` and `--test` training and testing dataset path, which is in csv format.
62 | - `--savedir` model saving directory, default will be in '/checkpoints' folder
63 |
64 | you can leave the rest to the default config, or use `tfkit-train -h` to more configuration.
65 |
66 | An example about training a sentiment classifier:
67 | ```bash
68 | tfkit-train \
69 | --task clas \
70 | --config xlm-roberta-base \
71 | --train training_data.csv \
72 | --test testing_data.csv \
73 | --lr 4e-5 \
74 | --maxlen 384 \
75 | --epoch 10 \
76 | --savedir roberta_sentiment_classifier
77 | ```
78 |
79 | #### Third step - model eval
80 |
81 | Using `tfkit-eval` for model evaluation.
82 | - `--model` saved model's path.
83 | - `--metric` the evaluation metric eg: emf1, nlg(BLEU/ROUGE), clas(confusion matrix).
84 | - `--valid` validation data, also in csv format.
85 | - `--panel` a input panel for model specific parameter.
86 |
87 | for more configuration detail, you may use `tfkit-eval -h`.
88 |
89 | After evaluate, It will print evaluate result in your console, and also generate three report for debugging.
90 | - `*_score.csv` overall score, it is the copy of the console result.
91 | - `*each_data_score.csv` score on each data, 3 column `predicted,targets,score`, ranked from the lowest to the highest.
92 | - `*predicted.csv` csv file include 3 column `input,predicted,targets`.
93 |
94 | !!! hint
95 | nlp2go is a tool for demonstration, with CLI and Restful interface. check [here](https://github.com/voidful/nlp2go)
96 |
97 | ### Example
98 | #### Use distilbert to train NER Model
99 | ```bash
100 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t
101 | tfkit-train --batch 10 --epoch 3 --lr 5e-6 --train ./clner_row/train --test ./clner_row/test --maxlen 512 --task tag --config distilbert-base-multilingual-cased
102 | nlp2go --task ./checkpoints/3.pt --cli
103 | ```
104 |
105 | #### Use Albert to train DRCD Model Model
106 | ```bash
107 | nlprep --dataset qa_zh --outdir ./zhqa/
108 | tfkit-train --maxlen 512 --savedir ./drcd_qa_model/ --train ./zhqa/drcd-train --test ./zhqa/drcd-test --task qa --config voidful/albert_chinese_small --cache
109 | nlp2go --task ./drcd_qa_model/3.pt --cli
110 | ```
111 |
112 | #### Use Albert to train both DRCD Model and NER Model
113 | ```bash
114 | nlprep --dataset tag_clner --outdir ./clner_row --util s2t
115 | nlprep --dataset qa_zh --outdir ./zhqa/
116 | tfkit-train --maxlen 300 --savedir ./mt-qaner --train ./clner_row/train ./zhqa/drcd-train --test ./clner_row/test ./zhqa/drcd-test --task tag qa --config voidful/albert_chinese_small
117 | nlp2go --task ./mt-qaner/3.pt --cli
118 | ```
119 |
120 | **You can also try tfkit in Google Colab: [](https://colab.research.google.com/drive/1hqaTKxd3VtX2XkvjiO0FMtY-rTZX30MJ?usp=sharing)**
121 |
122 | ## Contributing
123 | Thanks for your interest.There are many ways to contribute to this project. Get started [here](https://github.com/voidful/tfkit/blob/master/CONTRIBUTING.md).
124 |
125 | ## License
126 | 
127 |
128 | * [License](https://github.com/voidful/tfkit/blob/master/LICENSE)
129 |
130 | ## Icons reference
131 | Icons modify from Freepik from www.flaticon.com
132 | Icons modify from Nikita Golubev from www.flaticon.com
133 |
--------------------------------------------------------------------------------
/demo_data/mcq.csv:
--------------------------------------------------------------------------------
1 | "I 'm sure many of you have seen Star Wars , Jurassic Park , Multiplicity , or many of the other movies that describe cloning . Most of what you see in these movies is false . What you do n't know is that cloning could be dangerous , to the clone and to our society as a whole . I think human cloning is wrong mainly for four reasons . What about identity ? Humans are promised the right to their own personalities . What would happen if we ignore those rights by giving them someone else 's genetic identity ? True , Cloning may prevent people from possessing their identities . Also , these is a large power struggle here . Cloning means a degree of power and controls over another person 's physical identity and that ignores their rights and their only personalities . The person doing the cloning would have more power than any parent would have . Cloning would also deal with killing embryos . You might not have known , but Dolly , the sheep that was cloned in 1996 , was one of over 200 sheep embryos and hers was the only embryo that survived . The rest died or were thrown away . Imagine if the failure rate was that high when we started to clone humans . cloning means running the risk of wasting too much effort Cloning someone , at this present time , would be extremely dangerous to the birth mother and the clone . In studies done on cows , 4 out of 12 birth mothers died . There is a very high failure rate , which is shown in the cloning of Dolly . Even if you had a few good embryos , failures have been noticeable in animal tests . So , should we work ahead in the world of cloning ? I say no . the risks are greater than the benefits . It 's dangerous to the clone and to the birth mother . We would be killing human lives in the process . It would also be a violation of the clone 's right to its own genetic identity and personality .