├── NER
    ├── metrics
    │   ├── __init__.py
    │   ├── functional
    │   │   ├── __init__.py
    │   │   └── query_span_f1.py
    │   └── query_span_f1.py
    ├── models
    │   ├── __init__.py
    │   ├── query_ner_config.py
    │   ├── classifier.py
    │   └── bert_query_ner.py
    ├── ner2mrc
    │   ├── __init__.py
    │   ├── queries
    │   │   ├── zh_msra.json
    │   │   └── genia.json
    │   ├── download.md
    │   ├── genia2mrc.py
    │   └── msra2mrc.py
    ├── utils
    │   ├── __init__.py
    │   ├── convert_tf2torch.sh
    │   ├── radom_seed.py
    │   ├── get_parser.py
    │   └── bmes_decode.py
    ├── datasets
    │   ├── __init__.py
    │   ├── truncate_dataset.py
    │   ├── collate_functions.py
    │   ├── mrc_ner_dataset.py
    │   ├── compute_acc.py
    │   ├── compute_acc_linux.py
    │   └── doc-paragraph-sentence-id
    │   │   ├── mrc-ner.test-id
    │   │   └── mrc-ner.dev-id
    ├── loss
    │   ├── __init__.py
    │   ├── adaptive_dice_loss.py
    │   └── dice_loss.py
    ├── requirements.txt
    ├── scripts
    │   └── reproduce
    │   │   ├── zh_msra.sh
    │   │   ├── ace04.sh
    │   │   └── ace05.sh
    ├── parameters
    ├── evaluate.py
    ├── README.md
    └── trainer.py
├── RE
    ├── requirements.txt
    ├── data
    │   └── relation2id.txt
    ├── README.md
    ├── process_data.py
    ├── train_GRU.py
    ├── network.py
    ├── initial.py
    └── test_GRU.py
├── data
    └── annotation-guidelines.pdf
├── .gitignore
├── README.md
└── LICENSE


/NER/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/ner2mrc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/metrics/functional/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NER/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .dice_loss import DiceLoss
2 | 


--------------------------------------------------------------------------------
/RE/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.4
2 | scikit_learn==0.23.2
3 | jieba==0.42.1
4 | 


--------------------------------------------------------------------------------
/NER/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | pytorch-lightning==0.9.0
3 | tokenizers
4 | transformers
5 | 


--------------------------------------------------------------------------------
/data/annotation-guidelines.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changdejie/diaKG-code/HEAD/data/annotation-guidelines.pdf


--------------------------------------------------------------------------------
/NER/ner2mrc/queries/zh_msra.json:
--------------------------------------------------------------------------------
1 | {
2 |     "NR": "人名和虚构的人物形象",
3 |     "NS": "按照地理位置划分的国家,城市,乡镇,大洲",
4 |     "NT": "组织包括公司,政府党派,学校,政府,新闻机构"
5 | }


--------------------------------------------------------------------------------
/NER/models/query_ner_config.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | from transformers import BertConfig
 5 | 
 6 | 
 7 | class BertQueryNerConfig(BertConfig):
 8 |     def __init__(self, **kwargs):
 9 |         super(BertQueryNerConfig, self).__init__(**kwargs)
10 |         self.mrc_dropout = kwargs.get("mrc_dropout", 0.1)
11 | 


--------------------------------------------------------------------------------
/NER/utils/convert_tf2torch.sh:
--------------------------------------------------------------------------------
1 | # convert tf model to pytorch format
2 | 
3 | export BERT_BASE_DIR=/mnt/mrc/wwm_uncased_L-24_H-1024_A-16
4 | 
5 | transformers-cli convert --model_type bert \
6 |   --tf_checkpoint $BERT_BASE_DIR/model.ckpt \
7 |   --config $BERT_BASE_DIR/config.json \
8 |   --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
9 | 


--------------------------------------------------------------------------------
/NER/ner2mrc/queries/genia.json:
--------------------------------------------------------------------------------
1 | {
2 |     "DNA": "deoxyribonucleic acid",
3 |     "RNA": "ribonucleic acid",
4 |     "cell_line": "cell line",
5 |     "cell_type": "cell type",
6 |     "protein": "protein entities are limited to nitrogenous organic compounds and are parts of all living organisms, as structural components of body tissues such as muscle, hair, collagen and as enzymes and antibodies."
7 | }
8 | 


--------------------------------------------------------------------------------
/RE/data/relation2id.txt:
--------------------------------------------------------------------------------
 1 | Rel_Method_Drug 0
 2 | Rel_Test_items_Disease  1
 3 | Rel_Anatomy_Disease 2
 4 | Rel_Drug_Disease    3
 5 | Rel_SideEff_Disease 4
 6 | Rel_Treatment_Disease   5
 7 | Rel_Pathogenesis_Disease    6
 8 | Rel_Frequency_Drug  7
 9 | Rel_Test_Disease    8
10 | Rel_Operation_Disese    9
11 | Rel_Symptom_Disease 10
12 | Rel_Type_Disease    11
13 | Rel_Amount_Drug 12
14 | Rel_SideEff_Drug    13
15 | Rel_Reason_Disease  14
16 | Rel_Duration_Drug   15


--------------------------------------------------------------------------------
/NER/utils/radom_seed.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def set_random_seed(seed: int):
 9 |     """set seeds for reproducibility"""
10 |     np.random.seed(seed)
11 |     torch.manual_seed(seed)
12 |     torch.backends.cudnn.deterministic = True
13 |     torch.backends.cudnn.benchmark = False
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     # without this line, x would be different in every execution.
18 |     set_random_seed(0)
19 | 
20 |     x = np.random.random()
21 |     print(x)
22 | 


--------------------------------------------------------------------------------
/NER/ner2mrc/download.md:
--------------------------------------------------------------------------------
 1 | ## Download Processed MRC-NER Datasets 
 2 | ZH:
 3 | - [MSRA](https://drive.google.com/file/d/1bAoSJfT1IBdpbQWSrZPjQPPbAsDGlN2D/view?usp=sharing)
 4 | - [OntoNotes4](https://drive.google.com/file/d/1CRVgZJDDGuj0O1NLK5DgujQBTLKyMR-g/view?usp=sharing)
 5 | 
 6 | EN:
 7 | - [CoNLL03](https://drive.google.com/file/d/1COt5bSHgwfl3oIZ6sCBVAenJKlfy3LI_/view?usp=sharing)
 8 | - [ACE2004](https://drive.google.com/file/d/1zxLjecKK7CeLjxvPa-9QU9xsRJTVI5vb/view?usp=sharing)
 9 | - [ACE2005](https://drive.google.com/file/d/1yxfwlrBmYIECqL_4K5xRve-pfBeIt58z/view?usp=sharing)
10 | 


--------------------------------------------------------------------------------
/NER/datasets/truncate_dataset.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from torch.utils.data import Dataset
 4 | 
 5 | 
 6 | class TruncateDataset(Dataset):
 7 |     """Truncate dataset to certain num"""
 8 |     def __init__(self, dataset: Dataset, max_num: int = 100):
 9 |         self.dataset = dataset
10 |         self.max_num = min(max_num, len(self.dataset))
11 | 
12 |     def __len__(self):
13 |         return self.max_num
14 | 
15 |     def __getitem__(self, item):
16 |         return self.dataset[item]
17 | 
18 |     def __getattr__(self, item):
19 |         """other dataset func"""
20 |         return getattr(self.dataset, item)
21 | 


--------------------------------------------------------------------------------
/NER/metrics/query_span_f1.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | from pytorch_lightning.metrics.metric import TensorMetric
 5 | from .functional.query_span_f1 import query_span_f1
 6 | 
 7 | 
 8 | class QuerySpanF1(TensorMetric):
 9 |     """
10 |     Query Span F1
11 |     Args:
12 |         flat: is flat-ner
13 |     """
14 |     def __init__(self, reduce_group=None, reduce_op=None, flat=False):
15 |         super(QuerySpanF1, self).__init__(name="query_span_f1",
16 |                                           reduce_group=reduce_group,
17 |                                           reduce_op=reduce_op)
18 |         self.flat = flat
19 | 
20 |     def forward(self, start_preds, end_preds, match_logits, start_label_mask, end_label_mask, match_labels):
21 |         return query_span_f1(start_preds, end_preds, match_logits, start_label_mask, end_label_mask, match_labels,
22 |                              flat=self.flat)
23 | 


--------------------------------------------------------------------------------
/NER/scripts/reproduce/zh_msra.sh:
--------------------------------------------------------------------------------
 1 | export PYTHONPATH="$PWD"
 2 | export TOKENIZERS_PARALLELISM=false
 3 | DATA_DIR="/mnt/mrc/zh_msra"
 4 | BERT_DIR="/mnt/mrc/chinese_roberta_wwm_large_ext_pytorch"
 5 | SPAN_WEIGHT=0.1
 6 | DROPOUT=0.2
 7 | LR=8e-6
 8 | MAXLEN=128
 9 | 
10 | OUTPUT_DIR="/mnt/mrc/train_logs/zh_msra/zh_msra_bertlarge_lr${LR}20200913_dropout${DROPOUT}_bsz16_maxlen${MAXLEN}"
11 | 
12 | mkdir -p $OUTPUT_DIR
13 | 
14 | python trainer.py \
15 | --chinese \
16 | --data_dir $DATA_DIR \
17 | --bert_config_dir $BERT_DIR \
18 | --max_length $MAXLEN \
19 | --batch_size 4 \
20 | --gpus="0,1,2,3" \
21 | --precision=16 \
22 | --progress_bar_refresh_rate 1 \
23 | --lr ${LR} \
24 | --distributed_backend=ddp \
25 | --val_check_interval 0.5 \
26 | --accumulate_grad_batches 1 \
27 | --default_root_dir $OUTPUT_DIR \
28 | --mrc_dropout $DROPOUT \
29 | --max_epochs 20 \
30 | --weight_span $SPAN_WEIGHT \
31 | --span_loss_candidates "pred_and_gold"
32 | 


--------------------------------------------------------------------------------
/NER/scripts/reproduce/ace04.sh:
--------------------------------------------------------------------------------
 1 | export PYTHONPATH="$PWD"
 2 | DATA_DIR="/mnt/mrc/ace2004"
 3 | BERT_DIR="/mnt/mrc/bert-large-uncased"
 4 | 
 5 | BERT_DROPOUT=0.1
 6 | MRC_DROPOUT=0.3
 7 | LR=3e-5
 8 | SPAN_WEIGHT=0.1
 9 | WARMUP=0
10 | MAXLEN=128
11 | MAXNORM=1.0
12 | 
13 | OUTPUT_DIR="/mnt/mrc/train_logs/ace2004/ace2004_20200915reproduce_lr${LR}_drop${MRC_DROPOUT}_norm${MAXNORM}_bsz32_hard_span_weight${SPAN_WEIGHT}_warmup${WARMUP}_maxlen${MAXLEN}_newtrunc_debug"
14 | mkdir -p $OUTPUT_DIR
15 | python trainer.py \
16 | --data_dir $DATA_DIR \
17 | --bert_config_dir $BERT_DIR \
18 | --max_length $MAXLEN \
19 | --batch_size 4 \
20 | --gpus="0,1,2,3" \
21 | --precision=16 \
22 | --progress_bar_refresh_rate 1 \
23 | --lr $LR \
24 | --distributed_backend=ddp \
25 | --val_check_interval 0.5 \
26 | --accumulate_grad_batches 2 \
27 | --default_root_dir $OUTPUT_DIR \
28 | --mrc_dropout $MRC_DROPOUT \
29 | --bert_dropout $BERT_DROPOUT \
30 | --max_epochs 20 \
31 | --span_loss_candidates "pred_and_gold" \
32 | --weight_span $SPAN_WEIGHT \
33 | --warmup_steps $WARMUP \
34 | --max_length $MAXLEN \
35 | --gradient_clip_val $MAXNORM
36 | 


--------------------------------------------------------------------------------
/NER/scripts/reproduce/ace05.sh:
--------------------------------------------------------------------------------
 1 | export PYTHONPATH="$PWD"
 2 | DATA_DIR="/mnt/mrc/ace2005"
 3 | BERT_DIR="/mnt/mrc/wwm_uncased_L-24_H-1024_A-16"
 4 | 
 5 | BERT_DROPOUT=0.1
 6 | MRC_DROPOUT=0.4
 7 | LR=1e-5
 8 | SPAN_WEIGHT=0.1
 9 | WARMUP=0
10 | MAXLEN=128
11 | MAXNORM=1.0
12 | 
13 | OUTPUT_DIR="/mnt/mrc/train_logs/ace2005/ace2005_20200917_wwmlarge_sgd_warm${WARMUP}lr${LR}_drop${MRC_DROPOUT}_norm${MAXNORM}_bsz32_gold_span_weight${SPAN_WEIGHT}_warmup${WARMUP}_maxlen${MAXLEN}"
14 | mkdir -p $OUTPUT_DIR
15 | 
16 | python trainer.py \
17 | --data_dir $DATA_DIR \
18 | --bert_config_dir $BERT_DIR \
19 | --max_length $MAXLEN \
20 | --batch_size 8 \
21 | --gpus="0,1,2,3" \
22 | --precision=16 \
23 | --progress_bar_refresh_rate 1 \
24 | --lr $LR \
25 | --distributed_backend=ddp \
26 | --val_check_interval 0.25 \
27 | --accumulate_grad_batches 1 \
28 | --default_root_dir $OUTPUT_DIR \
29 | --mrc_dropout $MRC_DROPOUT \
30 | --bert_dropout $BERT_DROPOUT \
31 | --max_epochs 20 \
32 | --span_loss_candidates "pred_and_gold" \
33 | --weight_span $SPAN_WEIGHT \
34 | --warmup_steps $WARMUP \
35 | --max_length $MAXLEN \
36 | --gradient_clip_val $MAXNORM \
37 | --optimizer "adamw"
38 | 


--------------------------------------------------------------------------------
/NER/models/classifier.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import torch.nn as nn
 5 | from torch.nn import functional as F
 6 | 
 7 | 
 8 | class SingleLinearClassifier(nn.Module):
 9 |     def __init__(self, hidden_size, num_label):
10 |         super(SingleLinearClassifier, self).__init__()
11 |         self.num_label = num_label
12 |         self.classifier = nn.Linear(hidden_size, num_label)
13 | 
14 |     def forward(self, input_features):
15 |         features_output = self.classifier(input_features)
16 |         return features_output
17 | 
18 | 
19 | class MultiNonLinearClassifier(nn.Module):
20 |     def __init__(self, hidden_size, num_label, dropout_rate):
21 |         super(MultiNonLinearClassifier, self).__init__()
22 |         self.num_label = num_label
23 |         self.classifier1 = nn.Linear(hidden_size, hidden_size)
24 |         self.classifier2 = nn.Linear(hidden_size, num_label)
25 |         self.dropout = nn.Dropout(dropout_rate)
26 | 
27 |     def forward(self, input_features):
28 |         features_output1 = self.classifier1(input_features)
29 |         # features_output1 = F.relu(features_output1)
30 |         features_output1 = F.gelu(features_output1)
31 |         features_output1 = self.dropout(features_output1)
32 |         features_output2 = self.classifier2(features_output1)
33 |         return features_output2
34 | 


--------------------------------------------------------------------------------
/NER/utils/get_parser.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | def get_parser() -> argparse.ArgumentParser:
 8 |     """
 9 |     return basic arg parser
10 |     """
11 |     parser = argparse.ArgumentParser(description="Training")
12 | 
13 |     parser.add_argument("--data_dir", type=str, required=False, default="E:\\data\\nested",help="data dir")
14 |     parser.add_argument("--bert_config_dir", type=str, required=False,  default="E:\data\chinese_roberta_wwm_large_ext_pytorch",help="bert config dir")
15 |     parser.add_argument("--pretrained_checkpoint", default="", type=str, help="pretrained checkpoint path")
16 |     parser.add_argument("--max_length", type=int, default=128, help="max length of dataset")
17 |     parser.add_argument("--batch_size", type=int, default=2, help="batch size")
18 |     parser.add_argument("--lr", type=float, default=2e-5, help="learning rate")
19 |     parser.add_argument("--workers", type=int, default=0, help="num workers for dataloader")
20 |     parser.add_argument("--weight_decay", default=0.01, type=float,
21 |                         help="Weight decay if we apply some.")
22 |     parser.add_argument("--warmup_steps", default=0, type=int,
23 |                         help="warmup steps used for scheduler.")
24 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
25 |                         help="Epsilon for Adam optimizer.")
26 | 
27 |     return parser
28 | 


--------------------------------------------------------------------------------
/NER/datasets/collate_functions.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import torch
 4 | from typing import List
 5 | 
 6 | 
 7 | def collate_to_max_length(batch: List[List[torch.Tensor]]) -> List[torch.Tensor]:
 8 |     """
 9 |     pad to maximum length of this batch
10 |     Args:
11 |         batch: a batch of samples, each contains a list of field data(Tensor):
12 |             tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx
13 |     Returns:
14 |         output: list of field batched data, which shape is [batch, max_length]
15 |     """
16 |     batch_size = len(batch)
17 |     max_length = max(x[0].shape[0] for x in batch)
18 |     output = []
19 | 
20 |     for field_idx in range(6):
21 |         pad_output = torch.full([batch_size, max_length], 0, dtype=batch[0][field_idx].dtype)
22 |         for sample_idx in range(batch_size):
23 |             data = batch[sample_idx][field_idx]
24 |             pad_output[sample_idx][: data.shape[0]] = data
25 |         output.append(pad_output)
26 | 
27 |     pad_match_labels = torch.zeros([batch_size, max_length, max_length], dtype=torch.long)
28 |     for sample_idx in range(batch_size):
29 |         data = batch[sample_idx][6]
30 |         pad_match_labels[sample_idx, : data.shape[1], : data.shape[1]] = data
31 |     output.append(pad_match_labels)
32 | 
33 |     output.append(torch.stack([x[-2] for x in batch]))
34 |     output.append(torch.stack([x[-1] for x in batch]))
35 | 
36 |     return output
37 | 


--------------------------------------------------------------------------------
/NER/parameters:
--------------------------------------------------------------------------------
 1 | accumulate_grad_batches: 1
 2 | adam_epsilon: 1.0e-08
 3 | amp_backend: native
 4 | amp_level: O2
 5 | auto_lr_find: false
 6 | auto_scale_batch_size: false
 7 | auto_select_gpus: false
 8 | batch_size: 32
 9 | benchmark: false
10 | bert_config_dir: chinese_roberta_wwm_large_ext_pytorch
11 | bert_dropout: 0.1
12 | check_val_every_n_epoch: 1
13 | checkpoint_callback: true
14 | chinese: false
15 | data_dir: nested
16 | default_root_dir: null
17 | deterministic: false
18 | dice_smooth: 1.0e-08
19 | distributed_backend: null
20 | early_stop_callback: false
21 | fast_dev_run: false
22 | final_div_factor: 10000.0
23 | flat: false
24 | gradient_clip_val: 0
25 | limit_test_batches: 1.0
26 | limit_train_batches: 1.0
27 | limit_val_batches: 1.0
28 | log_gpu_memory: null
29 | log_save_interval: 100
30 | logger: true
31 | loss_type: bce
32 | lr: 2.0e-05
33 | gpus: 0
34 | max_epochs: 10
35 | max_length: 128
36 | max_steps: null
37 | min_epochs: 1
38 | min_steps: null
39 | mrc_dropout: 0.1
40 | num_nodes: 1
41 | num_processes: 1
42 | num_sanity_val_steps: 2
43 | optimizer: adamw
44 | overfit_batches: 0.0
45 | overfit_pct: null
46 | precision: 32
47 | prepare_data_per_node: true
48 | pretrained_checkpoint: ''
49 | process_position: 0
50 | profiler: null
51 | progress_bar_refresh_rate: 1
52 | reload_dataloaders_every_epoch: false
53 | replace_sampler_ddp: true
54 | resume_from_checkpoint: null
55 | row_log_interval: 50
56 | span_loss_candidates: all
57 | sync_batchnorm: false
58 | terminate_on_nan: false
59 | test_percent_check: null
60 | track_grad_norm: -1
61 | train_percent_check: null
62 | truncated_bptt_steps: null
63 | val_check_interval: 1.0
64 | val_percent_check: null
65 | warmup_steps: 0
66 | weight_decay: 0.01
67 | weight_end: 1.0
68 | weight_span: 1.0
69 | weight_start: 1.0
70 | weights_save_path: null
71 | weights_summary: top
72 | workers: 0
73 | 


--------------------------------------------------------------------------------
/NER/ner2mrc/genia2mrc.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import os
 5 | from utils.bmes_decode import bmes_decode
 6 | import json
 7 | 
 8 | 
 9 | def convert_file(input_file, output_file, tag2query_file):
10 |     """
11 |     Convert GENIA(xiaoya) data to MRC format
12 |     """
13 |     all_data = json.load(open(input_file))
14 |     tag2query = json.load(open(tag2query_file))
15 | 
16 |     output = []
17 |     origin_count = 0
18 |     new_count = 0
19 | 
20 |     for data in all_data:
21 |         origin_count += 1
22 |         context = data["context"]
23 |         label2positions = data["label"]
24 |         for tag_idx, (tag, query) in enumerate(tag2query.items()):
25 |             positions = label2positions.get(tag, [])
26 |             mrc_sample = {
27 |                 "context": context,
28 |                 "query": query,
29 |                 "start_position": [int(x.split(";")[0]) for x in positions],
30 |                 "end_position": [int(x.split(";")[1]) for x in positions],
31 |                 "qas_id": f"{origin_count}.{tag_idx}"
32 |             }
33 |             output.append(mrc_sample)
34 |             new_count += 1
35 | 
36 |     json.dump(output, open(output_file, "w"), ensure_ascii=False, indent=2)
37 |     print(f"Convert {origin_count} samples to {new_count} samples and save to {output_file}")
38 | 
39 | 
40 | def main():
41 |     genia_raw_dir = "/mnt/mrc/genia/genia_raw"
42 |     genia_mrc_dir = "/mnt/mrc/genia/genia_raw/mrc_format"
43 |     tag2query_file = "queries/genia.json"
44 |     os.makedirs(genia_mrc_dir, exist_ok=True)
45 |     for phase in ["train", "dev", "test"]:
46 |         old_file = os.path.join(genia_raw_dir, f"{phase}.genia.json")
47 |         new_file = os.path.join(genia_mrc_dir, f"mrc-ner.{phase}")
48 |         convert_file(old_file, new_file, tag2query_file)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/NER/evaluate.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import os
 5 | from pytorch_lightning import Trainer
 6 | 
 7 | from trainer import BertLabeling
 8 | 
 9 | 
10 | def evaluate(ckpt, hparams_file):
11 |     """main"""
12 | 
13 |     trainer = Trainer(distributed_backend="ddp")
14 | 
15 |     model = BertLabeling.load_from_checkpoint(
16 |         checkpoint_path=ckpt,
17 |         hparams_file=hparams_file,
18 |         map_location=None,
19 |         batch_size=1,
20 |         max_length=128,
21 |         workers=0
22 |     )
23 |     trainer.test(model=model)
24 | 
25 | if __name__ == '__main__':
26 |     # ace04
27 |     # HPARAMS = "/mnt/mrc/train_logs/ace2004/ace2004_20200911reproduce_epoch15_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/lightning_logs/version_0/hparams.yaml"
28 |     # CHECKPOINTS = "/mnt/mrc/train_logs/ace2004/ace2004_20200911reproduce_epoch15_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/epoch=10_v0.ckpt"
29 |     # DIR = "/mnt/mrc/train_logs/ace2004/ace2004_20200910_lr3e-5_drop0.3_bert0.1_bsz32_hard_loss_bce_weight_span0.05"
30 |     # CHECKPOINTS = [os.path.join(DIR, x) for x in os.listdir(DIR)]
31 | 
32 |     # ace04-large
33 |     # HPARAMS = "/mnt/mrc/train_logs/ace2004/ace2004_20200910reproduce_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/lightning_logs/version_2/hparams.yaml"
34 |     # CHECKPOINTS = "/mnt/mrc/train_logs/ace2004/ace2004_20200910reproduce_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/epoch=10.ckpt"
35 | 
36 |     # ace05
37 |     # HPARAMS = "/mnt/mrc/train_logs/ace2005/ace2005_20200911_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/lightning_logs/version_0/hparams.yaml"
38 |     # CHECKPOINTS = "/mnt/mrc/train_logs/ace2005/ace2005_20200911_lr3e-5_drop0.3_norm1.0_bsz32_hard_span_weight0.1_warmup0_maxlen128_newtrunc_debug/epoch=15.ckpt"
39 | 
40 |     # zh_msra
41 |     CHECKPOINTS = "E:\\data\\modelNER\\version_7\\checkpoints\\epoch=2.ckpt"
42 |     HPARAMS = "E:\\data\\modelNER\\version_7\\hparams.yaml"
43 | 
44 |     evaluate(ckpt=CHECKPOINTS, hparams_file=HPARAMS)
45 | 


--------------------------------------------------------------------------------
/NER/ner2mrc/msra2mrc.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import os
 5 | from utils.bmes_decode import bmes_decode
 6 | import json
 7 | 
 8 | 
 9 | def convert_file(input_file, output_file, tag2query_file):
10 |     """
11 |     Convert MSRA raw data to MRC format
12 |     """
13 |     origin_count = 0
14 |     new_count = 0
15 |     tag2query = json.load(open(tag2query_file))
16 |     mrc_samples = []
17 |     with open(input_file) as fin:
18 |         for line in fin:
19 |             line = line.strip()
20 |             if not line:
21 |                 continue
22 |             origin_count += 1
23 |             src, labels = line.split("\t")
24 |             tags = bmes_decode(char_label_list=[(char, label) for char, label in zip(src.split(), labels.split())])
25 |             for label, query in tag2query.items():
26 |                 mrc_samples.append(
27 |                     {
28 |                         "context": src,
29 |                         "start_position": [tag.begin for tag in tags if tag.tag == label],
30 |                         "end_position": [tag.end-1 for tag in tags if tag.tag == label],
31 |                         "query": query
32 |                     }
33 |                 )
34 |                 new_count += 1
35 | 
36 |     json.dump(mrc_samples, open(output_file, "w"), ensure_ascii=False, sort_keys=True, indent=2)
37 |     print(f"Convert {origin_count} samples to {new_count} samples and save to {output_file}")
38 | 
39 | 
40 | def main():
41 |     # msra_raw_dir = "/mnt/mrc/zh_msra_yuxian"
42 |     # msra_mrc_dir = "/mnt/mrc/zh_msra_yuxian/mrc_format"
43 |     msra_raw_dir = "./queries/zh_msra"
44 |     msra_mrc_dir = "/mnt/mrc/zh_msra/mrc_format"
45 |     tag2query_file = "queries/zh_msra.json"
46 |     os.makedirs(msra_mrc_dir, exist_ok=True)
47 |     for phase in ["train", "dev", "test"]:
48 |         old_file = os.path.join(msra_raw_dir, f"{phase}.tsv")
49 |         new_file = os.path.join(msra_mrc_dir, f"mrc-ner.{phase}")
50 |         # old_file = os.path.join(msra_raw_dir, f"mrc-ner.{phase}")
51 |         # new_file = os.path.join(msra_mrc_dir, f"mrc-ner.{phase}")
52 |         convert_file(old_file, new_file, tag2query_file)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     main()
57 | 


--------------------------------------------------------------------------------
/NER/utils/bmes_decode.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | from typing import Tuple, List
 5 | 
 6 | 
 7 | class Tag(object):
 8 |     def __init__(self, term, tag, begin, end):
 9 |         self.term = term
10 |         self.tag = tag
11 |         self.begin = begin
12 |         self.end = end
13 | 
14 |     def to_tuple(self):
15 |         return tuple([self.term, self.begin, self.end])
16 | 
17 |     def __str__(self):
18 |         return str({key: value for key, value in self.__dict__.items()})
19 | 
20 |     def __repr__(self):
21 |         return str({key: value for key, value in self.__dict__.items()})
22 | 
23 | 
24 | def bmes_decode(char_label_list: List[Tuple[str, str]]) -> List[Tag]:
25 |     """
26 |     decode inputs to tags
27 |     Args:
28 |         char_label_list: list of tuple (word, bmes-tag)
29 |     Returns:
30 |         tags
31 |     Examples:
32 |         >>> x = [("Hi", "O"), ("Beijing", "S-LOC")]
33 |         >>> bmes_decode(x)
34 |         [{'term': 'Beijing', 'tag': 'LOC', 'begin': 1, 'end': 2}]
35 |     """
36 |     idx = 0
37 |     length = len(char_label_list)
38 |     tags = []
39 |     while idx < length:
40 |         term, label = char_label_list[idx]
41 |         current_label = label[0]
42 | 
43 |         # correct labels
44 |         if current_label in ["M", "E"]:
45 |             current_label = "B"
46 |         if idx + 1 == length and current_label == "B":
47 |             current_label = "S"
48 | 
49 |         # merge chars
50 |         if current_label == "O":
51 |             idx += 1
52 |             continue
53 |         if current_label == "S":
54 |             tags.append(Tag(term, label[2:], idx, idx + 1))
55 |             idx += 1
56 |             continue
57 |         if current_label == "B":
58 |             end = idx + 1
59 |             while end + 1 < length and char_label_list[end][1][0] == "M":
60 |                 end += 1
61 |             if char_label_list[end][1][0] == "E":  # end with E
62 |                 entity = "".join(char_label_list[i][0] for i in range(idx, end + 1))
63 |                 tags.append(Tag(entity, label[2:], idx, end + 1))
64 |                 idx = end + 1
65 |             else:  # end with M/B
66 |                 entity = "".join(char_label_list[i][0] for i in range(idx, end))
67 |                 tags.append(Tag(entity, label[2:], idx, end))
68 |                 idx = end
69 |             continue
70 |         else:
71 |             raise Exception("Invalid Inputs")
72 |     return tags
73 | 


--------------------------------------------------------------------------------
/NER/README.md:
--------------------------------------------------------------------------------
 1 | # A Unified MRC Framework for Named Entity Recognition 
 2 | The repository contains the code of the recent research advances in [Shannon.AI](http://www.shannonai.com). 
 3 | 
 4 | **A Unified MRC Framework for Named Entity Recognition** <br>
 5 | Xiaoya Li, Jingrong Feng, Yuxian Meng, Qinghong Han, Fei Wu and Jiwei Li<br>
 6 | In ACL 2020. [paper](https://arxiv.org/abs/1910.11476)<br>
 7 | If you find this repo helpful, please cite the following:
 8 | ```latex
 9 | @article{li2019unified,
10 |   title={A Unified MRC Framework for Named Entity Recognition},
11 |   author={Li, Xiaoya and Feng, Jingrong and Meng, Yuxian and Han, Qinghong and Wu, Fei and Li, Jiwei},
12 |   journal={arXiv preprint arXiv:1910.11476},
13 |   year={2019}
14 | }
15 | ```
16 | For any question, please feel free to post Github issues.<br>
17 | 
18 | ## Install Requirements
19 | `pip install -r requirements.txt`
20 | 
21 | We build our project on [pytorch-lightning.](https://github.com/PyTorchLightning/pytorch-lightning)
22 | If you want to know more about the arguments used in our training scripts, please 
23 | refer to [pytorch-lightning documentation.](https://pytorch-lightning.readthedocs.io/en/latest/)
24 | 
25 | ## Prepare Datasets
26 | You can [download](./ner2mrc/download.md) our preprocessed MRC-NER datasets or 
27 | write your own preprocess scripts. We provide `ner2mrc/mrsa2mrc.py` for reference.
28 | 
29 | ## Prepare Models
30 | For English Datasets, we use [BERT-Large](https://github.com/google-research/bert)
31 | 
32 | For Chinese Datasets, we use [RoBERTa-wwm-ext-large](https://github.com/ymcui/Chinese-BERT-wwm)
33 | 
34 | ## Train
35 | The main training procedure is in `trainer.py`
36 | 
37 | Examples to start training are in `scripts/reproduce`.
38 | 
39 | Note that you may need to change `DATA_DIR`, `BERT_DIR`, `OUTPUT_DIR` to your own
40 | dataset path, bert model path and log path, respectively.
41 | 
42 | ## Evaluate
43 | `trainer.py` will automatically evaluate on dev set every `val_check_interval` epochs,
44 | and save the topk checkpoints to `default_root_dir`.
45 | 
46 | To evaluate them, use `evaluate.py`
47 | 
48 | # 模型经改造后使用如下
49 | ## 训练模型
50 | ```
51 | 训练的时候依据自己的机器情况设置参数，参数过大容易导致显存溢出
52 | python trainer.py --data_dir entity_type_data --bert_config models/chinese_roberta_wwm_large_ext_pytorch --batch_size 16 --max_epochs 10 --gpus 1
53 | ```
54 | ## 评估模型
55 | ```
56 | 需要修改evaluate里面新模型的路径
57 | python evaluate.py 
58 | 
59 | ```
60 | ## 统计详细不同实体信息
61 | ```
62 | datasets/compute_acc_linux.py or datasets/compute_acc.py
63 | 
64 | ```
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | .idea/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # mac
133 | .DS_Store
134 | 


--------------------------------------------------------------------------------
/NER/models/bert_query_ner.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from transformers import BertModel, BertPreTrainedModel
 7 | 
 8 | from models.classifier import MultiNonLinearClassifier, SingleLinearClassifier
 9 | 
10 | 
11 | class BertQueryNER(BertPreTrainedModel):
12 |     def __init__(self, config):
13 |         super(BertQueryNER, self).__init__(config)
14 |         self.bert = BertModel(config)
15 | 
16 |         # self.start_outputs = nn.Linear(config.hidden_size, 2)
17 |         # self.end_outputs = nn.Linear(config.hidden_size, 2)
18 |         self.start_outputs = nn.Linear(config.hidden_size, 1)
19 |         self.end_outputs = nn.Linear(config.hidden_size, 1)
20 |         self.span_embedding = MultiNonLinearClassifier(config.hidden_size * 2, 1, config.mrc_dropout)
21 |         # self.span_embedding = SingleLinearClassifier(config.hidden_size * 2, 1)
22 | 
23 |         self.hidden_size = config.hidden_size
24 | 
25 |         self.init_weights()
26 | 
27 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
28 |         """
29 |         Args:
30 |             input_ids: bert input tokens, tensor of shape [seq_len]
31 |             token_type_ids: 0 for query, 1 for context, tensor of shape [seq_len]
32 |             attention_mask: attention mask, tensor of shape [seq_len]
33 |         Returns:
34 |             start_logits: start/non-start probs of shape [seq_len]
35 |             end_logits: end/non-end probs of shape [seq_len]
36 |             match_logits: start-end-match probs of shape [seq_len, 1]
37 |         """
38 | 
39 |         bert_outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
40 |         sequence_heatmap = bert_outputs[0]  # [batch, seq_len, hidden]
41 |         batch_size, seq_len, hid_size = sequence_heatmap.size()
42 | 
43 |         start_logits = self.start_outputs(sequence_heatmap).squeeze(-1)  # [batch, seq_len, 1]
44 |         end_logits = self.end_outputs(sequence_heatmap).squeeze(-1)  # [batch, seq_len, 1]
45 | 
46 |         # for every position $i$ in sequence, should concate $j$ to
47 |         # predict if $i$ and $j$ are start_pos and end_pos for an entity.
48 |         # [batch, seq_len, seq_len, hidden]
49 |         start_extend = sequence_heatmap.unsqueeze(2).expand(-1, -1, seq_len, -1)
50 |         # [batch, seq_len, seq_len, hidden]
51 |         end_extend = sequence_heatmap.unsqueeze(1).expand(-1, seq_len, -1, -1)
52 |         # [batch, seq_len, seq_len, hidden*2]
53 |         span_matrix = torch.cat([start_extend, end_extend], 3)
54 |         # [batch, seq_len, seq_len]
55 |         span_logits = self.span_embedding(span_matrix).squeeze(-1)
56 | 
57 |         return start_logits, end_logits, span_logits
58 | 


--------------------------------------------------------------------------------
/NER/loss/adaptive_dice_loss.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch import Tensor
 7 | from typing import Optional
 8 | 
 9 | 
10 | class AdaptiveDiceLoss(nn.Module):
11 |     """
12 |     Dice coefficient for short, is an F1-oriented statistic used to gauge the similarity of two sets.
13 | 
14 |     Math Function:
15 |         https://arxiv.org/abs/1911.02855.pdf
16 |         adaptive_dice_loss(p, y) = 1 - numerator / denominator
17 |             numerator = 2 * \sum_{1}^{t} (1 - p_i) ** alpha * p_i * y_i + smooth
18 |             denominator = \sum_{1}^{t} (1 - p_i) ** alpha * p_i + \sum_{1} ^{t} y_i + smooth
19 | 
20 |     Args:
21 |         alpha: alpha in math function
22 |         smooth (float, optional): smooth in math function
23 |         square_denominator (bool, optional): [True, False], specifies whether to square the denominator in the loss function.
24 |         with_logits (bool, optional): [True, False], specifies whether the input tensor is normalized by Sigmoid/Softmax funcs.
25 |             True: the loss combines a `sigmoid` layer and the `BCELoss` in one single class.
26 |             False: the loss contains `BCELoss`.
27 |     Shape:
28 |         - input: (*)
29 |         - target: (*)
30 |         - mask: (*) 0,1 mask for the input sequence.
31 |         - Output: Scalar loss
32 |     Examples:
33 |         >>> loss = AdaptiveDiceLoss()
34 |         >>> input = torch.randn(3, 1, requires_grad=True)
35 |         >>> target = torch.empty(3, dtype=torch.long).random_(5)
36 |         >>> output = loss(input, target)
37 |         >>> output.backward()
38 |     """
39 |     def __init__(self,
40 |                  alpha: float = 0.1,
41 |                  smooth: Optional[float] = 1e-8,
42 |                  square_denominator: Optional[bool] = False,
43 |                  with_logits: Optional[bool] = True,
44 |                  reduction: Optional[str] = "mean") -> None:
45 |         super(AdaptiveDiceLoss, self).__init__()
46 | 
47 |         self.reduction = reduction
48 |         self.with_logits = with_logits
49 |         self.alpha = alpha
50 |         self.smooth = smooth
51 |         self.square_denominator = square_denominator
52 | 
53 |     def forward(self,
54 |                 input: Tensor,
55 |                 target: Tensor,
56 |                 mask: Optional[Tensor] = None) -> Tensor:
57 | 
58 |         flat_input = input.view(-1)
59 |         flat_target = target.view(-1)
60 | 
61 |         if self.with_logits:
62 |             flat_input = torch.sigmoid(flat_input)
63 | 
64 |         if mask is not None:
65 |             mask = mask.view(-1).float()
66 |             flat_input = flat_input * mask
67 |             flat_target = flat_target * mask
68 | 
69 |         intersection = torch.sum((1-flat_input)**self.alpha * flat_input * flat_target, -1) + self.smooth
70 |         denominator = torch.sum((1-flat_input)**self.alpha * flat_input) + flat_target.sum() + self.smooth
71 |         return 1 - 2 * intersection / denominator
72 | 
73 |     def __str__(self):
74 |         return f"Adaptive Dice Loss, smooth:{self.smooth}; alpha:{self.alpha}"
75 | 


--------------------------------------------------------------------------------
/NER/loss/dice_loss.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch import Tensor
 7 | from typing import Optional
 8 | 
 9 | 
10 | class DiceLoss(nn.Module):
11 |     """
12 |     Dice coefficient for short, is an F1-oriented statistic used to gauge the similarity of two sets.
13 |     Given two sets A and B, the vanilla dice coefficient between them is given as follows:
14 |         Dice(A, B)  = 2 * True_Positive / (2 * True_Positive + False_Positive + False_Negative)
15 |                     = 2 * |A and B| / (|A| + |B|)
16 | 
17 |     Math Function:
18 |         U-NET: https://arxiv.org/abs/1505.04597.pdf
19 |         dice_loss(p, y) = 1 - numerator / denominator
20 |             numerator = 2 * \sum_{1}^{t} p_i * y_i + smooth
21 |             denominator = \sum_{1}^{t} p_i + \sum_{1} ^{t} y_i + smooth
22 |         if square_denominator is True, the denominator is \sum_{1}^{t} (p_i ** 2) + \sum_{1} ^{t} (y_i ** 2) + smooth
23 |         V-NET: https://arxiv.org/abs/1606.04797.pdf
24 |     Args:
25 |         smooth (float, optional): a manual smooth value for numerator and denominator.
26 |         square_denominator (bool, optional): [True, False], specifies whether to square the denominator in the loss function.
27 |         with_logits (bool, optional): [True, False], specifies whether the input tensor is normalized by Sigmoid/Softmax funcs.
28 |             True: the loss combines a `sigmoid` layer and the `BCELoss` in one single class.
29 |             False: the loss contains `BCELoss`.
30 |     Shape:
31 |         - input: (*)
32 |         - target: (*)
33 |         - mask: (*) 0,1 mask for the input sequence.
34 |         - Output: Scalar loss
35 |     Examples:
36 |         >>> loss = DiceLoss()
37 |         >>> input = torch.randn(3, 1, requires_grad=True)
38 |         >>> target = torch.empty(3, dtype=torch.long).random_(5)
39 |         >>> output = loss(input, target)
40 |         >>> output.backward()
41 |     """
42 |     def __init__(self,
43 |                  smooth: Optional[float] = 1e-8,
44 |                  square_denominator: Optional[bool] = False,
45 |                  with_logits: Optional[bool] = True,
46 |                  reduction: Optional[str] = "mean") -> None:
47 |         super(DiceLoss, self).__init__()
48 | 
49 |         self.reduction = reduction
50 |         self.with_logits = with_logits
51 |         self.smooth = smooth
52 |         self.square_denominator = square_denominator
53 | 
54 |     def forward(self,
55 |                 input: Tensor,
56 |                 target: Tensor,
57 |                 mask: Optional[Tensor] = None) -> Tensor:
58 | 
59 |         flat_input = input.view(-1)
60 |         flat_target = target.view(-1)
61 | 
62 |         if self.with_logits:
63 |             flat_input = torch.sigmoid(flat_input)
64 | 
65 |         if mask is not None:
66 |             mask = mask.view(-1).float()
67 |             flat_input = flat_input * mask
68 |             flat_target = flat_target * mask
69 | 
70 |         interection = torch.sum(flat_input * flat_target, -1)
71 |         if not self.square_denominator:
72 |             return 1 - ((2 * interection + self.smooth) /
73 |                         (flat_input.sum() + flat_target.sum() + self.smooth))
74 |         else:
75 |             return 1 - ((2 * interection + self.smooth) /
76 |                         (torch.sum(torch.square(flat_input,), -1) + torch.sum(torch.square(flat_target), -1) + self.smooth))
77 | 
78 |     def __str__(self):
79 |         return f"Dice Loss smooth:{self.smooth}"
80 | 


--------------------------------------------------------------------------------
/RE/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Chinese Relation Extraction by biGRU with Character and Sentence Attentions
  3 | 
  4 | ### [中文Blog](http://www.crownpku.com//2017/08/19/%E7%94%A8Bi-GRU%E5%92%8C%E5%AD%97%E5%90%91%E9%87%8F%E5%81%9A%E7%AB%AF%E5%88%B0%E7%AB%AF%E7%9A%84%E4%B8%AD%E6%96%87%E5%85%B3%E7%B3%BB%E6%8A%BD%E5%8F%96.html)
  5 | 
  6 | Bi-directional GRU with Word and Sentence Dual Attentions for End-to End Relation Extraction
  7 | 
  8 | Original Code in https://github.com/thunlp/TensorFlow-NRE, modified for Chinese.
  9 | 
 10 | Original paper [Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification](http://anthology.aclweb.org/P16-2034) and [Neural Relation Extraction with Selective Attention over Instances](http://aclweb.org/anthology/P16-1200)
 11 | 
 12 | ![](http://www.crownpku.com/images/201708/1.jpg)
 13 | 
 14 | ![](http://www.crownpku.com/images/201708/2.jpg)
 15 | 
 16 | 
 17 | ## Requrements
 18 | 
 19 | * Python (>=3.5)
 20 | 
 21 | * TensorFlow (>=r1.0)
 22 | 
 23 | * scikit-learn (>=0.18)
 24 | 
 25 | 
 26 | ## Usage
 27 | 
 28 | 
 29 | ### * Training:
 30 | 
 31 | 1. Prepare data in origin_data/ , including relation types (relation2id.txt), training data (train.txt), testing data (test.txt) and Chinese word vectors (vec.txt).
 32 | 
 33 | ```
 34 | Current sample data includes the following 12 relationships:
 35 | unknown, 父母, 夫妻, 师生, 兄弟姐妹, 合作, 情侣, 祖孙, 好友, 亲戚, 同门, 上下级
 36 | ```
 37 | 
 38 | 2. Organize data into npy files, which will be save at data/
 39 | ```
 40 | #python3 initial.py
 41 | ```
 42 | 
 43 | 3. Training, models will be save at model/
 44 | ```
 45 | #python3 train_GRU.py
 46 | ```
 47 | 
 48 | 
 49 | ### * Inference
 50 | 
 51 | **If you have trained a new model, please remember to change the pathname in main_for_evaluation() and main() in test_GRU.py with your own model name.**
 52 | 
 53 | ```
 54 | #python3 test_GRU.py
 55 | ```
 56 | 
 57 | Program will ask for data input in the format of "name1 name2 sentence".
 58 | 
 59 | We have pre-trained model in /model. To test the pre-trained model, simply initialize the data and run test_GRU.py:
 60 | 
 61 | ```
 62 | #python3 initial.py
 63 | #python3 test_GRU.py
 64 | ```
 65 | 
 66 | 
 67 | ## Sample Results
 68 | 
 69 | We make up some sentences and test the performance. The model gives good results, sometimes wrong but reasonable.
 70 | 
 71 | More data is needed for better performance.
 72 | 
 73 | ```
 74 | INFO:tensorflow:Restoring parameters from ./model/ATT_GRU_model-9000
 75 | reading word embedding data...
 76 | reading relation to id
 77 | 
 78 | 实体1: 李晓华
 79 | 实体2: 王大牛
 80 | 李晓华和她的丈夫王大牛前日一起去英国旅行了。
 81 | 关系是:
 82 | No.1: 夫妻, Probability is 0.996217
 83 | No.2: 父母, Probability is 0.00193673
 84 | No.3: 兄弟姐妹, Probability is 0.00128172
 85 | 
 86 | 实体1: 李晓华
 87 | 实体2: 王大牛
 88 | 李晓华和她的高中同学王大牛两个人前日一起去英国旅行。
 89 | 关系是:
 90 | No.1: 好友, Probability is 0.526823
 91 | No.2: 兄弟姐妹, Probability is 0.177491
 92 | No.3: 夫妻, Probability is 0.132977
 93 | 
 94 | 实体1: 李晓华
 95 | 实体2: 王大牛
 96 | 王大牛命令李晓华在周末前完成这份代码。
 97 | 关系是:
 98 | No.1: 上下级, Probability is 0.965674
 99 | No.2: 亲戚, Probability is 0.0185355
100 | No.3: 父母, Probability is 0.00953698
101 | 
102 | 实体1: 李晓华
103 | 实体2: 王大牛
104 | 王大牛非常疼爱他的孙女李晓华小朋友。
105 | 关系是:
106 | No.1: 祖孙, Probability is 0.785542
107 | No.2: 好友, Probability is 0.0829895
108 | No.3: 同门, Probability is 0.0728216
109 | 
110 | 实体1: 李晓华
111 | 实体2: 王大牛
112 | 谈起曾经一起求学的日子，王大牛非常怀念他的师妹李晓华。
113 | 关系是:
114 | No.1: 师生, Probability is 0.735982
115 | No.2: 同门, Probability is 0.159495
116 | No.3: 兄弟姐妹, Probability is 0.0440367
117 | 
118 | 实体1: 李晓华
119 | 实体2: 王大牛
120 | 王大牛对于他的学生李晓华做出的成果非常骄傲！
121 | 关系是:
122 | No.1: 师生, Probability is 0.994964
123 | No.2: 父母, Probability is 0.00460191
124 | No.3: 夫妻, Probability is 0.000108601
125 | 
126 | 实体1: 李晓华
127 | 实体2: 王大牛
128 | 王大牛和李晓华是从小一起长大的好哥们
129 | 关系是:
130 | No.1: 兄弟姐妹, Probability is 0.852632
131 | No.2: 亲戚, Probability is 0.0477967
132 | No.3: 好友, Probability is 0.0433101
133 | 
134 | 实体1: 李晓华
135 | 实体2: 王大牛
136 | 王大牛的表舅叫李晓华的二妈为大姐
137 | 关系是:
138 | No.1: 亲戚, Probability is 0.766272
139 | No.2: 父母, Probability is 0.162108
140 | No.3: 兄弟姐妹, Probability is 0.0623203
141 | 
142 | 实体1: 李晓华
143 | 实体2: 王大牛
144 | 这篇论文是王大牛负责编程，李晓华负责写作的。
145 | 关系是:
146 | No.1: 合作, Probability is 0.907599
147 | No.2: unknown, Probability is 0.082604
148 | No.3: 上下级, Probability is 0.00730342
149 | 
150 | 实体1: 李晓华
151 | 实体2: 王大牛
152 | 王大牛和李晓华为谁是论文的第一作者争得头破血流。
153 | 关系是:
154 | No.1: 合作, Probability is 0.819008
155 | No.2: 上下级, Probability is 0.116768
156 | No.3: 师生, Probability is 0.0448312
157 | ```
158 | 
159 | 


--------------------------------------------------------------------------------
/NER/metrics/functional/query_span_f1.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | import os
  3 | 
  4 | import torch
  5 | from tokenizers import BertWordPieceTokenizer
  6 | 
  7 | from utils.bmes_decode import bmes_decode
  8 | 
  9 | bert_path = "E:\data\chinese_roberta_wwm_large_ext_pytorch\\"
 10 | json_path = "E:\\data\\nested\\mrc-ner.dev"
 11 | is_chinese = True
 12 | 
 13 | vocab_file = os.path.join(bert_path, "vocab.txt")
 14 | tokenizer = BertWordPieceTokenizer(vocab_file)
 15 | 
 16 | 
 17 | def query_span_f1(start_preds, end_preds, match_logits, start_label_mask, end_label_mask, match_labels, flat=False):
 18 |     """
 19 |     Compute span f1 according to query-based model output
 20 |     Args:
 21 |         start_preds: [bsz, seq_len]
 22 |         end_preds: [bsz, seq_len]
 23 |         match_logits: [bsz, seq_len, seq_len]
 24 |         start_label_mask: [bsz, seq_len]
 25 |         end_label_mask: [bsz, seq_len]
 26 |         match_labels: [bsz, seq_len, seq_len]
 27 |         flat: if True, decode as flat-ner
 28 |     Returns:
 29 |         span-f1 counts, tensor of shape [3]: tp, fp, fn
 30 |     """
 31 |     start_label_mask = start_label_mask.bool()
 32 |     end_label_mask = end_label_mask.bool()
 33 |     match_labels = match_labels.bool()
 34 |     bsz, seq_len = start_label_mask.size()
 35 |     # [bsz, seq_len, seq_len]
 36 |     match_preds = match_logits > 0
 37 |     # [bsz, seq_len]
 38 |     start_preds = start_preds.bool()
 39 |     # [bsz, seq_len]
 40 |     end_preds = end_preds.bool()
 41 | 
 42 |     match_preds = (match_preds
 43 |                    & start_preds.unsqueeze(-1).expand(-1, -1, seq_len)
 44 |                    & end_preds.unsqueeze(1).expand(-1, seq_len, -1))
 45 |     match_label_mask = (start_label_mask.unsqueeze(-1).expand(-1, -1, seq_len)
 46 |                         & end_label_mask.unsqueeze(1).expand(-1, seq_len, -1))
 47 |     match_label_mask = torch.triu(match_label_mask, 0)  # start should be less or equal to end
 48 |     match_preds = match_label_mask & match_preds
 49 | 
 50 |     tp = (match_labels & match_preds).long().sum()
 51 |     fp = (~match_labels & match_preds).long().sum()
 52 |     fn = (match_labels & ~match_preds).long().sum()
 53 |     return torch.stack([tp, fp, fn])
 54 | 
 55 | 
 56 | def extract_flat_spans(start_pred, end_pred, match_pred, label_mask):
 57 |     """
 58 |     Extract flat-ner spans from start/end/match logits
 59 |     Args:
 60 |         start_pred: [seq_len], 1/True for start, 0/False for non-start
 61 |         end_pred: [seq_len, 2], 1/True for end, 0/False for non-end
 62 |         match_pred: [seq_len, seq_len], 1/True for match, 0/False for non-match
 63 |         label_mask: [seq_len], 1 for valid boundary.
 64 |     Returns:
 65 |         tags: list of tuple (start, end)
 66 |     Examples:
 67 |         >>> start_pred = [0, 1]
 68 |         >>> end_pred = [0, 1]
 69 |         >>> match_pred = [[0, 0], [0, 1]]
 70 |         >>> label_mask = [1, 1]
 71 |         >>> extract_flat_spans(start_pred, end_pred, match_pred, label_mask)
 72 |         [(1, 2)]
 73 |     """
 74 |     pseudo_tag = "TAG"
 75 |     pseudo_input = "a"
 76 | 
 77 |     bmes_labels = ["O"] * len(start_pred)
 78 |     start_positions = [idx for idx, tmp in enumerate(start_pred) if tmp and label_mask[idx]]
 79 |     end_positions = [idx for idx, tmp in enumerate(end_pred) if tmp and label_mask[idx]]
 80 | 
 81 |     for start_item in start_positions:
 82 |         bmes_labels[start_item] = f"B-{pseudo_tag}"
 83 |     for end_item in end_positions:
 84 |         bmes_labels[end_item] = f"E-{pseudo_tag}"
 85 | 
 86 |     for tmp_start in start_positions:
 87 |         tmp_end = [tmp for tmp in end_positions if tmp >= tmp_start]
 88 |         if len(tmp_end) == 0:
 89 |             continue
 90 |         else:
 91 |             tmp_end = min(tmp_end)
 92 |         if match_pred[tmp_start][tmp_end]:
 93 |             if tmp_start != tmp_end:
 94 |                 for i in range(tmp_start + 1, tmp_end):
 95 |                     bmes_labels[i] = f"M-{pseudo_tag}"
 96 |             else:
 97 |                 bmes_labels[tmp_end] = f"S-{pseudo_tag}"
 98 | 
 99 |     tags = bmes_decode([(pseudo_input, label) for label in bmes_labels])
100 | 
101 |     return [(tag.begin, tag.end) for tag in tags]
102 | 
103 | 
104 | def remove_overlap(spans):
105 |     """
106 |     remove overlapped spans greedily for flat-ner
107 |     Args:
108 |         spans: list of tuple (start, end), which means [start, end] is a ner-span
109 |     Returns:
110 |         spans without overlap
111 |     """
112 |     output = []
113 |     occupied = set()
114 |     for start, end in spans:
115 |         if any(x for x in range(start, end + 1)) in occupied:
116 |             continue
117 |         output.append((start, end))
118 |         for x in range(start, end + 1):
119 |             occupied.add(x)
120 |     return output
121 | 


--------------------------------------------------------------------------------
/RE/process_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2021/5/7 下午2:08
  3 | # @Author  : liuliping
  4 | # @File    : process_data.py.py
  5 | # @description: 从原始数据抽取实体-关系
  6 | 
  7 | import json
  8 | import os
  9 | import glob
 10 | import copy
 11 | import random
 12 | 
 13 | total_count = {}
 14 | 
 15 | def custom_RE(file_path, output_path):
 16 |     #
 17 |     with open(file_path, 'r', encoding='utf-8') as f:
 18 |         data = [json.loads(line.strip()) for line in f.readlines()]
 19 | 
 20 |     result = []
 21 |     relation_count = {}
 22 |     for sub in data: # 行
 23 |         ner_relation = {}
 24 |         relation_set = set()
 25 |         ner_data = sub['sd_result']['items']
 26 |         text = sub['text']
 27 |         segments = [i for i in range(len(text)) if text[i] in {'。', '？', '！', '；', '：', '，'}]
 28 |         for ner in ner_data:
 29 |             ner_text = ner['meta']['text']
 30 |             start, end = ner['meta']['segment_range']
 31 |             ner_label = ner['labels']['Entity']
 32 |             relations = ner['labels'].get('Relation', [])
 33 |             for rel in relations:
 34 |                 relation_set.add(rel)
 35 | 
 36 |             ner_relation[f'{ner_text}#{ner_label}#{start}#{end}'] = relations
 37 | 
 38 |         for rel in relation_set:
 39 |             tmp = []
 40 |             for ner, rels in ner_relation.items():
 41 | 
 42 |                 if rel in rels and (len(tmp) == 0 or tmp[0].split('#')[1] != ner.split('#')[1]):
 43 |                     tmp.append(ner)
 44 | 
 45 |                 if len(tmp) == 2:
 46 |                     ner_1 = tmp[0].split('#')
 47 |                     ner_2 = tmp[1].split('#')
 48 | 
 49 |                     index = [int(d) for d in ner_1[2:]] + [int(d) for d in ner_2[2:]]
 50 | 
 51 |                     min_idx, max_idx = min(index), max(index)
 52 | 
 53 |                     sub_text = text[:segments[0]] if segments and segments[0] >= max_idx else text[:]
 54 | 
 55 |                     min_seg = 0
 56 |                     max_seg = len(text)
 57 |                     for i in range(len(segments) - 1):
 58 |                         if segments[i] <= min_idx <= segments[i + 1]:
 59 |                             min_seg = segments[i]
 60 |                             break
 61 |                     for i in range(len(segments) - 1):
 62 |                         if segments[i] <= max_idx <= segments[i + 1]:
 63 |                             max_seg = segments[i + 1]
 64 |                             break
 65 | 
 66 |                     if min_seg != 0 or max_seg != len(text):
 67 |                         sub_text = text[min_seg + 1: max_seg]
 68 |                     rel = rel.split('-'[0])[0].strip().replace('?', '')
 69 |                     result.append([t.split('#')[0] for t in tmp] + [rel, sub_text])
 70 |                     relation_count[rel] = relation_count.setdefault(rel, 0) + 1
 71 |                     total_count[rel] = total_count.setdefault(rel, 0) + 1
 72 |                     break
 73 | 
 74 |     with open(os.path.join(output_path, 'custom_RE_{}.txt'.format(os.path.split(file_path)[1].split('.')[0])),
 75 |             'w', encoding='utf-8') as fw:
 76 |         fw.write('\n'.join(['\t'.join(t) for t in result]))
 77 | 
 78 |     print(os.path.basename(file_path), relation_count)
 79 | 
 80 | 
 81 | if __name__ == '__main__':
 82 |     input_path = '../data/糖尿病标注数据4.28/' # 原始数据path
 83 |     output_path = 'data/custom_RE'
 84 |     os.makedirs('data/custom_RE/', exist_ok=True)
 85 | 
 86 |     files = glob.glob(f'{input_path}/*.txt')
 87 |     for fil in files:
 88 |         custom_RE(fil, output_path)
 89 |     print('total', total_count)
 90 | 
 91 |     relations = {}
 92 |     files = glob.glob(f'{output_path}/*.txt')
 93 |     for fil in files:
 94 |         with open(fil, 'r', encoding='utf-8') as f:
 95 |             data = f.readlines()
 96 |         for sub in data:
 97 |             l = sub.strip().split('\t')
 98 |             relations.setdefault(l[2], []).append(sub)
 99 |     # 拆分数据集 train:dev:test=6:2:2
100 |     train_data, dev_data, test_data = [], [], []
101 |     for rel, val in relations.items():
102 |         print(rel, len(val))
103 |         length = len(val)
104 |         random.shuffle(val)
105 |         train_data.extend(val[:length // 10 * 6])
106 |         dev_data.extend(val[length // 10 * 6: length // 10 * 8])
107 |         test_data.extend(val[length // 10 * 8:])
108 | 
109 |     with open('data/train.txt', 'w', encoding='utf-8') as fw:
110 |         random.shuffle(train_data)
111 |         fw.write(''.join(train_data))
112 | 
113 |     with open('data/dev.txt', 'w', encoding='utf-8') as fw:
114 |         random.shuffle(dev_data)
115 |         fw.write(''.join(dev_data))
116 | 
117 |     with open('data/test.txt', 'w', encoding='utf-8') as fw:
118 |         random.shuffle(test_data)
119 |         fw.write(''.join(test_data))
120 | 


--------------------------------------------------------------------------------
/RE/train_GRU.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import time
  4 | import datetime
  5 | import os
  6 | import network
  7 | # from tensorflow.contrib.tensorboard.plugins import projector
  8 | 
  9 | FLAGS = tf.app.flags.FLAGS
 10 | 
 11 | tf.app.flags.DEFINE_string('summary_dir', '.', 'path to store summary')
 12 | 
 13 | import os
 14 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 15 | # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 16 | 
 17 | def main(_):
 18 |     # the path to save models
 19 |     save_path = './model'
 20 |     if not os.path.isdir(save_path):
 21 |         os.makedirs(save_path)
 22 | 
 23 |     print('reading wordembedding')
 24 |     wordembedding = np.load('./data/vec.npy', allow_pickle=True)
 25 | 
 26 |     print('reading training data')
 27 |     train_y = np.load('./data/train_y.npy', allow_pickle=True)
 28 |     train_word = np.load('./data/train_word.npy', allow_pickle=True)
 29 |     train_pos1 = np.load('./data/train_pos1.npy', allow_pickle=True)
 30 |     train_pos2 = np.load('./data/train_pos2.npy', allow_pickle=True)
 31 | 
 32 |     settings = network.Settings()
 33 |     settings.vocab_size = len(wordembedding)
 34 |     settings.num_classes = len(train_y[0])
 35 | 
 36 |     # big_num = settings.big_num
 37 | 
 38 |     with tf.Graph().as_default():
 39 | 
 40 |         sess = tf.Session()
 41 |         with sess.as_default():
 42 | 
 43 |             initializer = tf.contrib.layers.xavier_initializer()
 44 |             with tf.variable_scope("model", reuse=None, initializer=initializer):
 45 |                 m = network.GRU(is_training=True, word_embeddings=wordembedding, settings=settings)
 46 |             global_step = tf.Variable(0, name="global_step", trainable=False)
 47 |             optimizer = tf.train.AdamOptimizer(0.0005)
 48 | 
 49 |             train_op = optimizer.minimize(m.final_loss, global_step=global_step)
 50 |             sess.run(tf.global_variables_initializer())
 51 |             saver = tf.train.Saver(max_to_keep=None)
 52 |            
 53 |             merged_summary = tf.summary.merge_all()
 54 |             summary_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train_loss', sess.graph)
 55 | 
 56 |             def train_step(word_batch, pos1_batch, pos2_batch, y_batch, big_num):
 57 | 
 58 |                 feed_dict = {}
 59 |                 total_shape = []
 60 |                 total_num = 0
 61 |                 total_word = []
 62 |                 total_pos1 = []
 63 |                 total_pos2 = []
 64 |                 for i in range(len(word_batch)):
 65 |                     total_shape.append(total_num)
 66 |                     total_num += len(word_batch[i])
 67 |                     for word in word_batch[i]:
 68 |                         total_word.append(word)
 69 |                     for pos1 in pos1_batch[i]:
 70 |                         total_pos1.append(pos1)
 71 |                     for pos2 in pos2_batch[i]:
 72 |                         total_pos2.append(pos2)
 73 |                 total_shape.append(total_num)
 74 |                 total_shape = np.array(total_shape)
 75 |                 total_word = np.array(total_word)
 76 |                 total_pos1 = np.array(total_pos1)
 77 |                 total_pos2 = np.array(total_pos2)
 78 | 
 79 |                 feed_dict[m.total_shape] = total_shape
 80 |                 feed_dict[m.input_word] = total_word
 81 |                 feed_dict[m.input_pos1] = total_pos1
 82 |                 feed_dict[m.input_pos2] = total_pos2
 83 |                 feed_dict[m.input_y] = y_batch
 84 | 
 85 |                 temp, step, loss, accuracy, summary, l2_loss, final_loss = sess.run(
 86 |                     [train_op, global_step, m.total_loss, m.accuracy, merged_summary, m.l2_loss, m.final_loss],
 87 |                     feed_dict)
 88 |                 time_str = datetime.datetime.now().isoformat()
 89 |                 accuracy = np.reshape(np.array(accuracy), (big_num))
 90 |                 acc = np.mean(accuracy)
 91 |                 summary_writer.add_summary(summary, step)
 92 | 
 93 |                 if step % 50 == 0:
 94 |                     tempstr = "{}: step {}, softmax_loss {:g}, acc {:g}".format(time_str, step, loss, acc)
 95 |                     print(tempstr)
 96 | 
 97 |             for one_epoch in range(settings.num_epochs):
 98 | 
 99 |                 temp_order = list(range(len(train_word)))
100 |                 np.random.shuffle(temp_order)
101 |                 for i in range(int(len(temp_order) / float(settings.big_num))):
102 | 
103 |                     temp_word = []
104 |                     temp_pos1 = []
105 |                     temp_pos2 = []
106 |                     temp_y = []
107 | 
108 |                     temp_input = temp_order[i * settings.big_num:(i + 1) * settings.big_num]
109 |                     for k in temp_input:
110 |                         temp_word.append(train_word[k])
111 |                         temp_pos1.append(train_pos1[k])
112 |                         temp_pos2.append(train_pos2[k])
113 |                         temp_y.append(train_y[k])
114 |                     num = 0
115 |                     for single_word in temp_word:
116 |                         num += len(single_word)
117 | 
118 |                     if num > 1500:
119 |                         print('out of range')
120 |                         continue
121 | 
122 |                     temp_word = np.array(temp_word)
123 |                     temp_pos1 = np.array(temp_pos1)
124 |                     temp_pos2 = np.array(temp_pos2)
125 |                     temp_y = np.array(temp_y)
126 | 
127 |                     train_step(temp_word, temp_pos1, temp_pos2, temp_y, settings.big_num)
128 | 
129 |                     current_step = tf.train.global_step(sess, global_step)
130 |                     if current_step > 8000 and current_step % 100 == 0:
131 |                         print('saving model')
132 |                         path = saver.save(sess, save_path + 'ATT_GRU_model', global_step=current_step)
133 |                         tempstr = 'have saved model to ' + path
134 |                         print(tempstr)
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     tf.app.run()
139 | 


--------------------------------------------------------------------------------
/RE/network.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | class Settings(object):
  6 |     def __init__(self):
  7 |         self.vocab_size = 16691
  8 |         self.num_steps = 70
  9 |         self.num_epochs = 200
 10 |         self.num_classes = 16
 11 |         self.gru_size = 230
 12 |         self.keep_prob = 0.5
 13 |         self.num_layers = 1
 14 |         self.pos_size = 5
 15 |         self.pos_num = 123
 16 |         # the number of entity pairs of each batch during training or testing
 17 |         self.big_num = 50
 18 | 
 19 | 
 20 | class GRU:
 21 |     def __init__(self, is_training, word_embeddings, settings):
 22 | 
 23 |         self.num_steps = num_steps = settings.num_steps
 24 |         self.vocab_size = vocab_size = settings.vocab_size
 25 |         self.num_classes = num_classes = settings.num_classes
 26 |         self.gru_size = gru_size = settings.gru_size
 27 |         self.big_num = big_num = settings.big_num
 28 | 
 29 |         self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_word')
 30 |         self.input_pos1 = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_pos1')
 31 |         self.input_pos2 = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_pos2')
 32 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y')
 33 |         self.total_shape = tf.placeholder(dtype=tf.int32, shape=[big_num + 1], name='total_shape')
 34 |         total_num = self.total_shape[-1]
 35 | 
 36 |         word_embedding = tf.get_variable(initializer=word_embeddings, name='word_embedding')
 37 |         pos1_embedding = tf.get_variable('pos1_embedding', [settings.pos_num, settings.pos_size])
 38 |         pos2_embedding = tf.get_variable('pos2_embedding', [settings.pos_num, settings.pos_size])
 39 | 
 40 |         attention_w = tf.get_variable('attention_omega', [gru_size, 1])
 41 |         sen_a = tf.get_variable('attention_A', [gru_size])
 42 |         sen_r = tf.get_variable('query_r', [gru_size, 1])
 43 |         relation_embedding = tf.get_variable('relation_embedding', [self.num_classes, gru_size])
 44 |         sen_d = tf.get_variable('bias_d', [self.num_classes])
 45 | 
 46 |         gru_cell_forward = tf.contrib.rnn.GRUCell(gru_size)
 47 |         gru_cell_backward = tf.contrib.rnn.GRUCell(gru_size)
 48 | 
 49 |         if is_training and settings.keep_prob < 1:
 50 |             gru_cell_forward = tf.contrib.rnn.DropoutWrapper(gru_cell_forward, output_keep_prob=settings.keep_prob)
 51 |             gru_cell_backward = tf.contrib.rnn.DropoutWrapper(gru_cell_backward, output_keep_prob=settings.keep_prob)
 52 | 
 53 |         cell_forward = tf.contrib.rnn.MultiRNNCell([gru_cell_forward] * settings.num_layers)
 54 |         cell_backward = tf.contrib.rnn.MultiRNNCell([gru_cell_backward] * settings.num_layers)
 55 | 
 56 |         sen_repre = []
 57 |         sen_alpha = []
 58 |         sen_s = []
 59 |         sen_out = []
 60 |         self.prob = []
 61 |         self.predictions = []
 62 |         self.loss = []
 63 |         self.accuracy = []
 64 |         self.total_loss = 0.0
 65 | 
 66 |         self._initial_state_forward = cell_forward.zero_state(total_num, tf.float32)
 67 |         self._initial_state_backward = cell_backward.zero_state(total_num, tf.float32)
 68 | 
 69 |         # embedding layer
 70 |         inputs_forward = tf.concat(axis=2, values=[tf.nn.embedding_lookup(word_embedding, self.input_word),
 71 |                                                    tf.nn.embedding_lookup(pos1_embedding, self.input_pos1),
 72 |                                                    tf.nn.embedding_lookup(pos2_embedding, self.input_pos2)])
 73 |         inputs_backward = tf.concat(axis=2,
 74 |                                     values=[tf.nn.embedding_lookup(word_embedding, tf.reverse(self.input_word, [1])),
 75 |                                             tf.nn.embedding_lookup(pos1_embedding, tf.reverse(self.input_pos1, [1])),
 76 |                                             tf.nn.embedding_lookup(pos2_embedding,
 77 |                                                                    tf.reverse(self.input_pos2, [1]))])
 78 | 
 79 |         outputs_forward = []
 80 | 
 81 |         state_forward = self._initial_state_forward
 82 | 
 83 |         # Bi-GRU layer
 84 |         with tf.variable_scope('GRU_FORWARD') as scope:
 85 |             for step in range(num_steps):
 86 |                 if step > 0:
 87 |                     scope.reuse_variables()
 88 |                 (cell_output_forward, state_forward) = cell_forward(inputs_forward[:, step, :], state_forward)
 89 |                 outputs_forward.append(cell_output_forward)
 90 | 
 91 |         outputs_backward = []
 92 | 
 93 |         state_backward = self._initial_state_backward
 94 |         with tf.variable_scope('GRU_BACKWARD') as scope:
 95 |             for step in range(num_steps):
 96 |                 if step > 0:
 97 |                     scope.reuse_variables()
 98 |                 (cell_output_backward, state_backward) = cell_backward(inputs_backward[:, step, :], state_backward)
 99 |                 outputs_backward.append(cell_output_backward)
100 | 
101 |         output_forward = tf.reshape(tf.concat(axis=1, values=outputs_forward), [total_num, num_steps, gru_size])
102 |         output_backward = tf.reverse(
103 |             tf.reshape(tf.concat(axis=1, values=outputs_backward), [total_num, num_steps, gru_size]),
104 |             [1])
105 | 
106 |         # word-level attention layer
107 |         output_h = tf.add(output_forward, output_backward)
108 |         attention_r = tf.reshape(tf.matmul(tf.reshape(tf.nn.softmax(
109 |             tf.reshape(tf.matmul(tf.reshape(tf.tanh(output_h), [total_num * num_steps, gru_size]), attention_w),
110 |                        [total_num, num_steps])), [total_num, 1, num_steps]), output_h), [total_num, gru_size])
111 | 
112 |         # sentence-level attention layer
113 |         for i in range(big_num):
114 | 
115 |             sen_repre.append(tf.tanh(attention_r[self.total_shape[i]:self.total_shape[i + 1]]))
116 |             batch_size = self.total_shape[i + 1] - self.total_shape[i]
117 | 
118 |             sen_alpha.append(
119 |                 tf.reshape(tf.nn.softmax(tf.reshape(tf.matmul(tf.multiply(sen_repre[i], sen_a), sen_r), [batch_size])),
120 |                            [1, batch_size]))
121 | 
122 |             sen_s.append(tf.reshape(tf.matmul(sen_alpha[i], sen_repre[i]), [gru_size, 1]))
123 |             sen_out.append(tf.add(tf.reshape(tf.matmul(relation_embedding, sen_s[i]), [self.num_classes]), sen_d))
124 | 
125 |             self.prob.append(tf.nn.softmax(sen_out[i]))
126 | 
127 |             with tf.name_scope("output"):
128 |                 self.predictions.append(tf.argmax(self.prob[i], 0, name="predictions"))
129 | 
130 |             with tf.name_scope("loss"):
131 |                 self.loss.append(
132 |                     tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=sen_out[i], labels=self.input_y[i])))
133 |                 if i == 0:
134 |                     self.total_loss = self.loss[i]
135 |                 else:
136 |                     self.total_loss += self.loss[i]
137 | 
138 |             # tf.summary.scalar('loss',self.total_loss)
139 |             # tf.scalar_summary(['loss'],[self.total_loss])
140 |             with tf.name_scope("accuracy"):
141 |                 self.accuracy.append(
142 |                     tf.reduce_mean(tf.cast(tf.equal(self.predictions[i], tf.argmax(self.input_y[i], 0)), "float"),
143 |                                    name="accuracy"))
144 | 
145 |         # tf.summary.scalar('loss',self.total_loss)
146 |         tf.summary.scalar('loss', self.total_loss)
147 |         # regularization
148 |         self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001),
149 |                                                               weights_list=tf.trainable_variables())
150 |         self.final_loss = self.total_loss + self.l2_loss
151 |         tf.summary.scalar('l2_loss', self.l2_loss)
152 |         tf.summary.scalar('final_loss', self.final_loss)
153 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # DiaKG: an Annotated Diabetes Dataset for Medical Knowledge Graph Construction
  3 | 
  4 | This is the source code of the DiaKG [paper](https://arxiv.org/abs/2105.15033).
  5 | 
  6 | ## DataSet 
  7 | 
  8 | ### Overview
  9 | The DiaKG dataset is derived from 41 diabetes guidelines and consensus, which are from authoritative Chinese journals including basic research, clinical research, drug usage, clinical cases, diagnosis and treatment methods, etc. The dataset covers the most extensive field of research content and hotspot in recent years. The annotation process is done by 2 seasoned endocrinologists and 6 M.D. candidates, and finally conduct a high-quality diabates database which contains 22,050 entities and 6,890 relations in total.
 10 | 
 11 | ### Get the Data
 12 | The codebase only provides some sample annotation files. If you want to download the fullset, please apply at [Tianchi Platform](https://tianchi.aliyun.com/dataset/dataDetail?dataId=88836).
 13 | 
 14 | ### Data Format
 15 | The dataset is exhibited as a hierachical structure with "document-paragraph-sentence" information. All the entities and sentences are labelled on the sentence level. Below is an example:
 16 | 
 17 | ```
 18 | { 
 19 |   "doc_id": "1", // string, document id 
 20 |   "paragraphs": [ // array, paragraphs 
 21 |     {
 22 |       "paragraph_id": "0", // string, paragraph id
 23 |       "paragraph": "中国成人2型糖尿病胰岛素促泌剂应用的专家共识", // string, paragraph text
 24 |       "sentences": [ // array, sentences
 25 |         {
 26 |           "sentence_id": "0", // string, sentence id
 27 |           "sentence": "中国成人2型糖尿病胰岛素促泌剂应用的专家共识", // string, sentence text
 28 |           "start_idx": 0, // int, sentence start index in the current paragraph
 29 |           "end_idx": 22, // int, sentence end index in the current paragraph
 30 |           "entities": [ // array, entities in the current sentence
 31 |             {
 32 |               "entity_id": "T0", // string, entity id
 33 |               "entity": "2型糖尿病", // string, entity text
 34 |               "entity_type": "Disease", // string, entity type
 35 |               "start_idx": 4, // int, entity start index in the sentence
 36 |               "end_idx": 9 // int, entity end index in the sentence
 37 |             },
 38 |             {
 39 |               "entity_id": "T1",
 40 |               "entity": "2型",
 41 |               "entity_type": "Class",
 42 |               "start_idx": 4,
 43 |               "end_idx": 6
 44 |             },
 45 |             {
 46 |               "entity_id": "T2",
 47 |               "entity": "胰岛素促泌剂",
 48 |               "entity_type": "Drug",
 49 |               "start_idx": 9,
 50 |               "end_idx": 15
 51 |             }
 52 |           ],
 53 |           "relations": [ // array, relations in the current sentence
 54 |             {
 55 |               "relation_type": "Drug_Disease", // string, relation type
 56 |               "relation_id": "R0", // string, relation id
 57 |               "head_entity_id": "T2", // string, head entity id
 58 |               "tail_entity_id": "T0" // string, tail entity id
 59 |             },
 60 |             {
 61 |               "relation_type": "Class_Disease",
 62 |               "relation_id": "R1",
 63 |               "head_entity_id": "T1",
 64 |               "tail_entity_id": "T0"
 65 |             }
 66 |           ]
 67 |         }
 68 |       ]
 69 |     },
 70 |     {
 71 |       "paragraph_id": "1", // string, paragraph id
 72 |       "paragraph": "xxx" // string, paragraph text
 73 |       "sentences": [
 74 |         ...
 75 |       ] 
 76 |     },
 77 |     ...
 78 |   ] 
 79 | }
 80 | ```
 81 | 
 82 | ### Data Statistic
 83 | 
 84 | #### Entity
 85 | 
 86 | 
 87 | |Entity | Freq | Fraction(%) | Avg Length  |Entity | Freq | Fraction(%) | Avg Length |
 88 | |-----|--------|--------|---------|-------|-----------|----------|----------|
 89 | |Disease        |5743 |26.05% |7.27 |Frequency      |156  |0.71%  |4.71
 90 | |Class          |1262 |5.72%  |4.27 |Method         |399  |1.81%  |6.09
 91 | |Reason         |175  |0.79%  |7.34 |Treatment      |756  |3.43%  |7.97
 92 | |Pathogenesis   |202  |0.92%  |10.27|Operation      |133  |0.60%  |9.02
 93 | |Symptom        |479  |2.17%  |5.82 |ADE            |874  |3.96%  |5.06
 94 | |Test           |489  |2.22%  |6.1  |Anatomy        |1876 |8.51%  |3.1
 95 | |Test_items     |2718 |12.33% |7.65 |Level          |280  |1.27%  |2.93
 96 | |Test_Value     |1356 |6.15%  |9.49 |Duration       |69   |0.31%  |3.68
 97 | |Drug           |4782 |21.69% |7.79 |Amount         |301  |1.37%  |6.74
 98 | |Total          |22050|100%   |6.5
 99 | 
100 | 
101 | #### Relation
102 | 
103 | |Relation|Freq |Fraction(%)|Avg Cross-sentence Number |Relation|Freq |Fraction(%)|Avg Cross-sentence Number | 
104 | |-----------|------|-------|---------|--------|----------|------|-------|
105 | |Test_items_Disease  |1171  |17%　  |2.3  |Class_Disease         |854   |12.39% |2.13 |
106 | |Anatomy_Disease     |1072  |15.56% |2.07 |Reason_Disease       |164   |2.38%  |2.42 |
107 | |Drug_Disease        |1315  |19.09% |2.5  |Duration_Drug        |61    |0.89%  |2.79 |
108 | |Method_Drug         |185   |2.69%  |2.41　|Symptom_Disease    |283   |4.11%  |2.08 |
109 | |Treatment_Disease   |354   |5.14%  |2.6   |Amount_Drug         |195   |2.83%  |2.62 |
110 | |Pathogenesis_Disease|130   |1.89%  |1.97  |ADE_Drug        |693   |10.06% |2.65 |
111 | |Test_Disease        |271   |3.93%  |2.27  |Frequency_Drug      |103   |1.49%  |1.97 |
112 | |Operation_Disese    |37    |0.54%  |2.57  
113 | |Total               |6890  |100% |2.33 |
114 | 
115 | 
116 | * Note: **Avg Cross-sentence Number** means the average sentences that the two entities that compose a relation locate, since the annotation is conducted on document level and cross-sentence relation is allowed.
117 |  
118 | ## Experiments
119 | 
120 | ### NER
121 | 
122 | We use [MRC-BERT](https://github.com/changdejie/diaKG-code/tree/mrcforner) as our baseline model, and the source code is in the **NER** directory.
123 | 
124 | #### How to run
125 | ```
126 | cd NER
127 | 
128 | ## Training:
129 | python trainer.py --data_dir entity_type_data --bert_config models/chinese_roberta_wwm_large_ext_pytorch --batch_size 16 --max_epochs 10 --gpus 1
130 | 
131 | ## Inference:
132 | python evaluate.py 
133 | 
134 | ```
135 | 
136 | #### Results
137 | 
138 | |Entity       |precision|recall   |F1     |Entity       |precision|recall   |F1     |
139 | |-------------|---------|---------|-------|-------------|---------|---------|-------|
140 | |Frequency    |1.0      |0.9      |0.947  |ADE          | 0.791   | 0.815   | 0.803 |
141 | |Method       | 0.895   | 0.927   | 0.911 |Duration     | 0.833   | 0.714   | 0.769 |
142 | |Class        | 0.852   | 0.949   | 0.898 |Amount       | 0.73    | 0.75    | 0.74  |
143 | |Drug         | 0.881   | 0.902   | 0.892 |Operation    | 0.75    | 0.714   | 0.732 |
144 | |Level        | 0.841   | 0.902   | 0.871 |Treatment    | 0.679   | 0.783   | 0.727 |
145 | |Anatomy      | 0.834   | 0.869   | 0.851 |Test         | 0.855   | 0.609   | 0.711 |
146 | |Disease      | 0.794   | 0.91    | 0.848 |Pathogenesis | 0.595   | 0.667   | 0.629 |
147 | |Test\_Items  | 0.823   | 0.815   | 0.818 |Symptom      | 0.535   | 0.535   | 0.535 |
148 | |Test\_Value  | 0.828   | 0.787   | 0.807 |Reason       | 0.333   | 0.3     | 0.316 |
149 | |total        |0.814    |0.853    |0.833  |
150 | 
151 | 
152 | ### RE
153 | 
154 | We use [Bi-directional GRU-Attention](https://github.com/crownpku/Information-Extraction-Chinese) as our baseline model, and the source code is in the **RE** directory.
155 | 
156 | #### How to run
157 | 
158 | Details in folder [RE/README.md](https://github.com/changdejie/diaKG-code/edit/main/RE/README.md)
159 | 
160 | 
161 | #### Results
162 | |Relation   |precision  |recall |F1     |Relation   |precision  |recall |F1     |
163 | |-------------|---------|---------|-------|-------------|---------|---------|-------|
164 | Class\_Disease        | 0.968 | 0.874 | 0.918 |Duration\_Drug        | 0.833 | 0.769 | 0.8   |
165 | ADE\_Drug             | 0.892 | 0.892 | 0.892 |Frequency\_Drug       | 0.750 | 0.783 | 0.766 |
166 | Drug\_Disease         | 0.864 | 0.913 | 0.888 |Symptom\_Disease      | 0.689 | 0.712 | 0.7   |
167 | Anatomy\_Disease      | 0.869 | 0.864 | 0.867 |Reason\_Disease       | 0.769 | 0.571 | 0.656 |
168 | Method\_Drug          | 0.833 | 0.854 | 0.843 |Test\_Disease         | 0.648 | 0.636 | 0.642 |	
169 | Test\_Items\_Disease  | 0.833 | 0.833 | 0.833 |Pathogenesis\_Disease | 0.486 | 0.692 | 0.571 |
170 | Treatment\_Disease    | 0.771 | 0.877 | 0.821 |Operation\_Disese     | 0.6   | 0.231 | 0.333 |
171 | Amount\_Drug          | 0.850 | 0.791 | 0.819 |
172 | total                 |0.839  |0.837  |0.836  |
173 | 
174 | 
175 | ## Citation
176 | 
177 | If you use DiaKG in your research, please cite our [paper](https://arxiv.org/abs/2105.15033):
178 | ```
179 | @article{chang2021diakg,
180 |       title={DiaKG: an Annotated Diabetes Dataset for Medical Knowledge Graph Construction}, 
181 |       author={Dejie Chang and Mosha Chen and Chaozhen Liu and Liping Liu and Dongdong Li and Wei Li and Fei Kong and Bangchang Liu and Xiaobin Luo and Ji Qi and Qiao Jin and Bin Xu},
182 |       journal={arXiv preprint arXiv:2105.15033}，
183 |       year={2021}
184 |   }
185 | ```
186 | 


--------------------------------------------------------------------------------
/NER/datasets/mrc_ner_dataset.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | 
  4 | import json
  5 | import torch
  6 | from tokenizers import BertWordPieceTokenizer
  7 | from torch.utils.data import Dataset
  8 | 
  9 | 
 10 | class MRCNERDataset(Dataset):
 11 |     """
 12 |     MRC NER Dataset
 13 |     Args:
 14 |         json_path: path to mrc-ner style json
 15 |         tokenizer: BertTokenizer
 16 |         max_length: int, max length of query+context
 17 |         possible_only: if True, only use possible samples that contain answer for the query/context
 18 |         is_chinese: is chinese dataset
 19 |     """
 20 | 
 21 |     def __init__(self, json_path, tokenizer: BertWordPieceTokenizer, max_length: int = 128, possible_only=False,
 22 |                  is_chinese=False, pad_to_maxlen=False):
 23 |         self.all_data = json.load(open(json_path, encoding="utf-8"))
 24 |         self.tokenzier = tokenizer
 25 |         self.max_length = max_length
 26 |         self.possible_only = possible_only
 27 |         if self.possible_only:
 28 |             self.all_data = [
 29 |                 x for x in self.all_data if x["start_position"]
 30 |             ]
 31 |         self.is_chinese = is_chinese
 32 |         self.pad_to_maxlen = pad_to_maxlen
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.all_data)
 36 | 
 37 |     def __getitem__(self, item):
 38 |         """
 39 |         Args:
 40 |             item: int, idx
 41 |         Returns:
 42 |             tokens: tokens of query + context, [seq_len]
 43 |             token_type_ids: token type ids, 0 for query, 1 for context, [seq_len]
 44 |             start_labels: start labels of NER in tokens, [seq_len]
 45 |             end_labels: end labelsof NER in tokens, [seq_len]
 46 |             label_mask: label mask, 1 for counting into loss, 0 for ignoring. [seq_len]
 47 |             match_labels: match labels, [seq_len, seq_len]
 48 |             sample_idx: sample id
 49 |             label_idx: label id
 50 | 
 51 |         """
 52 |         data = self.all_data[item]
 53 |         tokenizer = self.tokenzier
 54 | 
 55 |         qas_id = data.get("qas_id", "0.0")
 56 |         sample_idx, label_idx = qas_id.split(".")
 57 |         sample_idx = torch.LongTensor([int(sample_idx)])
 58 |         label_idx = torch.LongTensor([int(label_idx)])
 59 | 
 60 |         query = data["query"]
 61 |         context = data["context"]
 62 |         start_positions = data["start_position"]
 63 |         end_positions = data["end_position"]
 64 |         if self.is_chinese:
 65 |             context = "".join(context.split())
 66 |             end_positions = [x + 1 for x in end_positions]
 67 |         else:
 68 |             # add space offsets
 69 |             words = context.split()
 70 |             start_positions = [x + sum([len(w) for w in words[:x]]) for x in start_positions]
 71 |             end_positions = [x + sum([len(w) for w in words[:x + 1]]) for x in end_positions]
 72 | 
 73 |         query_context_tokens = tokenizer.encode(query, context, add_special_tokens=True)
 74 |         tokens = query_context_tokens.ids
 75 |         type_ids = query_context_tokens.type_ids
 76 |         offsets = query_context_tokens.offsets
 77 | 
 78 |         # find new start_positions/end_positions, considering
 79 |         # 1. we add query tokens at the beginning
 80 |         # 2. word-piece tokenize
 81 |         origin_offset2token_idx_start = {}
 82 |         origin_offset2token_idx_end = {}
 83 |         for token_idx in range(len(tokens)):
 84 |             # skip query tokens
 85 |             if type_ids[token_idx] == 0:
 86 |                 continue
 87 |             token_start, token_end = offsets[token_idx]
 88 |             # skip [CLS] or [SEP]
 89 |             if token_start == token_end == 0:
 90 |                 continue
 91 |             origin_offset2token_idx_start[token_start] = token_idx
 92 |             origin_offset2token_idx_end[token_end] = token_idx
 93 |         new_start_positions = [origin_offset2token_idx_start[start] for start in start_positions]
 94 |         new_end_positions = [origin_offset2token_idx_end[end] for end in end_positions]
 95 | 
 96 |         label_mask = [
 97 |             (0 if type_ids[token_idx] == 0 or offsets[token_idx] == (0, 0) else 1)
 98 |             for token_idx in range(len(tokens))
 99 |         ]
100 |         start_label_mask = label_mask.copy()
101 |         end_label_mask = label_mask.copy()
102 | 
103 |         # the start/end position must be whole word
104 |         if not self.is_chinese:
105 |             for token_idx in range(len(tokens)):
106 |                 current_word_idx = query_context_tokens.words[token_idx]
107 |                 next_word_idx = query_context_tokens.words[token_idx + 1] if token_idx + 1 < len(tokens) else None
108 |                 prev_word_idx = query_context_tokens.words[token_idx - 1] if token_idx - 1 > 0 else None
109 |                 if prev_word_idx is not None and current_word_idx == prev_word_idx:
110 |                     start_label_mask[token_idx] = 0
111 |                 if next_word_idx is not None and current_word_idx == next_word_idx:
112 |                     end_label_mask[token_idx] = 0
113 | 
114 |         assert all(start_label_mask[p] != 0 for p in new_start_positions)
115 |         assert all(end_label_mask[p] != 0 for p in new_end_positions)
116 | 
117 |         assert len(new_start_positions) == len(new_end_positions) == len(start_positions)
118 |         assert len(label_mask) == len(tokens)
119 |         start_labels = [(1 if idx in new_start_positions else 0)
120 |                         for idx in range(len(tokens))]
121 |         end_labels = [(1 if idx in new_end_positions else 0)
122 |                       for idx in range(len(tokens))]
123 | 
124 |         # truncate
125 |         tokens = tokens[: self.max_length]
126 |         type_ids = type_ids[: self.max_length]
127 |         start_labels = start_labels[: self.max_length]
128 |         end_labels = end_labels[: self.max_length]
129 |         start_label_mask = start_label_mask[: self.max_length]
130 |         end_label_mask = end_label_mask[: self.max_length]
131 | 
132 |         # make sure last token is [SEP]
133 |         sep_token = tokenizer.token_to_id("[SEP]")
134 |         if tokens[-1] != sep_token:
135 |             assert len(tokens) == self.max_length
136 |             tokens = tokens[: -1] + [sep_token]
137 |             start_labels[-1] = 0
138 |             end_labels[-1] = 0
139 |             start_label_mask[-1] = 0
140 |             end_label_mask[-1] = 0
141 | 
142 |         if self.pad_to_maxlen:
143 |             tokens = self.pad(tokens, 0)
144 |             type_ids = self.pad(type_ids, 1)
145 |             start_labels = self.pad(start_labels)
146 |             end_labels = self.pad(end_labels)
147 |             start_label_mask = self.pad(start_label_mask)
148 |             end_label_mask = self.pad(end_label_mask)
149 | 
150 |         seq_len = len(tokens)
151 |         match_labels = torch.zeros([seq_len, seq_len], dtype=torch.long)
152 |         for start, end in zip(new_start_positions, new_end_positions):
153 |             if start >= seq_len or end >= seq_len:
154 |                 continue
155 |             match_labels[start, end] = 1
156 | 
157 |         return [
158 |             torch.LongTensor(tokens),
159 |             torch.LongTensor(type_ids),
160 |             torch.LongTensor(start_labels),
161 |             torch.LongTensor(end_labels),
162 |             torch.LongTensor(start_label_mask),
163 |             torch.LongTensor(end_label_mask),
164 |             match_labels,
165 |             sample_idx,
166 |             label_idx
167 |         ]
168 | 
169 |     def pad(self, lst, value=0, max_length=None):
170 |         max_length = max_length or self.max_length
171 |         while len(lst) < max_length:
172 |             lst.append(value)
173 |         return lst
174 | import  numpy as np
175 | 
176 | def run_dataset():
177 |     """test dataset"""
178 |     import os
179 |     from datasets.collate_functions import collate_to_max_length
180 |     from torch.utils.data import DataLoader
181 |     # zh datasets
182 |     # bert_path = "/mnt/mrc/chinese_L-12_H-768_A-12"
183 |     # json_path = "/mnt/mrc/zh_msra/mrc-ner.test"
184 |     # # json_path = "/mnt/mrc/zh_onto4/mrc-ner.train"
185 |     # is_chinese = True
186 | 
187 |     # en datasets
188 |     bert_path = "E:\data\chinese_roberta_wwm_large_ext_pytorch\\"
189 |     json_path = "E:\\data\\nested\\mrc-ner.evl"
190 |     # json_path = "/mnt/mrc/genia/mrc-ner.train"
191 |     is_chinese = True
192 | 
193 |     vocab_file = os.path.join(bert_path, "vocab.txt")
194 |     tokenizer = BertWordPieceTokenizer(vocab_file)
195 |     dataset = MRCNERDataset(json_path=json_path, tokenizer=tokenizer,
196 |                             is_chinese=is_chinese)
197 | 
198 |     dataloader = DataLoader(dataset, batch_size=32,
199 |                             collate_fn=collate_to_max_length)
200 |     for batch in dataloader:
201 |         for tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx in zip(
202 |                 *batch):
203 |             tokens = tokens.tolist()
204 |             start_positions, end_positions = torch.where(match_labels > 0)
205 |             start_positions = start_positions.tolist()
206 |             end_positions = end_positions.tolist()
207 |             if not start_positions:
208 |                 continue
209 | 
210 | 
211 |             print("=" * 20)
212 |             print(f"len: {len(tokens)}", tokenizer.decode(tokens, skip_special_tokens=False))
213 |             for start, end in zip(start_positions, end_positions):
214 |                 print(sample_idx)
215 |                 print(label_idx)
216 | 
217 |                 print(str(sample_idx.item())+"---", str(label_idx.item()) + "---" + tokenizer.decode(tokens[start: end + 1]))
218 | 
219 | 
220 | 
221 | if __name__ == '__main__':
222 |     run_dataset()
223 | 


--------------------------------------------------------------------------------
/NER/datasets/compute_acc.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | 
  4 | import json
  5 | import torch
  6 | from tokenizers import BertWordPieceTokenizer
  7 | from torch.utils.data import Dataset
  8 | import trainer
  9 | from tqdm import tqdm
 10 | 
 11 | class MRCNERDataset(Dataset):
 12 |     """
 13 |     MRC NER Dataset
 14 |     Args:
 15 |         json_path: path to mrc-ner style json
 16 |         tokenizer: BertTokenizer
 17 |         max_length: int, max length of query+context
 18 |         possible_only: if True, only use possible samples that contain answer for the query/context
 19 |         is_chinese: is chinese dataset
 20 |     """
 21 | 
 22 |     def __init__(self, json_path, tokenizer: BertWordPieceTokenizer, max_length: int = 128, possible_only=False,
 23 |                  is_chinese=False, pad_to_maxlen=False):
 24 |         self.all_data = json.load(open(json_path, encoding="utf-8"))
 25 |         self.tokenzier = tokenizer
 26 |         self.max_length = max_length
 27 |         self.possible_only = possible_only
 28 |         if self.possible_only:
 29 |             self.all_data = [
 30 |                 x for x in self.all_data if x["start_position"]
 31 |             ]
 32 |         self.is_chinese = is_chinese
 33 |         self.pad_to_maxlen = pad_to_maxlen
 34 | 
 35 |     def __len__(self):
 36 |         return len(self.all_data)
 37 | 
 38 |     def __getitem__(self, item):
 39 |         """
 40 |         Args:
 41 |             item: int, idx
 42 |         Returns:
 43 |             tokens: tokens of query + context, [seq_len]
 44 |             token_type_ids: token type ids, 0 for query, 1 for context, [seq_len]
 45 |             start_labels: start labels of NER in tokens, [seq_len]
 46 |             end_labels: end labelsof NER in tokens, [seq_len]
 47 |             label_mask: label mask, 1 for counting into loss, 0 for ignoring. [seq_len]
 48 |             match_labels: match labels, [seq_len, seq_len]
 49 |             sample_idx: sample id
 50 |             label_idx: label id
 51 | 
 52 |         """
 53 |         data = self.all_data[item]
 54 |         tokenizer = self.tokenzier
 55 | 
 56 |         qas_id = data.get("qas_id", "0.0")
 57 |         sample_idx, label_idx = qas_id.split(".")
 58 |         sample_idx = torch.LongTensor([int(sample_idx)])
 59 |         label_idx = torch.LongTensor([int(label_idx)])
 60 | 
 61 |         query = data["query"]
 62 |         context = data["context"]
 63 |         start_positions = data["start_position"]
 64 |         end_positions = data["end_position"]
 65 | 
 66 |         if self.is_chinese:
 67 |             context = "".join(context.split())
 68 |             end_positions = [x + 1 for x in end_positions]
 69 |         else:
 70 |             # add space offsets
 71 |             words = context.split()
 72 |             start_positions = [x + sum([len(w) for w in words[:x]]) for x in start_positions]
 73 |             end_positions = [x + sum([len(w) for w in words[:x + 1]]) for x in end_positions]
 74 | 
 75 |         query_context_tokens = tokenizer.encode(query, context, add_special_tokens=True)
 76 |         tokens = query_context_tokens.ids
 77 |         type_ids = query_context_tokens.type_ids
 78 |         offsets = query_context_tokens.offsets
 79 | 
 80 |         # find new start_positions/end_positions, considering
 81 |         # 1. we add query tokens at the beginning
 82 |         # 2. word-piece tokenize
 83 |         origin_offset2token_idx_start = {}
 84 |         origin_offset2token_idx_end = {}
 85 |         for token_idx in range(len(tokens)):
 86 |             # skip query tokens
 87 |             if type_ids[token_idx] == 0:
 88 |                 continue
 89 |             token_start, token_end = offsets[token_idx]
 90 |             # skip [CLS] or [SEP]
 91 |             if token_start == token_end == 0:
 92 |                 continue
 93 |             origin_offset2token_idx_start[token_start] = token_idx
 94 |             origin_offset2token_idx_end[token_end] = token_idx
 95 |         new_start_positions = [origin_offset2token_idx_start[start] for start in start_positions]
 96 |         new_end_positions = [origin_offset2token_idx_end[end] for end in end_positions]
 97 | 
 98 |         label_mask = [
 99 |             (0 if type_ids[token_idx] == 0 or offsets[token_idx] == (0, 0) else 1)
100 |             for token_idx in range(len(tokens))
101 |         ]
102 |         start_label_mask = label_mask.copy()
103 |         end_label_mask = label_mask.copy()
104 | 
105 |         # the start/end position must be whole word
106 |         if not self.is_chinese:
107 |             for token_idx in range(len(tokens)):
108 |                 current_word_idx = query_context_tokens.words[token_idx]
109 |                 next_word_idx = query_context_tokens.words[token_idx + 1] if token_idx + 1 < len(tokens) else None
110 |                 prev_word_idx = query_context_tokens.words[token_idx - 1] if token_idx - 1 > 0 else None
111 |                 if prev_word_idx is not None and current_word_idx == prev_word_idx:
112 |                     start_label_mask[token_idx] = 0
113 |                 if next_word_idx is not None and current_word_idx == next_word_idx:
114 |                     end_label_mask[token_idx] = 0
115 | 
116 |         assert all(start_label_mask[p] != 0 for p in new_start_positions)
117 |         assert all(end_label_mask[p] != 0 for p in new_end_positions)
118 | 
119 |         assert len(new_start_positions) == len(new_end_positions) == len(start_positions)
120 |         assert len(label_mask) == len(tokens)
121 |         start_labels = [(1 if idx in new_start_positions else 0)
122 |                         for idx in range(len(tokens))]
123 |         end_labels = [(1 if idx in new_end_positions else 0)
124 |                       for idx in range(len(tokens))]
125 | 
126 |         # truncate
127 |         tokens = tokens[: self.max_length]
128 |         type_ids = type_ids[: self.max_length]
129 |         start_labels = start_labels[: self.max_length]
130 |         end_labels = end_labels[: self.max_length]
131 |         start_label_mask = start_label_mask[: self.max_length]
132 |         end_label_mask = end_label_mask[: self.max_length]
133 | 
134 |         # make sure last token is [SEP]
135 |         sep_token = tokenizer.token_to_id("[SEP]")
136 |         if tokens[-1] != sep_token:
137 |             assert len(tokens) == self.max_length
138 |             tokens = tokens[: -1] + [sep_token]
139 |             start_labels[-1] = 0
140 |             end_labels[-1] = 0
141 |             start_label_mask[-1] = 0
142 |             end_label_mask[-1] = 0
143 | 
144 |         if self.pad_to_maxlen:
145 |             tokens = self.pad(tokens, 0)
146 |             type_ids = self.pad(type_ids, 1)
147 |             start_labels = self.pad(start_labels)
148 |             end_labels = self.pad(end_labels)
149 |             start_label_mask = self.pad(start_label_mask)
150 |             end_label_mask = self.pad(end_label_mask)
151 | 
152 |         seq_len = len(tokens)
153 |         match_labels = torch.zeros([seq_len, seq_len], dtype=torch.long)
154 |         for start, end in zip(new_start_positions, new_end_positions):
155 |             if start >= seq_len or end >= seq_len:
156 |                 continue
157 |             match_labels[start, end] = 1
158 | 
159 |         return [
160 |             torch.LongTensor(tokens),
161 |             torch.LongTensor(type_ids),
162 |             torch.LongTensor(start_labels),
163 |             torch.LongTensor(end_labels),
164 |             torch.LongTensor(start_label_mask),
165 |             torch.LongTensor(end_label_mask),
166 |             match_labels,
167 |             sample_idx,
168 |             label_idx
169 |         ]
170 | 
171 |     def pad(self, lst, value=0, max_length=None):
172 |         max_length = max_length or self.max_length
173 |         while len(lst) < max_length:
174 |             lst.append(value)
175 |         return lst
176 | 
177 | 
178 | def load_model_for_compute():
179 |     CHECKPOINTS = "E:\\data\\modelNER\\version_7\\checkpoints\\epoch=2.ckpt"
180 |     HPARAMS = "E:\\data\\modelNER\\version_7\\hparams.yaml"
181 |     model = trainer.BertLabeling.load_from_checkpoint(
182 |         checkpoint_path=CHECKPOINTS,
183 |         hparams_file=HPARAMS,
184 |         map_location=None,
185 |         batch_size=1,
186 |         max_length=128,
187 |         workers=0
188 |     )
189 |     return model
190 | 
191 | 
192 | def load_dataset_for_compute():
193 |     """test dataset"""
194 |     import os
195 |     from datasets.collate_functions import collate_to_max_length
196 |     from torch.utils.data import DataLoader
197 |     bert_path = "E:\data\chinese_roberta_wwm_large_ext_pytorch"
198 |     json_path = "E:\\data\\nested\\mrc-ner.evl"
199 |     is_chinese = True
200 |     vocab_file = os.path.join(bert_path, "vocab.txt")
201 |     tokenizer = BertWordPieceTokenizer(vocab_file)
202 |     dataset = MRCNERDataset(json_path=json_path, tokenizer=tokenizer,
203 |                             is_chinese=is_chinese)
204 |     dataloader = DataLoader(dataset, batch_size=1,
205 |                             collate_fn=collate_to_max_length)
206 |     return dataloader, tokenizer
207 | 
208 | 
209 | 
210 | def write_json_to_file(origin_path, des_path, data):
211 |     with open(origin_path + des_path, "w", encoding="utf8") as fp:
212 |         fp.write(json.dumps(data, ensure_ascii=False))
213 |         fp.write("\n")
214 |         fp.flush()
215 |         fp.close()
216 | 
217 | def compute_result_by_query():
218 |     model = load_model_for_compute()
219 |     data, tokenizer = load_dataset_for_compute()
220 |     entity_total = {}
221 |     with tqdm(total=len(data)) as pbar:
222 |         try:
223 |             for batch_idx, batch in enumerate(data):
224 |                 sample = data.dataset.all_data[batch_idx]
225 |                 entity_label = sample["entity_label"]
226 |                 if entity_label not in entity_total.keys():
227 |                     entity_total[entity_label] = {"tp": 0, "fp": 0, "fn": 0}
228 |                 out_puts = model.validation_step(batch, batch_idx)
229 |                 tp, fp, fn = out_puts["span_f1_stats"]
230 |                 entity_total[entity_label]["tp"] = entity_total[entity_label]["tp"] + tp.item()
231 |                 entity_total[entity_label]["fp"] = entity_total[entity_label]["fp"] + fp.item()
232 |                 entity_total[entity_label]["fn"] = entity_total[entity_label]["fn"] + fn.item()
233 |                 pbar.update(1)
234 |         except Exception as e:
235 |             print("ERROR,PLEASE CHECK CODE！{}".format(e))
236 |     entity_total_result = {}
237 |     tp_total = 0
238 |     fp_total = 0
239 |     fn_total = 0
240 |     for k, v in entity_total.items():
241 |         tp = v["tp"]
242 |         fp = v["fp"]
243 |         fn = v["fn"]
244 |         tp_total += tp
245 |         fp_total += fp
246 |         fn_total += fn
247 |         P = tp * 100.0 / (tp + fp)
248 |         R = tp * 100.0 / (tp + fn)
249 |         F1 = 0 if (P + R) == 0 else (2 * P * R) / (P + R)
250 |         entity_total_result[k] = {"precision": "{:.2f}".format(P), "recall": "{:.2f}".format(R), "f1": "{:.2f}".format(F1)}
251 |     P_Tol = tp_total * 100.0 / (tp_total + fp_total)
252 |     R_Tol = tp_total * 100.0 / (tp_total + fn_total)
253 |     F1_Tol = 0 if (P_Tol + R_Tol) == 0 else (2 * P_Tol * R_Tol) / (P_Tol + R_Tol)
254 |     write_json_to_file("", "query_entity_result.txt", entity_total_result)
255 |     print("total precision:{:.2f},total recall:{:.2f},total f1:{:.2f}".format(P_Tol, R_Tol, F1_Tol))
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     # run_dataset()
260 |     compute_result_by_query()
261 | 


--------------------------------------------------------------------------------
/RE/initial.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | 
  4 | 
  5 | # embedding the position
  6 | def pos_embed(x):
  7 |     if x < -60:
  8 |         return 0
  9 |     if -60 <= x <= 60:
 10 |         return x + 61
 11 |     if x > 60:
 12 |         return 122
 13 | 
 14 | 
 15 | # find the index of x in y, if x not in y, return -1
 16 | def find_index(x, y):
 17 |     flag = -1
 18 |     for i in range(len(y)):
 19 |         if x != y[i]:
 20 |             continue
 21 |         else:
 22 |             return i
 23 |     return flag
 24 | 
 25 | 
 26 | # reading data
 27 | def init():
 28 |     print('reading word embedding data...')
 29 |     vec = []
 30 |     word2id = {}
 31 |     f = open('./data/vec.txt', encoding='utf-8')
 32 |     content = f.readline()
 33 |     content = content.strip().split()
 34 |     dim = int(content[1])
 35 |     while True:
 36 |         content = f.readline()
 37 |         if content == '':
 38 |             break
 39 |         content = content.strip().split()
 40 |         word2id[content[0]] = len(word2id)
 41 |         content = content[1:]
 42 |         content = [(float)(i) for i in content]
 43 |         vec.append(content)
 44 |     f.close()
 45 |     word2id['UNK'] = len(word2id)
 46 |     word2id['BLANK'] = len(word2id)
 47 | 
 48 |     vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
 49 |     vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
 50 |     vec = np.array(vec, dtype=np.float32)
 51 | 
 52 |     print('reading relation to id')
 53 |     relation2id = {}
 54 |     f = open('./data/relation2id.txt', 'r', encoding='utf-8')
 55 |     while True:
 56 |         content = f.readline()
 57 |         if content == '':
 58 |             break
 59 |         content = content.strip().split()
 60 |         relation2id[content[0]] = int(content[1])
 61 |     f.close()
 62 | 
 63 |     # length of sentence is 70
 64 |     fixlen = 70
 65 |     # max length of position embedding is 60 (-60~+60)
 66 |     maxlen = 60
 67 | 
 68 |     train_sen = {}  # {entity pair:[[[label1-sentence 1],[label1-sentence 2]...],[[label2-sentence 1],[label2-sentence 2]...]}
 69 |     train_ans = {}  # {entity pair:[label1,label2,...]} the label is one-hot vector
 70 | 
 71 |     print('reading train data...')
 72 |     f = open('./data/train.txt', 'r', encoding='utf-8')
 73 | 
 74 |     while True:
 75 |         content = f.readline()
 76 |         if content == '':
 77 |             break
 78 | 
 79 |         content = content.strip().split('\t')
 80 |         # get entity name
 81 |         en1 = content[0]
 82 |         en2 = content[1]
 83 |         relation = 0
 84 |         if content[2] not in relation2id:
 85 |             print(content[2])
 86 |             relation = relation2id['NA']
 87 |         else:
 88 |             relation = relation2id[content[2]]
 89 |         # put the same entity pair sentences into a dict
 90 |         tup = (en1, en2)
 91 |         label_tag = 0
 92 |         if tup not in train_sen:
 93 |             train_sen[tup] = []
 94 |             train_sen[tup].append([])
 95 |             y_id = relation
 96 |             label_tag = 0
 97 |             label = [0 for i in range(len(relation2id))]
 98 |             label[y_id] = 1
 99 |             train_ans[tup] = []
100 |             train_ans[tup].append(label)
101 |         else:
102 |             y_id = relation
103 |             label_tag = 0
104 |             label = [0 for i in range(len(relation2id))]
105 |             label[y_id] = 1
106 | 
107 |             temp = find_index(label, train_ans[tup])
108 |             if temp == -1:
109 |                 train_ans[tup].append(label)
110 |                 label_tag = len(train_ans[tup]) - 1
111 |                 train_sen[tup].append([])
112 |             else:
113 |                 label_tag = temp
114 | 
115 |         sentence = content[3]
116 | 
117 |         en1pos = 0
118 |         en2pos = 0
119 |         
120 |         #For Chinese
121 |         en1pos = sentence.find(en1)
122 |         if en1pos == -1:
123 |             en1pos = 0
124 |         en2pos = sentence.find(en2)
125 |         if en2pos == -1:
126 |             en2pos = 0
127 |         
128 |         output = []
129 | 
130 |         #Embeding the position
131 |         for i in range(fixlen):
132 |             word = word2id['BLANK']
133 |             rel_e1 = pos_embed(i - en1pos)
134 |             rel_e2 = pos_embed(i - en2pos)
135 |             output.append([word, rel_e1, rel_e2])
136 | 
137 |         for i in range(min(fixlen, len(sentence))):
138 |             word = 0
139 |             if sentence[i] not in word2id:
140 |                 word = word2id['UNK']
141 |             else:
142 |                 word = word2id[sentence[i]]
143 | 
144 |             output[i][0] = word
145 | 
146 |         train_sen[tup][label_tag].append(output)
147 | 
148 |     print('reading test data ...')
149 | 
150 |     test_sen = {}  # {entity pair:[[sentence 1],[sentence 2]...]}
151 |     test_ans = {}  # {entity pair:[labels,...]} the labels is N-hot vector (N is the number of multi-label)
152 | 
153 |     f = open('./data/test.txt', 'r', encoding='utf-8')
154 | 
155 |     while True:
156 |         content = f.readline()
157 |         if content == '':
158 |             break
159 | 
160 |         content = content.strip().split('\t')
161 |         en1 = content[0]
162 |         en2 = content[1]
163 |         relation = 0
164 |         if content[2] not in relation2id:
165 |             relation = relation2id['NA']
166 |         else:
167 |             relation = relation2id[content[2]]
168 |         tup = (en1, en2)
169 | 
170 |         if tup not in test_sen:
171 |             test_sen[tup] = []
172 |             y_id = relation
173 |             label_tag = 0
174 |             label = [0 for i in range(len(relation2id))]
175 |             label[y_id] = 1
176 |             test_ans[tup] = label
177 |         else:
178 |             y_id = relation
179 |             test_ans[tup][y_id] = 1
180 | 
181 |         sentence = content[3]
182 | 
183 |         en1pos = 0
184 |         en2pos = 0
185 |         
186 |         #For Chinese
187 |         en1pos = sentence.find(en1)
188 |         if en1pos == -1:
189 |             en1pos = 0
190 |         en2pos = sentence.find(en2)
191 |         if en2pos == -1:
192 |             en2pos = 0
193 |             
194 |         output = []
195 | 
196 |         for i in range(fixlen):
197 |             word = word2id['BLANK']
198 |             rel_e1 = pos_embed(i - en1pos)
199 |             rel_e2 = pos_embed(i - en2pos)
200 |             output.append([word, rel_e1, rel_e2])
201 | 
202 |         for i in range(min(fixlen, len(sentence))):
203 |             word = 0
204 |             if sentence[i] not in word2id:
205 |                 word = word2id['UNK']
206 |             else:
207 |                 word = word2id[sentence[i]]
208 | 
209 |             output[i][0] = word
210 |         test_sen[tup].append(output)
211 | 
212 |     train_x = []
213 |     train_y = []
214 |     test_x = []
215 |     test_y = []
216 | 
217 |     if not os.path.exists("data"):
218 |         os.makedirs("data")
219 | 
220 |     print('organizing train data')
221 |     f = open('./data/train_q&a.txt', 'w', encoding='utf-8')
222 |     temp = 0
223 |     for i in train_sen:
224 |         if len(train_ans[i]) != len(train_sen[i]):
225 |             print('ERROR')
226 |         lenth = len(train_ans[i])
227 |         for j in range(lenth):
228 |             train_x.append(train_sen[i][j])
229 |             train_y.append(train_ans[i][j])
230 |             f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + str(np.argmax(train_ans[i][j])) + '\n')
231 |             temp += 1
232 |     f.close()
233 | 
234 |     print('organizing test data')
235 |     f = open('./data/test_q&a.txt', 'w', encoding='utf-8')
236 |     temp = 0
237 |     for i in test_sen:
238 |         test_x.append(test_sen[i])
239 |         test_y.append(test_ans[i])
240 |         tempstr = ''
241 |         for j in range(len(test_ans[i])):
242 |             if test_ans[i][j] != 0:
243 |                 tempstr = tempstr + str(j) + '\t'
244 |         f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + tempstr + '\n')
245 |         temp += 1
246 |     f.close()
247 | 
248 |     train_x = np.array(train_x)
249 |     train_y = np.array(train_y)
250 |     test_x = np.array(test_x)
251 |     test_y = np.array(test_y)
252 | 
253 |     np.save('./data/vec.npy', vec)
254 |     np.save('./data/train_x.npy', train_x)
255 |     np.save('./data/train_y.npy', train_y)
256 |     np.save('./data/testall_x.npy', test_x)
257 |     np.save('./data/testall_y.npy', test_y)
258 | 
259 | 
260 | def seperate():
261 |     print('reading training data')
262 |     x_train = np.load('./data/train_x.npy', allow_pickle=True)
263 | 
264 |     train_word = []
265 |     train_pos1 = []
266 |     train_pos2 = []
267 | 
268 |     print('seprating train data')
269 |     for i in range(len(x_train)):
270 |         word = []
271 |         pos1 = []
272 |         pos2 = []
273 |         for j in x_train[i]:
274 |             temp_word = []
275 |             temp_pos1 = []
276 |             temp_pos2 = []
277 |             for k in j:
278 |                 temp_word.append(k[0])
279 |                 temp_pos1.append(k[1])
280 |                 temp_pos2.append(k[2])
281 |             word.append(temp_word)
282 |             pos1.append(temp_pos1)
283 |             pos2.append(temp_pos2)
284 |         train_word.append(word)
285 |         train_pos1.append(pos1)
286 |         train_pos2.append(pos2)
287 | 
288 |     train_word = np.array(train_word)
289 |     train_pos1 = np.array(train_pos1)
290 |     train_pos2 = np.array(train_pos2)
291 |     np.save('./data/train_word.npy', train_word)
292 |     np.save('./data/train_pos1.npy', train_pos1)
293 |     np.save('./data/train_pos2.npy', train_pos2)
294 | 
295 |     print('seperating test all data')
296 |     x_test = np.load('./data/testall_x.npy', allow_pickle=True)
297 |     test_word = []
298 |     test_pos1 = []
299 |     test_pos2 = []
300 | 
301 |     for i in range(len(x_test)):
302 |         word = []
303 |         pos1 = []
304 |         pos2 = []
305 |         for j in x_test[i]:
306 |             temp_word = []
307 |             temp_pos1 = []
308 |             temp_pos2 = []
309 |             for k in j:
310 |                 temp_word.append(k[0])
311 |                 temp_pos1.append(k[1])
312 |                 temp_pos2.append(k[2])
313 |             word.append(temp_word)
314 |             pos1.append(temp_pos1)
315 |             pos2.append(temp_pos2)
316 |         test_word.append(word)
317 |         test_pos1.append(pos1)
318 |         test_pos2.append(pos2)
319 | 
320 |     test_word = np.array(test_word)
321 |     test_pos1 = np.array(test_pos1)
322 |     test_pos2 = np.array(test_pos2)
323 | 
324 |     np.save('./data/testall_word.npy', test_word)
325 |     np.save('./data/testall_pos1.npy', test_pos1)
326 |     np.save('./data/testall_pos2.npy', test_pos2)
327 | 
328 | 
329 | 
330 | # get answer metric for PR curve evaluation
331 | def getans():
332 |     test_y = np.load('./data/testall_y.npy', allow_pickle=True)
333 |     eval_y = []
334 |     for i in test_y:
335 |         eval_y.append(i[1:])
336 |     allans = np.reshape(eval_y, (-1))
337 |     np.save('./data/allans.npy', allans)
338 | 
339 | 
340 | def get_metadata():
341 |     fwrite = open('./data/metadata.tsv', 'w', encoding='utf-8')
342 |     f = open('./origin_data/vec.txt', encoding='utf-8')
343 |     f.readline()
344 |     while True:
345 |         content = f.readline().strip()
346 |         if content == '':
347 |             break
348 |         name = content.split()[0]
349 |         fwrite.write(name + '\n')
350 |     f.close()
351 |     fwrite.close()
352 | 
353 | 
354 | init()
355 | seperate()
356 | getans()
357 | get_metadata()
358 | 


--------------------------------------------------------------------------------
/NER/datasets/compute_acc_linux.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | 
  4 | import json
  5 | import torch
  6 | from tokenizers import BertWordPieceTokenizer
  7 | from torch.utils.data import Dataset
  8 | import trainer
  9 | from tqdm import tqdm
 10 | 
 11 | class MRCNERDataset(Dataset):
 12 |     """
 13 |     MRC NER Dataset
 14 |     Args:
 15 |         json_path: path to mrc-ner style json
 16 |         tokenizer: BertTokenizer
 17 |         max_length: int, max length of query+context
 18 |         possible_only: if True, only use possible samples that contain answer for the query/context
 19 |         is_chinese: is chinese dataset
 20 |     """
 21 | 
 22 |     def __init__(self, json_path, tokenizer: BertWordPieceTokenizer, max_length: int = 128, possible_only=False,
 23 |                  is_chinese=False, pad_to_maxlen=False):
 24 |         self.all_data = json.load(open(json_path, encoding="utf-8"))
 25 |         self.tokenzier = tokenizer
 26 |         self.max_length = max_length
 27 |         self.possible_only = possible_only
 28 |         if self.possible_only:
 29 |             self.all_data = [
 30 |                 x for x in self.all_data if x["start_position"]
 31 |             ]
 32 |         self.is_chinese = is_chinese
 33 |         self.pad_to_maxlen = pad_to_maxlen
 34 | 
 35 |     def __len__(self):
 36 |         return len(self.all_data)
 37 | 
 38 |     def __getitem__(self, item):
 39 |         """
 40 |         Args:
 41 |             item: int, idx
 42 |         Returns:
 43 |             tokens: tokens of query + context, [seq_len]
 44 |             token_type_ids: token type ids, 0 for query, 1 for context, [seq_len]
 45 |             start_labels: start labels of NER in tokens, [seq_len]
 46 |             end_labels: end labelsof NER in tokens, [seq_len]
 47 |             label_mask: label mask, 1 for counting into loss, 0 for ignoring. [seq_len]
 48 |             match_labels: match labels, [seq_len, seq_len]
 49 |             sample_idx: sample id
 50 |             label_idx: label id
 51 | 
 52 |         """
 53 |         data = self.all_data[item]
 54 |         tokenizer = self.tokenzier
 55 | 
 56 |         qas_id = data.get("qas_id", "0.0")
 57 |         sample_idx, label_idx = qas_id.split(".")
 58 |         sample_idx = torch.LongTensor([int(sample_idx)])
 59 |         label_idx = torch.LongTensor([int(label_idx)])
 60 | 
 61 |         query = data["query"]
 62 |         context = data["context"]
 63 |         start_positions = data["start_position"]
 64 |         end_positions = data["end_position"]
 65 | 
 66 |         if self.is_chinese:
 67 |             context = "".join(context.split())
 68 |             end_positions = [x + 1 for x in end_positions]
 69 |         else:
 70 |             # add space offsets
 71 |             words = context.split()
 72 |             start_positions = [x + sum([len(w) for w in words[:x]]) for x in start_positions]
 73 |             end_positions = [x + sum([len(w) for w in words[:x + 1]]) for x in end_positions]
 74 | 
 75 |         query_context_tokens = tokenizer.encode(query, context, add_special_tokens=True)
 76 |         tokens = query_context_tokens.ids
 77 |         type_ids = query_context_tokens.type_ids
 78 |         offsets = query_context_tokens.offsets
 79 | 
 80 |         # find new start_positions/end_positions, considering
 81 |         # 1. we add query tokens at the beginning
 82 |         # 2. word-piece tokenize
 83 |         origin_offset2token_idx_start = {}
 84 |         origin_offset2token_idx_end = {}
 85 |         for token_idx in range(len(tokens)):
 86 |             # skip query tokens
 87 |             if type_ids[token_idx] == 0:
 88 |                 continue
 89 |             token_start, token_end = offsets[token_idx]
 90 |             # skip [CLS] or [SEP]
 91 |             if token_start == token_end == 0:
 92 |                 continue
 93 |             origin_offset2token_idx_start[token_start] = token_idx
 94 |             origin_offset2token_idx_end[token_end] = token_idx
 95 |         new_start_positions = [origin_offset2token_idx_start[start] for start in start_positions]
 96 |         new_end_positions = [origin_offset2token_idx_end[end] for end in end_positions]
 97 | 
 98 |         label_mask = [
 99 |             (0 if type_ids[token_idx] == 0 or offsets[token_idx] == (0, 0) else 1)
100 |             for token_idx in range(len(tokens))
101 |         ]
102 |         start_label_mask = label_mask.copy()
103 |         end_label_mask = label_mask.copy()
104 | 
105 |         # the start/end position must be whole word
106 |         if not self.is_chinese:
107 |             for token_idx in range(len(tokens)):
108 |                 current_word_idx = query_context_tokens.words[token_idx]
109 |                 next_word_idx = query_context_tokens.words[token_idx + 1] if token_idx + 1 < len(tokens) else None
110 |                 prev_word_idx = query_context_tokens.words[token_idx - 1] if token_idx - 1 > 0 else None
111 |                 if prev_word_idx is not None and current_word_idx == prev_word_idx:
112 |                     start_label_mask[token_idx] = 0
113 |                 if next_word_idx is not None and current_word_idx == next_word_idx:
114 |                     end_label_mask[token_idx] = 0
115 | 
116 |         assert all(start_label_mask[p] != 0 for p in new_start_positions)
117 |         assert all(end_label_mask[p] != 0 for p in new_end_positions)
118 | 
119 |         assert len(new_start_positions) == len(new_end_positions) == len(start_positions)
120 |         assert len(label_mask) == len(tokens)
121 |         start_labels = [(1 if idx in new_start_positions else 0)
122 |                         for idx in range(len(tokens))]
123 |         end_labels = [(1 if idx in new_end_positions else 0)
124 |                       for idx in range(len(tokens))]
125 | 
126 |         # truncate
127 |         tokens = tokens[: self.max_length]
128 |         type_ids = type_ids[: self.max_length]
129 |         start_labels = start_labels[: self.max_length]
130 |         end_labels = end_labels[: self.max_length]
131 |         start_label_mask = start_label_mask[: self.max_length]
132 |         end_label_mask = end_label_mask[: self.max_length]
133 | 
134 |         # make sure last token is [SEP]
135 |         sep_token = tokenizer.token_to_id("[SEP]")
136 |         if tokens[-1] != sep_token:
137 |             assert len(tokens) == self.max_length
138 |             tokens = tokens[: -1] + [sep_token]
139 |             start_labels[-1] = 0
140 |             end_labels[-1] = 0
141 |             start_label_mask[-1] = 0
142 |             end_label_mask[-1] = 0
143 | 
144 |         if self.pad_to_maxlen:
145 |             tokens = self.pad(tokens, 0)
146 |             type_ids = self.pad(type_ids, 1)
147 |             start_labels = self.pad(start_labels)
148 |             end_labels = self.pad(end_labels)
149 |             start_label_mask = self.pad(start_label_mask)
150 |             end_label_mask = self.pad(end_label_mask)
151 | 
152 |         seq_len = len(tokens)
153 |         match_labels = torch.zeros([seq_len, seq_len], dtype=torch.long)
154 |         for start, end in zip(new_start_positions, new_end_positions):
155 |             if start >= seq_len or end >= seq_len:
156 |                 continue
157 |             match_labels[start, end] = 1
158 | 
159 |         return [
160 |             torch.LongTensor(tokens),
161 |             torch.LongTensor(type_ids),
162 |             torch.LongTensor(start_labels),
163 |             torch.LongTensor(end_labels),
164 |             torch.LongTensor(start_label_mask),
165 |             torch.LongTensor(end_label_mask),
166 |             match_labels,
167 |             sample_idx,
168 |             label_idx
169 |         ]
170 | 
171 |     def pad(self, lst, value=0, max_length=None):
172 |         max_length = max_length or self.max_length
173 |         while len(lst) < max_length:
174 |             lst.append(value)
175 |         return lst
176 | 
177 | 
178 | def load_model_for_compute():
179 |     CHECKPOINTS = "/ml/home/mcldd/ner-rel/diabete-kno-dataset-code/mrc-for-flat-nested-ner-master/lightning_logs/version_17/checkpoints/epoch=7.ckpt"
180 |     HPARAMS = "/ml/home/mcldd/ner-rel/diabete-kno-dataset-code/mrc-for-flat-nested-ner-master/lightning_logs/version_17/hparams.yaml"
181 | 
182 |     model = trainer.BertLabeling.load_from_checkpoint(
183 |         checkpoint_path=CHECKPOINTS,
184 |         hparams_file=HPARAMS,
185 |         map_location=None,
186 |         batch_size=1,
187 |         max_length=128,
188 |         workers=0
189 |     )
190 |     return model
191 | 
192 | 
193 | def load_dataset_for_compute():
194 |     """test dataset"""
195 |     import os
196 |     from datasets.collate_functions import collate_to_max_length
197 |     from torch.utils.data import DataLoader
198 |     bert_path = "/ml/home/mcldd/ner-rel/diabete-kno-dataset-code/mrc-for-flat-nested-ner-master/models/chinese_roberta_wwm_large_ext_pytorch"
199 |     json_path = "/ml/home/mcldd/ner-rel/diabete-kno-dataset-code/mrc-for-flat-nested-ner-master/entity_type_data/mrc-ner.test"
200 | 
201 |     is_chinese = False
202 |     vocab_file = os.path.join(bert_path, "vocab.txt")
203 |     tokenizer = BertWordPieceTokenizer(vocab_file)
204 |     dataset = MRCNERDataset(json_path=json_path, tokenizer=tokenizer,
205 |                             is_chinese=is_chinese)
206 |     dataloader = DataLoader(dataset, batch_size=1,
207 |                             collate_fn=collate_to_max_length)
208 |     return dataloader, tokenizer
209 | 
210 | 
211 | 
212 | 
213 | def write_json_to_file(origin_path, des_path, data):
214 |     with open(origin_path + des_path, "w", encoding="utf8") as fp:
215 |         fp.write(json.dumps(data, ensure_ascii=False))
216 |         fp.write("\n")
217 |         fp.flush()
218 |         fp.close()
219 | 
220 | 
221 | def compute_result_by_query():
222 |     model = load_model_for_compute()
223 |     data, tokenizer = load_dataset_for_compute()
224 |     entity_total = {}
225 |     with tqdm(total=len(data)) as pbar:
226 |         try:
227 |             for batch_idx, batch in enumerate(data):
228 |                 sample = data.dataset.all_data[batch_idx]
229 |                 entity_label = sample["entity_label"]
230 |                 if entity_label not in entity_total.keys():
231 |                     entity_total[entity_label] = {"tp": 0, "fp": 0, "fn": 0}
232 |                 out_puts = model.validation_step(batch, batch_idx)
233 |                 tp, fp, fn = out_puts["span_f1_stats"]
234 |                 entity_total[entity_label]["tp"] = entity_total[entity_label]["tp"] + tp.item()
235 |                 entity_total[entity_label]["fp"] = entity_total[entity_label]["fp"] + fp.item()
236 |                 entity_total[entity_label]["fn"] = entity_total[entity_label]["fn"] + fn.item()
237 |                 pbar.update(1)
238 |         except Exception as e:
239 |             print("ERROR,PLEASE CHECK CODE！{}".format(e))
240 |     entity_total_result = {}
241 |     tp_total = 0
242 |     fp_total = 0
243 |     fn_total = 0
244 |     for k, v in entity_total.items():
245 |         tp = v["tp"]
246 |         fp = v["fp"]
247 |         fn = v["fn"]
248 |         tp_total += tp
249 |         fp_total += fp
250 |         fn_total += fn
251 |         P = tp * 100.0 / (tp + fp)
252 |         R = tp * 100.0 / (tp + fn)
253 |         F1 = 0 if (P + R) == 0 else (2 * P * R) / (P + R)
254 |         entity_total_result[k] = {"precision": "{:.2f}%".format(P), "recall": "{:.2f}%".format(R), "f1": "{:.2f}%".format(F1)}
255 |     P_Tol = tp_total * 100.0 / (tp_total + fp_total)
256 |     R_Tol = tp_total * 100.0 / (tp_total + fn_total)
257 |     F1_Tol = 0 if (P_Tol + R_Tol) == 0 else (2 * P_Tol * R_Tol) / (P_Tol + R_Tol)
258 |     write_json_to_file("", "query_entity_result.txt", entity_total_result)
259 |     print("total precision:{:.2f},total recall:{:.2f},total f1:{:.2f}".format(P_Tol, R_Tol, F1_Tol))
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     # run_dataset()
264 |     compute_result_by_query()
265 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/RE/test_GRU.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import time
  6 | import datetime
  7 | import os
  8 | import network
  9 | from sklearn import metrics
 10 | 
 11 | FLAGS = tf.app.flags.FLAGS
 12 | 
 13 | import os
 14 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 15 | 
 16 | # embedding the position
 17 | def pos_embed(x):
 18 |     if x < -60:
 19 |         return 0
 20 |     if -60 <= x <= 60:
 21 |         return x + 61
 22 |     if x > 60:
 23 |         return 122
 24 | 
 25 | 
 26 | def main_for_evaluation():
 27 | # def main(_):
 28 |     pathname = "./model_custom/ATT_GRU_model-"
 29 | 
 30 |     wordembedding = np.load('chinese_RE/custom_RE/vec.npy')
 31 | 
 32 |     test_settings = network.Settings()
 33 |     test_settings.vocab_size = 16693
 34 |     test_settings.num_classes = 16
 35 |     test_settings.big_num = 5561
 36 | 
 37 |     big_num_test = test_settings.big_num
 38 | 
 39 |     with tf.Graph().as_default():
 40 | 
 41 |         sess = tf.Session()
 42 |         with sess.as_default():
 43 | 
 44 |             def test_step(word_batch, pos1_batch, pos2_batch, y_batch):
 45 | 
 46 |                 feed_dict = {}
 47 |                 total_shape = []
 48 |                 total_num = 0
 49 |                 total_word = []
 50 |                 total_pos1 = []
 51 |                 total_pos2 = []
 52 | 
 53 |                 for i in range(len(word_batch)):
 54 |                     total_shape.append(total_num)
 55 |                     total_num += len(word_batch[i])
 56 |                     for word in word_batch[i]:
 57 |                         total_word.append(word)
 58 |                     for pos1 in pos1_batch[i]:
 59 |                         total_pos1.append(pos1)
 60 |                     for pos2 in pos2_batch[i]:
 61 |                         total_pos2.append(pos2)
 62 | 
 63 |                 total_shape.append(total_num)
 64 |                 total_shape = np.array(total_shape)
 65 |                 total_word = np.array(total_word)
 66 |                 total_pos1 = np.array(total_pos1)
 67 |                 total_pos2 = np.array(total_pos2)
 68 | 
 69 |                 feed_dict[mtest.total_shape] = total_shape
 70 |                 feed_dict[mtest.input_word] = total_word
 71 |                 feed_dict[mtest.input_pos1] = total_pos1
 72 |                 feed_dict[mtest.input_pos2] = total_pos2
 73 |                 feed_dict[mtest.input_y] = y_batch
 74 | 
 75 |                 loss, accuracy, prob = sess.run(
 76 |                     [mtest.loss, mtest.accuracy, mtest.prob], feed_dict)
 77 |                 return prob, accuracy
 78 | 
 79 |            
 80 |             with tf.variable_scope("model"):
 81 |                 mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)
 82 | 
 83 |             names_to_vars = {v.op.name: v for v in tf.global_variables()}
 84 |             saver = tf.train.Saver(names_to_vars)
 85 | 
 86 |         
 87 |             #testlist = range(1000, 1800, 100)
 88 |             testlist = [8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800]
 89 |             
 90 |             for model_iter in testlist:
 91 |                 # for compatibility purposes only, name key changes from tf 0.x to 1.x, compat_layer
 92 |                 saver.restore(sess, pathname + str(model_iter))
 93 | 
 94 | 
 95 |                 time_str = datetime.datetime.now().isoformat()
 96 |                 print(time_str)
 97 |                 print('Evaluating all test data and save data for PR curve')
 98 | 
 99 |                 test_y = np.load('chinese_RE/custom_RE/testall_y.npy', allow_pickle=True)
100 |                 test_word = np.load('chinese_RE/custom_RE/testall_word.npy', allow_pickle=True)
101 |                 test_pos1 = np.load('chinese_RE/custom_RE/testall_pos1.npy', allow_pickle=True)
102 |                 test_pos2 = np.load('chinese_RE/custom_RE/testall_pos2.npy', allow_pickle=True)
103 |                 allprob = []
104 |                 acc = []
105 |                 for i in range(int(len(test_word) / float(test_settings.big_num))):
106 |                     prob, accuracy = test_step(test_word[i * test_settings.big_num:(i + 1) * test_settings.big_num],
107 |                                                test_pos1[i * test_settings.big_num:(i + 1) * test_settings.big_num],
108 |                                                test_pos2[i * test_settings.big_num:(i + 1) * test_settings.big_num],
109 |                                                test_y[i * test_settings.big_num:(i + 1) * test_settings.big_num])
110 |                     acc.append(np.mean(np.reshape(np.array(accuracy), (test_settings.big_num))))
111 |                     prob = np.reshape(np.array(prob), (test_settings.big_num, test_settings.num_classes))
112 |                     for single_prob in prob:
113 |                         allprob.append(single_prob[1:])
114 |                 allprob = np.reshape(np.array(allprob), (-1))
115 |                 order = np.argsort(-allprob)
116 | 
117 |                 print('saving all test result...')
118 |                 current_step = model_iter
119 |                 
120 |                 np.save('./out/allprob_iter_' + str(current_step) + '.npy', allprob)
121 |                 allans = np.load('chinese_RE/custom_RE/allans.npy', allow_pickle=True)
122 | 
123 |                 # caculate the pr curve area
124 |                 average_precision = metrics.average_precision_score(allans, allprob)
125 |                 print('PR curve area:' + str(average_precision))
126 | 
127 | 
128 | def main(_):
129 | 
130 |     #If you retrain the model, please remember to change the path to your own model below:
131 |     pathname = "./model_custom/ATT_GRU_model-8800"
132 | 
133 |     wordembedding = np.load('chinese_RE/custom_RE/vec.npy')
134 |     test_settings = network.Settings()
135 |     test_settings.vocab_size = 16693
136 |     test_settings.num_classes = 16
137 |     test_settings.big_num = 1
138 | 
139 |     with tf.Graph().as_default():
140 |         sess = tf.Session()
141 |         with sess.as_default():
142 |             def test_step(word_batch, pos1_batch, pos2_batch, y_batch):
143 | 
144 |                 feed_dict = {}
145 |                 total_shape = []
146 |                 total_num = 0
147 |                 total_word = []
148 |                 total_pos1 = []
149 |                 total_pos2 = []
150 | 
151 |                 for i in range(len(word_batch)):
152 |                     total_shape.append(total_num)
153 |                     total_num += len(word_batch[i])
154 |                     for word in word_batch[i]:
155 |                         total_word.append(word)
156 |                     for pos1 in pos1_batch[i]:
157 |                         total_pos1.append(pos1)
158 |                     for pos2 in pos2_batch[i]:
159 |                         total_pos2.append(pos2)
160 | 
161 |                 total_shape.append(total_num)
162 |                 total_shape = np.array(total_shape)
163 |                 total_word = np.array(total_word)
164 |                 total_pos1 = np.array(total_pos1)
165 |                 total_pos2 = np.array(total_pos2)
166 | 
167 |                 feed_dict[mtest.total_shape] = total_shape
168 |                 feed_dict[mtest.input_word] = total_word
169 |                 feed_dict[mtest.input_pos1] = total_pos1
170 |                 feed_dict[mtest.input_pos2] = total_pos2
171 |                 feed_dict[mtest.input_y] = y_batch
172 | 
173 |                 loss, accuracy, prob = sess.run(
174 |                     [mtest.loss, mtest.accuracy, mtest.prob], feed_dict)
175 |                 return prob, accuracy
176 | 
177 | 
178 |             with tf.variable_scope("model"):
179 |                 mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)
180 | 
181 |             names_to_vars = {v.op.name: v for v in tf.global_variables()}
182 |             saver = tf.train.Saver(names_to_vars)
183 |             saver.restore(sess, pathname)
184 | 
185 |             print('reading word embedding data...')
186 |             vec = []
187 |             word2id = {}
188 |             f = open('./origin_data/vec.txt', encoding='utf-8')
189 |             content = f.readline()
190 |             content = content.strip().split()
191 |             dim = int(content[1])
192 |             while True:
193 |                 content = f.readline()
194 |                 if content == '':
195 |                     break
196 |                 content = content.strip().split()
197 |                 word2id[content[0]] = len(word2id)
198 |                 content = content[1:]
199 |                 content = [(float)(i) for i in content]
200 |                 vec.append(content)
201 |             f.close()
202 |             word2id['UNK'] = len(word2id)
203 |             word2id['BLANK'] = len(word2id)
204 | 
205 |             print('reading relation to id')
206 |             relation2id = {}
207 |             id2relation = {}
208 |             f = open('chinese_RE/custom_RE/relation2id.txt', 'r', encoding='utf-8')
209 |             while True:
210 |                 content = f.readline()
211 |                 if content == '':
212 |                     break
213 |                 content = content.strip().split()
214 |                 relation2id[content[0]] = int(content[1])
215 |                 id2relation[int(content[1])] = content[0]
216 | 
217 |             f.close()
218 | 
219 |             y_true = []
220 |             y_pred =  []
221 | 
222 |             with open('chinese_RE/custom_RE/test.txt', encoding='utf-8') as f:
223 |                 for orgline in f:
224 |                     line = orgline.strip()
225 |                     # break
226 |                 # infile.close()
227 |                     entity1, entity2, rel, sentence = line.split('\t', 3)
228 | 
229 |                     # print("实体1: " + en1)
230 |                     # print("实体2: " + en2)
231 |                     # print(sentence)
232 |                     # relation = 0
233 |                     en1 = entity1.split('###')[0]
234 |                     en2 = entity2.split('###')[0]
235 |                     en1pos = sentence.find(en1)
236 |                     if en1pos == -1:
237 |                         en1pos = 0
238 |                     en2pos = sentence.find(en2)
239 |                     if en2pos == -1:
240 |                         en2post = 0
241 |                     output = []
242 |                     # length of sentence is 70
243 |                     fixlen = 70
244 |                     # max length of position embedding is 60 (-60~+60)
245 |                     maxlen = 60
246 | 
247 |                     #Encoding test x
248 |                     for i in range(fixlen):
249 |                         word = word2id['BLANK']
250 |                         rel_e1 = pos_embed(i - en1pos)
251 |                         rel_e2 = pos_embed(i - en2pos)
252 |                         output.append([word, rel_e1, rel_e2])
253 | 
254 |                     for i in range(min(fixlen, len(sentence))):
255 | 
256 |                         word = 0
257 |                         if sentence[i] not in word2id:
258 |                             word = word2id['UNK']
259 | 
260 |                         else:
261 |                             word = word2id[sentence[i]]
262 | 
263 |                         output[i][0] = word
264 |                     test_x = []
265 |                     test_x.append([output])
266 | 
267 |                     #Encoding test y
268 |                     label = [0 for i in range(len(relation2id))]
269 |                     label[relation2id[rel]] = 1
270 |                     test_y = []
271 |                     test_y.append(label)
272 | 
273 |                     test_x = np.array(test_x)
274 |                     test_y = np.array(test_y)
275 | 
276 |                     test_word = []
277 |                     test_pos1 = []
278 |                     test_pos2 = []
279 | 
280 |                     for i in range(len(test_x)):
281 |                         word = []
282 |                         pos1 = []
283 |                         pos2 = []
284 |                         for j in test_x[i]:
285 |                             temp_word = []
286 |                             temp_pos1 = []
287 |                             temp_pos2 = []
288 |                             for k in j:
289 |                                 temp_word.append(k[0])
290 |                                 temp_pos1.append(k[1])
291 |                                 temp_pos2.append(k[2])
292 |                             word.append(temp_word)
293 |                             pos1.append(temp_pos1)
294 |                             pos2.append(temp_pos2)
295 | 
296 |                         test_word.append(word)
297 |                         test_pos1.append(pos1)
298 |                         test_pos2.append(pos2)
299 | 
300 |                     test_word = np.array(test_word)
301 |                     test_pos1 = np.array(test_pos1)
302 |                     test_pos2 = np.array(test_pos2)
303 | 
304 |                     prob, accuracy = test_step(test_word, test_pos1, test_pos2, test_y)
305 |                     prob = np.reshape(np.array(prob), (1, test_settings.num_classes))[0]
306 | 
307 |                     top_id = prob.argsort()[-1]
308 | 
309 |                     y_true.append(rel)
310 |                     y_pred.append(id2relation[top_id])
311 | 
312 |                 print('准确率:', metrics.accuracy_score(y_true, y_pred))  # 预测准确率输出
313 | 
314 |                 print('宏平均精确率:', metrics.precision_score(y_true, y_pred, average='macro'))  # 预测宏平均精确率输出
315 |                 print('微平均精确率:', metrics.precision_score(y_true, y_pred, average='micro'))  # 预测微平均精确率输出
316 |                 print('加权平均精确率:', metrics.precision_score(y_true, y_pred, average='weighted'))  # 预测加权平均精确率输出
317 | 
318 |                 print('宏平均召回率:', metrics.recall_score(y_true, y_pred, average='macro'))  # 预测宏平均召回率输出
319 |                 print('微平均召回率:', metrics.recall_score(y_true, y_pred, average='micro'))  # 预测微平均召回率输出
320 |                 print('加权平均召回率:', metrics.recall_score(y_true, y_pred, average='micro'))  # 预测加权平均召回率输出
321 | 
322 |                 print('宏平均F1-score:',
323 |                       metrics.f1_score(y_true, y_pred, labels=[id2relation[i] for i in list(range(16))], average='macro'))  # 预测宏平均f1-score输出
324 |                 print('微平均F1-score:',
325 |                       metrics.f1_score(y_true, y_pred, labels=[id2relation[i] for i in list(range(16))], average='micro'))  # 预测微平均f1-score输出
326 |                 print('加权平均F1-score:',
327 |                       metrics.f1_score(y_true, y_pred, labels=[id2relation[i] for i in list(range(16))], average='weighted'))  # 预测加权平均f1-score输出
328 | 
329 |                 # print('混淆矩阵输出:\n', metrics.confusion_matrix(y_true, y_pred, labels=list(range(16))))  # 混淆矩阵输出
330 |                 print('分类报告:\n', metrics.classification_report(y_true, y_pred, labels=[id2relation[i] for i in list(range(16))], digits=3))  # 分类报告输出
331 | 
332 | if __name__ == "__main__":
333 |     tf.app.run()
334 | 


--------------------------------------------------------------------------------
/NER/trainer.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | 
  4 | import argparse
  5 | import os
  6 | from collections import namedtuple
  7 | from typing import Dict
  8 | 
  9 | import pytorch_lightning as pl
 10 | import torch
 11 | from pytorch_lightning import Trainer
 12 | from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 13 | from tokenizers import BertWordPieceTokenizer
 14 | from torch import Tensor
 15 | from torch.nn.modules import CrossEntropyLoss, BCEWithLogitsLoss
 16 | from torch.utils.data import DataLoader
 17 | from transformers import AdamW
 18 | from torch.optim import SGD
 19 | 
 20 | from datasets.mrc_ner_dataset import MRCNERDataset
 21 | from datasets.truncate_dataset import TruncateDataset
 22 | from datasets.collate_functions import collate_to_max_length
 23 | from metrics.query_span_f1 import QuerySpanF1
 24 | from models.bert_query_ner import BertQueryNER
 25 | from models.query_ner_config import BertQueryNerConfig
 26 | from loss import *
 27 | from utils.get_parser import get_parser
 28 | from utils.radom_seed import set_random_seed
 29 | import logging
 30 | set_random_seed(0)
 31 | 
 32 | 
 33 | class BertLabeling(pl.LightningModule):
 34 |     """MLM Trainer"""
 35 | 
 36 |     def __init__(self,args:argparse.Namespace):
 37 |         """Initialize a model, tokenizer and config."""
 38 |         super().__init__()
 39 |         if isinstance(args, argparse.Namespace):
 40 |             self.save_hyperparameters(args)
 41 |             self.args = args
 42 |         else:
 43 |             # eval mode
 44 |             TmpArgs = namedtuple("tmp_args", field_names=list(args.keys()))
 45 |             self.args = args = TmpArgs(**args)
 46 | 
 47 |         self.bert_dir = args.bert_config_dir
 48 |         self.data_dir = self.args.data_dir
 49 | 
 50 |         bert_config = BertQueryNerConfig.from_pretrained(args.bert_config_dir,
 51 |                                                          hidden_dropout_prob=args.bert_dropout,
 52 |                                                          attention_probs_dropout_prob=args.bert_dropout,
 53 |                                                          mrc_dropout=args.mrc_dropout)
 54 | 
 55 |         self.model = BertQueryNER.from_pretrained(args.bert_config_dir,
 56 |                                                   config=bert_config)
 57 |         # logging.info(str(self.model))
 58 |         logging.info(str(args.__dict__ if isinstance(args, argparse.ArgumentParser) else args))
 59 |         # self.ce_loss = CrossEntropyLoss(reduction="none")
 60 |         self.loss_type = args.loss_type
 61 |         # self.loss_type = "bce"
 62 |         if self.loss_type == "bce":
 63 |             self.bce_loss = BCEWithLogitsLoss(reduction="none")
 64 |         else:
 65 |             self.dice_loss = DiceLoss(with_logits=True, smooth=args.dice_smooth)
 66 |         # todo(yuxian): 由于match loss是n^2的，应该特殊调整一下loss rate
 67 |         weight_sum = args.weight_start + args.weight_end + args.weight_span
 68 |         self.weight_start = args.weight_start / weight_sum
 69 |         self.weight_end = args.weight_end / weight_sum
 70 |         self.weight_span = args.weight_span / weight_sum
 71 |         self.flat_ner = args.flat
 72 |         self.span_f1 = QuerySpanF1(flat=self.flat_ner)
 73 |         self.chinese = args.chinese
 74 |         self.optimizer = args.optimizer
 75 |         self.span_loss_candidates = args.span_loss_candidates
 76 | 
 77 |     @staticmethod
 78 |     def add_model_specific_args(parent_parser):
 79 |         parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
 80 |         parser.add_argument("--mrc_dropout", type=float, default=0.1,
 81 |                             help="mrc dropout rate")
 82 |         parser.add_argument("--bert_dropout", type=float, default=0.1,
 83 |                             help="bert dropout rate")
 84 |         parser.add_argument("--weight_start", type=float, default=1.0)
 85 |         parser.add_argument("--weight_end", type=float, default=1.0)
 86 |         parser.add_argument("--weight_span", type=float, default=1.0)
 87 |         parser.add_argument("--flat", action="store_true", help="is flat ner")
 88 |         parser.add_argument("--span_loss_candidates", choices=["all", "pred_and_gold", "gold"],
 89 |                             default="all", help="Candidates used to compute span loss")
 90 |         parser.add_argument("--chinese", action="store_true",
 91 |                             help="is chinese dataset")
 92 |         parser.add_argument("--loss_type", choices=["bce", "dice"], default="bce",
 93 |                             help="loss type")
 94 |         parser.add_argument("--optimizer", choices=["adamw", "sgd"], default="adamw",
 95 |                             help="loss type")
 96 |         parser.add_argument("--dice_smooth", type=float, default=1e-8,
 97 |                             help="smooth value of dice loss")
 98 |         parser.add_argument("--final_div_factor", type=float, default=1e4,
 99 |                             help="final div factor of linear decay scheduler")
100 |         return parser
101 | 
102 |     def configure_optimizers(self):
103 |         """Prepare optimizer and schedule (linear warmup and decay)"""
104 |         no_decay = ["bias", "LayerNorm.weight"]
105 |         optimizer_grouped_parameters = [
106 |             {
107 |                 "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
108 |                 "weight_decay": self.args.weight_decay,
109 |             },
110 |             {
111 |                 "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
112 |                 "weight_decay": 0.0,
113 |             },
114 |         ]
115 |         if self.optimizer == "adamw":
116 |             optimizer = AdamW(optimizer_grouped_parameters,
117 |                               betas=(0.9, 0.98),  # according to RoBERTa paper
118 |                               lr=self.args.lr,
119 |                               eps=self.args.adam_epsilon,)
120 |         else:
121 |             optimizer = SGD(optimizer_grouped_parameters, lr=self.args.lr, momentum=0.9)
122 |         num_gpus = len([x for x in str(self.args.gpus).split(",") if x.strip()])
123 |         t_total = (len(self.train_dataloader()) // (self.args.accumulate_grad_batches * num_gpus) + 1) * self.args.max_epochs
124 |         scheduler = torch.optim.lr_scheduler.OneCycleLR(
125 |             optimizer, max_lr=self.args.lr, pct_start=float(self.args.warmup_steps/t_total),
126 |             final_div_factor=self.args.final_div_factor,
127 |             total_steps=t_total, anneal_strategy='linear'
128 |         )
129 |         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
130 | 
131 |     def forward(self, input_ids, attention_mask, token_type_ids):
132 |         """"""
133 |         return self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
134 | 
135 |     def compute_loss(self, start_logits, end_logits, span_logits,
136 |                      start_labels, end_labels, match_labels, start_label_mask, end_label_mask):
137 |         batch_size, seq_len = start_logits.size()
138 | 
139 |         start_float_label_mask = start_label_mask.view(-1).float()
140 |         end_float_label_mask = end_label_mask.view(-1).float()
141 |         match_label_row_mask = start_label_mask.bool().unsqueeze(-1).expand(-1, -1, seq_len)
142 |         match_label_col_mask = end_label_mask.bool().unsqueeze(-2).expand(-1, seq_len, -1)
143 |         match_label_mask = match_label_row_mask & match_label_col_mask
144 |         match_label_mask = torch.triu(match_label_mask, 0)  # start should be less equal to end
145 | 
146 |         if self.span_loss_candidates == "all":
147 |             # naive mask
148 |             float_match_label_mask = match_label_mask.view(batch_size, -1).float()
149 |         else:
150 |             # use only pred or golden start/end to compute match loss
151 |             start_preds = start_logits > 0
152 |             end_preds = end_logits > 0
153 |             if self.span_loss_candidates == "gold":
154 |                 match_candidates = ((start_labels.unsqueeze(-1).expand(-1, -1, seq_len) > 0)
155 |                                     & (end_labels.unsqueeze(-2).expand(-1, seq_len, -1) > 0))
156 |             else:
157 |                 match_candidates = torch.logical_or(
158 |                     (start_preds.unsqueeze(-1).expand(-1, -1, seq_len)
159 |                      & end_preds.unsqueeze(-2).expand(-1, seq_len, -1)),
160 |                     (start_labels.unsqueeze(-1).expand(-1, -1, seq_len)
161 |                      & end_labels.unsqueeze(-2).expand(-1, seq_len, -1))
162 |                 )
163 |             match_label_mask = match_label_mask & match_candidates
164 |             float_match_label_mask = match_label_mask.view(batch_size, -1).float()
165 |         if self.loss_type == "bce":
166 |             start_loss = self.bce_loss(start_logits.view(-1), start_labels.view(-1).float())
167 |             start_loss = (start_loss * start_float_label_mask).sum() / start_float_label_mask.sum()
168 |             end_loss = self.bce_loss(end_logits.view(-1), end_labels.view(-1).float())
169 |             end_loss = (end_loss * end_float_label_mask).sum() / end_float_label_mask.sum()
170 |             match_loss = self.bce_loss(span_logits.view(batch_size, -1), match_labels.view(batch_size, -1).float())
171 |             match_loss = match_loss * float_match_label_mask
172 |             match_loss = match_loss.sum() / (float_match_label_mask.sum() + 1e-10)
173 |         else:
174 |             start_loss = self.dice_loss(start_logits, start_labels.float(), start_float_label_mask)
175 |             end_loss = self.dice_loss(end_logits, end_labels.float(), end_float_label_mask)
176 |             match_loss = self.dice_loss(span_logits, match_labels.float(), float_match_label_mask)
177 | 
178 |         return start_loss, end_loss, match_loss
179 | 
180 |     def training_step(self, batch, batch_idx):
181 |         """"""
182 |         tf_board_logs = {
183 |             "lr": self.trainer.optimizers[0].param_groups[0]['lr']
184 |         }
185 |         tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx = batch
186 | 
187 |         # num_tasks * [bsz, length, num_labels]
188 |         attention_mask = (tokens != 0).long()
189 |         start_logits, end_logits, span_logits = self(tokens, attention_mask, token_type_ids)
190 | 
191 |         start_loss, end_loss, match_loss = self.compute_loss(start_logits=start_logits,
192 |                                                              end_logits=end_logits,
193 |                                                              span_logits=span_logits,
194 |                                                              start_labels=start_labels,
195 |                                                              end_labels=end_labels,
196 |                                                              match_labels=match_labels,
197 |                                                              start_label_mask=start_label_mask,
198 |                                                              end_label_mask=end_label_mask
199 |                                                              )
200 | 
201 |         total_loss = self.weight_start * start_loss + self.weight_end * end_loss + self.weight_span * match_loss
202 | 
203 |         tf_board_logs[f"train_loss"] = total_loss
204 |         tf_board_logs[f"start_loss"] = start_loss
205 |         tf_board_logs[f"end_loss"] = end_loss
206 |         tf_board_logs[f"match_loss"] = match_loss
207 | 
208 |         return {'loss': total_loss, 'log': tf_board_logs}
209 | 
210 |     def validation_step(self, batch, batch_idx):
211 |         """"""
212 | 
213 |         output = {}
214 | 
215 |         tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx = batch
216 | 
217 |         attention_mask = (tokens != 0).long()
218 |         start_logits, end_logits, span_logits = self(tokens, attention_mask, token_type_ids)
219 | 
220 |         start_loss, end_loss, match_loss = self.compute_loss(start_logits=start_logits,
221 |                                                              end_logits=end_logits,
222 |                                                              span_logits=span_logits,
223 |                                                              start_labels=start_labels,
224 |                                                              end_labels=end_labels,
225 |                                                              match_labels=match_labels,
226 |                                                              start_label_mask=start_label_mask,
227 |                                                              end_label_mask=end_label_mask
228 |                                                              )
229 | 
230 |         total_loss = self.weight_start * start_loss + self.weight_end * end_loss + self.weight_span * match_loss
231 | 
232 |         output[f"val_loss"] = total_loss
233 |         output[f"start_loss"] = start_loss
234 |         output[f"end_loss"] = end_loss
235 |         output[f"match_loss"] = match_loss
236 | 
237 |         start_preds, end_preds = start_logits > 0, end_logits > 0
238 |         span_f1_stats = self.span_f1(start_preds=start_preds, end_preds=end_preds, match_logits=span_logits,
239 |                                      start_label_mask=start_label_mask, end_label_mask=end_label_mask,
240 |                                      match_labels=match_labels)
241 |         output["span_f1_stats"] = span_f1_stats
242 | 
243 |         return output
244 | 
245 |     def validation_epoch_end(self, outputs):
246 |         """"""
247 |         avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
248 |         tensorboard_logs = {'val_loss': avg_loss}
249 | 
250 |         all_counts = torch.stack([x[f'span_f1_stats'] for x in outputs]).sum(0)
251 |         span_tp, span_fp, span_fn = all_counts
252 |         span_recall = span_tp / (span_tp + span_fn + 1e-10)
253 |         span_precision = span_tp / (span_tp + span_fp + 1e-10)
254 |         span_f1 = span_precision * span_recall * 2 / (span_recall + span_precision + 1e-10)
255 |         tensorboard_logs[f"span_precision"] = span_precision
256 |         tensorboard_logs[f"span_recall"] = span_recall
257 |         tensorboard_logs[f"span_f1"] = span_f1
258 | 
259 |         return {'val_loss': avg_loss, 'log': tensorboard_logs}
260 | 
261 |     def test_step(self, batch, batch_idx):
262 |         """"""
263 |         return self.validation_step(batch, batch_idx)
264 | 
265 |     def test_epoch_end(
266 |         self,
267 |         outputs
268 |     ) -> Dict[str, Dict[str, Tensor]]:
269 |         """"""
270 |         return self.validation_epoch_end(outputs)
271 | 
272 |     def train_dataloader(self) -> DataLoader:
273 |         return self.get_dataloader("train")
274 |         # return self.get_dataloader("dev", 100)
275 | 
276 |     def val_dataloader(self):
277 |         return self.get_dataloader("dev")
278 | 
279 |     def test_dataloader(self):
280 |         return self.get_dataloader("test")
281 |         # return self.get_dataloader("dev")
282 | 
283 |     def get_dataloader(self, prefix="train", limit: int = None) -> DataLoader:
284 |         """get training dataloader"""
285 |         """
286 |         load_mmap_dataset
287 |         """
288 |         json_path = os.path.join(self.data_dir, f"mrc-ner.{prefix}")
289 |         vocab_path = os.path.join(self.bert_dir, "vocab.txt")
290 |         dataset = MRCNERDataset(json_path=json_path,
291 |                                 tokenizer=BertWordPieceTokenizer(vocab_path),
292 |                                 max_length=self.args.max_length,
293 |                                 is_chinese=self.chinese,
294 |                                 pad_to_maxlen=False
295 |                                 )
296 | 
297 |         if limit is not None:
298 |             dataset = TruncateDataset(dataset, limit)
299 | 
300 |         dataloader = DataLoader(
301 |             dataset=dataset,
302 |             batch_size=self.args.batch_size,
303 |             num_workers=self.args.workers,
304 |             shuffle=True if prefix == "train" else False,
305 |             collate_fn=collate_to_max_length
306 |         )
307 | 
308 |         return dataloader
309 | 
310 | 
311 | def run_dataloader():
312 |     """test dataloader"""
313 |     parser = get_parser()
314 | 
315 |     # add model specific args
316 |     parser = BertLabeling.add_model_specific_args(parser)
317 | 
318 |     # add all the available trainer options to argparse
319 |     # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
320 |     parser = Trainer.add_argparse_args(parser)
321 | 
322 |     args = parser.parse_args()
323 |     args.workers = 0
324 |     args.default_root_dir = "/mnt/data/mrc/train_logs/debug"
325 | 
326 |     model = BertLabeling(args)
327 |     from tokenizers import BertWordPieceTokenizer
328 |     tokenizer = BertWordPieceTokenizer(os.path.join(args.bert_config_dir, "vocab.txt"))
329 | 
330 |     loader = model.get_dataloader("dev", limit=1000)
331 |     for d in loader:
332 |         input_ids = d[0][0].tolist()
333 |         match_labels = d[-1][0]
334 |         start_positions, end_positions = torch.where(match_labels > 0)
335 |         start_positions = start_positions.tolist()
336 |         end_positions = end_positions.tolist()
337 |         if not start_positions:
338 |             continue
339 |         print("="*20)
340 |         print(tokenizer.decode(input_ids, skip_special_tokens=False))
341 |         for start, end in zip(start_positions, end_positions):
342 |             print(tokenizer.decode(input_ids[start: end+1]))
343 | 
344 | 
345 | def main():
346 |     """main"""
347 |     parser = get_parser()
348 | 
349 |     # add model specific args
350 |     parser = BertLabeling.add_model_specific_args(parser)
351 | 
352 |     # add all the available trainer options to argparse
353 |     # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
354 |     parser = Trainer.add_argparse_args(parser)
355 | 
356 |     args = parser.parse_args()
357 | 
358 |     model = BertLabeling(args)
359 |     if args.pretrained_checkpoint:
360 |         model.load_state_dict(torch.load(args.pretrained_checkpoint,
361 |                                          map_location=torch.device('cpu'))["state_dict"])
362 | 
363 |     checkpoint_callback = ModelCheckpoint(
364 |         filepath=args.default_root_dir,
365 |         save_top_k=10,
366 |         verbose=True,
367 |         monitor="span_f1",
368 |         period=-1,
369 |         mode="max",
370 |     )
371 |     trainer = Trainer.from_argparse_args(
372 |         args,
373 |         checkpoint_callback=checkpoint_callback
374 |     )
375 | 
376 |     trainer.fit(model)
377 | 
378 | 
379 | if __name__ == '__main__':
380 |     #run_dataloader()
381 |     main()
382 | 


--------------------------------------------------------------------------------
/NER/datasets/doc-paragraph-sentence-id/mrc-ner.test-id:
--------------------------------------------------------------------------------
   1 | 37,65,0
   2 | 41,117,0
   3 | 11,65,0
   4 | 32,3,0
   5 | 16,0,0
   6 | 20,48,0
   7 | 13,43,0
   8 | 18,7,1
   9 | 30,4,0
  10 | 29,24,0
  11 | 37,32,0
  12 | 17,56,0
  13 | 17,77,0
  14 | 8,30,0
  15 | 1,17,0
  16 | 41,85,1
  17 | 11,51,0
  18 | 28,18,0
  19 | 18,73,0
  20 | 14,10,1
  21 | 23,54,0
  22 | 14,38,0
  23 | 4,19,0
  24 | 20,62,0
  25 | 4,28,0
  26 | 39,72,0
  27 | 15,45,0
  28 | 15,82,0
  29 | 28,13,0
  30 | 39,87,0
  31 | 6,49,0
  32 | 2,22,2
  33 | 17,92,4
  34 | 7,24,0
  35 | 2,22,2
  36 | 30,18,1
  37 | 41,84,0
  38 | 37,54,0
  39 | 40,62,3
  40 | 21,5,0
  41 | 34,4,0
  42 | 36,7,0
  43 | 20,51,5
  44 | 27,11,0
  45 | 29,32,0
  46 | 3,2,1
  47 | 17,38,2
  48 | 4,1,4
  49 | 14,10,0
  50 | 25,14,0
  51 | 1,37,0
  52 | 17,93,0
  53 | 34,26,1
  54 | 16,21,1
  55 | 33,68,0
  56 | 13,37,0
  57 | 22,21,0
  58 | 18,7,1
  59 | 31,10,0
  60 | 19,29,0
  61 | 15,16,0
  62 | 2,6,0
  63 | 31,8,0
  64 | 21,24,0
  65 | 8,11,0
  66 | 26,7,0
  67 | 28,22,0
  68 | 33,67,0
  69 | 12,11,2
  70 | 17,46,0
  71 | 29,0,0
  72 | 16,4,2
  73 | 38,19,0
  74 | 41,27,1
  75 | 25,16,0
  76 | 13,78,0
  77 | 22,53,0
  78 | 14,34,1
  79 | 39,13,1
  80 | 15,44,0
  81 | 13,6,0
  82 | 34,2,0
  83 | 32,47,0
  84 | 7,39,0
  85 | 27,10,0
  86 | 17,33,0
  87 | 7,24,0
  88 | 13,68,0
  89 | 18,64,0
  90 | 5,8,0
  91 | 10,9,0
  92 | 33,20,0
  93 | 8,34,0
  94 | 3,14,0
  95 | 30,12,0
  96 | 9,14,0
  97 | 9,26,0
  98 | 28,6,0
  99 | 26,16,0
 100 | 41,80,0
 101 | 13,51,0
 102 | 17,79,0
 103 | 35,9,2
 104 | 34,43,0
 105 | 7,53,0
 106 | 23,38,0
 107 | 27,9,0
 108 | 39,52,0
 109 | 37,68,0
 110 | 3,13,0
 111 | 14,3,0
 112 | 2,6,0
 113 | 20,32,0
 114 | 9,22,0
 115 | 41,89,0
 116 | 15,16,0
 117 | 39,83,0
 118 | 40,103,0
 119 | 40,102,0
 120 | 11,66,1
 121 | 14,31,0
 122 | 13,74,0
 123 | 34,10,6
 124 | 18,3,0
 125 | 15,72,0
 126 | 1,6,0
 127 | 15,84,0
 128 | 6,50,0
 129 | 39,32,0
 130 | 16,11,0
 131 | 41,47,1
 132 | 39,10,0
 133 | 11,72,0
 134 | 40,73,0
 135 | 9,13,0
 136 | 1,4,0
 137 | 41,3,2
 138 | 25,18,0
 139 | 23,1,0
 140 | 20,38,0
 141 | 30,35,0
 142 | 7,30,0
 143 | 36,22,0
 144 | 15,42,0
 145 | 15,32,0
 146 | 16,51,0
 147 | 19,22,0
 148 | 40,38,0
 149 | 6,8,0
 150 | 27,13,0
 151 | 13,9,0
 152 | 18,16,0
 153 | 30,50,0
 154 | 14,30,0
 155 | 34,33,0
 156 | 13,26,0
 157 | 21,24,0
 158 | 41,39,0
 159 | 11,54,0
 160 | 34,10,5
 161 | 23,49,0
 162 | 11,56,1
 163 | 34,8,5
 164 | 37,60,1
 165 | 33,4,0
 166 | 41,76,1
 167 | 21,30,0
 168 | 22,9,0
 169 | 17,89,0
 170 | 41,46,2
 171 | 41,78,3
 172 | 39,34,0
 173 | 41,47,1
 174 | 6,33,0
 175 | 34,44,0
 176 | 27,15,0
 177 | 28,1,0
 178 | 30,35,0
 179 | 37,39,2
 180 | 3,25,0
 181 | 4,25,0
 182 | 34,26,0
 183 | 17,88,0
 184 | 27,13,0
 185 | 41,80,0
 186 | 4,20,0
 187 | 6,37,0
 188 | 12,5,0
 189 | 19,13,2
 190 | 27,26,0
 191 | 40,13,0
 192 | 26,5,0
 193 | 6,22,0
 194 | 14,40,0
 195 | 41,46,0
 196 | 40,63,1
 197 | 29,8,0
 198 | 22,73,0
 199 | 8,21,0
 200 | 6,46,0
 201 | 41,122,0
 202 | 14,22,0
 203 | 18,53,0
 204 | 7,0,0
 205 | 13,49,0
 206 | 12,26,0
 207 | 13,80,2
 208 | 3,11,1
 209 | 14,2,0
 210 | 22,72,0
 211 | 28,9,0
 212 | 19,20,0
 213 | 11,24,0
 214 | 37,103,0
 215 | 8,37,0
 216 | 12,25,1
 217 | 12,49,0
 218 | 34,40,0
 219 | 40,31,0
 220 | 30,12,3
 221 | 6,5,0
 222 | 16,16,0
 223 | 6,29,0
 224 | 17,11,0
 225 | 30,59,0
 226 | 6,42,0
 227 | 4,10,1
 228 | 41,41,0
 229 | 16,36,0
 230 | 17,60,0
 231 | 37,30,0
 232 | 30,20,0
 233 | 14,5,1
 234 | 40,50,1
 235 | 17,35,4
 236 | 18,4,2
 237 | 37,100,0
 238 | 32,24,0
 239 | 15,71,0
 240 | 13,30,0
 241 | 41,51,0
 242 | 13,6,0
 243 | 39,48,1
 244 | 34,5,0
 245 | 6,50,0
 246 | 23,47,0
 247 | 20,57,0
 248 | 28,23,0
 249 | 20,31,3
 250 | 40,100,1
 251 | 6,61,0
 252 | 21,40,0
 253 | 18,60,0
 254 | 38,16,0
 255 | 39,66,0
 256 | 33,62,1
 257 | 32,10,0
 258 | 33,69,0
 259 | 8,25,0
 260 | 11,27,1
 261 | 20,61,0
 262 | 37,69,3
 263 | 13,66,0
 264 | 21,7,0
 265 | 7,36,0
 266 | 5,8,0
 267 | 17,2,1
 268 | 31,8,0
 269 | 39,56,1
 270 | 33,32,0
 271 | 30,48,0
 272 | 4,18,0
 273 | 37,68,0
 274 | 1,58,0
 275 | 22,9,2
 276 | 32,21,0
 277 | 12,5,2
 278 | 17,72,0
 279 | 13,54,0
 280 | 27,19,0
 281 | 18,46,0
 282 | 26,10,0
 283 | 41,61,0
 284 | 41,37,1
 285 | 9,16,0
 286 | 41,15,0
 287 | 23,62,1
 288 | 41,46,1
 289 | 39,63,0
 290 | 4,12,0
 291 | 37,51,0
 292 | 32,41,0
 293 | 16,11,0
 294 | 38,15,0
 295 | 27,2,0
 296 | 18,19,0
 297 | 14,25,0
 298 | 12,19,0
 299 | 33,75,0
 300 | 37,40,0
 301 | 41,84,0
 302 | 18,64,1
 303 | 41,78,0
 304 | 21,28,0
 305 | 6,4,0
 306 | 30,29,0
 307 | 33,3,0
 308 | 8,41,0
 309 | 19,16,0
 310 | 2,15,0
 311 | 10,28,0
 312 | 15,16,0
 313 | 10,29,0
 314 | 24,20,0
 315 | 31,10,0
 316 | 12,47,0
 317 | 6,46,2
 318 | 37,17,0
 319 | 1,3,0
 320 | 37,19,0
 321 | 14,15,3
 322 | 12,10,0
 323 | 29,15,0
 324 | 12,17,0
 325 | 39,55,0
 326 | 27,28,0
 327 | 35,9,2
 328 | 10,16,0
 329 | 12,30,1
 330 | 12,23,0
 331 | 7,41,0
 332 | 41,96,0
 333 | 34,32,2
 334 | 12,33,1
 335 | 13,69,3
 336 | 1,44,0
 337 | 6,49,0
 338 | 3,6,0
 339 | 32,21,1
 340 | 33,70,0
 341 | 36,25,0
 342 | 16,51,0
 343 | 14,15,0
 344 | 27,0,0
 345 | 6,19,0
 346 | 39,3,0
 347 | 18,10,0
 348 | 39,31,0
 349 | 7,24,0
 350 | 39,59,0
 351 | 9,24,0
 352 | 39,20,2
 353 | 8,35,0
 354 | 15,16,0
 355 | 34,35,0
 356 | 34,45,1
 357 | 12,23,0
 358 | 41,5,2
 359 | 40,68,1
 360 | 17,71,0
 361 | 37,72,0
 362 | 13,78,0
 363 | 7,41,0
 364 | 13,49,0
 365 | 24,7,2
 366 | 22,65,0
 367 | 40,12,0
 368 | 2,22,1
 369 | 10,43,0
 370 | 15,64,2
 371 | 25,12,0
 372 | 6,19,1
 373 | 37,51,0
 374 | 41,74,1
 375 | 17,50,1
 376 | 34,10,3
 377 | 34,38,0
 378 | 8,28,0
 379 | 32,51,0
 380 | 4,4,0
 381 | 36,19,0
 382 | 39,63,0
 383 | 35,24,2
 384 | 4,22,0
 385 | 32,18,0
 386 | 17,80,0
 387 | 36,30,2
 388 | 35,9,0
 389 | 40,83,0
 390 | 16,26,0
 391 | 13,37,0
 392 | 41,33,0
 393 | 41,78,3
 394 | 15,84,0
 395 | 6,57,0
 396 | 36,8,2
 397 | 27,4,0
 398 | 24,12,0
 399 | 18,60,0
 400 | 6,46,3
 401 | 40,27,0
 402 | 18,68,0
 403 | 17,72,0
 404 | 40,83,3
 405 | 15,34,0
 406 | 25,21,0
 407 | 21,44,0
 408 | 28,6,0
 409 | 41,85,1
 410 | 40,63,2
 411 | 6,31,0
 412 | 30,41,1
 413 | 31,12,0
 414 | 28,6,1
 415 | 13,44,0
 416 | 39,53,1
 417 | 12,37,0
 418 | 9,25,0
 419 | 18,56,3
 420 | 14,18,0
 421 | 8,29,0
 422 | 22,46,0
 423 | 36,25,0
 424 | 30,14,0
 425 | 7,23,0
 426 | 24,24,0
 427 | 33,22,0
 428 | 30,39,0
 429 | 33,37,1
 430 | 6,57,0
 431 | 11,68,0
 432 | 37,47,2
 433 | 23,39,0
 434 | 22,77,0
 435 | 41,30,0
 436 | 21,14,1
 437 | 10,25,0
 438 | 41,75,0
 439 | 13,71,0
 440 | 39,27,0
 441 | 29,36,0
 442 | 1,59,0
 443 | 37,68,0
 444 | 41,3,1
 445 | 23,59,0
 446 | 7,44,0
 447 | 40,110,0
 448 | 17,93,0
 449 | 11,76,2
 450 | 33,54,0
 451 | 40,2,0
 452 | 41,107,0
 453 | 32,27,0
 454 | 27,30,0
 455 | 27,26,0
 456 | 17,18,0
 457 | 36,5,0
 458 | 24,29,0
 459 | 8,10,0
 460 | 11,5,0
 461 | 16,42,0
 462 | 30,51,0
 463 | 14,17,3
 464 | 19,26,2
 465 | 41,88,0
 466 | 2,14,0
 467 | 9,11,0
 468 | 12,13,0
 469 | 34,25,1
 470 | 23,2,0
 471 | 6,19,1
 472 | 6,46,0
 473 | 26,14,1
 474 | 14,10,0
 475 | 24,22,0
 476 | 13,45,0
 477 | 18,21,0
 478 | 41,91,0
 479 | 17,35,0
 480 | 25,17,0
 481 | 15,79,0
 482 | 21,36,0
 483 | 9,19,0
 484 | 13,9,0
 485 | 33,56,0
 486 | 10,7,0
 487 | 10,21,0
 488 | 41,22,0
 489 | 7,38,0
 490 | 16,36,0
 491 | 23,14,0
 492 | 3,14,0
 493 | 39,35,0
 494 | 25,16,0
 495 | 20,60,0
 496 | 17,34,2
 497 | 41,103,0
 498 | 15,6,0
 499 | 34,39,0
 500 | 6,44,1
 501 | 11,59,0
 502 | 9,28,0
 503 | 27,10,0
 504 | 12,42,0
 505 | 39,88,2
 506 | 23,31,0
 507 | 18,51,0
 508 | 6,58,0
 509 | 30,57,1
 510 | 23,64,0
 511 | 20,59,1
 512 | 24,13,0
 513 | 28,21,0
 514 | 2,9,0
 515 | 12,29,0
 516 | 13,5,0
 517 | 17,6,0
 518 | 14,1,0
 519 | 17,80,0
 520 | 12,46,1
 521 | 17,67,0
 522 | 11,29,1
 523 | 23,29,0
 524 | 15,57,0
 525 | 39,20,0
 526 | 41,116,0
 527 | 38,7,0
 528 | 15,81,0
 529 | 13,13,0
 530 | 14,47,0
 531 | 28,21,0
 532 | 10,23,0
 533 | 30,27,0
 534 | 18,38,0
 535 | 13,78,0
 536 | 6,52,0
 537 | 2,8,0
 538 | 38,18,0
 539 | 25,8,0
 540 | 10,41,0
 541 | 23,52,0
 542 | 18,65,0
 543 | 41,39,0
 544 | 14,30,1
 545 | 28,15,0
 546 | 39,90,3
 547 | 20,0,0
 548 | 16,52,0
 549 | 15,79,0
 550 | 19,16,0
 551 | 40,19,0
 552 | 13,49,0
 553 | 40,29,1
 554 | 34,5,0
 555 | 11,1,0
 556 | 22,31,0
 557 | 6,60,0
 558 | 34,4,0
 559 | 6,33,0
 560 | 16,38,2
 561 | 40,110,1
 562 | 39,6,1
 563 | 32,37,0
 564 | 30,39,0
 565 | 12,20,1
 566 | 26,14,0
 567 | 12,50,0
 568 | 7,35,0
 569 | 16,34,0
 570 | 41,3,0
 571 | 13,49,0
 572 | 30,21,0
 573 | 29,11,1
 574 | 13,60,0
 575 | 7,56,0
 576 | 27,25,0
 577 | 37,31,0
 578 | 11,43,0
 579 | 21,23,1
 580 | 3,13,0
 581 | 21,5,0
 582 | 3,8,0
 583 | 16,44,0
 584 | 9,17,0
 585 | 16,53,3
 586 | 26,1,0
 587 | 34,32,2
 588 | 8,30,1
 589 | 36,43,1
 590 | 41,59,1
 591 | 14,7,1
 592 | 39,63,0
 593 | 29,24,0
 594 | 16,12,0
 595 | 16,23,0
 596 | 32,35,0
 597 | 17,3,0
 598 | 15,44,0
 599 | 21,14,7
 600 | 6,25,0
 601 | 41,76,3
 602 | 13,45,0
 603 | 6,24,0
 604 | 32,24,1
 605 | 12,33,1
 606 | 6,6,0
 607 | 7,34,0
 608 | 24,20,0
 609 | 25,19,0
 610 | 32,10,0
 611 | 8,19,4
 612 | 41,100,0
 613 | 41,74,1
 614 | 3,11,1
 615 | 36,34,0
 616 | 20,47,0
 617 | 18,41,0
 618 | 11,48,0
 619 | 29,30,0
 620 | 21,29,1
 621 | 23,0,0
 622 | 36,30,0
 623 | 13,41,0
 624 | 36,14,0
 625 | 7,48,0
 626 | 18,60,1
 627 | 41,94,0
 628 | 29,33,0
 629 | 17,32,1
 630 | 18,32,0
 631 | 24,32,3
 632 | 1,42,0
 633 | 4,17,0
 634 | 40,52,3
 635 | 21,7,1
 636 | 7,7,0
 637 | 27,18,0
 638 | 15,70,0
 639 | 21,53,0
 640 | 21,18,0
 641 | 14,2,0
 642 | 38,7,0
 643 | 23,43,0
 644 | 10,18,0
 645 | 21,24,0
 646 | 25,27,0
 647 | 19,9,0
 648 | 7,40,0
 649 | 37,97,2
 650 | 16,40,0
 651 | 41,75,0
 652 | 6,56,0
 653 | 15,4,1
 654 | 28,8,0
 655 | 40,18,0
 656 | 12,46,0
 657 | 17,5,0
 658 | 30,58,2
 659 | 37,31,0
 660 | 22,69,0
 661 | 18,12,0
 662 | 32,2,2
 663 | 33,33,0
 664 | 32,3,0
 665 | 6,46,4
 666 | 8,32,0
 667 | 33,17,1
 668 | 7,50,0
 669 | 15,31,0
 670 | 13,27,0
 671 | 4,21,4
 672 | 8,19,0
 673 | 32,37,1
 674 | 10,20,0
 675 | 36,3,3
 676 | 2,22,1
 677 | 32,29,0
 678 | 6,12,0
 679 | 14,33,1
 680 | 18,72,0
 681 | 32,17,0
 682 | 6,25,0
 683 | 26,25,0
 684 | 26,27,0
 685 | 20,4,0
 686 | 41,101,1
 687 | 35,17,0
 688 | 1,33,0
 689 | 13,60,0
 690 | 33,56,0
 691 | 3,19,0
 692 | 37,76,0
 693 | 34,27,2
 694 | 13,23,0
 695 | 24,23,0
 696 | 11,64,0
 697 | 12,7,0
 698 | 37,95,0
 699 | 34,8,2
 700 | 39,33,0
 701 | 35,24,3
 702 | 33,70,0
 703 | 28,0,0
 704 | 22,16,0
 705 | 15,80,1
 706 | 12,47,0
 707 | 3,3,0
 708 | 1,24,0
 709 | 24,29,0
 710 | 32,40,0
 711 | 18,7,2
 712 | 6,46,1
 713 | 10,44,2
 714 | 41,10,0
 715 | 10,43,0
 716 | 21,26,0
 717 | 4,9,0
 718 | 24,23,0
 719 | 12,27,0
 720 | 33,52,0
 721 | 10,6,0
 722 | 10,37,0
 723 | 26,13,0
 724 | 13,70,0
 725 | 15,14,0
 726 | 13,15,0
 727 | 15,39,0
 728 | 22,55,0
 729 | 11,19,0
 730 | 12,29,0
 731 | 2,9,1
 732 | 12,14,0
 733 | 41,22,0
 734 | 6,12,0
 735 | 41,74,0
 736 | 25,19,0
 737 | 18,7,1
 738 | 18,27,0
 739 | 29,29,0
 740 | 41,32,0
 741 | 18,24,0
 742 | 9,6,0
 743 | 4,7,0
 744 | 33,73,3
 745 | 4,4,0
 746 | 18,18,0
 747 | 10,46,0
 748 | 27,12,0
 749 | 34,48,0
 750 | 37,79,1
 751 | 23,31,0
 752 | 13,71,0
 753 | 40,23,0
 754 | 6,40,0
 755 | 30,1,0
 756 | 26,13,0
 757 | 39,9,0
 758 | 11,17,1
 759 | 4,11,0
 760 | 34,6,0
 761 | 6,4,0
 762 | 13,9,0
 763 | 37,73,0
 764 | 23,5,0
 765 | 11,53,0
 766 | 20,37,0
 767 | 7,31,0
 768 | 16,17,0
 769 | 6,39,0
 770 | 15,63,0
 771 | 13,32,0
 772 | 28,0,0
 773 | 22,28,0
 774 | 41,79,0
 775 | 22,84,0
 776 | 19,1,0
 777 | 39,17,0
 778 | 39,7,0
 779 | 4,29,1
 780 | 4,19,0
 781 | 41,59,0
 782 | 39,46,0
 783 | 23,26,0
 784 | 15,87,0
 785 | 1,49,0
 786 | 30,27,0
 787 | 28,9,0
 788 | 13,40,0
 789 | 16,30,0
 790 | 15,82,0
 791 | 37,69,2
 792 | 6,47,0
 793 | 37,33,1
 794 | 36,0,0
 795 | 39,34,0
 796 | 8,27,0
 797 | 33,17,0
 798 | 6,13,0
 799 | 20,37,1
 800 | 27,2,0
 801 | 37,69,1
 802 | 5,1,0
 803 | 41,51,1
 804 | 30,42,0
 805 | 37,69,1
 806 | 37,71,0
 807 | 39,71,0
 808 | 37,59,0
 809 | 31,9,0
 810 | 3,14,0
 811 | 34,22,0
 812 | 12,18,0
 813 | 3,3,2
 814 | 33,19,0
 815 | 29,2,2
 816 | 15,82,0
 817 | 13,3,0
 818 | 41,57,1
 819 | 6,51,0
 820 | 39,63,0
 821 | 16,44,0
 822 | 24,10,0
 823 | 33,30,1
 824 | 22,31,0
 825 | 30,40,0
 826 | 13,26,0
 827 | 6,31,0
 828 | 15,14,0
 829 | 24,29,0
 830 | 6,62,2
 831 | 34,3,0
 832 | 18,67,0
 833 | 19,20,2
 834 | 39,87,0
 835 | 36,40,0
 836 | 13,54,0
 837 | 36,4,0
 838 | 24,26,0
 839 | 23,58,0
 840 | 41,89,0
 841 | 41,52,0
 842 | 40,94,0
 843 | 6,32,0
 844 | 41,46,1
 845 | 41,88,0
 846 | 26,25,0
 847 | 12,33,0
 848 | 14,31,0
 849 | 37,21,0
 850 | 17,33,0
 851 | 40,95,1
 852 | 39,57,0
 853 | 32,42,0
 854 | 12,52,0
 855 | 2,14,0
 856 | 9,5,0
 857 | 1,4,1
 858 | 12,46,2
 859 | 26,24,1
 860 | 39,45,3
 861 | 41,61,0
 862 | 16,1,0
 863 | 6,31,0
 864 | 11,5,1
 865 | 41,30,0
 866 | 26,25,0
 867 | 29,22,1
 868 | 34,38,3
 869 | 40,94,1
 870 | 41,78,3
 871 | 13,45,0
 872 | 36,5,0
 873 | 15,46,0
 874 | 11,26,0
 875 | 40,38,0
 876 | 15,77,0
 877 | 17,31,0
 878 | 40,22,0
 879 | 12,34,2
 880 | 38,12,0
 881 | 13,69,3
 882 | 8,24,0
 883 | 20,54,0
 884 | 13,21,0
 885 | 17,38,1
 886 | 34,42,0
 887 | 25,17,0
 888 | 33,44,0
 889 | 12,2,0
 890 | 11,26,0
 891 | 40,0,0
 892 | 24,25,0
 893 | 29,32,3
 894 | 41,72,1
 895 | 26,3,0
 896 | 39,5,0
 897 | 14,14,1
 898 | 34,9,0
 899 | 15,3,0
 900 | 39,35,0
 901 | 6,57,0
 902 | 34,19,0
 903 | 18,56,2
 904 | 7,17,0
 905 | 18,33,1
 906 | 14,31,0
 907 | 39,30,0
 908 | 17,37,0
 909 | 7,56,1
 910 | 36,22,0
 911 | 25,21,0
 912 | 2,19,0
 913 | 27,25,0
 914 | 16,28,0
 915 | 39,45,0
 916 | 33,75,0
 917 | 6,13,0
 918 | 4,26,0
 919 | 17,85,0
 920 | 34,32,1
 921 | 13,9,0
 922 | 13,72,0
 923 | 40,45,0
 924 | 39,45,2
 925 | 16,33,0
 926 | 17,37,0
 927 | 11,29,0
 928 | 17,70,0
 929 | 8,29,0
 930 | 17,60,1
 931 | 13,77,0
 932 | 29,4,1
 933 | 17,56,1
 934 | 9,28,0
 935 | 39,20,0
 936 | 34,39,0
 937 | 32,20,0
 938 | 15,79,0
 939 | 11,35,0
 940 | 21,36,0
 941 | 27,2,0
 942 | 36,28,0
 943 | 22,75,0
 944 | 15,2,0
 945 | 37,63,0
 946 | 21,51,1
 947 | 22,81,0
 948 | 17,84,2
 949 | 2,1,0
 950 | 10,12,0
 951 | 28,9,0
 952 | 34,5,0
 953 | 33,42,0
 954 | 1,14,0
 955 | 41,101,0
 956 | 6,43,0
 957 | 6,56,0
 958 | 35,5,1
 959 | 12,47,0
 960 | 9,27,0
 961 | 13,18,0
 962 | 17,90,1
 963 | 39,53,0
 964 | 30,51,0
 965 | 16,45,0
 966 | 36,8,3
 967 | 2,22,0
 968 | 9,14,0
 969 | 11,66,0
 970 | 4,1,0
 971 | 8,19,2
 972 | 17,57,0
 973 | 25,20,0
 974 | 17,61,0
 975 | 18,12,0
 976 | 7,55,0
 977 | 9,27,0
 978 | 2,0,0
 979 | 13,2,0
 980 | 23,54,0
 981 | 2,15,1
 982 | 7,34,0
 983 | 36,12,0
 984 | 9,22,0
 985 | 37,20,0
 986 | 41,90,1
 987 | 19,24,0
 988 | 13,59,0
 989 | 7,53,0
 990 | 33,37,0
 991 | 1,14,1
 992 | 17,96,0
 993 | 11,60,0
 994 | 23,58,0
 995 | 37,39,1
 996 | 19,26,1
 997 | 19,29,0
 998 | 21,29,0
 999 | 20,33,3
1000 | 3,13,0
1001 | 41,73,0
1002 | 40,15,0
1003 | 41,40,0
1004 | 18,19,0
1005 | 19,13,0
1006 | 29,33,0
1007 | 14,33,1
1008 | 19,15,0
1009 | 38,1,0
1010 | 26,12,0
1011 | 41,59,0
1012 | 36,25,0
1013 | 8,30,0
1014 | 32,55,0
1015 | 35,14,0
1016 | 11,56,0
1017 | 13,39,0
1018 | 36,30,0
1019 | 37,72,0
1020 | 12,30,1
1021 | 39,4,2
1022 | 1,53,0
1023 | 11,52,0
1024 | 16,19,0
1025 | 6,7,0
1026 | 1,2,0
1027 | 8,39,0
1028 | 1,31,3
1029 | 32,29,0
1030 | 13,3,0
1031 | 24,7,0
1032 | 25,27,0
1033 | 21,1,0
1034 | 13,21,0
1035 | 11,63,0
1036 | 6,19,0
1037 | 41,76,4
1038 | 32,3,0
1039 | 38,12,0
1040 | 24,24,0
1041 | 25,19,0
1042 | 26,11,0
1043 | 11,24,0
1044 | 17,89,0
1045 | 32,24,0
1046 | 17,93,0
1047 | 21,34,0
1048 | 34,15,0
1049 | 38,20,0
1050 | 30,38,0
1051 | 8,10,0
1052 | 2,7,1
1053 | 20,10,0
1054 | 13,18,0
1055 | 29,32,0
1056 | 15,32,0
1057 | 14,7,0
1058 | 16,41,0
1059 | 39,65,1
1060 | 17,56,1
1061 | 13,50,0
1062 | 14,28,0
1063 | 23,2,0
1064 | 18,53,0
1065 | 6,48,0
1066 | 37,6,4
1067 | 23,17,0
1068 | 35,11,0
1069 | 3,11,0
1070 | 15,83,0
1071 | 15,23,0
1072 | 19,10,0
1073 | 36,41,0
1074 | 14,35,0
1075 | 23,58,0
1076 | 13,6,0
1077 | 13,8,0
1078 | 4,17,0
1079 | 37,42,0
1080 | 24,28,0
1081 | 32,37,1
1082 | 34,31,1
1083 | 24,18,0
1084 | 12,37,0
1085 | 9,3,0
1086 | 2,12,0
1087 | 13,39,1
1088 | 13,44,0
1089 | 6,40,0
1090 | 7,33,0
1091 | 40,82,0
1092 | 39,53,1
1093 | 22,60,0
1094 | 15,56,0
1095 | 22,84,0
1096 | 34,21,0
1097 | 10,24,1
1098 | 4,7,0
1099 | 38,21,0
1100 | 39,6,0
1101 | 14,1,0
1102 | 37,42,0
1103 | 33,62,0
1104 | 20,36,0
1105 | 3,8,0
1106 | 21,5,0
1107 | 41,86,0
1108 | 15,21,0
1109 | 7,25,1
1110 | 15,4,1
1111 | 35,11,2
1112 | 32,40,0
1113 | 39,41,1
1114 | 39,63,0
1115 | 35,3,0
1116 | 10,42,3
1117 | 17,67,0
1118 | 11,45,0
1119 | 23,27,1
1120 | 7,21,0
1121 | 39,60,0
1122 | 13,32,0
1123 | 17,16,0
1124 | 13,12,0
1125 | 14,5,0
1126 | 30,58,0
1127 | 15,4,1
1128 | 7,33,0
1129 | 26,10,0
1130 | 1,53,0
1131 | 29,9,1
1132 | 13,44,0
1133 | 13,34,0
1134 | 1,37,0
1135 | 7,7,0
1136 | 15,74,0
1137 | 39,45,0
1138 | 4,13,0
1139 | 37,100,0
1140 | 40,32,0
1141 | 34,12,0
1142 | 18,38,0
1143 | 16,16,0
1144 | 16,31,0
1145 | 6,34,0
1146 | 41,65,1
1147 | 17,39,0
1148 | 14,40,0
1149 | 23,60,0
1150 | 9,8,0
1151 | 30,58,2
1152 | 16,25,0
1153 | 37,69,3
1154 | 10,7,0
1155 | 8,39,0
1156 | 4,0,0
1157 | 29,8,0
1158 | 41,105,0
1159 | 12,30,2
1160 | 18,7,0
1161 | 8,12,0
1162 | 23,14,2
1163 | 1,47,0
1164 | 6,11,0
1165 | 15,14,0
1166 | 18,10,0
1167 | 27,5,0
1168 | 7,46,0
1169 | 30,6,0
1170 | 7,46,0
1171 | 9,21,0
1172 | 6,5,0
1173 | 14,29,0
1174 | 13,32,0
1175 | 33,51,0
1176 | 18,10,0
1177 | 17,20,0
1178 | 17,0,0
1179 | 39,30,0
1180 | 14,4,0
1181 | 33,23,0
1182 | 36,4,0
1183 | 12,2,0
1184 | 41,101,1
1185 | 30,15,0
1186 | 39,5,1
1187 | 1,20,0
1188 | 3,20,0
1189 | 1,54,0
1190 | 36,1,1
1191 | 3,1,0
1192 | 30,55,4
1193 | 17,79,1
1194 | 7,46,0
1195 | 16,33,0
1196 | 7,25,4
1197 | 17,4,0
1198 | 23,42,0
1199 | 6,46,0
1200 | 6,62,0
1201 | 2,9,1
1202 | 1,29,0
1203 | 7,1,2
1204 | 6,22,0
1205 | 16,34,0
1206 | 33,62,0
1207 | 7,40,0
1208 | 27,28,0
1209 | 19,5,1
1210 | 41,14,0
1211 | 11,46,0
1212 | 34,33,0
1213 | 15,20,0
1214 | 40,85,1
1215 | 39,25,0
1216 | 2,11,0
1217 | 37,57,0
1218 | 8,27,0
1219 | 16,17,0
1220 | 6,15,0
1221 | 14,7,0
1222 | 28,1,0
1223 | 15,86,0
1224 | 11,70,0
1225 | 14,31,0
1226 | 29,32,0
1227 | 32,15,0
1228 | 1,58,0
1229 | 27,10,0
1230 | 16,1,1
1231 | 11,21,0
1232 | 17,2,0
1233 | 21,9,0
1234 | 23,53,0
1235 | 41,43,1
1236 | 28,13,0
1237 | 33,23,0
1238 | 38,3,0
1239 | 6,34,0
1240 | 11,53,0
1241 | 13,80,0
1242 | 34,44,0
1243 | 16,38,1
1244 | 15,65,0
1245 | 13,37,1
1246 | 15,83,0
1247 | 37,26,0
1248 | 36,30,0
1249 | 41,43,1
1250 | 41,75,0
1251 | 8,35,1
1252 | 36,2,0
1253 | 12,50,0
1254 | 2,24,0
1255 | 11,48,1
1256 | 33,15,0
1257 | 41,95,0
1258 | 37,77,1
1259 | 23,4,0
1260 | 18,50,0
1261 | 28,27,0
1262 | 41,72,0
1263 | 21,24,0
1264 | 23,51,0
1265 | 26,16,0
1266 | 18,59,0
1267 | 33,37,0
1268 | 34,11,1
1269 | 15,25,0
1270 | 6,26,0
1271 | 36,30,0
1272 | 32,38,0
1273 | 12,31,0
1274 | 3,16,0
1275 | 41,120,0
1276 | 40,102,0
1277 | 29,22,3
1278 | 16,21,0
1279 | 37,3,0
1280 | 41,93,0
1281 | 21,16,0
1282 | 12,23,1
1283 | 11,48,0
1284 | 1,8,0
1285 | 17,63,0
1286 | 6,45,0
1287 | 20,8,0
1288 | 6,16,0
1289 | 1,2,0
1290 | 32,41,0
1291 | 17,57,1
1292 | 39,4,1
1293 | 6,6,0
1294 | 6,25,0
1295 | 13,36,0
1296 | 21,2,1
1297 | 9,18,0
1298 | 33,13,0
1299 | 37,6,0
1300 | 26,4,0
1301 | 40,60,0
1302 | 6,46,1
1303 | 6,49,0
1304 | 41,106,0
1305 | 35,9,0
1306 | 22,88,0
1307 | 35,9,0
1308 | 12,5,0
1309 | 32,50,0
1310 | 30,8,0
1311 | 33,30,1
1312 | 15,76,0
1313 | 16,33,0
1314 | 3,14,0
1315 | 37,63,1
1316 | 34,38,1
1317 | 37,25,1
1318 | 37,35,0
1319 | 18,44,0
1320 | 13,10,2
1321 | 13,73,0
1322 | 16,15,0
1323 | 22,36,0
1324 | 10,28,0
1325 | 38,9,0
1326 | 14,16,0
1327 | 13,37,0
1328 | 24,8,0
1329 | 41,3,1
1330 | 15,4,0
1331 | 12,19,0
1332 | 3,0,0
1333 | 9,27,0
1334 | 7,55,1
1335 | 4,20,0
1336 | 16,41,0
1337 | 4,22,0
1338 | 39,35,0
1339 | 23,61,0
1340 | 39,72,0
1341 | 9,26,0
1342 | 41,71,0
1343 | 21,53,0
1344 | 25,11,0
1345 | 33,46,0
1346 | 21,6,0
1347 | 23,60,0
1348 | 27,18,0
1349 | 24,28,0
1350 | 21,22,0
1351 | 17,23,0
1352 | 2,5,0
1353 | 15,77,0
1354 | 16,12,0
1355 | 39,1,0
1356 | 7,3,0
1357 | 36,4,0
1358 | 35,10,0
1359 | 37,77,0
1360 | 24,32,0
1361 | 24,27,0
1362 | 4,25,0
1363 | 10,42,2
1364 | 31,4,0
1365 | 29,39,0
1366 | 1,49,0
1367 | 34,38,2
1368 | 30,36,0
1369 | 26,4,0
1370 | 8,30,0
1371 | 6,20,0
1372 | 12,33,0
1373 | 27,20,0
1374 | 7,21,1
1375 | 32,46,1
1376 | 16,37,0
1377 | 20,6,0
1378 | 14,10,0
1379 | 37,69,1
1380 | 23,34,0
1381 | 9,13,0
1382 | 41,55,1
1383 | 37,74,1
1384 | 22,17,0
1385 | 33,52,0
1386 | 6,42,0
1387 | 11,26,1
1388 | 15,38,0
1389 | 12,20,0
1390 | 9,18,0
1391 | 28,15,0
1392 | 24,28,0
1393 | 4,19,0
1394 | 11,31,0
1395 | 12,18,0
1396 | 10,42,1
1397 | 12,51,0
1398 | 18,79,0
1399 | 22,58,0
1400 | 6,40,0
1401 | 33,20,0
1402 | 3,20,1
1403 | 40,60,0
1404 | 37,73,0
1405 | 39,40,0
1406 | 4,7,0
1407 | 24,6,1
1408 | 36,3,0
1409 | 19,8,0
1410 | 6,47,0
1411 | 8,30,1
1412 | 41,97,0
1413 | 7,23,1
1414 | 38,15,1
1415 | 11,41,0
1416 | 11,53,0
1417 | 37,48,0
1418 | 39,40,0
1419 | 9,15,0
1420 | 41,15,0
1421 | 15,40,0
1422 | 40,28,0
1423 | 31,4,1
1424 | 37,58,0
1425 | 6,8,0
1426 | 1,51,0
1427 | 20,51,2
1428 | 32,36,0
1429 | 7,21,0
1430 | 41,74,0
1431 | 16,21,0
1432 | 26,6,0
1433 | 11,3,5
1434 | 40,31,2
1435 | 38,12,0
1436 | 19,20,0
1437 | 1,23,0
1438 | 18,10,1
1439 | 29,36,0
1440 | 1,33,0
1441 | 36,47,0
1442 | 20,37,0
1443 | 37,30,0
1444 | 21,36,0
1445 | 34,3,2
1446 | 41,121,0
1447 | 1,37,0
1448 | 33,22,0
1449 | 22,51,0
1450 | 10,55,1
1451 | 39,35,0
1452 | 21,10,0
1453 | 39,26,0
1454 | 16,21,0
1455 | 41,33,0
1456 | 1,4,0
1457 | 41,80,0
1458 | 34,40,2
1459 | 37,51,0
1460 | 10,46,0
1461 | 27,10,0
1462 | 6,52,0
1463 | 33,23,0
1464 | 13,37,2
1465 | 13,45,0
1466 | 6,61,1
1467 | 24,24,0
1468 | 5,1,0
1469 | 41,32,0
1470 | 40,62,0
1471 | 1,23,0
1472 | 33,11,0
1473 | 41,49,0
1474 | 39,57,0
1475 | 37,70,0
1476 | 32,40,0
1477 | 10,26,1
1478 | 36,23,0
1479 | 13,30,0
1480 | 29,2,2
1481 | 16,29,0
1482 | 9,28,0
1483 | 18,46,0
1484 | 17,29,1
1485 | 12,27,1
1486 | 24,16,1
1487 | 7,19,0
1488 | 7,41,0
1489 | 12,50,0
1490 | 16,26,0
1491 | 41,68,2
1492 | 32,43,0
1493 | 34,38,0
1494 | 14,8,0
1495 | 15,61,0
1496 | 11,71,0
1497 | 17,15,0
1498 | 34,45,1
1499 | 8,41,0
1500 | 2,6,0
1501 | 33,58,0
1502 | 6,45,0
1503 | 1,29,0
1504 | 41,41,0
1505 | 32,18,0
1506 | 9,22,0
1507 | 15,31,0
1508 | 3,16,0
1509 | 16,15,0
1510 | 16,41,0
1511 | 10,43,0
1512 | 32,27,0
1513 | 16,38,2
1514 | 17,34,0
1515 | 41,123,1
1516 | 10,24,0
1517 | 27,27,0
1518 | 41,46,0
1519 | 16,34,0
1520 | 2,5,0
1521 | 39,84,0
1522 | 15,34,0
1523 | 19,28,0
1524 | 2,6,1
1525 | 17,21,0
1526 | 35,21,2
1527 | 8,7,0
1528 | 20,32,0
1529 | 25,24,0
1530 | 30,7,0
1531 | 1,44,0
1532 | 11,31,0
1533 | 39,80,1
1534 | 9,9,0
1535 | 21,17,0
1536 | 16,48,0
1537 | 30,19,0
1538 | 24,32,0
1539 | 27,12,0
1540 | 36,23,0
1541 | 15,47,0
1542 | 15,34,0
1543 | 15,10,0
1544 | 21,50,0
1545 | 6,67,0
1546 | 18,59,0
1547 | 16,48,0
1548 | 19,18,0
1549 | 15,2,0
1550 | 29,2,0
1551 | 40,54,0
1552 | 10,7,0
1553 | 16,19,0
1554 | 39,73,0
1555 | 34,8,0
1556 | 28,30,0
1557 | 33,68,0
1558 | 33,2,0
1559 | 33,25,0
1560 | 12,46,0
1561 | 36,27,0
1562 | 20,62,1
1563 | 17,79,0
1564 | 41,48,0
1565 | 6,50,0
1566 | 16,23,0
1567 | 30,45,0
1568 | 29,34,0
1569 | 17,50,0
1570 | 20,31,0
1571 | 18,10,0
1572 | 6,35,0
1573 | 32,55,1
1574 | 23,34,0
1575 | 31,9,0
1576 | 29,32,0
1577 | 11,52,0
1578 | 37,27,0
1579 | 36,6,0
1580 | 7,58,0
1581 | 23,58,0
1582 | 1,26,0
1583 | 35,11,0
1584 | 17,62,0
1585 | 41,38,1
1586 | 11,36,0
1587 | 37,79,2
1588 | 24,28,0
1589 | 4,12,0
1590 | 29,11,3
1591 | 19,15,0
1592 | 8,32,0
1593 | 29,9,2
1594 | 17,69,0
1595 | 23,48,0
1596 | 34,33,0
1597 | 20,2,0
1598 | 16,15,0
1599 | 30,51,0
1600 | 23,46,0
1601 | 34,10,3
1602 | 18,49,0
1603 | 15,53,0
1604 | 37,47,0
1605 | 11,34,0
1606 | 39,71,0
1607 | 39,83,0
1608 | 37,61,0
1609 | 19,9,0
1610 | 16,24,2
1611 | 41,76,0
1612 | 18,56,1
1613 | 34,41,0
1614 | 16,48,0
1615 | 10,21,0
1616 | 37,60,0
1617 | 25,4,0
1618 | 6,43,0
1619 | 13,49,0
1620 | 34,9,0
1621 | 7,55,1
1622 | 18,50,0
1623 | 35,4,2
1624 | 17,33,0
1625 | 13,35,0
1626 | 41,55,1
1627 | 30,33,0
1628 | 27,11,0
1629 | 37,28,0
1630 | 6,52,0
1631 | 33,75,0
1632 | 17,70,0
1633 | 2,21,0
1634 | 6,0,0
1635 | 13,14,0
1636 | 24,30,0
1637 | 


--------------------------------------------------------------------------------
/NER/datasets/doc-paragraph-sentence-id/mrc-ner.dev-id:
--------------------------------------------------------------------------------
   1 | 16,53,0
   2 | 6,61,0
   3 | 15,75,0
   4 | 41,48,0
   5 | 30,27,0
   6 | 16,28,1
   7 | 13,27,0
   8 | 21,27,1
   9 | 24,16,2
  10 | 18,77,1
  11 | 32,42,0
  12 | 36,47,0
  13 | 30,21,0
  14 | 15,83,0
  15 | 27,15,0
  16 | 30,17,0
  17 | 37,39,2
  18 | 12,20,0
  19 | 28,27,0
  20 | 9,19,0
  21 | 7,59,0
  22 | 41,119,0
  23 | 3,21,0
  24 | 39,85,0
  25 | 29,9,0
  26 | 16,18,1
  27 | 39,86,0
  28 | 6,55,0
  29 | 24,31,0
  30 | 12,13,0
  31 | 11,12,0
  32 | 40,13,1
  33 | 35,19,0
  34 | 2,11,0
  35 | 39,80,1
  36 | 15,39,0
  37 | 36,10,0
  38 | 34,29,1
  39 | 41,80,0
  40 | 18,42,0
  41 | 39,31,0
  42 | 3,11,0
  43 | 13,65,0
  44 | 41,85,0
  45 | 29,16,0
  46 | 24,8,0
  47 | 27,27,0
  48 | 4,24,0
  49 | 32,11,0
  50 | 7,11,0
  51 | 7,21,0
  52 | 35,14,0
  53 | 7,35,0
  54 | 20,12,0
  55 | 41,112,0
  56 | 1,22,0
  57 | 1,2,0
  58 | 10,1,0
  59 | 14,5,1
  60 | 11,26,0
  61 | 39,88,1
  62 | 37,2,0
  63 | 11,57,0
  64 | 34,10,4
  65 | 7,36,0
  66 | 9,18,0
  67 | 14,30,0
  68 | 22,62,0
  69 | 28,11,0
  70 | 15,12,0
  71 | 41,113,0
  72 | 12,30,4
  73 | 30,57,2
  74 | 16,31,0
  75 | 9,17,0
  76 | 23,52,0
  77 | 30,51,0
  78 | 40,28,0
  79 | 39,74,1
  80 | 39,2,0
  81 | 39,47,0
  82 | 12,24,0
  83 | 13,65,0
  84 | 14,31,0
  85 | 27,25,0
  86 | 26,11,0
  87 | 39,45,0
  88 | 41,105,0
  89 | 17,49,0
  90 | 30,7,0
  91 | 15,57,0
  92 | 14,6,1
  93 | 14,38,0
  94 | 34,51,0
  95 | 12,49,0
  96 | 30,43,0
  97 | 24,13,0
  98 | 37,69,1
  99 | 2,4,0
 100 | 12,11,0
 101 | 22,86,0
 102 | 33,69,0
 103 | 9,24,0
 104 | 20,37,0
 105 | 8,14,0
 106 | 14,29,2
 107 | 18,16,0
 108 | 1,49,0
 109 | 41,53,0
 110 | 41,55,1
 111 | 27,28,0
 112 | 33,17,0
 113 | 17,82,0
 114 | 13,13,0
 115 | 16,30,0
 116 | 41,36,1
 117 | 32,14,0
 118 | 30,29,0
 119 | 41,49,2
 120 | 39,26,1
 121 | 22,53,0
 122 | 24,16,1
 123 | 40,45,0
 124 | 13,37,3
 125 | 25,6,0
 126 | 28,7,0
 127 | 4,8,0
 128 | 38,3,0
 129 | 34,31,2
 130 | 37,23,0
 131 | 35,21,0
 132 | 16,7,0
 133 | 41,61,0
 134 | 16,14,0
 135 | 13,29,0
 136 | 3,8,0
 137 | 17,31,0
 138 | 41,26,0
 139 | 41,85,1
 140 | 34,10,5
 141 | 41,61,1
 142 | 41,31,0
 143 | 22,77,0
 144 | 27,15,2
 145 | 9,5,0
 146 | 7,10,0
 147 | 13,26,0
 148 | 23,19,0
 149 | 39,11,0
 150 | 1,22,0
 151 | 16,39,5
 152 | 2,9,4
 153 | 25,4,0
 154 | 39,45,0
 155 | 17,47,0
 156 | 10,41,0
 157 | 17,10,0
 158 | 3,14,0
 159 | 17,1,0
 160 | 8,8,0
 161 | 39,70,0
 162 | 34,9,0
 163 | 14,16,0
 164 | 18,68,0
 165 | 11,10,0
 166 | 30,53,1
 167 | 34,42,2
 168 | 41,91,0
 169 | 30,7,0
 170 | 14,5,0
 171 | 8,1,1
 172 | 27,15,1
 173 | 17,84,0
 174 | 23,42,0
 175 | 41,104,0
 176 | 23,14,1
 177 | 8,12,0
 178 | 4,24,0
 179 | 11,54,0
 180 | 6,3,0
 181 | 16,13,0
 182 | 11,63,0
 183 | 7,11,4
 184 | 24,3,0
 185 | 6,39,0
 186 | 25,16,0
 187 | 6,61,1
 188 | 33,16,0
 189 | 16,18,1
 190 | 6,53,2
 191 | 13,74,0
 192 | 7,37,1
 193 | 19,28,0
 194 | 10,26,0
 195 | 22,26,0
 196 | 9,13,0
 197 | 17,35,3
 198 | 33,40,0
 199 | 9,7,0
 200 | 6,50,0
 201 | 14,15,2
 202 | 13,79,0
 203 | 2,19,0
 204 | 6,14,0
 205 | 39,54,0
 206 | 20,33,3
 207 | 7,10,0
 208 | 30,29,0
 209 | 27,23,0
 210 | 40,19,1
 211 | 16,4,2
 212 | 41,61,0
 213 | 2,12,1
 214 | 33,46,0
 215 | 11,58,0
 216 | 13,80,2
 217 | 25,7,0
 218 | 34,49,1
 219 | 12,22,0
 220 | 15,20,1
 221 | 41,37,1
 222 | 6,39,0
 223 | 34,20,0
 224 | 13,39,1
 225 | 6,46,4
 226 | 9,28,0
 227 | 38,12,0
 228 | 1,7,3
 229 | 17,53,0
 230 | 34,31,1
 231 | 9,19,0
 232 | 36,39,0
 233 | 30,55,1
 234 | 4,5,0
 235 | 40,32,2
 236 | 37,73,0
 237 | 24,3,0
 238 | 40,66,0
 239 | 37,3,0
 240 | 8,34,2
 241 | 16,25,0
 242 | 2,24,0
 243 | 23,47,0
 244 | 11,45,0
 245 | 41,57,1
 246 | 36,31,0
 247 | 17,44,0
 248 | 3,3,0
 249 | 23,51,0
 250 | 1,50,0
 251 | 11,76,2
 252 | 37,1,1
 253 | 7,14,0
 254 | 29,11,2
 255 | 35,17,0
 256 | 21,32,0
 257 | 17,38,3
 258 | 4,16,0
 259 | 12,42,0
 260 | 8,11,0
 261 | 32,32,1
 262 | 39,17,1
 263 | 24,31,0
 264 | 33,7,0
 265 | 37,39,0
 266 | 34,43,0
 267 | 8,25,4
 268 | 17,35,0
 269 | 18,65,0
 270 | 29,37,0
 271 | 13,73,0
 272 | 14,24,0
 273 | 15,73,0
 274 | 24,25,0
 275 | 32,44,0
 276 | 16,44,0
 277 | 39,26,2
 278 | 17,71,0
 279 | 17,73,0
 280 | 37,69,2
 281 | 16,23,0
 282 | 17,31,0
 283 | 40,103,0
 284 | 34,38,0
 285 | 22,67,0
 286 | 18,46,0
 287 | 9,7,1
 288 | 11,42,0
 289 | 15,18,0
 290 | 12,32,0
 291 | 13,41,0
 292 | 34,34,1
 293 | 30,57,1
 294 | 9,24,0
 295 | 20,1,0
 296 | 4,12,0
 297 | 15,22,0
 298 | 6,57,0
 299 | 26,13,1
 300 | 13,0,0
 301 | 21,12,0
 302 | 26,2,0
 303 | 20,62,1
 304 | 41,73,0
 305 | 21,50,0
 306 | 34,24,0
 307 | 39,31,1
 308 | 26,9,0
 309 | 17,47,0
 310 | 18,23,0
 311 | 6,45,0
 312 | 9,24,0
 313 | 8,22,0
 314 | 36,27,0
 315 | 16,52,0
 316 | 10,17,0
 317 | 23,25,0
 318 | 16,4,0
 319 | 7,3,0
 320 | 21,53,0
 321 | 3,5,1
 322 | 41,103,0
 323 | 34,40,0
 324 | 25,22,0
 325 | 27,16,0
 326 | 13,77,0
 327 | 16,4,0
 328 | 12,33,0
 329 | 41,21,0
 330 | 18,38,0
 331 | 39,26,0
 332 | 7,19,0
 333 | 13,26,0
 334 | 16,6,0
 335 | 3,5,0
 336 | 41,99,0
 337 | 29,12,0
 338 | 30,39,0
 339 | 37,35,0
 340 | 41,36,0
 341 | 33,11,0
 342 | 19,10,0
 343 | 13,79,0
 344 | 39,8,0
 345 | 32,42,0
 346 | 11,37,0
 347 | 9,25,0
 348 | 9,10,0
 349 | 34,49,1
 350 | 7,25,3
 351 | 39,57,0
 352 | 1,4,0
 353 | 7,44,0
 354 | 17,13,0
 355 | 29,5,0
 356 | 24,3,0
 357 | 3,12,0
 358 | 3,16,0
 359 | 28,16,0
 360 | 26,3,0
 361 | 41,89,0
 362 | 20,36,0
 363 | 20,50,0
 364 | 21,1,0
 365 | 13,15,0
 366 | 33,62,0
 367 | 18,60,0
 368 | 15,70,0
 369 | 34,31,1
 370 | 7,21,2
 371 | 39,82,0
 372 | 7,24,0
 373 | 16,29,0
 374 | 16,54,0
 375 | 15,6,0
 376 | 33,30,2
 377 | 16,7,0
 378 | 1,47,0
 379 | 2,9,0
 380 | 16,16,0
 381 | 41,76,5
 382 | 9,10,0
 383 | 22,62,0
 384 | 16,53,0
 385 | 16,40,0
 386 | 13,75,0
 387 | 6,20,0
 388 | 16,12,0
 389 | 7,24,0
 390 | 18,16,0
 391 | 13,41,0
 392 | 6,9,0
 393 | 23,33,0
 394 | 36,38,0
 395 | 12,44,0
 396 | 12,23,0
 397 | 18,7,1
 398 | 25,9,0
 399 | 24,22,0
 400 | 37,66,0
 401 | 23,64,0
 402 | 17,55,0
 403 | 40,29,0
 404 | 6,11,3
 405 | 6,4,0
 406 | 39,40,0
 407 | 11,77,0
 408 | 21,7,0
 409 | 30,38,0
 410 | 41,78,0
 411 | 7,58,0
 412 | 23,18,0
 413 | 41,74,0
 414 | 15,60,0
 415 | 37,50,1
 416 | 24,29,0
 417 | 19,18,0
 418 | 19,3,0
 419 | 36,17,0
 420 | 12,15,0
 421 | 20,1,1
 422 | 6,33,0
 423 | 13,3,1
 424 | 37,88,0
 425 | 17,64,0
 426 | 9,15,0
 427 | 39,64,2
 428 | 4,10,0
 429 | 13,27,0
 430 | 35,3,1
 431 | 30,9,0
 432 | 16,23,0
 433 | 32,44,0
 434 | 13,44,0
 435 | 24,32,3
 436 | 12,17,0
 437 | 6,36,0
 438 | 37,67,0
 439 | 20,12,0
 440 | 13,69,2
 441 | 39,12,5
 442 | 3,16,0
 443 | 22,58,0
 444 | 3,11,0
 445 | 28,12,0
 446 | 15,68,0
 447 | 1,19,0
 448 | 22,40,0
 449 | 10,5,0
 450 | 6,17,0
 451 | 25,19,0
 452 | 33,11,0
 453 | 26,24,0
 454 | 41,20,0
 455 | 4,8,2
 456 | 18,59,0
 457 | 13,74,0
 458 | 3,19,0
 459 | 11,39,0
 460 | 4,25,0
 461 | 2,11,0
 462 | 16,30,0
 463 | 6,52,0
 464 | 4,11,0
 465 | 1,39,0
 466 | 19,13,3
 467 | 37,38,0
 468 | 21,40,0
 469 | 35,14,1
 470 | 11,63,0
 471 | 10,34,0
 472 | 37,20,0
 473 | 30,23,0
 474 | 10,3,0
 475 | 25,14,4
 476 | 41,105,0
 477 | 37,102,0
 478 | 41,5,1
 479 | 34,31,1
 480 | 23,64,0
 481 | 13,60,0
 482 | 41,5,3
 483 | 29,36,0
 484 | 28,30,0
 485 | 9,21,0
 486 | 40,16,1
 487 | 41,59,0
 488 | 27,20,0
 489 | 19,13,1
 490 | 30,6,0
 491 | 21,53,0
 492 | 11,56,0
 493 | 34,41,0
 494 | 41,86,0
 495 | 15,23,0
 496 | 7,5,0
 497 | 6,45,0
 498 | 33,51,0
 499 | 30,48,0
 500 | 38,11,0
 501 | 8,9,0
 502 | 27,11,0
 503 | 16,14,2
 504 | 39,48,0
 505 | 16,6,0
 506 | 17,57,0
 507 | 20,6,0
 508 | 32,47,0
 509 | 41,49,0
 510 | 13,37,2
 511 | 25,1,0
 512 | 12,24,0
 513 | 12,10,0
 514 | 15,77,0
 515 | 9,9,0
 516 | 37,60,1
 517 | 11,72,0
 518 | 15,45,0
 519 | 2,9,2
 520 | 17,27,0
 521 | 16,44,0
 522 | 15,82,0
 523 | 23,60,0
 524 | 14,15,3
 525 | 16,24,0
 526 | 9,24,0
 527 | 24,14,0
 528 | 33,68,0
 529 | 6,33,0
 530 | 19,8,0
 531 | 7,23,1
 532 | 17,81,0
 533 | 34,2,0
 534 | 14,47,0
 535 | 41,61,1
 536 | 17,32,0
 537 | 41,110,0
 538 | 3,2,0
 539 | 4,28,0
 540 | 3,2,0
 541 | 30,51,0
 542 | 26,9,1
 543 | 18,29,0
 544 | 17,32,0
 545 | 18,15,1
 546 | 23,1,0
 547 | 17,70,0
 548 | 34,22,0
 549 | 23,2,0
 550 | 41,12,0
 551 | 28,30,0
 552 | 34,38,2
 553 | 33,67,0
 554 | 32,45,0
 555 | 37,63,1
 556 | 8,30,2
 557 | 33,75,0
 558 | 21,20,0
 559 | 40,53,4
 560 | 37,72,0
 561 | 15,12,0
 562 | 4,9,0
 563 | 30,43,0
 564 | 34,11,1
 565 | 19,9,0
 566 | 41,74,1
 567 | 24,27,0
 568 | 9,1,2
 569 | 36,19,0
 570 | 3,4,0
 571 | 24,25,0
 572 | 41,78,2
 573 | 34,39,3
 574 | 17,92,0
 575 | 11,8,2
 576 | 33,44,1
 577 | 37,69,1
 578 | 11,19,1
 579 | 16,39,0
 580 | 23,14,2
 581 | 41,61,0
 582 | 27,23,0
 583 | 23,25,0
 584 | 16,12,0
 585 | 41,111,0
 586 | 9,1,1
 587 | 16,23,0
 588 | 38,3,0
 589 | 37,20,0
 590 | 37,69,1
 591 | 14,44,0
 592 | 15,69,0
 593 | 24,17,0
 594 | 32,11,0
 595 | 6,26,0
 596 | 6,6,1
 597 | 29,2,0
 598 | 15,37,0
 599 | 37,69,1
 600 | 26,21,0
 601 | 35,0,0
 602 | 39,66,0
 603 | 33,15,0
 604 | 15,20,0
 605 | 13,10,1
 606 | 1,58,0
 607 | 37,100,0
 608 | 8,19,0
 609 | 18,7,2
 610 | 29,22,0
 611 | 24,10,1
 612 | 40,112,0
 613 | 16,40,1
 614 | 6,37,0
 615 | 15,12,0
 616 | 1,24,0
 617 | 25,23,0
 618 | 10,55,0
 619 | 12,1,0
 620 | 14,33,0
 621 | 41,47,1
 622 | 25,4,0
 623 | 27,7,0
 624 | 15,10,0
 625 | 17,34,0
 626 | 14,39,0
 627 | 17,35,0
 628 | 36,29,0
 629 | 19,27,0
 630 | 17,31,0
 631 | 33,37,2
 632 | 3,7,0
 633 | 41,90,0
 634 | 10,31,0
 635 | 27,27,0
 636 | 22,48,0
 637 | 2,24,0
 638 | 13,3,1
 639 | 14,17,1
 640 | 30,23,0
 641 | 6,6,0
 642 | 41,85,1
 643 | 16,4,2
 644 | 13,67,0
 645 | 6,46,0
 646 | 7,13,0
 647 | 15,19,0
 648 | 12,25,1
 649 | 1,31,0
 650 | 30,18,1
 651 | 18,23,0
 652 | 29,15,0
 653 | 15,10,0
 654 | 3,3,1
 655 | 38,15,1
 656 | 36,32,0
 657 | 13,18,0
 658 | 30,27,0
 659 | 39,8,0
 660 | 34,49,1
 661 | 17,33,0
 662 | 15,3,0
 663 | 10,23,0
 664 | 16,1,0
 665 | 11,48,0
 666 | 8,32,0
 667 | 33,54,1
 668 | 18,75,0
 669 | 17,73,0
 670 | 15,77,0
 671 | 29,36,0
 672 | 30,33,0
 673 | 21,27,1
 674 | 34,10,2
 675 | 13,24,0
 676 | 37,66,0
 677 | 37,103,0
 678 | 15,12,0
 679 | 15,24,0
 680 | 11,77,0
 681 | 2,18,0
 682 | 7,24,0
 683 | 10,44,2
 684 | 25,19,0
 685 | 12,33,1
 686 | 13,71,0
 687 | 35,17,0
 688 | 33,48,2
 689 | 14,10,0
 690 | 37,43,0
 691 | 11,32,0
 692 | 1,42,0
 693 | 17,5,1
 694 | 11,18,0
 695 | 15,16,0
 696 | 14,7,2
 697 | 15,20,0
 698 | 7,16,0
 699 | 8,30,2
 700 | 1,21,0
 701 | 41,107,1
 702 | 41,76,0
 703 | 12,51,0
 704 | 14,40,1
 705 | 29,24,0
 706 | 28,26,0
 707 | 41,3,0
 708 | 9,30,0
 709 | 33,75,0
 710 | 11,33,0
 711 | 41,76,6
 712 | 18,53,2
 713 | 37,7,0
 714 | 29,35,0
 715 | 10,47,0
 716 | 17,81,0
 717 | 11,53,0
 718 | 33,36,0
 719 | 28,11,0
 720 | 11,56,0
 721 | 11,48,0
 722 | 6,48,0
 723 | 34,45,1
 724 | 30,31,0
 725 | 33,73,1
 726 | 1,31,0
 727 | 37,13,2
 728 | 10,0,0
 729 | 39,11,0
 730 | 39,71,0
 731 | 8,1,0
 732 | 12,11,2
 733 | 37,5,0
 734 | 40,70,0
 735 | 36,33,0
 736 | 12,21,0
 737 | 19,27,0
 738 | 40,72,0
 739 | 4,3,1
 740 | 4,20,1
 741 | 20,24,2
 742 | 41,77,0
 743 | 31,10,0
 744 | 30,14,0
 745 | 21,36,0
 746 | 12,51,0
 747 | 41,75,0
 748 | 13,22,0
 749 | 16,7,0
 750 | 36,22,0
 751 | 8,22,0
 752 | 15,69,0
 753 | 6,4,0
 754 | 37,74,0
 755 | 34,31,1
 756 | 15,77,0
 757 | 30,41,0
 758 | 36,25,0
 759 | 16,43,0
 760 | 13,14,0
 761 | 16,32,0
 762 | 22,75,0
 763 | 29,32,4
 764 | 13,40,0
 765 | 6,35,0
 766 | 11,43,1
 767 | 41,86,0
 768 | 39,8,1
 769 | 4,3,0
 770 | 15,75,0
 771 | 41,110,1
 772 | 40,100,0
 773 | 39,82,0
 774 | 3,1,0
 775 | 27,27,0
 776 | 19,26,2
 777 | 13,15,0
 778 | 4,20,0
 779 | 6,6,1
 780 | 6,48,0
 781 | 24,9,0
 782 | 20,31,1
 783 | 36,9,0
 784 | 11,61,0
 785 | 41,6,0
 786 | 11,22,1
 787 | 41,76,0
 788 | 4,8,0
 789 | 16,2,1
 790 | 24,32,0
 791 | 33,1,0
 792 | 14,21,0
 793 | 13,76,0
 794 | 27,20,0
 795 | 35,0,0
 796 | 8,1,0
 797 | 17,60,0
 798 | 21,34,0
 799 | 9,28,0
 800 | 15,15,0
 801 | 16,36,0
 802 | 16,53,0
 803 | 12,23,0
 804 | 41,32,0
 805 | 41,103,0
 806 | 18,69,0
 807 | 37,9,0
 808 | 27,26,0
 809 | 11,65,0
 810 | 26,3,0
 811 | 22,12,0
 812 | 41,54,0
 813 | 33,60,0
 814 | 2,5,0
 815 | 36,42,0
 816 | 11,39,0
 817 | 15,0,0
 818 | 37,10,0
 819 | 13,65,0
 820 | 24,17,0
 821 | 37,88,0
 822 | 6,6,2
 823 | 3,25,0
 824 | 8,24,0
 825 | 23,46,0
 826 | 11,67,0
 827 | 27,18,0
 828 | 41,21,0
 829 | 32,25,0
 830 | 39,75,0
 831 | 18,27,0
 832 | 6,36,0
 833 | 41,32,0
 834 | 21,13,0
 835 | 17,34,1
 836 | 41,87,0
 837 | 13,74,1
 838 | 16,49,1
 839 | 21,4,0
 840 | 41,74,1
 841 | 40,54,0
 842 | 41,101,0
 843 | 9,28,1
 844 | 40,22,0
 845 | 37,25,1
 846 | 33,3,0
 847 | 37,54,0
 848 | 6,29,0
 849 | 9,11,0
 850 | 35,9,3
 851 | 8,34,0
 852 | 28,31,0
 853 | 14,7,0
 854 | 37,85,0
 855 | 8,27,0
 856 | 19,12,1
 857 | 33,32,1
 858 | 41,97,0
 859 | 27,15,0
 860 | 13,56,0
 861 | 6,34,0
 862 | 41,75,0
 863 | 9,9,0
 864 | 37,33,0
 865 | 14,2,0
 866 | 30,25,0
 867 | 6,66,0
 868 | 5,3,0
 869 | 41,55,1
 870 | 4,29,0
 871 | 7,48,0
 872 | 10,44,2
 873 | 29,32,0
 874 | 12,35,0
 875 | 37,21,0
 876 | 39,79,0
 877 | 23,43,0
 878 | 4,1,2
 879 | 40,90,0
 880 | 34,38,0
 881 | 25,19,1
 882 | 34,32,2
 883 | 37,71,0
 884 | 18,54,0
 885 | 17,89,0
 886 | 9,19,0
 887 | 15,32,0
 888 | 13,66,0
 889 | 26,14,1
 890 | 32,31,0
 891 | 1,2,0
 892 | 37,79,1
 893 | 37,29,0
 894 | 41,61,0
 895 | 13,30,1
 896 | 18,66,0
 897 | 12,20,1
 898 | 20,23,0
 899 | 1,58,3
 900 | 14,17,3
 901 | 21,1,0
 902 | 6,46,0
 903 | 6,24,0
 904 | 36,16,0
 905 | 10,39,0
 906 | 30,5,0
 907 | 23,46,0
 908 | 17,92,3
 909 | 7,60,0
 910 | 41,98,0
 911 | 1,33,1
 912 | 14,30,0
 913 | 39,26,1
 914 | 37,36,0
 915 | 6,50,0
 916 | 3,8,0
 917 | 16,15,0
 918 | 37,69,3
 919 | 16,49,0
 920 | 17,17,0
 921 | 16,29,0
 922 | 9,6,0
 923 | 3,22,1
 924 | 39,46,0
 925 | 10,22,0
 926 | 11,76,0
 927 | 6,37,0
 928 | 4,29,0
 929 | 16,20,0
 930 | 14,47,0
 931 | 16,4,0
 932 | 3,22,0
 933 | 34,27,0
 934 | 32,2,0
 935 | 30,44,0
 936 | 27,23,0
 937 | 26,27,0
 938 | 16,20,0
 939 | 16,40,0
 940 | 41,63,0
 941 | 34,5,0
 942 | 30,56,0
 943 | 13,23,0
 944 | 13,32,0
 945 | 24,32,3
 946 | 38,12,0
 947 | 12,41,0
 948 | 13,37,0
 949 | 17,12,0
 950 | 9,28,0
 951 | 39,25,0
 952 | 7,34,0
 953 | 41,106,3
 954 | 8,22,0
 955 | 16,39,1
 956 | 37,20,0
 957 | 23,2,0
 958 | 15,24,0
 959 | 33,75,0
 960 | 30,29,0
 961 | 33,73,3
 962 | 16,5,0
 963 | 16,51,0
 964 | 34,27,1
 965 | 20,4,0
 966 | 17,62,2
 967 | 40,90,0
 968 | 14,28,0
 969 | 19,10,0
 970 | 7,21,0
 971 | 37,48,0
 972 | 32,25,0
 973 | 40,82,0
 974 | 20,12,0
 975 | 13,10,2
 976 | 36,43,2
 977 | 28,31,0
 978 | 12,51,0
 979 | 8,21,0
 980 | 34,10,3
 981 | 26,22,0
 982 | 7,51,0
 983 | 15,12,0
 984 | 41,123,2
 985 | 33,62,0
 986 | 32,35,0
 987 | 13,9,1
 988 | 35,9,0
 989 | 30,52,0
 990 | 37,6,3
 991 | 39,11,2
 992 | 26,3,1
 993 | 22,26,0
 994 | 2,19,0
 995 | 20,37,0
 996 | 6,60,0
 997 | 15,67,0
 998 | 41,88,0
 999 | 18,45,0
1000 | 18,7,0
1001 | 12,35,0
1002 | 13,80,0
1003 | 9,12,0
1004 | 34,10,0
1005 | 7,60,0
1006 | 27,23,0
1007 | 13,23,0
1008 | 9,8,0
1009 | 41,46,1
1010 | 18,31,0
1011 | 39,48,0
1012 | 37,74,1
1013 | 33,37,0
1014 | 13,70,0
1015 | 17,78,0
1016 | 28,27,0
1017 | 11,54,0
1018 | 15,8,0
1019 | 11,66,0
1020 | 41,58,0
1021 | 26,4,1
1022 | 1,2,0
1023 | 21,14,5
1024 | 27,23,0
1025 | 8,21,0
1026 | 18,35,1
1027 | 9,11,0
1028 | 33,42,0
1029 | 17,38,1
1030 | 28,8,0
1031 | 3,10,0
1032 | 21,30,0
1033 | 37,54,0
1034 | 4,31,0
1035 | 8,35,2
1036 | 15,5,0
1037 | 23,54,0
1038 | 41,75,0
1039 | 13,62,0
1040 | 23,45,0
1041 | 15,14,0
1042 | 13,32,0
1043 | 17,11,0
1044 | 2,22,0
1045 | 15,79,0
1046 | 22,86,0
1047 | 36,1,0
1048 | 8,34,2
1049 | 11,32,0
1050 | 11,22,0
1051 | 38,10,0
1052 | 9,16,0
1053 | 6,31,0
1054 | 39,60,0
1055 | 27,28,0
1056 | 15,30,0
1057 | 13,35,0
1058 | 41,18,0
1059 | 14,16,0
1060 | 1,59,0
1061 | 24,20,0
1062 | 15,83,0
1063 | 33,25,0
1064 | 6,32,0
1065 | 7,50,0
1066 | 15,16,0
1067 | 10,5,0
1068 | 36,6,0
1069 | 32,18,2
1070 | 41,46,0
1071 | 13,30,0
1072 | 17,8,0
1073 | 33,46,0
1074 | 13,69,2
1075 | 34,8,4
1076 | 28,12,0
1077 | 36,24,0
1078 | 23,66,0
1079 | 27,10,0
1080 | 12,23,0
1081 | 7,40,0
1082 | 19,5,0
1083 | 13,11,0
1084 | 7,5,0
1085 | 22,1,2
1086 | 21,43,0
1087 | 12,18,0
1088 | 3,13,0
1089 | 6,42,1
1090 | 6,67,0
1091 | 34,32,3
1092 | 18,19,0
1093 | 33,17,1
1094 | 30,55,1
1095 | 15,38,0
1096 | 13,61,0
1097 | 3,21,0
1098 | 30,23,0
1099 | 12,5,0
1100 | 9,20,0
1101 | 11,59,0
1102 | 37,39,1
1103 | 34,44,0
1104 | 12,14,0
1105 | 39,24,1
1106 | 25,6,1
1107 | 1,17,0
1108 | 22,24,0
1109 | 1,15,0
1110 | 40,85,0
1111 | 18,60,1
1112 | 6,49,0
1113 | 24,8,0
1114 | 9,26,0
1115 | 18,68,0
1116 | 39,75,0
1117 | 6,33,0
1118 | 17,63,0
1119 | 34,32,1
1120 | 34,52,2
1121 | 36,26,0
1122 | 4,15,0
1123 | 12,30,0
1124 | 4,8,2
1125 | 24,1,0
1126 | 41,104,0
1127 | 13,71,0
1128 | 41,8,0
1129 | 6,11,1
1130 | 3,18,0
1131 | 33,40,0
1132 | 6,66,0
1133 | 4,8,0
1134 | 37,6,0
1135 | 13,47,0
1136 | 30,21,0
1137 | 13,43,0
1138 | 27,7,0
1139 | 5,6,0
1140 | 13,60,1
1141 | 11,66,0
1142 | 13,28,0
1143 | 9,3,0
1144 | 25,21,0
1145 | 13,47,0
1146 | 4,1,0
1147 | 41,61,0
1148 | 37,50,0
1149 | 27,30,0
1150 | 6,53,0
1151 | 9,15,0
1152 | 22,39,0
1153 | 36,17,0
1154 | 36,21,0
1155 | 40,21,0
1156 | 6,13,0
1157 | 6,1,0
1158 | 22,58,0
1159 | 15,4,3
1160 | 41,82,0
1161 | 8,14,0
1162 | 13,44,0
1163 | 14,31,0
1164 | 41,119,1
1165 | 22,65,0
1166 | 17,13,0
1167 | 7,25,4
1168 | 17,60,0
1169 | 12,49,0
1170 | 39,33,0
1171 | 27,6,0
1172 | 39,81,0
1173 | 30,7,0
1174 | 29,25,0
1175 | 30,41,0
1176 | 32,44,0
1177 | 15,18,0
1178 | 41,57,1
1179 | 4,12,0
1180 | 30,28,0
1181 | 10,20,1
1182 | 27,8,0
1183 | 34,38,0
1184 | 41,9,0
1185 | 11,19,0
1186 | 11,44,0
1187 | 18,25,0
1188 | 15,44,0
1189 | 30,51,0
1190 | 12,46,0
1191 | 41,35,1
1192 | 30,8,0
1193 | 34,10,4
1194 | 14,14,0
1195 | 36,38,0
1196 | 33,11,0
1197 | 10,20,0
1198 | 15,8,0
1199 | 37,26,0
1200 | 12,30,0
1201 | 12,30,1
1202 | 14,17,0
1203 | 16,25,0
1204 | 13,63,0
1205 | 18,76,0
1206 | 32,15,1
1207 | 39,80,0
1208 | 40,77,0
1209 | 4,3,0
1210 | 13,80,1
1211 | 12,11,0
1212 | 7,26,0
1213 | 23,60,0
1214 | 34,44,1
1215 | 28,17,1
1216 | 37,73,1
1217 | 15,40,0
1218 | 13,20,0
1219 | 17,60,0
1220 | 33,19,0
1221 | 40,55,2
1222 | 24,24,0
1223 | 41,54,0
1224 | 34,32,1
1225 | 18,14,0
1226 | 22,62,0
1227 | 9,1,0
1228 | 7,39,0
1229 | 29,4,0
1230 | 38,12,0
1231 | 41,65,1
1232 | 37,27,0
1233 | 13,2,0
1234 | 20,55,1
1235 | 11,43,0
1236 | 41,82,0
1237 | 34,10,1
1238 | 36,34,0
1239 | 33,46,0
1240 | 10,44,1
1241 | 1,0,0
1242 | 26,13,0
1243 | 16,4,0
1244 | 34,44,1
1245 | 33,59,0
1246 | 17,7,0
1247 | 18,53,3
1248 | 15,46,0
1249 | 21,14,6
1250 | 7,38,0
1251 | 12,19,0
1252 | 32,6,0
1253 | 24,26,0
1254 | 19,16,1
1255 | 29,9,0
1256 | 6,46,2
1257 | 11,58,0
1258 | 11,32,0
1259 | 18,5,1
1260 | 35,2,0
1261 | 15,79,0
1262 | 36,24,0
1263 | 9,24,0
1264 | 39,72,0
1265 | 4,23,0
1266 | 12,50,0
1267 | 8,10,0
1268 | 1,44,0
1269 | 36,41,0
1270 | 9,22,0
1271 | 8,9,0
1272 | 8,14,2
1273 | 15,4,0
1274 | 6,34,0
1275 | 39,9,0
1276 | 19,15,0
1277 | 34,11,0
1278 | 36,24,0
1279 | 41,97,0
1280 | 22,88,0
1281 | 8,36,0
1282 | 33,13,0
1283 | 37,18,0
1284 | 15,70,0
1285 | 13,10,2
1286 | 12,37,0
1287 | 41,72,0
1288 | 15,69,0
1289 | 16,1,0
1290 | 40,84,0
1291 | 1,26,0
1292 | 29,22,0
1293 | 15,82,0
1294 | 24,25,0
1295 | 13,74,0
1296 | 12,32,0
1297 | 22,46,0
1298 | 34,36,0
1299 | 11,24,0
1300 | 12,19,0
1301 | 41,61,0
1302 | 36,10,0
1303 | 17,33,0
1304 | 24,29,0
1305 | 41,17,0
1306 | 37,73,0
1307 | 40,68,1
1308 | 21,52,0
1309 | 19,12,1
1310 | 41,54,0
1311 | 36,3,0
1312 | 6,46,3
1313 | 18,12,0
1314 | 2,0,0
1315 | 8,34,0
1316 | 34,22,0
1317 | 23,17,0
1318 | 4,6,0
1319 | 36,20,0
1320 | 8,16,0
1321 | 30,55,2
1322 | 14,1,0
1323 | 27,5,0
1324 | 14,29,1
1325 | 24,14,0
1326 | 39,72,0
1327 | 14,11,0
1328 | 15,16,0
1329 | 15,71,0
1330 | 41,109,0
1331 | 6,25,0
1332 | 17,48,0
1333 | 9,1,2
1334 | 37,1,0
1335 | 36,3,2
1336 | 15,19,0
1337 | 15,38,1
1338 | 41,82,0
1339 | 12,5,0
1340 | 16,25,0
1341 | 20,61,0
1342 | 30,21,0
1343 | 2,7,0
1344 | 30,53,0
1345 | 10,40,1
1346 | 14,16,0
1347 | 9,11,0
1348 | 17,73,0
1349 | 11,18,0
1350 | 16,7,0
1351 | 7,2,0
1352 | 8,41,0
1353 | 27,8,0
1354 | 24,28,0
1355 | 5,13,0
1356 | 14,45,0
1357 | 2,24,0
1358 | 6,46,1
1359 | 14,29,0
1360 | 27,13,0
1361 | 11,52,0
1362 | 11,66,1
1363 | 1,28,0
1364 | 32,42,0
1365 | 41,112,0
1366 | 11,5,1
1367 | 24,32,0
1368 | 8,19,3
1369 | 3,17,0
1370 | 12,42,0
1371 | 39,26,0
1372 | 38,15,0
1373 | 8,7,1
1374 | 29,13,0
1375 | 34,31,1
1376 | 37,99,0
1377 | 40,28,0
1378 | 23,37,0
1379 | 25,24,0
1380 | 27,25,0
1381 | 7,29,0
1382 | 22,51,0
1383 | 10,49,1
1384 | 39,47,0
1385 | 1,58,0
1386 | 41,119,1
1387 | 37,35,0
1388 | 29,9,2
1389 | 13,60,1
1390 | 41,91,0
1391 | 38,10,0
1392 | 18,35,0
1393 | 33,13,0
1394 | 14,34,2
1395 | 34,49,1
1396 | 33,74,1
1397 | 30,35,0
1398 | 39,31,1
1399 | 21,52,0
1400 | 39,59,1
1401 | 23,12,0
1402 | 15,72,0
1403 | 19,9,0
1404 | 37,83,0
1405 | 35,22,1
1406 | 14,38,0
1407 | 24,6,0
1408 | 11,63,0
1409 | 37,37,0
1410 | 14,15,2
1411 | 14,2,1
1412 | 25,11,0
1413 | 37,61,0
1414 | 34,45,1
1415 | 41,76,3
1416 | 21,3,0
1417 | 27,10,0
1418 | 29,27,0
1419 | 15,64,0
1420 | 41,57,0
1421 | 7,50,0
1422 | 37,27,0
1423 | 41,85,0
1424 | 3,0,0
1425 | 15,82,0
1426 | 6,15,0
1427 | 41,78,0
1428 | 32,27,0
1429 | 19,28,0
1430 | 30,34,0
1431 | 1,56,0
1432 | 2,14,0
1433 | 6,39,0
1434 | 14,12,0
1435 | 28,28,0
1436 | 6,37,0
1437 | 14,35,0
1438 | 23,20,0
1439 | 17,94,0
1440 | 34,42,0
1441 | 16,16,0
1442 | 39,46,0
1443 | 13,37,0
1444 | 10,32,0
1445 | 8,22,0
1446 | 8,23,0
1447 | 30,33,0
1448 | 40,29,0
1449 | 13,55,0
1450 | 1,44,0
1451 | 15,56,0
1452 | 21,30,0
1453 | 29,0,0
1454 | 8,33,0
1455 | 13,6,0
1456 | 26,24,0
1457 | 41,47,1
1458 | 2,15,0
1459 | 20,36,0
1460 | 14,27,0
1461 | 16,39,0
1462 | 15,53,0
1463 | 4,1,4
1464 | 16,36,0
1465 | 35,21,0
1466 | 8,19,3
1467 | 22,40,0
1468 | 38,4,0
1469 | 15,10,0
1470 | 41,61,0
1471 | 9,30,0
1472 | 20,36,0
1473 | 1,10,3
1474 | 6,57,0
1475 | 14,20,0
1476 | 14,15,0
1477 | 38,9,0
1478 | 14,30,0
1479 | 37,22,0
1480 | 14,0,0
1481 | 13,30,1
1482 | 7,22,0
1483 | 14,29,2
1484 | 13,26,0
1485 | 35,11,2
1486 | 25,12,0
1487 | 6,26,0
1488 | 6,53,0
1489 | 13,72,0
1490 | 16,12,0
1491 | 17,61,0
1492 | 8,14,0
1493 | 11,69,0
1494 | 21,23,0
1495 | 6,9,0
1496 | 41,88,0
1497 | 21,35,0
1498 | 10,17,0
1499 | 6,17,0
1500 | 30,17,0
1501 | 33,70,0
1502 | 39,34,0
1503 | 13,47,0
1504 | 11,58,0
1505 | 6,48,0
1506 | 37,14,0
1507 | 6,66,0
1508 | 33,20,0
1509 | 13,53,0
1510 | 41,31,0
1511 | 15,51,0
1512 | 35,16,0
1513 | 13,47,0
1514 | 37,1,1
1515 | 33,15,0
1516 | 27,20,0
1517 | 4,10,3
1518 | 30,27,0
1519 | 4,21,3
1520 | 12,52,0
1521 | 14,42,0
1522 | 3,6,0
1523 | 34,34,0
1524 | 22,28,0
1525 | 26,3,1
1526 | 7,39,0
1527 | 12,32,0
1528 | 7,63,0
1529 | 15,29,0
1530 | 30,59,0
1531 | 10,44,0
1532 | 30,39,0
1533 | 6,48,0
1534 | 22,12,0
1535 | 4,15,0
1536 | 12,17,0
1537 | 6,9,1
1538 | 10,46,1
1539 | 4,20,0
1540 | 24,25,0
1541 | 30,39,0
1542 | 38,5,0
1543 | 13,11,0
1544 | 24,22,0
1545 | 33,59,0
1546 | 35,5,4
1547 | 17,2,0
1548 | 41,17,0
1549 | 33,67,0
1550 | 17,62,2
1551 | 39,24,1
1552 | 9,1,0
1553 | 41,15,0
1554 | 22,12,0
1555 | 41,47,2
1556 | 3,5,0
1557 | 34,32,1
1558 | 16,20,0
1559 | 23,27,0
1560 | 13,74,1
1561 | 34,31,1
1562 | 41,59,0
1563 | 8,33,0
1564 | 28,27,0
1565 | 13,64,0
1566 | 12,25,1
1567 | 17,32,1
1568 | 7,41,0
1569 | 6,34,0
1570 | 21,27,0
1571 | 28,10,0
1572 | 37,15,0
1573 | 13,69,1
1574 | 18,67,0
1575 | 12,1,0
1576 | 9,21,0
1577 | 35,6,0
1578 | 2,19,1
1579 | 15,30,0
1580 | 32,33,0
1581 | 14,38,0
1582 | 24,10,0
1583 | 17,82,0
1584 | 4,17,0
1585 | 3,1,0
1586 | 3,13,0
1587 | 8,34,1
1588 | 39,45,1
1589 | 21,5,0
1590 | 11,29,2
1591 | 6,62,2
1592 | 17,86,0
1593 | 38,9,0
1594 | 27,4,0
1595 | 38,10,0
1596 | 6,22,0
1597 | 41,11,0
1598 | 37,74,1
1599 | 14,18,0
1600 | 36,32,0
1601 | 12,10,0
1602 | 29,32,0
1603 | 34,5,0
1604 | 6,37,0
1605 | 13,2,0
1606 | 41,49,0
1607 | 17,67,0
1608 | 9,12,0
1609 | 9,10,0
1610 | 13,77,0
1611 | 41,119,1
1612 | 33,15,0
1613 | 1,44,0
1614 | 8,14,0
1615 | 12,37,0
1616 | 6,58,0
1617 | 13,9,0
1618 | 16,19,0
1619 | 27,7,0
1620 | 13,44,0
1621 | 35,21,3
1622 | 41,73,0
1623 | 37,66,0
1624 | 4,26,0
1625 | 24,24,0
1626 | 29,32,4
1627 | 33,46,0
1628 | 41,38,0
1629 | 32,30,1
1630 | 25,22,0
1631 | 18,73,0
1632 | 11,52,0
1633 | 19,15,0
1634 | 17,79,3
1635 | 37,4,0
1636 | 17,33,0
1637 | 


--------------------------------------------------------------------------------